LLVM 19.0.0git
LoopAccessAnalysis.cpp
Go to the documentation of this file.
1//===- LoopAccessAnalysis.cpp - Loop Access Analysis Implementation --------==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// The implementation for the loop memory dependence that was originally
10// developed for the loop vectorizer.
11//
12//===----------------------------------------------------------------------===//
13
15#include "llvm/ADT/APInt.h"
16#include "llvm/ADT/DenseMap.h"
19#include "llvm/ADT/STLExtras.h"
20#include "llvm/ADT/SetVector.h"
22#include "llvm/ADT/SmallSet.h"
36#include "llvm/IR/BasicBlock.h"
37#include "llvm/IR/Constants.h"
38#include "llvm/IR/DataLayout.h"
39#include "llvm/IR/DebugLoc.h"
42#include "llvm/IR/Dominators.h"
43#include "llvm/IR/Function.h"
45#include "llvm/IR/InstrTypes.h"
46#include "llvm/IR/Instruction.h"
48#include "llvm/IR/Operator.h"
49#include "llvm/IR/PassManager.h"
51#include "llvm/IR/Type.h"
52#include "llvm/IR/Value.h"
53#include "llvm/IR/ValueHandle.h"
56#include "llvm/Support/Debug.h"
59#include <algorithm>
60#include <cassert>
61#include <cstdint>
62#include <iterator>
63#include <utility>
64#include <variant>
65#include <vector>
66
67using namespace llvm;
68using namespace llvm::PatternMatch;
69
70#define DEBUG_TYPE "loop-accesses"
71
73VectorizationFactor("force-vector-width", cl::Hidden,
74 cl::desc("Sets the SIMD width. Zero is autoselect."),
77
79VectorizationInterleave("force-vector-interleave", cl::Hidden,
80 cl::desc("Sets the vectorization interleave count. "
81 "Zero is autoselect."),
85
87 "runtime-memory-check-threshold", cl::Hidden,
88 cl::desc("When performing memory disambiguation checks at runtime do not "
89 "generate more than this number of comparisons (default = 8)."),
92
93/// The maximum iterations used to merge memory checks
95 "memory-check-merge-threshold", cl::Hidden,
96 cl::desc("Maximum number of comparisons done when trying to merge "
97 "runtime memory checks. (default = 100)"),
98 cl::init(100));
99
100/// Maximum SIMD width.
101const unsigned VectorizerParams::MaxVectorWidth = 64;
102
103/// We collect dependences up to this threshold.
105 MaxDependences("max-dependences", cl::Hidden,
106 cl::desc("Maximum number of dependences collected by "
107 "loop-access analysis (default = 100)"),
108 cl::init(100));
109
110/// This enables versioning on the strides of symbolically striding memory
111/// accesses in code like the following.
112/// for (i = 0; i < N; ++i)
113/// A[i * Stride1] += B[i * Stride2] ...
114///
115/// Will be roughly translated to
116/// if (Stride1 == 1 && Stride2 == 1) {
117/// for (i = 0; i < N; i+=4)
118/// A[i:i+3] += ...
119/// } else
120/// ...
122 "enable-mem-access-versioning", cl::init(true), cl::Hidden,
123 cl::desc("Enable symbolic stride memory access versioning"));
124
125/// Enable store-to-load forwarding conflict detection. This option can
126/// be disabled for correctness testing.
128 "store-to-load-forwarding-conflict-detection", cl::Hidden,
129 cl::desc("Enable conflict detection in loop-access analysis"),
130 cl::init(true));
131
133 "max-forked-scev-depth", cl::Hidden,
134 cl::desc("Maximum recursion depth when finding forked SCEVs (default = 5)"),
135 cl::init(5));
136
138 "laa-speculate-unit-stride", cl::Hidden,
139 cl::desc("Speculate that non-constant strides are unit in LAA"),
140 cl::init(true));
141
143 "hoist-runtime-checks", cl::Hidden,
144 cl::desc(
145 "Hoist inner loop runtime memory checks to outer loop if possible"),
148
150 return ::VectorizationInterleave.getNumOccurrences() > 0;
151}
152
154 const DenseMap<Value *, const SCEV *> &PtrToStride,
155 Value *Ptr) {
156 const SCEV *OrigSCEV = PSE.getSCEV(Ptr);
157
158 // If there is an entry in the map return the SCEV of the pointer with the
159 // symbolic stride replaced by one.
161 if (SI == PtrToStride.end())
162 // For a non-symbolic stride, just return the original expression.
163 return OrigSCEV;
164
165 const SCEV *StrideSCEV = SI->second;
166 // Note: This assert is both overly strong and overly weak. The actual
167 // invariant here is that StrideSCEV should be loop invariant. The only
168 // such invariant strides we happen to speculate right now are unknowns
169 // and thus this is a reasonable proxy of the actual invariant.
170 assert(isa<SCEVUnknown>(StrideSCEV) && "shouldn't be in map");
171
172 ScalarEvolution *SE = PSE.getSE();
173 const auto *CT = SE->getOne(StrideSCEV->getType());
174 PSE.addPredicate(*SE->getEqualPredicate(StrideSCEV, CT));
175 auto *Expr = PSE.getSCEV(Ptr);
176
177 LLVM_DEBUG(dbgs() << "LAA: Replacing SCEV: " << *OrigSCEV
178 << " by: " << *Expr << "\n");
179 return Expr;
180}
181
183 unsigned Index, RuntimePointerChecking &RtCheck)
184 : High(RtCheck.Pointers[Index].End), Low(RtCheck.Pointers[Index].Start),
185 AddressSpace(RtCheck.Pointers[Index]
186 .PointerValue->getType()
188 NeedsFreeze(RtCheck.Pointers[Index].NeedsFreeze) {
190}
191
192/// Calculate Start and End points of memory access.
193/// Let's assume A is the first access and B is a memory access on N-th loop
194/// iteration. Then B is calculated as:
195/// B = A + Step*N .
196/// Step value may be positive or negative.
197/// N is a calculated back-edge taken count:
198/// N = (TripCount > 0) ? RoundDown(TripCount -1 , VF) : 0
199/// Start and End points are calculated in the following way:
200/// Start = UMIN(A, B) ; End = UMAX(A, B) + SizeOfElt,
201/// where SizeOfElt is the size of single memory access in bytes.
202///
203/// There is no conflict when the intervals are disjoint:
204/// NoConflict = (P2.Start >= P1.End) || (P1.Start >= P2.End)
206 Type *AccessTy, bool WritePtr,
207 unsigned DepSetId, unsigned ASId,
209 bool NeedsFreeze) {
210 ScalarEvolution *SE = PSE.getSE();
211
212 const SCEV *ScStart;
213 const SCEV *ScEnd;
214
215 if (SE->isLoopInvariant(PtrExpr, Lp)) {
216 ScStart = ScEnd = PtrExpr;
217 } else {
218 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PtrExpr);
219 assert(AR && "Invalid addrec expression");
220 const SCEV *Ex = PSE.getBackedgeTakenCount();
221
222 ScStart = AR->getStart();
223 ScEnd = AR->evaluateAtIteration(Ex, *SE);
224 const SCEV *Step = AR->getStepRecurrence(*SE);
225
226 // For expressions with negative step, the upper bound is ScStart and the
227 // lower bound is ScEnd.
228 if (const auto *CStep = dyn_cast<SCEVConstant>(Step)) {
229 if (CStep->getValue()->isNegative())
230 std::swap(ScStart, ScEnd);
231 } else {
232 // Fallback case: the step is not constant, but we can still
233 // get the upper and lower bounds of the interval by using min/max
234 // expressions.
235 ScStart = SE->getUMinExpr(ScStart, ScEnd);
236 ScEnd = SE->getUMaxExpr(AR->getStart(), ScEnd);
237 }
238 }
239 assert(SE->isLoopInvariant(ScStart, Lp) && "ScStart needs to be invariant");
240 assert(SE->isLoopInvariant(ScEnd, Lp)&& "ScEnd needs to be invariant");
241
242 // Add the size of the pointed element to ScEnd.
243 auto &DL = Lp->getHeader()->getModule()->getDataLayout();
244 Type *IdxTy = DL.getIndexType(Ptr->getType());
245 const SCEV *EltSizeSCEV = SE->getStoreSizeOfExpr(IdxTy, AccessTy);
246 ScEnd = SE->getAddExpr(ScEnd, EltSizeSCEV);
247
248 Pointers.emplace_back(Ptr, ScStart, ScEnd, WritePtr, DepSetId, ASId, PtrExpr,
249 NeedsFreeze);
250}
251
252void RuntimePointerChecking::tryToCreateDiffCheck(
253 const RuntimeCheckingPtrGroup &CGI, const RuntimeCheckingPtrGroup &CGJ) {
254 if (!CanUseDiffCheck)
255 return;
256
257 // If either group contains multiple different pointers, bail out.
258 // TODO: Support multiple pointers by using the minimum or maximum pointer,
259 // depending on src & sink.
260 if (CGI.Members.size() != 1 || CGJ.Members.size() != 1) {
261 CanUseDiffCheck = false;
262 return;
263 }
264
265 PointerInfo *Src = &Pointers[CGI.Members[0]];
266 PointerInfo *Sink = &Pointers[CGJ.Members[0]];
267
268 // If either pointer is read and written, multiple checks may be needed. Bail
269 // out.
270 if (!DC.getOrderForAccess(Src->PointerValue, !Src->IsWritePtr).empty() ||
271 !DC.getOrderForAccess(Sink->PointerValue, !Sink->IsWritePtr).empty()) {
272 CanUseDiffCheck = false;
273 return;
274 }
275
276 ArrayRef<unsigned> AccSrc =
277 DC.getOrderForAccess(Src->PointerValue, Src->IsWritePtr);
278 ArrayRef<unsigned> AccSink =
279 DC.getOrderForAccess(Sink->PointerValue, Sink->IsWritePtr);
280 // If either pointer is accessed multiple times, there may not be a clear
281 // src/sink relation. Bail out for now.
282 if (AccSrc.size() != 1 || AccSink.size() != 1) {
283 CanUseDiffCheck = false;
284 return;
285 }
286 // If the sink is accessed before src, swap src/sink.
287 if (AccSink[0] < AccSrc[0])
288 std::swap(Src, Sink);
289
290 auto *SrcAR = dyn_cast<SCEVAddRecExpr>(Src->Expr);
291 auto *SinkAR = dyn_cast<SCEVAddRecExpr>(Sink->Expr);
292 if (!SrcAR || !SinkAR || SrcAR->getLoop() != DC.getInnermostLoop() ||
293 SinkAR->getLoop() != DC.getInnermostLoop()) {
294 CanUseDiffCheck = false;
295 return;
296 }
297
299 DC.getInstructionsForAccess(Src->PointerValue, Src->IsWritePtr);
301 DC.getInstructionsForAccess(Sink->PointerValue, Sink->IsWritePtr);
302 Type *SrcTy = getLoadStoreType(SrcInsts[0]);
303 Type *DstTy = getLoadStoreType(SinkInsts[0]);
304 if (isa<ScalableVectorType>(SrcTy) || isa<ScalableVectorType>(DstTy)) {
305 CanUseDiffCheck = false;
306 return;
307 }
308 const DataLayout &DL =
309 SinkAR->getLoop()->getHeader()->getModule()->getDataLayout();
310 unsigned AllocSize =
311 std::max(DL.getTypeAllocSize(SrcTy), DL.getTypeAllocSize(DstTy));
312
313 // Only matching constant steps matching the AllocSize are supported at the
314 // moment. This simplifies the difference computation. Can be extended in the
315 // future.
316 auto *Step = dyn_cast<SCEVConstant>(SinkAR->getStepRecurrence(*SE));
317 if (!Step || Step != SrcAR->getStepRecurrence(*SE) ||
318 Step->getAPInt().abs() != AllocSize) {
319 CanUseDiffCheck = false;
320 return;
321 }
322
323 IntegerType *IntTy =
324 IntegerType::get(Src->PointerValue->getContext(),
325 DL.getPointerSizeInBits(CGI.AddressSpace));
326
327 // When counting down, the dependence distance needs to be swapped.
328 if (Step->getValue()->isNegative())
329 std::swap(SinkAR, SrcAR);
330
331 const SCEV *SinkStartInt = SE->getPtrToIntExpr(SinkAR->getStart(), IntTy);
332 const SCEV *SrcStartInt = SE->getPtrToIntExpr(SrcAR->getStart(), IntTy);
333 if (isa<SCEVCouldNotCompute>(SinkStartInt) ||
334 isa<SCEVCouldNotCompute>(SrcStartInt)) {
335 CanUseDiffCheck = false;
336 return;
337 }
338
339 const Loop *InnerLoop = SrcAR->getLoop();
340 // If the start values for both Src and Sink also vary according to an outer
341 // loop, then it's probably better to avoid creating diff checks because
342 // they may not be hoisted. We should instead let llvm::addRuntimeChecks
343 // do the expanded full range overlap checks, which can be hoisted.
344 if (HoistRuntimeChecks && InnerLoop->getParentLoop() &&
345 isa<SCEVAddRecExpr>(SinkStartInt) && isa<SCEVAddRecExpr>(SrcStartInt)) {
346 auto *SrcStartAR = cast<SCEVAddRecExpr>(SrcStartInt);
347 auto *SinkStartAR = cast<SCEVAddRecExpr>(SinkStartInt);
348 const Loop *StartARLoop = SrcStartAR->getLoop();
349 if (StartARLoop == SinkStartAR->getLoop() &&
350 StartARLoop == InnerLoop->getParentLoop() &&
351 // If the diff check would already be loop invariant (due to the
352 // recurrences being the same), then we prefer to keep the diff checks
353 // because they are cheaper.
354 SrcStartAR->getStepRecurrence(*SE) !=
355 SinkStartAR->getStepRecurrence(*SE)) {
356 LLVM_DEBUG(dbgs() << "LAA: Not creating diff runtime check, since these "
357 "cannot be hoisted out of the outer loop\n");
358 CanUseDiffCheck = false;
359 return;
360 }
361 }
362
363 LLVM_DEBUG(dbgs() << "LAA: Creating diff runtime check for:\n"
364 << "SrcStart: " << *SrcStartInt << '\n'
365 << "SinkStartInt: " << *SinkStartInt << '\n');
366 DiffChecks.emplace_back(SrcStartInt, SinkStartInt, AllocSize,
367 Src->NeedsFreeze || Sink->NeedsFreeze);
368}
369
370SmallVector<RuntimePointerCheck, 4> RuntimePointerChecking::generateChecks() {
372
373 for (unsigned I = 0; I < CheckingGroups.size(); ++I) {
374 for (unsigned J = I + 1; J < CheckingGroups.size(); ++J) {
377
378 if (needsChecking(CGI, CGJ)) {
379 tryToCreateDiffCheck(CGI, CGJ);
380 Checks.push_back(std::make_pair(&CGI, &CGJ));
381 }
382 }
383 }
384 return Checks;
385}
386
387void RuntimePointerChecking::generateChecks(
388 MemoryDepChecker::DepCandidates &DepCands, bool UseDependencies) {
389 assert(Checks.empty() && "Checks is not empty");
390 groupChecks(DepCands, UseDependencies);
391 Checks = generateChecks();
392}
393
395 const RuntimeCheckingPtrGroup &M, const RuntimeCheckingPtrGroup &N) const {
396 for (unsigned I = 0, EI = M.Members.size(); EI != I; ++I)
397 for (unsigned J = 0, EJ = N.Members.size(); EJ != J; ++J)
398 if (needsChecking(M.Members[I], N.Members[J]))
399 return true;
400 return false;
401}
402
403/// Compare \p I and \p J and return the minimum.
404/// Return nullptr in case we couldn't find an answer.
405static const SCEV *getMinFromExprs(const SCEV *I, const SCEV *J,
406 ScalarEvolution *SE) {
407 const SCEV *Diff = SE->getMinusSCEV(J, I);
408 const SCEVConstant *C = dyn_cast<const SCEVConstant>(Diff);
409
410 if (!C)
411 return nullptr;
412 if (C->getValue()->isNegative())
413 return J;
414 return I;
415}
416
418 RuntimePointerChecking &RtCheck) {
419 return addPointer(
420 Index, RtCheck.Pointers[Index].Start, RtCheck.Pointers[Index].End,
421 RtCheck.Pointers[Index].PointerValue->getType()->getPointerAddressSpace(),
422 RtCheck.Pointers[Index].NeedsFreeze, *RtCheck.SE);
423}
424
426 const SCEV *End, unsigned AS,
427 bool NeedsFreeze,
428 ScalarEvolution &SE) {
429 assert(AddressSpace == AS &&
430 "all pointers in a checking group must be in the same address space");
431
432 // Compare the starts and ends with the known minimum and maximum
433 // of this set. We need to know how we compare against the min/max
434 // of the set in order to be able to emit memchecks.
435 const SCEV *Min0 = getMinFromExprs(Start, Low, &SE);
436 if (!Min0)
437 return false;
438
439 const SCEV *Min1 = getMinFromExprs(End, High, &SE);
440 if (!Min1)
441 return false;
442
443 // Update the low bound expression if we've found a new min value.
444 if (Min0 == Start)
445 Low = Start;
446
447 // Update the high bound expression if we've found a new max value.
448 if (Min1 != End)
449 High = End;
450
452 this->NeedsFreeze |= NeedsFreeze;
453 return true;
454}
455
456void RuntimePointerChecking::groupChecks(
457 MemoryDepChecker::DepCandidates &DepCands, bool UseDependencies) {
458 // We build the groups from dependency candidates equivalence classes
459 // because:
460 // - We know that pointers in the same equivalence class share
461 // the same underlying object and therefore there is a chance
462 // that we can compare pointers
463 // - We wouldn't be able to merge two pointers for which we need
464 // to emit a memcheck. The classes in DepCands are already
465 // conveniently built such that no two pointers in the same
466 // class need checking against each other.
467
468 // We use the following (greedy) algorithm to construct the groups
469 // For every pointer in the equivalence class:
470 // For each existing group:
471 // - if the difference between this pointer and the min/max bounds
472 // of the group is a constant, then make the pointer part of the
473 // group and update the min/max bounds of that group as required.
474
475 CheckingGroups.clear();
476
477 // If we need to check two pointers to the same underlying object
478 // with a non-constant difference, we shouldn't perform any pointer
479 // grouping with those pointers. This is because we can easily get
480 // into cases where the resulting check would return false, even when
481 // the accesses are safe.
482 //
483 // The following example shows this:
484 // for (i = 0; i < 1000; ++i)
485 // a[5000 + i * m] = a[i] + a[i + 9000]
486 //
487 // Here grouping gives a check of (5000, 5000 + 1000 * m) against
488 // (0, 10000) which is always false. However, if m is 1, there is no
489 // dependence. Not grouping the checks for a[i] and a[i + 9000] allows
490 // us to perform an accurate check in this case.
491 //
492 // The above case requires that we have an UnknownDependence between
493 // accesses to the same underlying object. This cannot happen unless
494 // FoundNonConstantDistanceDependence is set, and therefore UseDependencies
495 // is also false. In this case we will use the fallback path and create
496 // separate checking groups for all pointers.
497
498 // If we don't have the dependency partitions, construct a new
499 // checking pointer group for each pointer. This is also required
500 // for correctness, because in this case we can have checking between
501 // pointers to the same underlying object.
502 if (!UseDependencies) {
503 for (unsigned I = 0; I < Pointers.size(); ++I)
504 CheckingGroups.push_back(RuntimeCheckingPtrGroup(I, *this));
505 return;
506 }
507
508 unsigned TotalComparisons = 0;
509
511 for (unsigned Index = 0; Index < Pointers.size(); ++Index) {
512 auto Iter = PositionMap.insert({Pointers[Index].PointerValue, {}});
513 Iter.first->second.push_back(Index);
514 }
515
516 // We need to keep track of what pointers we've already seen so we
517 // don't process them twice.
519
520 // Go through all equivalence classes, get the "pointer check groups"
521 // and add them to the overall solution. We use the order in which accesses
522 // appear in 'Pointers' to enforce determinism.
523 for (unsigned I = 0; I < Pointers.size(); ++I) {
524 // We've seen this pointer before, and therefore already processed
525 // its equivalence class.
526 if (Seen.count(I))
527 continue;
528
529 MemoryDepChecker::MemAccessInfo Access(Pointers[I].PointerValue,
530 Pointers[I].IsWritePtr);
531
533 auto LeaderI = DepCands.findValue(DepCands.getLeaderValue(Access));
534
535 // Because DepCands is constructed by visiting accesses in the order in
536 // which they appear in alias sets (which is deterministic) and the
537 // iteration order within an equivalence class member is only dependent on
538 // the order in which unions and insertions are performed on the
539 // equivalence class, the iteration order is deterministic.
540 for (auto MI = DepCands.member_begin(LeaderI), ME = DepCands.member_end();
541 MI != ME; ++MI) {
542 auto PointerI = PositionMap.find(MI->getPointer());
543 assert(PointerI != PositionMap.end() &&
544 "pointer in equivalence class not found in PositionMap");
545 for (unsigned Pointer : PointerI->second) {
546 bool Merged = false;
547 // Mark this pointer as seen.
548 Seen.insert(Pointer);
549
550 // Go through all the existing sets and see if we can find one
551 // which can include this pointer.
552 for (RuntimeCheckingPtrGroup &Group : Groups) {
553 // Don't perform more than a certain amount of comparisons.
554 // This should limit the cost of grouping the pointers to something
555 // reasonable. If we do end up hitting this threshold, the algorithm
556 // will create separate groups for all remaining pointers.
557 if (TotalComparisons > MemoryCheckMergeThreshold)
558 break;
559
560 TotalComparisons++;
561
562 if (Group.addPointer(Pointer, *this)) {
563 Merged = true;
564 break;
565 }
566 }
567
568 if (!Merged)
569 // We couldn't add this pointer to any existing set or the threshold
570 // for the number of comparisons has been reached. Create a new group
571 // to hold the current pointer.
572 Groups.push_back(RuntimeCheckingPtrGroup(Pointer, *this));
573 }
574 }
575
576 // We've computed the grouped checks for this partition.
577 // Save the results and continue with the next one.
578 llvm::copy(Groups, std::back_inserter(CheckingGroups));
579 }
580}
581
583 const SmallVectorImpl<int> &PtrToPartition, unsigned PtrIdx1,
584 unsigned PtrIdx2) {
585 return (PtrToPartition[PtrIdx1] != -1 &&
586 PtrToPartition[PtrIdx1] == PtrToPartition[PtrIdx2]);
587}
588
589bool RuntimePointerChecking::needsChecking(unsigned I, unsigned J) const {
590 const PointerInfo &PointerI = Pointers[I];
591 const PointerInfo &PointerJ = Pointers[J];
592
593 // No need to check if two readonly pointers intersect.
594 if (!PointerI.IsWritePtr && !PointerJ.IsWritePtr)
595 return false;
596
597 // Only need to check pointers between two different dependency sets.
598 if (PointerI.DependencySetId == PointerJ.DependencySetId)
599 return false;
600
601 // Only need to check pointers in the same alias set.
602 if (PointerI.AliasSetId != PointerJ.AliasSetId)
603 return false;
604
605 return true;
606}
607
610 unsigned Depth) const {
611 unsigned N = 0;
612 for (const auto &Check : Checks) {
613 const auto &First = Check.first->Members, &Second = Check.second->Members;
614
615 OS.indent(Depth) << "Check " << N++ << ":\n";
616
617 OS.indent(Depth + 2) << "Comparing group (" << Check.first << "):\n";
618 for (unsigned K = 0; K < First.size(); ++K)
619 OS.indent(Depth + 2) << *Pointers[First[K]].PointerValue << "\n";
620
621 OS.indent(Depth + 2) << "Against group (" << Check.second << "):\n";
622 for (unsigned K = 0; K < Second.size(); ++K)
623 OS.indent(Depth + 2) << *Pointers[Second[K]].PointerValue << "\n";
624 }
625}
626
628
629 OS.indent(Depth) << "Run-time memory checks:\n";
630 printChecks(OS, Checks, Depth);
631
632 OS.indent(Depth) << "Grouped accesses:\n";
633 for (unsigned I = 0; I < CheckingGroups.size(); ++I) {
634 const auto &CG = CheckingGroups[I];
635
636 OS.indent(Depth + 2) << "Group " << &CG << ":\n";
637 OS.indent(Depth + 4) << "(Low: " << *CG.Low << " High: " << *CG.High
638 << ")\n";
639 for (unsigned J = 0; J < CG.Members.size(); ++J) {
640 OS.indent(Depth + 6) << "Member: " << *Pointers[CG.Members[J]].Expr
641 << "\n";
642 }
643 }
644}
645
646namespace {
647
648/// Analyses memory accesses in a loop.
649///
650/// Checks whether run time pointer checks are needed and builds sets for data
651/// dependence checking.
652class AccessAnalysis {
653public:
654 /// Read or write access location.
655 typedef PointerIntPair<Value *, 1, bool> MemAccessInfo;
656 typedef SmallVector<MemAccessInfo, 8> MemAccessInfoList;
657
658 AccessAnalysis(Loop *TheLoop, AAResults *AA, LoopInfo *LI,
661 SmallPtrSetImpl<MDNode *> &LoopAliasScopes)
662 : TheLoop(TheLoop), BAA(*AA), AST(BAA), LI(LI), DepCands(DA), PSE(PSE),
663 LoopAliasScopes(LoopAliasScopes) {
664 // We're analyzing dependences across loop iterations.
665 BAA.enableCrossIterationMode();
666 }
667
668 /// Register a load and whether it is only read from.
669 void addLoad(MemoryLocation &Loc, Type *AccessTy, bool IsReadOnly) {
670 Value *Ptr = const_cast<Value *>(Loc.Ptr);
671 AST.add(adjustLoc(Loc));
672 Accesses[MemAccessInfo(Ptr, false)].insert(AccessTy);
673 if (IsReadOnly)
674 ReadOnlyPtr.insert(Ptr);
675 }
676
677 /// Register a store.
678 void addStore(MemoryLocation &Loc, Type *AccessTy) {
679 Value *Ptr = const_cast<Value *>(Loc.Ptr);
680 AST.add(adjustLoc(Loc));
681 Accesses[MemAccessInfo(Ptr, true)].insert(AccessTy);
682 }
683
684 /// Check if we can emit a run-time no-alias check for \p Access.
685 ///
686 /// Returns true if we can emit a run-time no alias check for \p Access.
687 /// If we can check this access, this also adds it to a dependence set and
688 /// adds a run-time to check for it to \p RtCheck. If \p Assume is true,
689 /// we will attempt to use additional run-time checks in order to get
690 /// the bounds of the pointer.
691 bool createCheckForAccess(RuntimePointerChecking &RtCheck,
692 MemAccessInfo Access, Type *AccessTy,
693 const DenseMap<Value *, const SCEV *> &Strides,
695 Loop *TheLoop, unsigned &RunningDepId,
696 unsigned ASId, bool ShouldCheckStride, bool Assume);
697
698 /// Check whether we can check the pointers at runtime for
699 /// non-intersection.
700 ///
701 /// Returns true if we need no check or if we do and we can generate them
702 /// (i.e. the pointers have computable bounds).
703 bool canCheckPtrAtRT(RuntimePointerChecking &RtCheck, ScalarEvolution *SE,
704 Loop *TheLoop, const DenseMap<Value *, const SCEV *> &Strides,
705 Value *&UncomputablePtr, bool ShouldCheckWrap = false);
706
707 /// Goes over all memory accesses, checks whether a RT check is needed
708 /// and builds sets of dependent accesses.
709 void buildDependenceSets() {
710 processMemAccesses();
711 }
712
713 /// Initial processing of memory accesses determined that we need to
714 /// perform dependency checking.
715 ///
716 /// Note that this can later be cleared if we retry memcheck analysis without
717 /// dependency checking (i.e. FoundNonConstantDistanceDependence).
718 bool isDependencyCheckNeeded() { return !CheckDeps.empty(); }
719
720 /// We decided that no dependence analysis would be used. Reset the state.
721 void resetDepChecks(MemoryDepChecker &DepChecker) {
722 CheckDeps.clear();
723 DepChecker.clearDependences();
724 }
725
726 MemAccessInfoList &getDependenciesToCheck() { return CheckDeps; }
727
730 return UnderlyingObjects;
731 }
732
733private:
735
736 /// Adjust the MemoryLocation so that it represents accesses to this
737 /// location across all iterations, rather than a single one.
738 MemoryLocation adjustLoc(MemoryLocation Loc) const {
739 // The accessed location varies within the loop, but remains within the
740 // underlying object.
742 Loc.AATags.Scope = adjustAliasScopeList(Loc.AATags.Scope);
743 Loc.AATags.NoAlias = adjustAliasScopeList(Loc.AATags.NoAlias);
744 return Loc;
745 }
746
747 /// Drop alias scopes that are only valid within a single loop iteration.
748 MDNode *adjustAliasScopeList(MDNode *ScopeList) const {
749 if (!ScopeList)
750 return nullptr;
751
752 // For the sake of simplicity, drop the whole scope list if any scope is
753 // iteration-local.
754 if (any_of(ScopeList->operands(), [&](Metadata *Scope) {
755 return LoopAliasScopes.contains(cast<MDNode>(Scope));
756 }))
757 return nullptr;
758
759 return ScopeList;
760 }
761
762 /// Go over all memory access and check whether runtime pointer checks
763 /// are needed and build sets of dependency check candidates.
764 void processMemAccesses();
765
766 /// Map of all accesses. Values are the types used to access memory pointed to
767 /// by the pointer.
768 PtrAccessMap Accesses;
769
770 /// The loop being checked.
771 const Loop *TheLoop;
772
773 /// List of accesses that need a further dependence check.
774 MemAccessInfoList CheckDeps;
775
776 /// Set of pointers that are read only.
777 SmallPtrSet<Value*, 16> ReadOnlyPtr;
778
779 /// Batched alias analysis results.
780 BatchAAResults BAA;
781
782 /// An alias set tracker to partition the access set by underlying object and
783 //intrinsic property (such as TBAA metadata).
784 AliasSetTracker AST;
785
786 LoopInfo *LI;
787
788 /// Sets of potentially dependent accesses - members of one set share an
789 /// underlying pointer. The set "CheckDeps" identfies which sets really need a
790 /// dependence check.
792
793 /// Initial processing of memory accesses determined that we may need
794 /// to add memchecks. Perform the analysis to determine the necessary checks.
795 ///
796 /// Note that, this is different from isDependencyCheckNeeded. When we retry
797 /// memcheck analysis without dependency checking
798 /// (i.e. FoundNonConstantDistanceDependence), isDependencyCheckNeeded is
799 /// cleared while this remains set if we have potentially dependent accesses.
800 bool IsRTCheckAnalysisNeeded = false;
801
802 /// The SCEV predicate containing all the SCEV-related assumptions.
804
806
807 /// Alias scopes that are declared inside the loop, and as such not valid
808 /// across iterations.
809 SmallPtrSetImpl<MDNode *> &LoopAliasScopes;
810};
811
812} // end anonymous namespace
813
814/// Check whether a pointer can participate in a runtime bounds check.
815/// If \p Assume, try harder to prove that we can compute the bounds of \p Ptr
816/// by adding run-time checks (overflow checks) if necessary.
818 const SCEV *PtrScev, Loop *L, bool Assume) {
819 // The bounds for loop-invariant pointer is trivial.
820 if (PSE.getSE()->isLoopInvariant(PtrScev, L))
821 return true;
822
823 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PtrScev);
824
825 if (!AR && Assume)
826 AR = PSE.getAsAddRec(Ptr);
827
828 if (!AR)
829 return false;
830
831 return AR->isAffine();
832}
833
834/// Check whether a pointer address cannot wrap.
836 const DenseMap<Value *, const SCEV *> &Strides, Value *Ptr, Type *AccessTy,
837 Loop *L) {
838 const SCEV *PtrScev = PSE.getSCEV(Ptr);
839 if (PSE.getSE()->isLoopInvariant(PtrScev, L))
840 return true;
841
842 int64_t Stride = getPtrStride(PSE, AccessTy, Ptr, L, Strides).value_or(0);
843 if (Stride == 1 || PSE.hasNoOverflow(Ptr, SCEVWrapPredicate::IncrementNUSW))
844 return true;
845
846 return false;
847}
848
849static void visitPointers(Value *StartPtr, const Loop &InnermostLoop,
850 function_ref<void(Value *)> AddPointer) {
852 SmallVector<Value *> WorkList;
853 WorkList.push_back(StartPtr);
854
855 while (!WorkList.empty()) {
856 Value *Ptr = WorkList.pop_back_val();
857 if (!Visited.insert(Ptr).second)
858 continue;
859 auto *PN = dyn_cast<PHINode>(Ptr);
860 // SCEV does not look through non-header PHIs inside the loop. Such phis
861 // can be analyzed by adding separate accesses for each incoming pointer
862 // value.
863 if (PN && InnermostLoop.contains(PN->getParent()) &&
864 PN->getParent() != InnermostLoop.getHeader()) {
865 for (const Use &Inc : PN->incoming_values())
866 WorkList.push_back(Inc);
867 } else
868 AddPointer(Ptr);
869 }
870}
871
872// Walk back through the IR for a pointer, looking for a select like the
873// following:
874//
875// %offset = select i1 %cmp, i64 %a, i64 %b
876// %addr = getelementptr double, double* %base, i64 %offset
877// %ld = load double, double* %addr, align 8
878//
879// We won't be able to form a single SCEVAddRecExpr from this since the
880// address for each loop iteration depends on %cmp. We could potentially
881// produce multiple valid SCEVAddRecExprs, though, and check all of them for
882// memory safety/aliasing if needed.
883//
884// If we encounter some IR we don't yet handle, or something obviously fine
885// like a constant, then we just add the SCEV for that term to the list passed
886// in by the caller. If we have a node that may potentially yield a valid
887// SCEVAddRecExpr then we decompose it into parts and build the SCEV terms
888// ourselves before adding to the list.
889static void findForkedSCEVs(
890 ScalarEvolution *SE, const Loop *L, Value *Ptr,
892 unsigned Depth) {
893 // If our Value is a SCEVAddRecExpr, loop invariant, not an instruction, or
894 // we've exceeded our limit on recursion, just return whatever we have
895 // regardless of whether it can be used for a forked pointer or not, along
896 // with an indication of whether it might be a poison or undef value.
897 const SCEV *Scev = SE->getSCEV(Ptr);
898 if (isa<SCEVAddRecExpr>(Scev) || L->isLoopInvariant(Ptr) ||
899 !isa<Instruction>(Ptr) || Depth == 0) {
900 ScevList.emplace_back(Scev, !isGuaranteedNotToBeUndefOrPoison(Ptr));
901 return;
902 }
903
904 Depth--;
905
906 auto UndefPoisonCheck = [](PointerIntPair<const SCEV *, 1, bool> S) {
907 return get<1>(S);
908 };
909
910 auto GetBinOpExpr = [&SE](unsigned Opcode, const SCEV *L, const SCEV *R) {
911 switch (Opcode) {
912 case Instruction::Add:
913 return SE->getAddExpr(L, R);
914 case Instruction::Sub:
915 return SE->getMinusSCEV(L, R);
916 default:
917 llvm_unreachable("Unexpected binary operator when walking ForkedPtrs");
918 }
919 };
920
921 Instruction *I = cast<Instruction>(Ptr);
922 unsigned Opcode = I->getOpcode();
923 switch (Opcode) {
924 case Instruction::GetElementPtr: {
925 GetElementPtrInst *GEP = cast<GetElementPtrInst>(I);
926 Type *SourceTy = GEP->getSourceElementType();
927 // We only handle base + single offset GEPs here for now.
928 // Not dealing with preexisting gathers yet, so no vectors.
929 if (I->getNumOperands() != 2 || SourceTy->isVectorTy()) {
930 ScevList.emplace_back(Scev, !isGuaranteedNotToBeUndefOrPoison(GEP));
931 break;
932 }
935 findForkedSCEVs(SE, L, I->getOperand(0), BaseScevs, Depth);
936 findForkedSCEVs(SE, L, I->getOperand(1), OffsetScevs, Depth);
937
938 // See if we need to freeze our fork...
939 bool NeedsFreeze = any_of(BaseScevs, UndefPoisonCheck) ||
940 any_of(OffsetScevs, UndefPoisonCheck);
941
942 // Check that we only have a single fork, on either the base or the offset.
943 // Copy the SCEV across for the one without a fork in order to generate
944 // the full SCEV for both sides of the GEP.
945 if (OffsetScevs.size() == 2 && BaseScevs.size() == 1)
946 BaseScevs.push_back(BaseScevs[0]);
947 else if (BaseScevs.size() == 2 && OffsetScevs.size() == 1)
948 OffsetScevs.push_back(OffsetScevs[0]);
949 else {
950 ScevList.emplace_back(Scev, NeedsFreeze);
951 break;
952 }
953
954 // Find the pointer type we need to extend to.
955 Type *IntPtrTy = SE->getEffectiveSCEVType(
956 SE->getSCEV(GEP->getPointerOperand())->getType());
957
958 // Find the size of the type being pointed to. We only have a single
959 // index term (guarded above) so we don't need to index into arrays or
960 // structures, just get the size of the scalar value.
961 const SCEV *Size = SE->getSizeOfExpr(IntPtrTy, SourceTy);
962
963 // Scale up the offsets by the size of the type, then add to the bases.
964 const SCEV *Scaled1 = SE->getMulExpr(
965 Size, SE->getTruncateOrSignExtend(get<0>(OffsetScevs[0]), IntPtrTy));
966 const SCEV *Scaled2 = SE->getMulExpr(
967 Size, SE->getTruncateOrSignExtend(get<0>(OffsetScevs[1]), IntPtrTy));
968 ScevList.emplace_back(SE->getAddExpr(get<0>(BaseScevs[0]), Scaled1),
969 NeedsFreeze);
970 ScevList.emplace_back(SE->getAddExpr(get<0>(BaseScevs[1]), Scaled2),
971 NeedsFreeze);
972 break;
973 }
974 case Instruction::Select: {
976 // A select means we've found a forked pointer, but we currently only
977 // support a single select per pointer so if there's another behind this
978 // then we just bail out and return the generic SCEV.
979 findForkedSCEVs(SE, L, I->getOperand(1), ChildScevs, Depth);
980 findForkedSCEVs(SE, L, I->getOperand(2), ChildScevs, Depth);
981 if (ChildScevs.size() == 2) {
982 ScevList.push_back(ChildScevs[0]);
983 ScevList.push_back(ChildScevs[1]);
984 } else
985 ScevList.emplace_back(Scev, !isGuaranteedNotToBeUndefOrPoison(Ptr));
986 break;
987 }
988 case Instruction::PHI: {
990 // A phi means we've found a forked pointer, but we currently only
991 // support a single phi per pointer so if there's another behind this
992 // then we just bail out and return the generic SCEV.
993 if (I->getNumOperands() == 2) {
994 findForkedSCEVs(SE, L, I->getOperand(0), ChildScevs, Depth);
995 findForkedSCEVs(SE, L, I->getOperand(1), ChildScevs, Depth);
996 }
997 if (ChildScevs.size() == 2) {
998 ScevList.push_back(ChildScevs[0]);
999 ScevList.push_back(ChildScevs[1]);
1000 } else
1001 ScevList.emplace_back(Scev, !isGuaranteedNotToBeUndefOrPoison(Ptr));
1002 break;
1003 }
1004 case Instruction::Add:
1005 case Instruction::Sub: {
1008 findForkedSCEVs(SE, L, I->getOperand(0), LScevs, Depth);
1009 findForkedSCEVs(SE, L, I->getOperand(1), RScevs, Depth);
1010
1011 // See if we need to freeze our fork...
1012 bool NeedsFreeze =
1013 any_of(LScevs, UndefPoisonCheck) || any_of(RScevs, UndefPoisonCheck);
1014
1015 // Check that we only have a single fork, on either the left or right side.
1016 // Copy the SCEV across for the one without a fork in order to generate
1017 // the full SCEV for both sides of the BinOp.
1018 if (LScevs.size() == 2 && RScevs.size() == 1)
1019 RScevs.push_back(RScevs[0]);
1020 else if (RScevs.size() == 2 && LScevs.size() == 1)
1021 LScevs.push_back(LScevs[0]);
1022 else {
1023 ScevList.emplace_back(Scev, NeedsFreeze);
1024 break;
1025 }
1026
1027 ScevList.emplace_back(
1028 GetBinOpExpr(Opcode, get<0>(LScevs[0]), get<0>(RScevs[0])),
1029 NeedsFreeze);
1030 ScevList.emplace_back(
1031 GetBinOpExpr(Opcode, get<0>(LScevs[1]), get<0>(RScevs[1])),
1032 NeedsFreeze);
1033 break;
1034 }
1035 default:
1036 // Just return the current SCEV if we haven't handled the instruction yet.
1037 LLVM_DEBUG(dbgs() << "ForkedPtr unhandled instruction: " << *I << "\n");
1038 ScevList.emplace_back(Scev, !isGuaranteedNotToBeUndefOrPoison(Ptr));
1039 break;
1040 }
1041}
1042
1045 const DenseMap<Value *, const SCEV *> &StridesMap, Value *Ptr,
1046 const Loop *L) {
1047 ScalarEvolution *SE = PSE.getSE();
1048 assert(SE->isSCEVable(Ptr->getType()) && "Value is not SCEVable!");
1050 findForkedSCEVs(SE, L, Ptr, Scevs, MaxForkedSCEVDepth);
1051
1052 // For now, we will only accept a forked pointer with two possible SCEVs
1053 // that are either SCEVAddRecExprs or loop invariant.
1054 if (Scevs.size() == 2 &&
1055 (isa<SCEVAddRecExpr>(get<0>(Scevs[0])) ||
1056 SE->isLoopInvariant(get<0>(Scevs[0]), L)) &&
1057 (isa<SCEVAddRecExpr>(get<0>(Scevs[1])) ||
1058 SE->isLoopInvariant(get<0>(Scevs[1]), L))) {
1059 LLVM_DEBUG(dbgs() << "LAA: Found forked pointer: " << *Ptr << "\n");
1060 LLVM_DEBUG(dbgs() << "\t(1) " << *get<0>(Scevs[0]) << "\n");
1061 LLVM_DEBUG(dbgs() << "\t(2) " << *get<0>(Scevs[1]) << "\n");
1062 return Scevs;
1063 }
1064
1065 return {{replaceSymbolicStrideSCEV(PSE, StridesMap, Ptr), false}};
1066}
1067
1068bool AccessAnalysis::createCheckForAccess(RuntimePointerChecking &RtCheck,
1069 MemAccessInfo Access, Type *AccessTy,
1070 const DenseMap<Value *, const SCEV *> &StridesMap,
1072 Loop *TheLoop, unsigned &RunningDepId,
1073 unsigned ASId, bool ShouldCheckWrap,
1074 bool Assume) {
1075 Value *Ptr = Access.getPointer();
1076
1078 findForkedPointer(PSE, StridesMap, Ptr, TheLoop);
1079
1080 for (auto &P : TranslatedPtrs) {
1081 const SCEV *PtrExpr = get<0>(P);
1082 if (!hasComputableBounds(PSE, Ptr, PtrExpr, TheLoop, Assume))
1083 return false;
1084
1085 // When we run after a failing dependency check we have to make sure
1086 // we don't have wrapping pointers.
1087 if (ShouldCheckWrap) {
1088 // Skip wrap checking when translating pointers.
1089 if (TranslatedPtrs.size() > 1)
1090 return false;
1091
1092 if (!isNoWrap(PSE, StridesMap, Ptr, AccessTy, TheLoop)) {
1093 auto *Expr = PSE.getSCEV(Ptr);
1094 if (!Assume || !isa<SCEVAddRecExpr>(Expr))
1095 return false;
1097 }
1098 }
1099 // If there's only one option for Ptr, look it up after bounds and wrap
1100 // checking, because assumptions might have been added to PSE.
1101 if (TranslatedPtrs.size() == 1)
1102 TranslatedPtrs[0] = {replaceSymbolicStrideSCEV(PSE, StridesMap, Ptr),
1103 false};
1104 }
1105
1106 for (auto [PtrExpr, NeedsFreeze] : TranslatedPtrs) {
1107 // The id of the dependence set.
1108 unsigned DepId;
1109
1110 if (isDependencyCheckNeeded()) {
1111 Value *Leader = DepCands.getLeaderValue(Access).getPointer();
1112 unsigned &LeaderId = DepSetId[Leader];
1113 if (!LeaderId)
1114 LeaderId = RunningDepId++;
1115 DepId = LeaderId;
1116 } else
1117 // Each access has its own dependence set.
1118 DepId = RunningDepId++;
1119
1120 bool IsWrite = Access.getInt();
1121 RtCheck.insert(TheLoop, Ptr, PtrExpr, AccessTy, IsWrite, DepId, ASId, PSE,
1122 NeedsFreeze);
1123 LLVM_DEBUG(dbgs() << "LAA: Found a runtime check ptr:" << *Ptr << '\n');
1124 }
1125
1126 return true;
1127}
1128
1129bool AccessAnalysis::canCheckPtrAtRT(RuntimePointerChecking &RtCheck,
1130 ScalarEvolution *SE, Loop *TheLoop,
1131 const DenseMap<Value *, const SCEV *> &StridesMap,
1132 Value *&UncomputablePtr, bool ShouldCheckWrap) {
1133 // Find pointers with computable bounds. We are going to use this information
1134 // to place a runtime bound check.
1135 bool CanDoRT = true;
1136
1137 bool MayNeedRTCheck = false;
1138 if (!IsRTCheckAnalysisNeeded) return true;
1139
1140 bool IsDepCheckNeeded = isDependencyCheckNeeded();
1141
1142 // We assign a consecutive id to access from different alias sets.
1143 // Accesses between different groups doesn't need to be checked.
1144 unsigned ASId = 0;
1145 for (auto &AS : AST) {
1146 int NumReadPtrChecks = 0;
1147 int NumWritePtrChecks = 0;
1148 bool CanDoAliasSetRT = true;
1149 ++ASId;
1150 auto ASPointers = AS.getPointers();
1151
1152 // We assign consecutive id to access from different dependence sets.
1153 // Accesses within the same set don't need a runtime check.
1154 unsigned RunningDepId = 1;
1156
1158
1159 // First, count how many write and read accesses are in the alias set. Also
1160 // collect MemAccessInfos for later.
1162 for (const Value *Ptr_ : ASPointers) {
1163 Value *Ptr = const_cast<Value *>(Ptr_);
1164 bool IsWrite = Accesses.count(MemAccessInfo(Ptr, true));
1165 if (IsWrite)
1166 ++NumWritePtrChecks;
1167 else
1168 ++NumReadPtrChecks;
1169 AccessInfos.emplace_back(Ptr, IsWrite);
1170 }
1171
1172 // We do not need runtime checks for this alias set, if there are no writes
1173 // or a single write and no reads.
1174 if (NumWritePtrChecks == 0 ||
1175 (NumWritePtrChecks == 1 && NumReadPtrChecks == 0)) {
1176 assert((ASPointers.size() <= 1 ||
1177 all_of(ASPointers,
1178 [this](const Value *Ptr) {
1179 MemAccessInfo AccessWrite(const_cast<Value *>(Ptr),
1180 true);
1181 return DepCands.findValue(AccessWrite) == DepCands.end();
1182 })) &&
1183 "Can only skip updating CanDoRT below, if all entries in AS "
1184 "are reads or there is at most 1 entry");
1185 continue;
1186 }
1187
1188 for (auto &Access : AccessInfos) {
1189 for (const auto &AccessTy : Accesses[Access]) {
1190 if (!createCheckForAccess(RtCheck, Access, AccessTy, StridesMap,
1191 DepSetId, TheLoop, RunningDepId, ASId,
1192 ShouldCheckWrap, false)) {
1193 LLVM_DEBUG(dbgs() << "LAA: Can't find bounds for ptr:"
1194 << *Access.getPointer() << '\n');
1195 Retries.push_back({Access, AccessTy});
1196 CanDoAliasSetRT = false;
1197 }
1198 }
1199 }
1200
1201 // Note that this function computes CanDoRT and MayNeedRTCheck
1202 // independently. For example CanDoRT=false, MayNeedRTCheck=false means that
1203 // we have a pointer for which we couldn't find the bounds but we don't
1204 // actually need to emit any checks so it does not matter.
1205 //
1206 // We need runtime checks for this alias set, if there are at least 2
1207 // dependence sets (in which case RunningDepId > 2) or if we need to re-try
1208 // any bound checks (because in that case the number of dependence sets is
1209 // incomplete).
1210 bool NeedsAliasSetRTCheck = RunningDepId > 2 || !Retries.empty();
1211
1212 // We need to perform run-time alias checks, but some pointers had bounds
1213 // that couldn't be checked.
1214 if (NeedsAliasSetRTCheck && !CanDoAliasSetRT) {
1215 // Reset the CanDoSetRt flag and retry all accesses that have failed.
1216 // We know that we need these checks, so we can now be more aggressive
1217 // and add further checks if required (overflow checks).
1218 CanDoAliasSetRT = true;
1219 for (auto Retry : Retries) {
1220 MemAccessInfo Access = Retry.first;
1221 Type *AccessTy = Retry.second;
1222 if (!createCheckForAccess(RtCheck, Access, AccessTy, StridesMap,
1223 DepSetId, TheLoop, RunningDepId, ASId,
1224 ShouldCheckWrap, /*Assume=*/true)) {
1225 CanDoAliasSetRT = false;
1226 UncomputablePtr = Access.getPointer();
1227 break;
1228 }
1229 }
1230 }
1231
1232 CanDoRT &= CanDoAliasSetRT;
1233 MayNeedRTCheck |= NeedsAliasSetRTCheck;
1234 ++ASId;
1235 }
1236
1237 // If the pointers that we would use for the bounds comparison have different
1238 // address spaces, assume the values aren't directly comparable, so we can't
1239 // use them for the runtime check. We also have to assume they could
1240 // overlap. In the future there should be metadata for whether address spaces
1241 // are disjoint.
1242 unsigned NumPointers = RtCheck.Pointers.size();
1243 for (unsigned i = 0; i < NumPointers; ++i) {
1244 for (unsigned j = i + 1; j < NumPointers; ++j) {
1245 // Only need to check pointers between two different dependency sets.
1246 if (RtCheck.Pointers[i].DependencySetId ==
1247 RtCheck.Pointers[j].DependencySetId)
1248 continue;
1249 // Only need to check pointers in the same alias set.
1250 if (RtCheck.Pointers[i].AliasSetId != RtCheck.Pointers[j].AliasSetId)
1251 continue;
1252
1253 Value *PtrI = RtCheck.Pointers[i].PointerValue;
1254 Value *PtrJ = RtCheck.Pointers[j].PointerValue;
1255
1256 unsigned ASi = PtrI->getType()->getPointerAddressSpace();
1257 unsigned ASj = PtrJ->getType()->getPointerAddressSpace();
1258 if (ASi != ASj) {
1259 LLVM_DEBUG(
1260 dbgs() << "LAA: Runtime check would require comparison between"
1261 " different address spaces\n");
1262 return false;
1263 }
1264 }
1265 }
1266
1267 if (MayNeedRTCheck && CanDoRT)
1268 RtCheck.generateChecks(DepCands, IsDepCheckNeeded);
1269
1270 LLVM_DEBUG(dbgs() << "LAA: We need to do " << RtCheck.getNumberOfChecks()
1271 << " pointer comparisons.\n");
1272
1273 // If we can do run-time checks, but there are no checks, no runtime checks
1274 // are needed. This can happen when all pointers point to the same underlying
1275 // object for example.
1276 RtCheck.Need = CanDoRT ? RtCheck.getNumberOfChecks() != 0 : MayNeedRTCheck;
1277
1278 bool CanDoRTIfNeeded = !RtCheck.Need || CanDoRT;
1279 if (!CanDoRTIfNeeded)
1280 RtCheck.reset();
1281 return CanDoRTIfNeeded;
1282}
1283
1284void AccessAnalysis::processMemAccesses() {
1285 // We process the set twice: first we process read-write pointers, last we
1286 // process read-only pointers. This allows us to skip dependence tests for
1287 // read-only pointers.
1288
1289 LLVM_DEBUG(dbgs() << "LAA: Processing memory accesses...\n");
1290 LLVM_DEBUG(dbgs() << " AST: "; AST.dump());
1291 LLVM_DEBUG(dbgs() << "LAA: Accesses(" << Accesses.size() << "):\n");
1292 LLVM_DEBUG({
1293 for (auto A : Accesses)
1294 dbgs() << "\t" << *A.first.getPointer() << " ("
1295 << (A.first.getInt()
1296 ? "write"
1297 : (ReadOnlyPtr.count(A.first.getPointer()) ? "read-only"
1298 : "read"))
1299 << ")\n";
1300 });
1301
1302 // The AliasSetTracker has nicely partitioned our pointers by metadata
1303 // compatibility and potential for underlying-object overlap. As a result, we
1304 // only need to check for potential pointer dependencies within each alias
1305 // set.
1306 for (const auto &AS : AST) {
1307 // Note that both the alias-set tracker and the alias sets themselves used
1308 // ordered collections internally and so the iteration order here is
1309 // deterministic.
1310 auto ASPointers = AS.getPointers();
1311
1312 bool SetHasWrite = false;
1313
1314 // Map of pointers to last access encountered.
1315 typedef DenseMap<const Value*, MemAccessInfo> UnderlyingObjToAccessMap;
1316 UnderlyingObjToAccessMap ObjToLastAccess;
1317
1318 // Set of access to check after all writes have been processed.
1319 PtrAccessMap DeferredAccesses;
1320
1321 // Iterate over each alias set twice, once to process read/write pointers,
1322 // and then to process read-only pointers.
1323 for (int SetIteration = 0; SetIteration < 2; ++SetIteration) {
1324 bool UseDeferred = SetIteration > 0;
1325 PtrAccessMap &S = UseDeferred ? DeferredAccesses : Accesses;
1326
1327 for (const Value *Ptr_ : ASPointers) {
1328 Value *Ptr = const_cast<Value *>(Ptr_);
1329
1330 // For a single memory access in AliasSetTracker, Accesses may contain
1331 // both read and write, and they both need to be handled for CheckDeps.
1332 for (const auto &AC : S) {
1333 if (AC.first.getPointer() != Ptr)
1334 continue;
1335
1336 bool IsWrite = AC.first.getInt();
1337
1338 // If we're using the deferred access set, then it contains only
1339 // reads.
1340 bool IsReadOnlyPtr = ReadOnlyPtr.count(Ptr) && !IsWrite;
1341 if (UseDeferred && !IsReadOnlyPtr)
1342 continue;
1343 // Otherwise, the pointer must be in the PtrAccessSet, either as a
1344 // read or a write.
1345 assert(((IsReadOnlyPtr && UseDeferred) || IsWrite ||
1346 S.count(MemAccessInfo(Ptr, false))) &&
1347 "Alias-set pointer not in the access set?");
1348
1349 MemAccessInfo Access(Ptr, IsWrite);
1350 DepCands.insert(Access);
1351
1352 // Memorize read-only pointers for later processing and skip them in
1353 // the first round (they need to be checked after we have seen all
1354 // write pointers). Note: we also mark pointer that are not
1355 // consecutive as "read-only" pointers (so that we check
1356 // "a[b[i]] +="). Hence, we need the second check for "!IsWrite".
1357 if (!UseDeferred && IsReadOnlyPtr) {
1358 // We only use the pointer keys, the types vector values don't
1359 // matter.
1360 DeferredAccesses.insert({Access, {}});
1361 continue;
1362 }
1363
1364 // If this is a write - check other reads and writes for conflicts. If
1365 // this is a read only check other writes for conflicts (but only if
1366 // there is no other write to the ptr - this is an optimization to
1367 // catch "a[i] = a[i] + " without having to do a dependence check).
1368 if ((IsWrite || IsReadOnlyPtr) && SetHasWrite) {
1369 CheckDeps.push_back(Access);
1370 IsRTCheckAnalysisNeeded = true;
1371 }
1372
1373 if (IsWrite)
1374 SetHasWrite = true;
1375
1376 // Create sets of pointers connected by a shared alias set and
1377 // underlying object.
1378 typedef SmallVector<const Value *, 16> ValueVector;
1379 ValueVector TempObjects;
1380
1381 UnderlyingObjects[Ptr] = {};
1382 SmallVector<const Value *, 16> &UOs = UnderlyingObjects[Ptr];
1383 ::getUnderlyingObjects(Ptr, UOs, LI);
1385 << "Underlying objects for pointer " << *Ptr << "\n");
1386 for (const Value *UnderlyingObj : UOs) {
1387 // nullptr never alias, don't join sets for pointer that have "null"
1388 // in their UnderlyingObjects list.
1389 if (isa<ConstantPointerNull>(UnderlyingObj) &&
1391 TheLoop->getHeader()->getParent(),
1392 UnderlyingObj->getType()->getPointerAddressSpace()))
1393 continue;
1394
1395 UnderlyingObjToAccessMap::iterator Prev =
1396 ObjToLastAccess.find(UnderlyingObj);
1397 if (Prev != ObjToLastAccess.end())
1398 DepCands.unionSets(Access, Prev->second);
1399
1400 ObjToLastAccess[UnderlyingObj] = Access;
1401 LLVM_DEBUG(dbgs() << " " << *UnderlyingObj << "\n");
1402 }
1403 }
1404 }
1405 }
1406 }
1407}
1408
1409/// Return true if an AddRec pointer \p Ptr is unsigned non-wrapping,
1410/// i.e. monotonically increasing/decreasing.
1411static bool isNoWrapAddRec(Value *Ptr, const SCEVAddRecExpr *AR,
1412 PredicatedScalarEvolution &PSE, const Loop *L) {
1413
1414 // FIXME: This should probably only return true for NUW.
1416 return true;
1417
1419 return true;
1420
1421 // Scalar evolution does not propagate the non-wrapping flags to values that
1422 // are derived from a non-wrapping induction variable because non-wrapping
1423 // could be flow-sensitive.
1424 //
1425 // Look through the potentially overflowing instruction to try to prove
1426 // non-wrapping for the *specific* value of Ptr.
1427
1428 // The arithmetic implied by an inbounds GEP can't overflow.
1429 auto *GEP = dyn_cast<GetElementPtrInst>(Ptr);
1430 if (!GEP || !GEP->isInBounds())
1431 return false;
1432
1433 // Make sure there is only one non-const index and analyze that.
1434 Value *NonConstIndex = nullptr;
1435 for (Value *Index : GEP->indices())
1436 if (!isa<ConstantInt>(Index)) {
1437 if (NonConstIndex)
1438 return false;
1439 NonConstIndex = Index;
1440 }
1441 if (!NonConstIndex)
1442 // The recurrence is on the pointer, ignore for now.
1443 return false;
1444
1445 // The index in GEP is signed. It is non-wrapping if it's derived from a NSW
1446 // AddRec using a NSW operation.
1447 if (auto *OBO = dyn_cast<OverflowingBinaryOperator>(NonConstIndex))
1448 if (OBO->hasNoSignedWrap() &&
1449 // Assume constant for other the operand so that the AddRec can be
1450 // easily found.
1451 isa<ConstantInt>(OBO->getOperand(1))) {
1452 auto *OpScev = PSE.getSCEV(OBO->getOperand(0));
1453
1454 if (auto *OpAR = dyn_cast<SCEVAddRecExpr>(OpScev))
1455 return OpAR->getLoop() == L && OpAR->getNoWrapFlags(SCEV::FlagNSW);
1456 }
1457
1458 return false;
1459}
1460
1461/// Check whether the access through \p Ptr has a constant stride.
1463 Type *AccessTy, Value *Ptr,
1464 const Loop *Lp,
1465 const DenseMap<Value *, const SCEV *> &StridesMap,
1466 bool Assume, bool ShouldCheckWrap) {
1467 Type *Ty = Ptr->getType();
1468 assert(Ty->isPointerTy() && "Unexpected non-ptr");
1469
1470 if (isa<ScalableVectorType>(AccessTy)) {
1471 LLVM_DEBUG(dbgs() << "LAA: Bad stride - Scalable object: " << *AccessTy
1472 << "\n");
1473 return std::nullopt;
1474 }
1475
1476 const SCEV *PtrScev = replaceSymbolicStrideSCEV(PSE, StridesMap, Ptr);
1477
1478 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PtrScev);
1479 if (Assume && !AR)
1480 AR = PSE.getAsAddRec(Ptr);
1481
1482 if (!AR) {
1483 LLVM_DEBUG(dbgs() << "LAA: Bad stride - Not an AddRecExpr pointer " << *Ptr
1484 << " SCEV: " << *PtrScev << "\n");
1485 return std::nullopt;
1486 }
1487
1488 // The access function must stride over the innermost loop.
1489 if (Lp != AR->getLoop()) {
1490 LLVM_DEBUG(dbgs() << "LAA: Bad stride - Not striding over innermost loop "
1491 << *Ptr << " SCEV: " << *AR << "\n");
1492 return std::nullopt;
1493 }
1494
1495 // Check the step is constant.
1496 const SCEV *Step = AR->getStepRecurrence(*PSE.getSE());
1497
1498 // Calculate the pointer stride and check if it is constant.
1499 const SCEVConstant *C = dyn_cast<SCEVConstant>(Step);
1500 if (!C) {
1501 LLVM_DEBUG(dbgs() << "LAA: Bad stride - Not a constant strided " << *Ptr
1502 << " SCEV: " << *AR << "\n");
1503 return std::nullopt;
1504 }
1505
1506 auto &DL = Lp->getHeader()->getModule()->getDataLayout();
1507 TypeSize AllocSize = DL.getTypeAllocSize(AccessTy);
1508 int64_t Size = AllocSize.getFixedValue();
1509 const APInt &APStepVal = C->getAPInt();
1510
1511 // Huge step value - give up.
1512 if (APStepVal.getBitWidth() > 64)
1513 return std::nullopt;
1514
1515 int64_t StepVal = APStepVal.getSExtValue();
1516
1517 // Strided access.
1518 int64_t Stride = StepVal / Size;
1519 int64_t Rem = StepVal % Size;
1520 if (Rem)
1521 return std::nullopt;
1522
1523 if (!ShouldCheckWrap)
1524 return Stride;
1525
1526 // The address calculation must not wrap. Otherwise, a dependence could be
1527 // inverted.
1528 if (isNoWrapAddRec(Ptr, AR, PSE, Lp))
1529 return Stride;
1530
1531 // An inbounds getelementptr that is a AddRec with a unit stride
1532 // cannot wrap per definition. If it did, the result would be poison
1533 // and any memory access dependent on it would be immediate UB
1534 // when executed.
1535 if (auto *GEP = dyn_cast<GetElementPtrInst>(Ptr);
1536 GEP && GEP->isInBounds() && (Stride == 1 || Stride == -1))
1537 return Stride;
1538
1539 // If the null pointer is undefined, then a access sequence which would
1540 // otherwise access it can be assumed not to unsigned wrap. Note that this
1541 // assumes the object in memory is aligned to the natural alignment.
1542 unsigned AddrSpace = Ty->getPointerAddressSpace();
1543 if (!NullPointerIsDefined(Lp->getHeader()->getParent(), AddrSpace) &&
1544 (Stride == 1 || Stride == -1))
1545 return Stride;
1546
1547 if (Assume) {
1549 LLVM_DEBUG(dbgs() << "LAA: Pointer may wrap:\n"
1550 << "LAA: Pointer: " << *Ptr << "\n"
1551 << "LAA: SCEV: " << *AR << "\n"
1552 << "LAA: Added an overflow assumption\n");
1553 return Stride;
1554 }
1555 LLVM_DEBUG(
1556 dbgs() << "LAA: Bad stride - Pointer may wrap in the address space "
1557 << *Ptr << " SCEV: " << *AR << "\n");
1558 return std::nullopt;
1559}
1560
1561std::optional<int> llvm::getPointersDiff(Type *ElemTyA, Value *PtrA,
1562 Type *ElemTyB, Value *PtrB,
1563 const DataLayout &DL,
1564 ScalarEvolution &SE, bool StrictCheck,
1565 bool CheckType) {
1566 assert(PtrA && PtrB && "Expected non-nullptr pointers.");
1567
1568 // Make sure that A and B are different pointers.
1569 if (PtrA == PtrB)
1570 return 0;
1571
1572 // Make sure that the element types are the same if required.
1573 if (CheckType && ElemTyA != ElemTyB)
1574 return std::nullopt;
1575
1576 unsigned ASA = PtrA->getType()->getPointerAddressSpace();
1577 unsigned ASB = PtrB->getType()->getPointerAddressSpace();
1578
1579 // Check that the address spaces match.
1580 if (ASA != ASB)
1581 return std::nullopt;
1582 unsigned IdxWidth = DL.getIndexSizeInBits(ASA);
1583
1584 APInt OffsetA(IdxWidth, 0), OffsetB(IdxWidth, 0);
1585 Value *PtrA1 = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
1586 Value *PtrB1 = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB);
1587
1588 int Val;
1589 if (PtrA1 == PtrB1) {
1590 // Retrieve the address space again as pointer stripping now tracks through
1591 // `addrspacecast`.
1592 ASA = cast<PointerType>(PtrA1->getType())->getAddressSpace();
1593 ASB = cast<PointerType>(PtrB1->getType())->getAddressSpace();
1594 // Check that the address spaces match and that the pointers are valid.
1595 if (ASA != ASB)
1596 return std::nullopt;
1597
1598 IdxWidth = DL.getIndexSizeInBits(ASA);
1599 OffsetA = OffsetA.sextOrTrunc(IdxWidth);
1600 OffsetB = OffsetB.sextOrTrunc(IdxWidth);
1601
1602 OffsetB -= OffsetA;
1603 Val = OffsetB.getSExtValue();
1604 } else {
1605 // Otherwise compute the distance with SCEV between the base pointers.
1606 const SCEV *PtrSCEVA = SE.getSCEV(PtrA);
1607 const SCEV *PtrSCEVB = SE.getSCEV(PtrB);
1608 const auto *Diff =
1609 dyn_cast<SCEVConstant>(SE.getMinusSCEV(PtrSCEVB, PtrSCEVA));
1610 if (!Diff)
1611 return std::nullopt;
1612 Val = Diff->getAPInt().getSExtValue();
1613 }
1614 int Size = DL.getTypeStoreSize(ElemTyA);
1615 int Dist = Val / Size;
1616
1617 // Ensure that the calculated distance matches the type-based one after all
1618 // the bitcasts removal in the provided pointers.
1619 if (!StrictCheck || Dist * Size == Val)
1620 return Dist;
1621 return std::nullopt;
1622}
1623
1625 const DataLayout &DL, ScalarEvolution &SE,
1626 SmallVectorImpl<unsigned> &SortedIndices) {
1628 VL, [](const Value *V) { return V->getType()->isPointerTy(); }) &&
1629 "Expected list of pointer operands.");
1630 // Walk over the pointers, and map each of them to an offset relative to
1631 // first pointer in the array.
1632 Value *Ptr0 = VL[0];
1633
1634 using DistOrdPair = std::pair<int64_t, int>;
1635 auto Compare = llvm::less_first();
1636 std::set<DistOrdPair, decltype(Compare)> Offsets(Compare);
1637 Offsets.emplace(0, 0);
1638 int Cnt = 1;
1639 bool IsConsecutive = true;
1640 for (auto *Ptr : VL.drop_front()) {
1641 std::optional<int> Diff = getPointersDiff(ElemTy, Ptr0, ElemTy, Ptr, DL, SE,
1642 /*StrictCheck=*/true);
1643 if (!Diff)
1644 return false;
1645
1646 // Check if the pointer with the same offset is found.
1647 int64_t Offset = *Diff;
1648 auto Res = Offsets.emplace(Offset, Cnt);
1649 if (!Res.second)
1650 return false;
1651 // Consecutive order if the inserted element is the last one.
1652 IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end();
1653 ++Cnt;
1654 }
1655 SortedIndices.clear();
1656 if (!IsConsecutive) {
1657 // Fill SortedIndices array only if it is non-consecutive.
1658 SortedIndices.resize(VL.size());
1659 Cnt = 0;
1660 for (const std::pair<int64_t, int> &Pair : Offsets) {
1661 SortedIndices[Cnt] = Pair.second;
1662 ++Cnt;
1663 }
1664 }
1665 return true;
1666}
1667
1668/// Returns true if the memory operations \p A and \p B are consecutive.
1670 ScalarEvolution &SE, bool CheckType) {
1673 if (!PtrA || !PtrB)
1674 return false;
1675 Type *ElemTyA = getLoadStoreType(A);
1676 Type *ElemTyB = getLoadStoreType(B);
1677 std::optional<int> Diff =
1678 getPointersDiff(ElemTyA, PtrA, ElemTyB, PtrB, DL, SE,
1679 /*StrictCheck=*/true, CheckType);
1680 return Diff && *Diff == 1;
1681}
1682
1684 visitPointers(SI->getPointerOperand(), *InnermostLoop,
1685 [this, SI](Value *Ptr) {
1686 Accesses[MemAccessInfo(Ptr, true)].push_back(AccessIdx);
1687 InstMap.push_back(SI);
1688 ++AccessIdx;
1689 });
1690}
1691
1693 visitPointers(LI->getPointerOperand(), *InnermostLoop,
1694 [this, LI](Value *Ptr) {
1695 Accesses[MemAccessInfo(Ptr, false)].push_back(AccessIdx);
1696 InstMap.push_back(LI);
1697 ++AccessIdx;
1698 });
1699}
1700
1703 switch (Type) {
1704 case NoDep:
1705 case Forward:
1708
1709 case Unknown:
1712 case Backward:
1714 case IndirectUnsafe:
1716 }
1717 llvm_unreachable("unexpected DepType!");
1718}
1719
1721 switch (Type) {
1722 case NoDep:
1723 case Forward:
1724 case ForwardButPreventsForwarding:
1725 case Unknown:
1726 case IndirectUnsafe:
1727 return false;
1728
1729 case BackwardVectorizable:
1730 case Backward:
1731 case BackwardVectorizableButPreventsForwarding:
1732 return true;
1733 }
1734 llvm_unreachable("unexpected DepType!");
1735}
1736
1738 return isBackward() || Type == Unknown;
1739}
1740
1742 switch (Type) {
1743 case Forward:
1744 case ForwardButPreventsForwarding:
1745 return true;
1746
1747 case NoDep:
1748 case Unknown:
1749 case BackwardVectorizable:
1750 case Backward:
1751 case BackwardVectorizableButPreventsForwarding:
1752 case IndirectUnsafe:
1753 return false;
1754 }
1755 llvm_unreachable("unexpected DepType!");
1756}
1757
1758bool MemoryDepChecker::couldPreventStoreLoadForward(uint64_t Distance,
1759 uint64_t TypeByteSize) {
1760 // If loads occur at a distance that is not a multiple of a feasible vector
1761 // factor store-load forwarding does not take place.
1762 // Positive dependences might cause troubles because vectorizing them might
1763 // prevent store-load forwarding making vectorized code run a lot slower.
1764 // a[i] = a[i-3] ^ a[i-8];
1765 // The stores to a[i:i+1] don't align with the stores to a[i-3:i-2] and
1766 // hence on your typical architecture store-load forwarding does not take
1767 // place. Vectorizing in such cases does not make sense.
1768 // Store-load forwarding distance.
1769
1770 // After this many iterations store-to-load forwarding conflicts should not
1771 // cause any slowdowns.
1772 const uint64_t NumItersForStoreLoadThroughMemory = 8 * TypeByteSize;
1773 // Maximum vector factor.
1774 uint64_t MaxVFWithoutSLForwardIssues = std::min(
1775 VectorizerParams::MaxVectorWidth * TypeByteSize, MinDepDistBytes);
1776
1777 // Compute the smallest VF at which the store and load would be misaligned.
1778 for (uint64_t VF = 2 * TypeByteSize; VF <= MaxVFWithoutSLForwardIssues;
1779 VF *= 2) {
1780 // If the number of vector iteration between the store and the load are
1781 // small we could incur conflicts.
1782 if (Distance % VF && Distance / VF < NumItersForStoreLoadThroughMemory) {
1783 MaxVFWithoutSLForwardIssues = (VF >> 1);
1784 break;
1785 }
1786 }
1787
1788 if (MaxVFWithoutSLForwardIssues < 2 * TypeByteSize) {
1789 LLVM_DEBUG(
1790 dbgs() << "LAA: Distance " << Distance
1791 << " that could cause a store-load forwarding conflict\n");
1792 return true;
1793 }
1794
1795 if (MaxVFWithoutSLForwardIssues < MinDepDistBytes &&
1796 MaxVFWithoutSLForwardIssues !=
1797 VectorizerParams::MaxVectorWidth * TypeByteSize)
1798 MinDepDistBytes = MaxVFWithoutSLForwardIssues;
1799 return false;
1800}
1801
1802void MemoryDepChecker::mergeInStatus(VectorizationSafetyStatus S) {
1803 if (Status < S)
1804 Status = S;
1805}
1806
1807/// Given a dependence-distance \p Dist between two
1808/// memory accesses, that have the same stride whose absolute value is given
1809/// in \p Stride, and that have the same type size \p TypeByteSize,
1810/// in a loop whose takenCount is \p BackedgeTakenCount, check if it is
1811/// possible to prove statically that the dependence distance is larger
1812/// than the range that the accesses will travel through the execution of
1813/// the loop. If so, return true; false otherwise. This is useful for
1814/// example in loops such as the following (PR31098):
1815/// for (i = 0; i < D; ++i) {
1816/// = out[i];
1817/// out[i+D] =
1818/// }
1820 const SCEV &BackedgeTakenCount,
1821 const SCEV &Dist, uint64_t Stride,
1822 uint64_t TypeByteSize) {
1823
1824 // If we can prove that
1825 // (**) |Dist| > BackedgeTakenCount * Step
1826 // where Step is the absolute stride of the memory accesses in bytes,
1827 // then there is no dependence.
1828 //
1829 // Rationale:
1830 // We basically want to check if the absolute distance (|Dist/Step|)
1831 // is >= the loop iteration count (or > BackedgeTakenCount).
1832 // This is equivalent to the Strong SIV Test (Practical Dependence Testing,
1833 // Section 4.2.1); Note, that for vectorization it is sufficient to prove
1834 // that the dependence distance is >= VF; This is checked elsewhere.
1835 // But in some cases we can prune dependence distances early, and
1836 // even before selecting the VF, and without a runtime test, by comparing
1837 // the distance against the loop iteration count. Since the vectorized code
1838 // will be executed only if LoopCount >= VF, proving distance >= LoopCount
1839 // also guarantees that distance >= VF.
1840 //
1841 const uint64_t ByteStride = Stride * TypeByteSize;
1842 const SCEV *Step = SE.getConstant(BackedgeTakenCount.getType(), ByteStride);
1843 const SCEV *Product = SE.getMulExpr(&BackedgeTakenCount, Step);
1844
1845 const SCEV *CastedDist = &Dist;
1846 const SCEV *CastedProduct = Product;
1847 uint64_t DistTypeSizeBits = DL.getTypeSizeInBits(Dist.getType());
1848 uint64_t ProductTypeSizeBits = DL.getTypeSizeInBits(Product->getType());
1849
1850 // The dependence distance can be positive/negative, so we sign extend Dist;
1851 // The multiplication of the absolute stride in bytes and the
1852 // backedgeTakenCount is non-negative, so we zero extend Product.
1853 if (DistTypeSizeBits > ProductTypeSizeBits)
1854 CastedProduct = SE.getZeroExtendExpr(Product, Dist.getType());
1855 else
1856 CastedDist = SE.getNoopOrSignExtend(&Dist, Product->getType());
1857
1858 // Is Dist - (BackedgeTakenCount * Step) > 0 ?
1859 // (If so, then we have proven (**) because |Dist| >= Dist)
1860 const SCEV *Minus = SE.getMinusSCEV(CastedDist, CastedProduct);
1861 if (SE.isKnownPositive(Minus))
1862 return true;
1863
1864 // Second try: Is -Dist - (BackedgeTakenCount * Step) > 0 ?
1865 // (If so, then we have proven (**) because |Dist| >= -1*Dist)
1866 const SCEV *NegDist = SE.getNegativeSCEV(CastedDist);
1867 Minus = SE.getMinusSCEV(NegDist, CastedProduct);
1868 if (SE.isKnownPositive(Minus))
1869 return true;
1870
1871 return false;
1872}
1873
1874/// Check the dependence for two accesses with the same stride \p Stride.
1875/// \p Distance is the positive distance and \p TypeByteSize is type size in
1876/// bytes.
1877///
1878/// \returns true if they are independent.
1880 uint64_t TypeByteSize) {
1881 assert(Stride > 1 && "The stride must be greater than 1");
1882 assert(TypeByteSize > 0 && "The type size in byte must be non-zero");
1883 assert(Distance > 0 && "The distance must be non-zero");
1884
1885 // Skip if the distance is not multiple of type byte size.
1886 if (Distance % TypeByteSize)
1887 return false;
1888
1889 uint64_t ScaledDist = Distance / TypeByteSize;
1890
1891 // No dependence if the scaled distance is not multiple of the stride.
1892 // E.g.
1893 // for (i = 0; i < 1024 ; i += 4)
1894 // A[i+2] = A[i] + 1;
1895 //
1896 // Two accesses in memory (scaled distance is 2, stride is 4):
1897 // | A[0] | | | | A[4] | | | |
1898 // | | | A[2] | | | | A[6] | |
1899 //
1900 // E.g.
1901 // for (i = 0; i < 1024 ; i += 3)
1902 // A[i+4] = A[i] + 1;
1903 //
1904 // Two accesses in memory (scaled distance is 4, stride is 3):
1905 // | A[0] | | | A[3] | | | A[6] | | |
1906 // | | | | | A[4] | | | A[7] | |
1907 return ScaledDist % Stride;
1908}
1909
1910/// Returns true if any of the underlying objects has a loop varying address,
1911/// i.e. may change in \p L.
1912static bool
1914 ScalarEvolution &SE, const Loop *L) {
1915 return any_of(UnderlyingObjects, [&SE, L](const Value *UO) {
1916 return !SE.isLoopInvariant(SE.getSCEV(const_cast<Value *>(UO)), L);
1917 });
1918}
1919
1920namespace {
1921struct DepDistanceStrideAndSizeInfo {
1922 const SCEV *Dist;
1923 uint64_t Stride;
1924 uint64_t TypeByteSize;
1925 bool AIsWrite;
1926 bool BIsWrite;
1927
1928 DepDistanceStrideAndSizeInfo(const SCEV *Dist, uint64_t Stride,
1929 uint64_t TypeByteSize, bool AIsWrite,
1930 bool BIsWrite)
1931 : Dist(Dist), Stride(Stride), TypeByteSize(TypeByteSize),
1932 AIsWrite(AIsWrite), BIsWrite(BIsWrite) {}
1933};
1934} // namespace
1935
1936// Get the dependence distance, stride, type size and whether it is a write for
1937// the dependence between A and B. Returns a DepType, if we can prove there's
1938// no dependence or the analysis fails. Outlined to lambda to limit he scope
1939// of various temporary variables, like A/BPtr, StrideA/BPtr and others.
1940// Returns either the dependence result, if it could already be determined, or a
1941// struct containing (Distance, Stride, TypeSize, AIsWrite, BIsWrite).
1942static std::variant<MemoryDepChecker::Dependence::DepType,
1943 DepDistanceStrideAndSizeInfo>
1947 const DenseMap<Value *, const SCEV *> &Strides,
1948 const DenseMap<Value *, SmallVector<const Value *, 16>> &UnderlyingObjects,
1949 PredicatedScalarEvolution &PSE, const Loop *InnermostLoop) {
1950 auto &DL = InnermostLoop->getHeader()->getModule()->getDataLayout();
1951 auto &SE = *PSE.getSE();
1952 auto [APtr, AIsWrite] = A;
1953 auto [BPtr, BIsWrite] = B;
1954
1955 // Two reads are independent.
1956 if (!AIsWrite && !BIsWrite)
1958
1959 Type *ATy = getLoadStoreType(AInst);
1960 Type *BTy = getLoadStoreType(BInst);
1961
1962 // We cannot check pointers in different address spaces.
1963 if (APtr->getType()->getPointerAddressSpace() !=
1964 BPtr->getType()->getPointerAddressSpace())
1966
1967 int64_t StrideAPtr =
1968 getPtrStride(PSE, ATy, APtr, InnermostLoop, Strides, true).value_or(0);
1969 int64_t StrideBPtr =
1970 getPtrStride(PSE, BTy, BPtr, InnermostLoop, Strides, true).value_or(0);
1971
1972 const SCEV *Src = PSE.getSCEV(APtr);
1973 const SCEV *Sink = PSE.getSCEV(BPtr);
1974
1975 // If the induction step is negative we have to invert source and sink of the
1976 // dependence when measuring the distance between them. We should not swap
1977 // AIsWrite with BIsWrite, as their uses expect them in program order.
1978 if (StrideAPtr < 0) {
1979 std::swap(Src, Sink);
1980 std::swap(AInst, BInst);
1981 }
1982
1983 const SCEV *Dist = SE.getMinusSCEV(Sink, Src);
1984
1985 LLVM_DEBUG(dbgs() << "LAA: Src Scev: " << *Src << "Sink Scev: " << *Sink
1986 << "(Induction step: " << StrideAPtr << ")\n");
1987 LLVM_DEBUG(dbgs() << "LAA: Distance for " << *AInst << " to " << *BInst
1988 << ": " << *Dist << "\n");
1989
1990 // Needs accesses where the addresses of the accessed underlying objects do
1991 // not change within the loop.
1992 if (isLoopVariantIndirectAddress(UnderlyingObjects.find(APtr)->second, SE,
1993 InnermostLoop) ||
1994 isLoopVariantIndirectAddress(UnderlyingObjects.find(BPtr)->second, SE,
1995 InnermostLoop))
1997
1998 // Need accesses with constant stride. We don't want to vectorize
1999 // "A[B[i]] += ..." and similar code or pointer arithmetic that could wrap
2000 // in the address space.
2001 if (!StrideAPtr || !StrideBPtr || StrideAPtr != StrideBPtr) {
2002 LLVM_DEBUG(dbgs() << "Pointer access with non-constant stride\n");
2004 }
2005
2006 uint64_t TypeByteSize = DL.getTypeAllocSize(ATy);
2007 bool HasSameSize =
2008 DL.getTypeStoreSizeInBits(ATy) == DL.getTypeStoreSizeInBits(BTy);
2009 if (!HasSameSize)
2010 TypeByteSize = 0;
2011 uint64_t Stride = std::abs(StrideAPtr);
2012 return DepDistanceStrideAndSizeInfo(Dist, Stride, TypeByteSize, AIsWrite,
2013 BIsWrite);
2014}
2015
2016MemoryDepChecker::Dependence::DepType MemoryDepChecker::isDependent(
2017 const MemAccessInfo &A, unsigned AIdx, const MemAccessInfo &B,
2018 unsigned BIdx, const DenseMap<Value *, const SCEV *> &Strides,
2020 &UnderlyingObjects) {
2021 assert(AIdx < BIdx && "Must pass arguments in program order");
2022
2023 // Get the dependence distance, stride, type size and what access writes for
2024 // the dependence between A and B.
2026 A, InstMap[AIdx], B, InstMap[BIdx], Strides, UnderlyingObjects, PSE,
2027 InnermostLoop);
2028 if (std::holds_alternative<Dependence::DepType>(Res))
2029 return std::get<Dependence::DepType>(Res);
2030
2031 const auto &[Dist, Stride, TypeByteSize, AIsWrite, BIsWrite] =
2032 std::get<DepDistanceStrideAndSizeInfo>(Res);
2033 bool HasSameSize = TypeByteSize > 0;
2034
2035 ScalarEvolution &SE = *PSE.getSE();
2036 auto &DL = InnermostLoop->getHeader()->getModule()->getDataLayout();
2037 if (!isa<SCEVCouldNotCompute>(Dist) && HasSameSize &&
2039 Stride, TypeByteSize))
2040 return Dependence::NoDep;
2041
2042 const SCEVConstant *C = dyn_cast<SCEVConstant>(Dist);
2043 if (!C) {
2044 LLVM_DEBUG(dbgs() << "LAA: Dependence because of non-constant distance\n");
2045 FoundNonConstantDistanceDependence = true;
2046 return Dependence::Unknown;
2047 }
2048
2049 const APInt &Val = C->getAPInt();
2050 int64_t Distance = Val.getSExtValue();
2051
2052 // Attempt to prove strided accesses independent.
2053 if (std::abs(Distance) > 0 && Stride > 1 && HasSameSize &&
2054 areStridedAccessesIndependent(std::abs(Distance), Stride, TypeByteSize)) {
2055 LLVM_DEBUG(dbgs() << "LAA: Strided accesses are independent\n");
2056 return Dependence::NoDep;
2057 }
2058
2059 // Negative distances are not plausible dependencies.
2060 if (Val.isNegative()) {
2061 bool IsTrueDataDependence = (AIsWrite && !BIsWrite);
2062 // There is no need to update MaxSafeVectorWidthInBits after call to
2063 // couldPreventStoreLoadForward, even if it changed MinDepDistBytes,
2064 // since a forward dependency will allow vectorization using any width.
2065 if (IsTrueDataDependence && EnableForwardingConflictDetection &&
2066 (!HasSameSize || couldPreventStoreLoadForward(Val.abs().getZExtValue(),
2067 TypeByteSize))) {
2068 LLVM_DEBUG(dbgs() << "LAA: Forward but may prevent st->ld forwarding\n");
2070 }
2071
2072 LLVM_DEBUG(dbgs() << "LAA: Dependence is negative\n");
2073 return Dependence::Forward;
2074 }
2075
2076 // Write to the same location with the same size.
2077 if (Val == 0) {
2078 if (HasSameSize)
2079 return Dependence::Forward;
2080 LLVM_DEBUG(
2081 dbgs() << "LAA: Zero dependence difference but different type sizes\n");
2082 return Dependence::Unknown;
2083 }
2084
2085 assert(Val.isStrictlyPositive() && "Expect a positive value");
2086
2087 if (!HasSameSize) {
2088 LLVM_DEBUG(dbgs() << "LAA: ReadWrite-Write positive dependency with "
2089 "different type sizes\n");
2090 return Dependence::Unknown;
2091 }
2092
2093 // Bail out early if passed-in parameters make vectorization not feasible.
2094 unsigned ForcedFactor = (VectorizerParams::VectorizationFactor ?
2096 unsigned ForcedUnroll = (VectorizerParams::VectorizationInterleave ?
2098 // The minimum number of iterations for a vectorized/unrolled version.
2099 unsigned MinNumIter = std::max(ForcedFactor * ForcedUnroll, 2U);
2100
2101 // It's not vectorizable if the distance is smaller than the minimum distance
2102 // needed for a vectroized/unrolled version. Vectorizing one iteration in
2103 // front needs TypeByteSize * Stride. Vectorizing the last iteration needs
2104 // TypeByteSize (No need to plus the last gap distance).
2105 //
2106 // E.g. Assume one char is 1 byte in memory and one int is 4 bytes.
2107 // foo(int *A) {
2108 // int *B = (int *)((char *)A + 14);
2109 // for (i = 0 ; i < 1024 ; i += 2)
2110 // B[i] = A[i] + 1;
2111 // }
2112 //
2113 // Two accesses in memory (stride is 2):
2114 // | A[0] | | A[2] | | A[4] | | A[6] | |
2115 // | B[0] | | B[2] | | B[4] |
2116 //
2117 // Distance needs for vectorizing iterations except the last iteration:
2118 // 4 * 2 * (MinNumIter - 1). Distance needs for the last iteration: 4.
2119 // So the minimum distance needed is: 4 * 2 * (MinNumIter - 1) + 4.
2120 //
2121 // If MinNumIter is 2, it is vectorizable as the minimum distance needed is
2122 // 12, which is less than distance.
2123 //
2124 // If MinNumIter is 4 (Say if a user forces the vectorization factor to be 4),
2125 // the minimum distance needed is 28, which is greater than distance. It is
2126 // not safe to do vectorization.
2127 uint64_t MinDistanceNeeded =
2128 TypeByteSize * Stride * (MinNumIter - 1) + TypeByteSize;
2129 if (MinDistanceNeeded > static_cast<uint64_t>(Distance)) {
2130 LLVM_DEBUG(dbgs() << "LAA: Failure because of positive distance "
2131 << Distance << '\n');
2132 return Dependence::Backward;
2133 }
2134
2135 // Unsafe if the minimum distance needed is greater than smallest dependence
2136 // distance distance.
2137 if (MinDistanceNeeded > MinDepDistBytes) {
2138 LLVM_DEBUG(dbgs() << "LAA: Failure because it needs at least "
2139 << MinDistanceNeeded << " size in bytes\n");
2140 return Dependence::Backward;
2141 }
2142
2143 // Positive distance bigger than max vectorization factor.
2144 // FIXME: Should use max factor instead of max distance in bytes, which could
2145 // not handle different types.
2146 // E.g. Assume one char is 1 byte in memory and one int is 4 bytes.
2147 // void foo (int *A, char *B) {
2148 // for (unsigned i = 0; i < 1024; i++) {
2149 // A[i+2] = A[i] + 1;
2150 // B[i+2] = B[i] + 1;
2151 // }
2152 // }
2153 //
2154 // This case is currently unsafe according to the max safe distance. If we
2155 // analyze the two accesses on array B, the max safe dependence distance
2156 // is 2. Then we analyze the accesses on array A, the minimum distance needed
2157 // is 8, which is less than 2 and forbidden vectorization, But actually
2158 // both A and B could be vectorized by 2 iterations.
2159 MinDepDistBytes =
2160 std::min(static_cast<uint64_t>(Distance), MinDepDistBytes);
2161
2162 bool IsTrueDataDependence = (!AIsWrite && BIsWrite);
2163 uint64_t MinDepDistBytesOld = MinDepDistBytes;
2164 if (IsTrueDataDependence && EnableForwardingConflictDetection &&
2165 couldPreventStoreLoadForward(Distance, TypeByteSize)) {
2166 // Sanity check that we didn't update MinDepDistBytes when calling
2167 // couldPreventStoreLoadForward
2168 assert(MinDepDistBytes == MinDepDistBytesOld &&
2169 "An update to MinDepDistBytes requires an update to "
2170 "MaxSafeVectorWidthInBits");
2171 (void)MinDepDistBytesOld;
2173 }
2174
2175 // An update to MinDepDistBytes requires an update to MaxSafeVectorWidthInBits
2176 // since there is a backwards dependency.
2177 uint64_t MaxVF = MinDepDistBytes / (TypeByteSize * Stride);
2178 LLVM_DEBUG(dbgs() << "LAA: Positive distance " << Val.getSExtValue()
2179 << " with max VF = " << MaxVF << '\n');
2180 uint64_t MaxVFInBits = MaxVF * TypeByteSize * 8;
2181 MaxSafeVectorWidthInBits = std::min(MaxSafeVectorWidthInBits, MaxVFInBits);
2183}
2184
2186 DepCandidates &AccessSets, MemAccessInfoList &CheckDeps,
2187 const DenseMap<Value *, const SCEV *> &Strides,
2189 &UnderlyingObjects) {
2190
2191 MinDepDistBytes = -1;
2193 for (MemAccessInfo CurAccess : CheckDeps) {
2194 if (Visited.count(CurAccess))
2195 continue;
2196
2197 // Get the relevant memory access set.
2199 AccessSets.findValue(AccessSets.getLeaderValue(CurAccess));
2200
2201 // Check accesses within this set.
2203 AccessSets.member_begin(I);
2205 AccessSets.member_end();
2206
2207 // Check every access pair.
2208 while (AI != AE) {
2209 Visited.insert(*AI);
2210 bool AIIsWrite = AI->getInt();
2211 // Check loads only against next equivalent class, but stores also against
2212 // other stores in the same equivalence class - to the same address.
2214 (AIIsWrite ? AI : std::next(AI));
2215 while (OI != AE) {
2216 // Check every accessing instruction pair in program order.
2217 for (std::vector<unsigned>::iterator I1 = Accesses[*AI].begin(),
2218 I1E = Accesses[*AI].end(); I1 != I1E; ++I1)
2219 // Scan all accesses of another equivalence class, but only the next
2220 // accesses of the same equivalent class.
2221 for (std::vector<unsigned>::iterator
2222 I2 = (OI == AI ? std::next(I1) : Accesses[*OI].begin()),
2223 I2E = (OI == AI ? I1E : Accesses[*OI].end());
2224 I2 != I2E; ++I2) {
2225 auto A = std::make_pair(&*AI, *I1);
2226 auto B = std::make_pair(&*OI, *I2);
2227
2228 assert(*I1 != *I2);
2229 if (*I1 > *I2)
2230 std::swap(A, B);
2231
2233 isDependent(*A.first, A.second, *B.first, B.second, Strides,
2234 UnderlyingObjects);
2236
2237 // Gather dependences unless we accumulated MaxDependences
2238 // dependences. In that case return as soon as we find the first
2239 // unsafe dependence. This puts a limit on this quadratic
2240 // algorithm.
2241 if (RecordDependences) {
2242 if (Type != Dependence::NoDep)
2243 Dependences.push_back(Dependence(A.second, B.second, Type));
2244
2245 if (Dependences.size() >= MaxDependences) {
2246 RecordDependences = false;
2247 Dependences.clear();
2249 << "Too many dependences, stopped recording\n");
2250 }
2251 }
2252 if (!RecordDependences && !isSafeForVectorization())
2253 return false;
2254 }
2255 ++OI;
2256 }
2257 AI++;
2258 }
2259 }
2260
2261 LLVM_DEBUG(dbgs() << "Total Dependences: " << Dependences.size() << "\n");
2262 return isSafeForVectorization();
2263}
2264
2267 MemAccessInfo Access(Ptr, isWrite);
2268 auto &IndexVector = Accesses.find(Access)->second;
2269
2271 transform(IndexVector,
2272 std::back_inserter(Insts),
2273 [&](unsigned Idx) { return this->InstMap[Idx]; });
2274 return Insts;
2275}
2276
2278 "NoDep",
2279 "Unknown",
2280 "IndirectUnsafe",
2281 "Forward",
2282 "ForwardButPreventsForwarding",
2283 "Backward",
2284 "BackwardVectorizable",
2285 "BackwardVectorizableButPreventsForwarding"};
2286
2288 raw_ostream &OS, unsigned Depth,
2289 const SmallVectorImpl<Instruction *> &Instrs) const {
2290 OS.indent(Depth) << DepName[Type] << ":\n";
2291 OS.indent(Depth + 2) << *Instrs[Source] << " -> \n";
2292 OS.indent(Depth + 2) << *Instrs[Destination] << "\n";
2293}
2294
2295bool LoopAccessInfo::canAnalyzeLoop() {
2296 // We need to have a loop header.
2297 LLVM_DEBUG(dbgs() << "LAA: Found a loop in "
2298 << TheLoop->getHeader()->getParent()->getName() << ": "
2299 << TheLoop->getHeader()->getName() << '\n');
2300
2301 // We can only analyze innermost loops.
2302 if (!TheLoop->isInnermost()) {
2303 LLVM_DEBUG(dbgs() << "LAA: loop is not the innermost loop\n");
2304 recordAnalysis("NotInnerMostLoop") << "loop is not the innermost loop";
2305 return false;
2306 }
2307
2308 // We must have a single backedge.
2309 if (TheLoop->getNumBackEdges() != 1) {
2310 LLVM_DEBUG(
2311 dbgs() << "LAA: loop control flow is not understood by analyzer\n");
2312 recordAnalysis("CFGNotUnderstood")
2313 << "loop control flow is not understood by analyzer";
2314 return false;
2315 }
2316
2317 // ScalarEvolution needs to be able to find the exit count.
2318 const SCEV *ExitCount = PSE->getBackedgeTakenCount();
2319 if (isa<SCEVCouldNotCompute>(ExitCount)) {
2320 recordAnalysis("CantComputeNumberOfIterations")
2321 << "could not determine number of loop iterations";
2322 LLVM_DEBUG(dbgs() << "LAA: SCEV could not compute the loop exit count.\n");
2323 return false;
2324 }
2325
2326 return true;
2327}
2328
2329void LoopAccessInfo::analyzeLoop(AAResults *AA, LoopInfo *LI,
2330 const TargetLibraryInfo *TLI,
2331 DominatorTree *DT) {
2332 // Holds the Load and Store instructions.
2335 SmallPtrSet<MDNode *, 8> LoopAliasScopes;
2336
2337 // Holds all the different accesses in the loop.
2338 unsigned NumReads = 0;
2339 unsigned NumReadWrites = 0;
2340
2341 bool HasComplexMemInst = false;
2342
2343 // A runtime check is only legal to insert if there are no convergent calls.
2344 HasConvergentOp = false;
2345
2346 PtrRtChecking->Pointers.clear();
2347 PtrRtChecking->Need = false;
2348
2349 const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel();
2350
2351 const bool EnableMemAccessVersioningOfLoop =
2353 !TheLoop->getHeader()->getParent()->hasOptSize();
2354
2355 // Traverse blocks in fixed RPOT order, regardless of their storage in the
2356 // loop info, as it may be arbitrary.
2357 LoopBlocksRPO RPOT(TheLoop);
2358 RPOT.perform(LI);
2359 for (BasicBlock *BB : RPOT) {
2360 // Scan the BB and collect legal loads and stores. Also detect any
2361 // convergent instructions.
2362 for (Instruction &I : *BB) {
2363 if (auto *Call = dyn_cast<CallBase>(&I)) {
2364 if (Call->isConvergent())
2365 HasConvergentOp = true;
2366 }
2367
2368 // With both a non-vectorizable memory instruction and a convergent
2369 // operation, found in this loop, no reason to continue the search.
2370 if (HasComplexMemInst && HasConvergentOp) {
2371 CanVecMem = false;
2372 return;
2373 }
2374
2375 // Avoid hitting recordAnalysis multiple times.
2376 if (HasComplexMemInst)
2377 continue;
2378
2379 // Record alias scopes defined inside the loop.
2380 if (auto *Decl = dyn_cast<NoAliasScopeDeclInst>(&I))
2381 for (Metadata *Op : Decl->getScopeList()->operands())
2382 LoopAliasScopes.insert(cast<MDNode>(Op));
2383
2384 // Many math library functions read the rounding mode. We will only
2385 // vectorize a loop if it contains known function calls that don't set
2386 // the flag. Therefore, it is safe to ignore this read from memory.
2387 auto *Call = dyn_cast<CallInst>(&I);
2388 if (Call && getVectorIntrinsicIDForCall(Call, TLI))
2389 continue;
2390
2391 // If this is a load, save it. If this instruction can read from memory
2392 // but is not a load, then we quit. Notice that we don't handle function
2393 // calls that read or write.
2394 if (I.mayReadFromMemory()) {
2395 // If the function has an explicit vectorized counterpart, we can safely
2396 // assume that it can be vectorized.
2397 if (Call && !Call->isNoBuiltin() && Call->getCalledFunction() &&
2398 !VFDatabase::getMappings(*Call).empty())
2399 continue;
2400
2401 auto *Ld = dyn_cast<LoadInst>(&I);
2402 if (!Ld) {
2403 recordAnalysis("CantVectorizeInstruction", Ld)
2404 << "instruction cannot be vectorized";
2405 HasComplexMemInst = true;
2406 continue;
2407 }
2408 if (!Ld->isSimple() && !IsAnnotatedParallel) {
2409 recordAnalysis("NonSimpleLoad", Ld)
2410 << "read with atomic ordering or volatile read";
2411 LLVM_DEBUG(dbgs() << "LAA: Found a non-simple load.\n");
2412 HasComplexMemInst = true;
2413 continue;
2414 }
2415 NumLoads++;
2416 Loads.push_back(Ld);
2417 DepChecker->addAccess(Ld);
2418 if (EnableMemAccessVersioningOfLoop)
2419 collectStridedAccess(Ld);
2420 continue;
2421 }
2422
2423 // Save 'store' instructions. Abort if other instructions write to memory.
2424 if (I.mayWriteToMemory()) {
2425 auto *St = dyn_cast<StoreInst>(&I);
2426 if (!St) {
2427 recordAnalysis("CantVectorizeInstruction", St)
2428 << "instruction cannot be vectorized";
2429 HasComplexMemInst = true;
2430 continue;
2431 }
2432 if (!St->isSimple() && !IsAnnotatedParallel) {
2433 recordAnalysis("NonSimpleStore", St)
2434 << "write with atomic ordering or volatile write";
2435 LLVM_DEBUG(dbgs() << "LAA: Found a non-simple store.\n");
2436 HasComplexMemInst = true;
2437 continue;
2438 }
2439 NumStores++;
2440 Stores.push_back(St);
2441 DepChecker->addAccess(St);
2442 if (EnableMemAccessVersioningOfLoop)
2443 collectStridedAccess(St);
2444 }
2445 } // Next instr.
2446 } // Next block.
2447
2448 if (HasComplexMemInst) {
2449 CanVecMem = false;
2450 return;
2451 }
2452
2453 // Now we have two lists that hold the loads and the stores.
2454 // Next, we find the pointers that they use.
2455
2456 // Check if we see any stores. If there are no stores, then we don't
2457 // care if the pointers are *restrict*.
2458 if (!Stores.size()) {
2459 LLVM_DEBUG(dbgs() << "LAA: Found a read-only loop!\n");
2460 CanVecMem = true;
2461 return;
2462 }
2463
2464 MemoryDepChecker::DepCandidates DependentAccesses;
2465 AccessAnalysis Accesses(TheLoop, AA, LI, DependentAccesses, *PSE,
2466 LoopAliasScopes);
2467
2468 // Holds the analyzed pointers. We don't want to call getUnderlyingObjects
2469 // multiple times on the same object. If the ptr is accessed twice, once
2470 // for read and once for write, it will only appear once (on the write
2471 // list). This is okay, since we are going to check for conflicts between
2472 // writes and between reads and writes, but not between reads and reads.
2474
2475 // Record uniform store addresses to identify if we have multiple stores
2476 // to the same address.
2477 SmallPtrSet<Value *, 16> UniformStores;
2478
2479 for (StoreInst *ST : Stores) {
2480 Value *Ptr = ST->getPointerOperand();
2481
2482 if (isInvariant(Ptr)) {
2483 // Record store instructions to loop invariant addresses
2484 StoresToInvariantAddresses.push_back(ST);
2485 HasDependenceInvolvingLoopInvariantAddress |=
2486 !UniformStores.insert(Ptr).second;
2487 }
2488
2489 // If we did *not* see this pointer before, insert it to the read-write
2490 // list. At this phase it is only a 'write' list.
2491 Type *AccessTy = getLoadStoreType(ST);
2492 if (Seen.insert({Ptr, AccessTy}).second) {
2493 ++NumReadWrites;
2494
2496 // The TBAA metadata could have a control dependency on the predication
2497 // condition, so we cannot rely on it when determining whether or not we
2498 // need runtime pointer checks.
2499 if (blockNeedsPredication(ST->getParent(), TheLoop, DT))
2500 Loc.AATags.TBAA = nullptr;
2501
2502 visitPointers(const_cast<Value *>(Loc.Ptr), *TheLoop,
2503 [&Accesses, AccessTy, Loc](Value *Ptr) {
2504 MemoryLocation NewLoc = Loc.getWithNewPtr(Ptr);
2505 Accesses.addStore(NewLoc, AccessTy);
2506 });
2507 }
2508 }
2509
2510 if (IsAnnotatedParallel) {
2511 LLVM_DEBUG(
2512 dbgs() << "LAA: A loop annotated parallel, ignore memory dependency "
2513 << "checks.\n");
2514 CanVecMem = true;
2515 return;
2516 }
2517
2518 for (LoadInst *LD : Loads) {
2519 Value *Ptr = LD->getPointerOperand();
2520 // If we did *not* see this pointer before, insert it to the
2521 // read list. If we *did* see it before, then it is already in
2522 // the read-write list. This allows us to vectorize expressions
2523 // such as A[i] += x; Because the address of A[i] is a read-write
2524 // pointer. This only works if the index of A[i] is consecutive.
2525 // If the address of i is unknown (for example A[B[i]]) then we may
2526 // read a few words, modify, and write a few words, and some of the
2527 // words may be written to the same address.
2528 bool IsReadOnlyPtr = false;
2529 Type *AccessTy = getLoadStoreType(LD);
2530 if (Seen.insert({Ptr, AccessTy}).second ||
2531 !getPtrStride(*PSE, LD->getType(), Ptr, TheLoop, SymbolicStrides).value_or(0)) {
2532 ++NumReads;
2533 IsReadOnlyPtr = true;
2534 }
2535
2536 // See if there is an unsafe dependency between a load to a uniform address and
2537 // store to the same uniform address.
2538 if (UniformStores.count(Ptr)) {
2539 LLVM_DEBUG(dbgs() << "LAA: Found an unsafe dependency between a uniform "
2540 "load and uniform store to the same address!\n");
2541 HasDependenceInvolvingLoopInvariantAddress = true;
2542 }
2543
2545 // The TBAA metadata could have a control dependency on the predication
2546 // condition, so we cannot rely on it when determining whether or not we
2547 // need runtime pointer checks.
2548 if (blockNeedsPredication(LD->getParent(), TheLoop, DT))
2549 Loc.AATags.TBAA = nullptr;
2550
2551 visitPointers(const_cast<Value *>(Loc.Ptr), *TheLoop,
2552 [&Accesses, AccessTy, Loc, IsReadOnlyPtr](Value *Ptr) {
2553 MemoryLocation NewLoc = Loc.getWithNewPtr(Ptr);
2554 Accesses.addLoad(NewLoc, AccessTy, IsReadOnlyPtr);
2555 });
2556 }
2557
2558 // If we write (or read-write) to a single destination and there are no
2559 // other reads in this loop then is it safe to vectorize.
2560 if (NumReadWrites == 1 && NumReads == 0) {
2561 LLVM_DEBUG(dbgs() << "LAA: Found a write-only loop!\n");
2562 CanVecMem = true;
2563 return;
2564 }
2565
2566 // Build dependence sets and check whether we need a runtime pointer bounds
2567 // check.
2568 Accesses.buildDependenceSets();
2569
2570 // Find pointers with computable bounds. We are going to use this information
2571 // to place a runtime bound check.
2572 Value *UncomputablePtr = nullptr;
2573 bool CanDoRTIfNeeded =
2574 Accesses.canCheckPtrAtRT(*PtrRtChecking, PSE->getSE(), TheLoop,
2575 SymbolicStrides, UncomputablePtr, false);
2576 if (!CanDoRTIfNeeded) {
2577 auto *I = dyn_cast_or_null<Instruction>(UncomputablePtr);
2578 recordAnalysis("CantIdentifyArrayBounds", I)
2579 << "cannot identify array bounds";
2580 LLVM_DEBUG(dbgs() << "LAA: We can't vectorize because we can't find "
2581 << "the array bounds.\n");
2582 CanVecMem = false;
2583 return;
2584 }
2585
2586 LLVM_DEBUG(
2587 dbgs() << "LAA: May be able to perform a memory runtime check if needed.\n");
2588
2589 CanVecMem = true;
2590 if (Accesses.isDependencyCheckNeeded()) {
2591 LLVM_DEBUG(dbgs() << "LAA: Checking memory dependencies\n");
2592 CanVecMem = DepChecker->areDepsSafe(
2593 DependentAccesses, Accesses.getDependenciesToCheck(), SymbolicStrides,
2594 Accesses.getUnderlyingObjects());
2595
2596 if (!CanVecMem && DepChecker->shouldRetryWithRuntimeCheck()) {
2597 LLVM_DEBUG(dbgs() << "LAA: Retrying with memory checks\n");
2598
2599 // Clear the dependency checks. We assume they are not needed.
2600 Accesses.resetDepChecks(*DepChecker);
2601
2602 PtrRtChecking->reset();
2603 PtrRtChecking->Need = true;
2604
2605 auto *SE = PSE->getSE();
2606 UncomputablePtr = nullptr;
2607 CanDoRTIfNeeded = Accesses.canCheckPtrAtRT(
2608 *PtrRtChecking, SE, TheLoop, SymbolicStrides, UncomputablePtr, true);
2609
2610 // Check that we found the bounds for the pointer.
2611 if (!CanDoRTIfNeeded) {
2612 auto *I = dyn_cast_or_null<Instruction>(UncomputablePtr);
2613 recordAnalysis("CantCheckMemDepsAtRunTime", I)
2614 << "cannot check memory dependencies at runtime";
2615 LLVM_DEBUG(dbgs() << "LAA: Can't vectorize with memory checks\n");
2616 CanVecMem = false;
2617 return;
2618 }
2619
2620 CanVecMem = true;
2621 }
2622 }
2623
2624 if (HasConvergentOp) {
2625 recordAnalysis("CantInsertRuntimeCheckWithConvergent")
2626 << "cannot add control dependency to convergent operation";
2627 LLVM_DEBUG(dbgs() << "LAA: We can't vectorize because a runtime check "
2628 "would be needed with a convergent operation\n");
2629 CanVecMem = false;
2630 return;
2631 }
2632
2633 if (CanVecMem)
2634 LLVM_DEBUG(
2635 dbgs() << "LAA: No unsafe dependent memory operations in loop. We"
2636 << (PtrRtChecking->Need ? "" : " don't")
2637 << " need runtime memory checks.\n");
2638 else
2639 emitUnsafeDependenceRemark();
2640}
2641
2642void LoopAccessInfo::emitUnsafeDependenceRemark() {
2643 auto Deps = getDepChecker().getDependences();
2644 if (!Deps)
2645 return;
2646 auto Found = llvm::find_if(*Deps, [](const MemoryDepChecker::Dependence &D) {
2649 });
2650 if (Found == Deps->end())
2651 return;
2652 MemoryDepChecker::Dependence Dep = *Found;
2653
2654 LLVM_DEBUG(dbgs() << "LAA: unsafe dependent memory operations in loop\n");
2655
2656 // Emit remark for first unsafe dependence
2657 bool HasForcedDistribution = false;
2658 std::optional<const MDOperand *> Value =
2659 findStringMetadataForLoop(TheLoop, "llvm.loop.distribute.enable");
2660 if (Value) {
2661 const MDOperand *Op = *Value;
2662 assert(Op && mdconst::hasa<ConstantInt>(*Op) && "invalid metadata");
2663 HasForcedDistribution = mdconst::extract<ConstantInt>(*Op)->getZExtValue();
2664 }
2665
2666 const std::string Info =
2667 HasForcedDistribution
2668 ? "unsafe dependent memory operations in loop."
2669 : "unsafe dependent memory operations in loop. Use "
2670 "#pragma clang loop distribute(enable) to allow loop distribution "
2671 "to attempt to isolate the offending operations into a separate "
2672 "loop";
2674 recordAnalysis("UnsafeDep", Dep.getDestination(*this)) << Info;
2675
2676 switch (Dep.Type) {
2680 llvm_unreachable("Unexpected dependence");
2682 R << "\nBackward loop carried data dependence.";
2683 break;
2685 R << "\nForward loop carried data dependence that prevents "
2686 "store-to-load forwarding.";
2687 break;
2689 R << "\nBackward loop carried data dependence that prevents "
2690 "store-to-load forwarding.";
2691 break;
2693 R << "\nUnsafe indirect dependence.";
2694 break;
2696 R << "\nUnknown data dependence.";
2697 break;
2698 }
2699
2700 if (Instruction *I = Dep.getSource(*this)) {
2701 DebugLoc SourceLoc = I->getDebugLoc();
2702 if (auto *DD = dyn_cast_or_null<Instruction>(getPointerOperand(I)))
2703 SourceLoc = DD->getDebugLoc();
2704 if (SourceLoc)
2705 R << " Memory location is the same as accessed at "
2706 << ore::NV("Location", SourceLoc);
2707 }
2708}
2709
2711 DominatorTree *DT) {
2712 assert(TheLoop->contains(BB) && "Unknown block used");
2713
2714 // Blocks that do not dominate the latch need predication.
2715 BasicBlock* Latch = TheLoop->getLoopLatch();
2716 return !DT->dominates(BB, Latch);
2717}
2718
2719OptimizationRemarkAnalysis &LoopAccessInfo::recordAnalysis(StringRef RemarkName,
2720 Instruction *I) {
2721 assert(!Report && "Multiple reports generated");
2722
2723 Value *CodeRegion = TheLoop->getHeader();
2724 DebugLoc DL = TheLoop->getStartLoc();
2725
2726 if (I) {
2727 CodeRegion = I->getParent();
2728 // If there is no debug location attached to the instruction, revert back to
2729 // using the loop's.
2730 if (I->getDebugLoc())
2731 DL = I->getDebugLoc();
2732 }
2733
2734 Report = std::make_unique<OptimizationRemarkAnalysis>(DEBUG_TYPE, RemarkName, DL,
2735 CodeRegion);
2736 return *Report;
2737}
2738
2740 auto *SE = PSE->getSE();
2741 // TODO: Is this really what we want? Even without FP SCEV, we may want some
2742 // trivially loop-invariant FP values to be considered invariant.
2743 if (!SE->isSCEVable(V->getType()))
2744 return false;
2745 const SCEV *S = SE->getSCEV(V);
2746 return SE->isLoopInvariant(S, TheLoop);
2747}
2748
2749/// Find the operand of the GEP that should be checked for consecutive
2750/// stores. This ignores trailing indices that have no effect on the final
2751/// pointer.
2752static unsigned getGEPInductionOperand(const GetElementPtrInst *Gep) {
2753 const DataLayout &DL = Gep->getModule()->getDataLayout();
2754 unsigned LastOperand = Gep->getNumOperands() - 1;
2755 TypeSize GEPAllocSize = DL.getTypeAllocSize(Gep->getResultElementType());
2756
2757 // Walk backwards and try to peel off zeros.
2758 while (LastOperand > 1 && match(Gep->getOperand(LastOperand), m_Zero())) {
2759 // Find the type we're currently indexing into.
2760 gep_type_iterator GEPTI = gep_type_begin(Gep);
2761 std::advance(GEPTI, LastOperand - 2);
2762
2763 // If it's a type with the same allocation size as the result of the GEP we
2764 // can peel off the zero index.
2765 TypeSize ElemSize = GEPTI.isStruct()
2766 ? DL.getTypeAllocSize(GEPTI.getIndexedType())
2768 if (ElemSize != GEPAllocSize)
2769 break;
2770 --LastOperand;
2771 }
2772
2773 return LastOperand;
2774}
2775
2776/// If the argument is a GEP, then returns the operand identified by
2777/// getGEPInductionOperand. However, if there is some other non-loop-invariant
2778/// operand, it returns that instead.
2780 GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
2781 if (!GEP)
2782 return Ptr;
2783
2784 unsigned InductionOperand = getGEPInductionOperand(GEP);
2785
2786 // Check that all of the gep indices are uniform except for our induction
2787 // operand.
2788 for (unsigned i = 0, e = GEP->getNumOperands(); i != e; ++i)
2789 if (i != InductionOperand &&
2790 !SE->isLoopInvariant(SE->getSCEV(GEP->getOperand(i)), Lp))
2791 return Ptr;
2792 return GEP->getOperand(InductionOperand);
2793}
2794
2795/// If a value has only one user that is a CastInst, return it.
2797 Value *UniqueCast = nullptr;
2798 for (User *U : Ptr->users()) {
2799 CastInst *CI = dyn_cast<CastInst>(U);
2800 if (CI && CI->getType() == Ty) {
2801 if (!UniqueCast)
2802 UniqueCast = CI;
2803 else
2804 return nullptr;
2805 }
2806 }
2807 return UniqueCast;
2808}
2809
2810/// Get the stride of a pointer access in a loop. Looks for symbolic
2811/// strides "a[i*stride]". Returns the symbolic stride, or null otherwise.
2813 auto *PtrTy = dyn_cast<PointerType>(Ptr->getType());
2814 if (!PtrTy || PtrTy->isAggregateType())
2815 return nullptr;
2816
2817 // Try to remove a gep instruction to make the pointer (actually index at this
2818 // point) easier analyzable. If OrigPtr is equal to Ptr we are analyzing the
2819 // pointer, otherwise, we are analyzing the index.
2820 Value *OrigPtr = Ptr;
2821
2822 // The size of the pointer access.
2823 int64_t PtrAccessSize = 1;
2824
2825 Ptr = stripGetElementPtr(Ptr, SE, Lp);
2826 const SCEV *V = SE->getSCEV(Ptr);
2827
2828 if (Ptr != OrigPtr)
2829 // Strip off casts.
2830 while (const SCEVIntegralCastExpr *C = dyn_cast<SCEVIntegralCastExpr>(V))
2831 V = C->getOperand();
2832
2833 const SCEVAddRecExpr *S = dyn_cast<SCEVAddRecExpr>(V);
2834 if (!S)
2835 return nullptr;
2836
2837 // If the pointer is invariant then there is no stride and it makes no
2838 // sense to add it here.
2839 if (Lp != S->getLoop())
2840 return nullptr;
2841
2842 V = S->getStepRecurrence(*SE);
2843 if (!V)
2844 return nullptr;
2845
2846 // Strip off the size of access multiplication if we are still analyzing the
2847 // pointer.
2848 if (OrigPtr == Ptr) {
2849 if (const SCEVMulExpr *M = dyn_cast<SCEVMulExpr>(V)) {
2850 if (M->getOperand(0)->getSCEVType() != scConstant)
2851 return nullptr;
2852
2853 const APInt &APStepVal = cast<SCEVConstant>(M->getOperand(0))->getAPInt();
2854
2855 // Huge step value - give up.
2856 if (APStepVal.getBitWidth() > 64)
2857 return nullptr;
2858
2859 int64_t StepVal = APStepVal.getSExtValue();
2860 if (PtrAccessSize != StepVal)
2861 return nullptr;
2862 V = M->getOperand(1);
2863 }
2864 }
2865
2866 // Note that the restriction after this loop invariant check are only
2867 // profitability restrictions.
2868 if (!SE->isLoopInvariant(V, Lp))
2869 return nullptr;
2870
2871 // Look for the loop invariant symbolic value.
2872 const SCEVUnknown *U = dyn_cast<SCEVUnknown>(V);
2873 if (!U) {
2874 const auto *C = dyn_cast<SCEVIntegralCastExpr>(V);
2875 if (!C)
2876 return nullptr;
2877 U = dyn_cast<SCEVUnknown>(C->getOperand());
2878 if (!U)
2879 return nullptr;
2880
2881 // Match legacy behavior - this is not needed for correctness
2882 if (!getUniqueCastUse(U->getValue(), Lp, V->getType()))
2883 return nullptr;
2884 }
2885
2886 return V;
2887}
2888
2889void LoopAccessInfo::collectStridedAccess(Value *MemAccess) {
2890 Value *Ptr = getLoadStorePointerOperand(MemAccess);
2891 if (!Ptr)
2892 return;
2893
2894 // Note: getStrideFromPointer is a *profitability* heuristic. We
2895 // could broaden the scope of values returned here - to anything
2896 // which happens to be loop invariant and contributes to the
2897 // computation of an interesting IV - but we chose not to as we
2898 // don't have a cost model here, and broadening the scope exposes
2899 // far too many unprofitable cases.
2900 const SCEV *StrideExpr = getStrideFromPointer(Ptr, PSE->getSE(), TheLoop);
2901 if (!StrideExpr)
2902 return;
2903
2904 LLVM_DEBUG(dbgs() << "LAA: Found a strided access that is a candidate for "
2905 "versioning:");
2906 LLVM_DEBUG(dbgs() << " Ptr: " << *Ptr << " Stride: " << *StrideExpr << "\n");
2907
2908 if (!SpeculateUnitStride) {
2909 LLVM_DEBUG(dbgs() << " Chose not to due to -laa-speculate-unit-stride\n");
2910 return;
2911 }
2912
2913 // Avoid adding the "Stride == 1" predicate when we know that
2914 // Stride >= Trip-Count. Such a predicate will effectively optimize a single
2915 // or zero iteration loop, as Trip-Count <= Stride == 1.
2916 //
2917 // TODO: We are currently not making a very informed decision on when it is
2918 // beneficial to apply stride versioning. It might make more sense that the
2919 // users of this analysis (such as the vectorizer) will trigger it, based on
2920 // their specific cost considerations; For example, in cases where stride
2921 // versioning does not help resolving memory accesses/dependences, the
2922 // vectorizer should evaluate the cost of the runtime test, and the benefit
2923 // of various possible stride specializations, considering the alternatives
2924 // of using gather/scatters (if available).
2925
2926 const SCEV *BETakenCount = PSE->getBackedgeTakenCount();
2927
2928 // Match the types so we can compare the stride and the BETakenCount.
2929 // The Stride can be positive/negative, so we sign extend Stride;
2930 // The backedgeTakenCount is non-negative, so we zero extend BETakenCount.
2931 const DataLayout &DL = TheLoop->getHeader()->getModule()->getDataLayout();
2932 uint64_t StrideTypeSizeBits = DL.getTypeSizeInBits(StrideExpr->getType());
2933 uint64_t BETypeSizeBits = DL.getTypeSizeInBits(BETakenCount->getType());
2934 const SCEV *CastedStride = StrideExpr;
2935 const SCEV *CastedBECount = BETakenCount;
2936 ScalarEvolution *SE = PSE->getSE();
2937 if (BETypeSizeBits >= StrideTypeSizeBits)
2938 CastedStride = SE->getNoopOrSignExtend(StrideExpr, BETakenCount->getType());
2939 else
2940 CastedBECount = SE->getZeroExtendExpr(BETakenCount, StrideExpr->getType());
2941 const SCEV *StrideMinusBETaken = SE->getMinusSCEV(CastedStride, CastedBECount);
2942 // Since TripCount == BackEdgeTakenCount + 1, checking:
2943 // "Stride >= TripCount" is equivalent to checking:
2944 // Stride - BETakenCount > 0
2945 if (SE->isKnownPositive(StrideMinusBETaken)) {
2946 LLVM_DEBUG(
2947 dbgs() << "LAA: Stride>=TripCount; No point in versioning as the "
2948 "Stride==1 predicate will imply that the loop executes "
2949 "at most once.\n");
2950 return;
2951 }
2952 LLVM_DEBUG(dbgs() << "LAA: Found a strided access that we can version.\n");
2953
2954 // Strip back off the integer cast, and check that our result is a
2955 // SCEVUnknown as we expect.
2956 const SCEV *StrideBase = StrideExpr;
2957 if (const auto *C = dyn_cast<SCEVIntegralCastExpr>(StrideBase))
2958 StrideBase = C->getOperand();
2959 SymbolicStrides[Ptr] = cast<SCEVUnknown>(StrideBase);
2960}
2961
2963 const TargetLibraryInfo *TLI, AAResults *AA,
2964 DominatorTree *DT, LoopInfo *LI)
2965 : PSE(std::make_unique<PredicatedScalarEvolution>(*SE, *L)),
2966 PtrRtChecking(nullptr),
2967 DepChecker(std::make_unique<MemoryDepChecker>(*PSE, L)), TheLoop(L) {
2968 PtrRtChecking = std::make_unique<RuntimePointerChecking>(*DepChecker, SE);
2969 if (canAnalyzeLoop()) {
2970 analyzeLoop(AA, LI, TLI, DT);
2971 }
2972}
2973
2975 if (CanVecMem) {
2976 OS.indent(Depth) << "Memory dependences are safe";
2977 const MemoryDepChecker &DC = getDepChecker();
2978 if (!DC.isSafeForAnyVectorWidth())
2979 OS << " with a maximum safe vector width of "
2980 << DC.getMaxSafeVectorWidthInBits() << " bits";
2981 if (PtrRtChecking->Need)
2982 OS << " with run-time checks";
2983 OS << "\n";
2984 }
2985
2986 if (HasConvergentOp)
2987 OS.indent(Depth) << "Has convergent operation in loop\n";
2988
2989 if (Report)
2990 OS.indent(Depth) << "Report: " << Report->getMsg() << "\n";
2991
2992 if (auto *Dependences = DepChecker->getDependences()) {
2993 OS.indent(Depth) << "Dependences:\n";
2994 for (const auto &Dep : *Dependences) {
2995 Dep.print(OS, Depth + 2, DepChecker->getMemoryInstructions());
2996 OS << "\n";
2997 }
2998 } else
2999 OS.indent(Depth) << "Too many dependences, not recorded\n";
3000
3001 // List the pair of accesses need run-time checks to prove independence.
3002 PtrRtChecking->print(OS, Depth);
3003 OS << "\n";
3004
3005 OS.indent(Depth) << "Non vectorizable stores to invariant address were "
3006 << (HasDependenceInvolvingLoopInvariantAddress ? "" : "not ")
3007 << "found in loop.\n";
3008
3009 OS.indent(Depth) << "SCEV assumptions:\n";
3010 PSE->getPredicate().print(OS, Depth);
3011
3012 OS << "\n";
3013
3014 OS.indent(Depth) << "Expressions re-written:\n";
3015 PSE->print(OS, Depth);
3016}
3017
3019 auto I = LoopAccessInfoMap.insert({&L, nullptr});
3020
3021 if (I.second)
3022 I.first->second =
3023 std::make_unique<LoopAccessInfo>(&L, &SE, TLI, &AA, &DT, &LI);
3024
3025 return *I.first->second;
3026}
3027
3029 Function &F, const PreservedAnalyses &PA,
3031 // Check whether our analysis is preserved.
3032 auto PAC = PA.getChecker<LoopAccessAnalysis>();
3033 if (!PAC.preserved() && !PAC.preservedSet<AllAnalysesOn<Function>>())
3034 // If not, give up now.
3035 return true;
3036
3037 // Check whether the analyses we depend on became invalid for any reason.
3038 // Skip checking TargetLibraryAnalysis as it is immutable and can't become
3039 // invalid.
3040 return Inv.invalidate<AAManager>(F, PA) ||
3042 Inv.invalidate<LoopAnalysis>(F, PA) ||
3044}
3045
3049 auto &AA = FAM.getResult<AAManager>(F);
3050 auto &DT = FAM.getResult<DominatorTreeAnalysis>(F);
3051 auto &LI = FAM.getResult<LoopAnalysis>(F);
3052 auto &TLI = FAM.getResult<TargetLibraryAnalysis>(F);
3053 return LoopAccessInfoManager(SE, AA, DT, LI, &TLI);
3054}
3055
3056AnalysisKey LoopAccessAnalysis::Key;
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
This file implements a class to represent arbitrary precision integral constant values and operations...
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
This file contains the declarations for the subclasses of Constant, which represent the different fla...
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
This file defines the DenseMap class.
uint64_t Size
bool End
Definition: ELF_riscv.cpp:480
Generic implementation of equivalence classes through the use Tarjan's efficient union-find algorithm...
#define Check(C,...)
#define DEBUG_TYPE
Hexagon Common GEP
IRTranslator LLVM IR MI
static cl::opt< unsigned > MaxDependences("max-dependences", cl::Hidden, cl::desc("Maximum number of dependences collected by " "loop-access analysis (default = 100)"), cl::init(100))
We collect dependences up to this threshold.
static cl::opt< bool > EnableForwardingConflictDetection("store-to-load-forwarding-conflict-detection", cl::Hidden, cl::desc("Enable conflict detection in loop-access analysis"), cl::init(true))
Enable store-to-load forwarding conflict detection.
static void findForkedSCEVs(ScalarEvolution *SE, const Loop *L, Value *Ptr, SmallVectorImpl< PointerIntPair< const SCEV *, 1, bool > > &ScevList, unsigned Depth)
static std::variant< MemoryDepChecker::Dependence::DepType, DepDistanceStrideAndSizeInfo > getDependenceDistanceStrideAndSize(const AccessAnalysis::MemAccessInfo &A, Instruction *AInst, const AccessAnalysis::MemAccessInfo &B, Instruction *BInst, const DenseMap< Value *, const SCEV * > &Strides, const DenseMap< Value *, SmallVector< const Value *, 16 > > &UnderlyingObjects, PredicatedScalarEvolution &PSE, const Loop *InnermostLoop)
static bool hasComputableBounds(PredicatedScalarEvolution &PSE, Value *Ptr, const SCEV *PtrScev, Loop *L, bool Assume)
Check whether a pointer can participate in a runtime bounds check.
static cl::opt< unsigned > MemoryCheckMergeThreshold("memory-check-merge-threshold", cl::Hidden, cl::desc("Maximum number of comparisons done when trying to merge " "runtime memory checks. (default = 100)"), cl::init(100))
The maximum iterations used to merge memory checks.
static bool isNoWrap(PredicatedScalarEvolution &PSE, const DenseMap< Value *, const SCEV * > &Strides, Value *Ptr, Type *AccessTy, Loop *L)
Check whether a pointer address cannot wrap.
static const SCEV * getStrideFromPointer(Value *Ptr, ScalarEvolution *SE, Loop *Lp)
Get the stride of a pointer access in a loop.
static unsigned getGEPInductionOperand(const GetElementPtrInst *Gep)
Find the operand of the GEP that should be checked for consecutive stores.
static cl::opt< unsigned, true > VectorizationInterleave("force-vector-interleave", cl::Hidden, cl::desc("Sets the vectorization interleave count. " "Zero is autoselect."), cl::location(VectorizerParams::VectorizationInterleave))
static bool isLoopVariantIndirectAddress(ArrayRef< const Value * > UnderlyingObjects, ScalarEvolution &SE, const Loop *L)
Returns true if any of the underlying objects has a loop varying address, i.e.
static Value * getUniqueCastUse(Value *Ptr, Loop *Lp, Type *Ty)
If a value has only one user that is a CastInst, return it.
static cl::opt< bool, true > HoistRuntimeChecks("hoist-runtime-checks", cl::Hidden, cl::desc("Hoist inner loop runtime memory checks to outer loop if possible"), cl::location(VectorizerParams::HoistRuntimeChecks), cl::init(true))
static cl::opt< unsigned, true > VectorizationFactor("force-vector-width", cl::Hidden, cl::desc("Sets the SIMD width. Zero is autoselect."), cl::location(VectorizerParams::VectorizationFactor))
static cl::opt< unsigned, true > RuntimeMemoryCheckThreshold("runtime-memory-check-threshold", cl::Hidden, cl::desc("When performing memory disambiguation checks at runtime do not " "generate more than this number of comparisons (default = 8)."), cl::location(VectorizerParams::RuntimeMemoryCheckThreshold), cl::init(8))
static void visitPointers(Value *StartPtr, const Loop &InnermostLoop, function_ref< void(Value *)> AddPointer)
static bool isNoWrapAddRec(Value *Ptr, const SCEVAddRecExpr *AR, PredicatedScalarEvolution &PSE, const Loop *L)
Return true if an AddRec pointer Ptr is unsigned non-wrapping, i.e.
static Value * stripGetElementPtr(Value *Ptr, ScalarEvolution *SE, Loop *Lp)
If the argument is a GEP, then returns the operand identified by getGEPInductionOperand.
static bool isSafeDependenceDistance(const DataLayout &DL, ScalarEvolution &SE, const SCEV &BackedgeTakenCount, const SCEV &Dist, uint64_t Stride, uint64_t TypeByteSize)
Given a dependence-distance Dist between two memory accesses, that have the same stride whose absolut...
static bool areStridedAccessesIndependent(uint64_t Distance, uint64_t Stride, uint64_t TypeByteSize)
Check the dependence for two accesses with the same stride Stride.
static const SCEV * getMinFromExprs(const SCEV *I, const SCEV *J, ScalarEvolution *SE)
Compare I and J and return the minimum.
static cl::opt< unsigned > MaxForkedSCEVDepth("max-forked-scev-depth", cl::Hidden, cl::desc("Maximum recursion depth when finding forked SCEVs (default = 5)"), cl::init(5))
static cl::opt< bool > SpeculateUnitStride("laa-speculate-unit-stride", cl::Hidden, cl::desc("Speculate that non-constant strides are unit in LAA"), cl::init(true))
static SmallVector< PointerIntPair< const SCEV *, 1, bool > > findForkedPointer(PredicatedScalarEvolution &PSE, const DenseMap< Value *, const SCEV * > &StridesMap, Value *Ptr, const Loop *L)
static cl::opt< bool > EnableMemAccessVersioning("enable-mem-access-versioning", cl::init(true), cl::Hidden, cl::desc("Enable symbolic stride memory access versioning"))
This enables versioning on the strides of symbolically striding memory accesses in code like the foll...
This header provides classes for managing per-loop analyses.
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
This file provides utility analysis objects describing memory locations.
uint64_t High
#define P(N)
FunctionAnalysisManager FAM
This header defines various interfaces for pass management in LLVM.
This file defines the PointerIntPair class.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file contains some templates that are useful if you are working with the STL at all.
raw_pwrite_stream & OS
static LLVM_ATTRIBUTE_ALWAYS_INLINE bool CheckType(MVT::SimpleValueType VT, SDValue N, const TargetLowering *TLI, const DataLayout &DL)
This file implements a set that has insertion order iteration characteristics.
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallVector class.
static SymbolRef::Type getType(const Symbol *Sym)
Definition: TapiFile.cpp:40
static const X86InstrFMA3Group Groups[]
A manager for alias analyses.
Class for arbitrary precision integers.
Definition: APInt.h:76
uint64_t getZExtValue() const
Get zero extended value.
Definition: APInt.h:1491
APInt abs() const
Get the absolute value.
Definition: APInt.h:1737
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1439
bool isNegative() const
Determine sign of this APInt.
Definition: APInt.h:307
APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition: APInt.cpp:1010
bool isStrictlyPositive() const
Determine if this APInt Value is positive.
Definition: APInt.h:334
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1513
This templated class represents "all analyses that operate over <a particular IR unit>" (e....
Definition: Analysis.h:47
API to communicate dependencies between analyses during invalidation.
Definition: PassManager.h:360
bool invalidate(IRUnitT &IR, const PreservedAnalyses &PA)
Trigger the invalidation of some other analysis pass if not already handled and return whether it was...
Definition: PassManager.h:378
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:321
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:473
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
ArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
Definition: ArrayRef.h:204
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:160
LLVM Basic Block Representation.
Definition: BasicBlock.h:60
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:206
const Module * getModule() const
Return the module owning the function this basic block belongs to, or nullptr if the function does no...
Definition: BasicBlock.cpp:289
This class is a wrapper over an AAResults, and it is intended to be used only when there are no IR ch...
This is the base class for all instructions that perform data casts.
Definition: InstrTypes.h:601
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
A debug info location.
Definition: DebugLoc.h:33
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:155
iterator end()
Definition: DenseMap.h:84
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:220
Analysis pass which computes a DominatorTree.
Definition: Dominators.h:279
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:162
bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
Definition: Dominators.cpp:122
EquivalenceClasses - This represents a collection of equivalence classes and supports three efficient...
iterator findValue(const ElemTy &V) const
findValue - Return an iterator to the specified value.
iterator insert(const ElemTy &Data)
insert - Insert a new value into the union/find set, ignoring the request if the value already exists...
member_iterator member_end() const
typename std::set< ECValue, ECValueComparator >::const_iterator iterator
iterator* - Provides a way to iterate over all values in the set.
member_iterator member_begin(iterator I) const
member_iterator unionSets(const ElemTy &V1, const ElemTy &V2)
union - Merge the two equivalence sets for the specified values, inserting them if they do not alread...
const ElemTy & getLeaderValue(const ElemTy &V) const
getLeaderValue - Return the leader for the specified value that is in the set.
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition: Function.h:681
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Definition: Instructions.h:973
Type * getResultElementType() const
PointerType * getType() const
Global values are always pointers.
Definition: GlobalValue.h:294
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
Definition: Instruction.cpp:82
Class to represent integer types.
Definition: DerivedTypes.h:40
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:278
An instruction for reading from memory.
Definition: Instructions.h:184
Value * getPointerOperand()
Definition: Instructions.h:280
static constexpr LocationSize beforeOrAfterPointer()
Any location before or after the base pointer (but still within the underlying object).
This analysis provides dependence information for the memory accesses of a loop.
Result run(Function &F, FunctionAnalysisManager &AM)
bool invalidate(Function &F, const PreservedAnalyses &PA, FunctionAnalysisManager::Invalidator &Inv)
const LoopAccessInfo & getInfo(Loop &L)
Drive the analysis of memory accesses in the loop.
const MemoryDepChecker & getDepChecker() const
the Memory Dependence Checker which can determine the loop-independent and loop-carried dependences b...
bool isInvariant(Value *V) const
Returns true if value V is loop invariant.
void print(raw_ostream &OS, unsigned Depth=0) const
Print the information about the memory accesses in the loop.
LoopAccessInfo(Loop *L, ScalarEvolution *SE, const TargetLibraryInfo *TLI, AAResults *AA, DominatorTree *DT, LoopInfo *LI)
static bool blockNeedsPredication(BasicBlock *BB, Loop *TheLoop, DominatorTree *DT)
Return true if the block BB needs to be predicated in order for the loop to be vectorized.
Analysis pass that exposes the LoopInfo for a function.
Definition: LoopInfo.h:566
bool contains(const LoopT *L) const
Return true if the specified loop is contained within in this loop.
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
bool isInnermost() const
Return true if the loop does not contain any (natural) loops.
unsigned getNumBackEdges() const
Calculate the number of back edges to the loop header.
BlockT * getHeader() const
LoopT * getParentLoop() const
Return the parent loop if it exists or nullptr for top level loops.
Wrapper class to LoopBlocksDFS that provides a standard begin()/end() interface for the DFS reverse p...
Definition: LoopIterator.h:172
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:44
bool isAnnotatedParallel() const
Returns true if the loop is annotated parallel.
Definition: LoopInfo.cpp:564
DebugLoc getStartLoc() const
Return the debug location of the start of this loop.
Definition: LoopInfo.cpp:631
Metadata node.
Definition: Metadata.h:1067
ArrayRef< MDOperand > operands() const
Definition: Metadata.h:1426
Tracking metadata reference owned by Metadata.
Definition: Metadata.h:889
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:36
Checks memory dependences among accesses to the same underlying object to determine whether there vec...
ArrayRef< unsigned > getOrderForAccess(Value *Ptr, bool IsWrite) const
Return the program order indices for the access location (Ptr, IsWrite).
bool isSafeForAnyVectorWidth() const
Return true if the number of elements that are safe to operate on simultaneously is not bounded.
bool areDepsSafe(DepCandidates &AccessSets, MemAccessInfoList &CheckDeps, const DenseMap< Value *, const SCEV * > &Strides, const DenseMap< Value *, SmallVector< const Value *, 16 > > &UnderlyingObjects)
Check whether the dependencies between the accesses are safe.
const SmallVectorImpl< Instruction * > & getMemoryInstructions() const
The vector of memory access instructions.
const Loop * getInnermostLoop() const
uint64_t getMaxSafeVectorWidthInBits() const
Return the number of elements that are safe to operate on simultaneously, multiplied by the size of t...
bool isSafeForVectorization() const
No memory dependence was encountered that would inhibit vectorization.
const SmallVectorImpl< Dependence > * getDependences() const
Returns the memory dependences.
SmallVector< Instruction *, 4 > getInstructionsForAccess(Value *Ptr, bool isWrite) const
Find the set of instructions that read or write via Ptr.
VectorizationSafetyStatus
Type to keep track of the status of the dependence check.
bool shouldRetryWithRuntimeCheck() const
In same cases when the dependency check fails we can still vectorize the loop with a dynamic array ac...
void addAccess(StoreInst *SI)
Register the location (instructions are given increasing numbers) of a write access.
PointerIntPair< Value *, 1, bool > MemAccessInfo
Representation for a specific memory location.
static MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
LocationSize Size
The maximum size of the location, in address-units, or UnknownSize if the size is not known.
AAMDNodes AATags
The metadata nodes which describes the aliasing of the location (each member is null if that kind of ...
const Value * Ptr
The address of the start of the location.
Root of the metadata hierarchy.
Definition: Metadata.h:62
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition: Module.h:293
Diagnostic information for optimization analysis remarks.
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
void addPredicate(const SCEVPredicate &Pred)
Adds a new predicate.
ScalarEvolution * getSE() const
Returns the ScalarEvolution analysis used.
const SCEVPredicate & getPredicate() const
bool hasNoOverflow(Value *V, SCEVWrapPredicate::IncrementWrapFlags Flags)
Returns true if we've proved that V doesn't wrap by means of a SCEV predicate.
void setNoOverflow(Value *V, SCEVWrapPredicate::IncrementWrapFlags Flags)
Proves that V doesn't overflow by adding SCEV predicate.
void print(raw_ostream &OS, unsigned Depth) const
Print the SCEV mappings done by the Predicated Scalar Evolution.
const SCEVAddRecExpr * getAsAddRec(Value *V)
Attempts to produce an AddRecExpr for V by adding additional SCEV predicates.
const SCEV * getBackedgeTakenCount()
Get the (predicated) backedge count for the analyzed loop.
const SCEV * getSCEV(Value *V)
Returns the SCEV expression of V, in the context of the current SCEV predicate.
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:109
PreservedAnalysisChecker getChecker() const
Build a checker for this PreservedAnalyses and the specified analysis type.
Definition: Analysis.h:264
Holds information about the memory runtime legality checks to verify that a group of pointers do not ...
bool Need
This flag indicates if we need to add the runtime check.
void reset()
Reset the state of the pointer runtime information.
unsigned getNumberOfChecks() const
Returns the number of run-time checks required according to needsChecking.
void printChecks(raw_ostream &OS, const SmallVectorImpl< RuntimePointerCheck > &Checks, unsigned Depth=0) const
Print Checks.
bool needsChecking(const RuntimeCheckingPtrGroup &M, const RuntimeCheckingPtrGroup &N) const
Decide if we need to add a check between two groups of pointers, according to needsChecking.
void print(raw_ostream &OS, unsigned Depth=0) const
Print the list run-time memory checks necessary.
SmallVector< RuntimeCheckingPtrGroup, 2 > CheckingGroups
Holds a partitioning of pointers into "check groups".
void generateChecks(MemoryDepChecker::DepCandidates &DepCands, bool UseDependencies)
Generate the checks and store it.
static bool arePointersInSamePartition(const SmallVectorImpl< int > &PtrToPartition, unsigned PtrIdx1, unsigned PtrIdx2)
Check if pointers are in the same partition.
SmallVector< PointerInfo, 2 > Pointers
Information about the pointers that may require checking.
void insert(Loop *Lp, Value *Ptr, const SCEV *PtrExpr, Type *AccessTy, bool WritePtr, unsigned DepSetId, unsigned ASId, PredicatedScalarEvolution &PSE, bool NeedsFreeze)
Insert a pointer and calculate the start and end SCEVs.
This node represents a polynomial recurrence on the trip count of the specified loop.
const SCEV * evaluateAtIteration(const SCEV *It, ScalarEvolution &SE) const
Return the value of this chain of recurrences at the specified iteration number.
const SCEV * getStepRecurrence(ScalarEvolution &SE) const
Constructs and returns the recurrence indicating how much this expression steps by.
bool isAffine() const
Return true if this represents an expression A + B*x where A and B are loop invariant values.
This class represents a constant integer value.
This is the base class for unary integral cast operator classes.
This node represents multiplication of some number of SCEVs.
NoWrapFlags getNoWrapFlags(NoWrapFlags Mask=NoWrapMask) const
virtual void print(raw_ostream &OS, unsigned Depth=0) const =0
Prints a textual representation of this predicate with an indentation of Depth.
This means that we are dealing with an entirely unknown SCEV value, and only represent it as its LLVM...
This class represents an analyzed expression in the program.
Type * getType() const
Return the LLVM type of this SCEV expression.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
const SCEV * getNegativeSCEV(const SCEV *V, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
Return the SCEV object corresponding to -V.
const SCEV * getUMaxExpr(const SCEV *LHS, const SCEV *RHS)
const SCEVPredicate * getEqualPredicate(const SCEV *LHS, const SCEV *RHS)
const SCEV * getConstant(ConstantInt *V)
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
const SCEV * getNoopOrSignExtend(const SCEV *V, Type *Ty)
Return a SCEV corresponding to a conversion of the input value to the specified type.
const SCEV * getOne(Type *Ty)
Return a SCEV for the constant 1 of a specific type.
const SCEV * getPtrToIntExpr(const SCEV *Op, Type *Ty)
bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
bool isKnownPositive(const SCEV *S)
Test if the given expression is known to be positive.
const SCEV * getZeroExtendExpr(const SCEV *Op, Type *Ty, unsigned Depth=0)
bool isSCEVable(Type *Ty) const
Test if values of the given type are analyzable within the SCEV framework.
Type * getEffectiveSCEVType(Type *Ty) const
Return a type with the same bitwidth as the given type and which represents how SCEV will treat the g...
const SCEV * getUMinExpr(const SCEV *LHS, const SCEV *RHS, bool Sequential=false)
const SCEV * getStoreSizeOfExpr(Type *IntTy, Type *StoreTy)
Return an expression for the store size of StoreTy that is type IntTy.
const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
const SCEV * getSizeOfExpr(Type *IntTy, TypeSize Size)
Return an expression for a TypeSize.
const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
const SCEV * getTruncateOrSignExtend(const SCEV *V, Type *Ty, unsigned Depth=0)
Return a SCEV corresponding to a conversion of the input value to the specified type.
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:321
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:360
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:342
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:427
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:135
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition: SmallSet.h:166
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:179
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:950
void resize(size_type N)
Definition: SmallVector.h:651
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
An instruction for storing to memory.
Definition: Instructions.h:317
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:265
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:255
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
Value * getOperand(unsigned i) const
Definition: User.h:169
unsigned getNumOperands() const
Definition: User.h:191
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
Definition: VectorUtils.h:70
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
const Value * stripAndAccumulateInBoundsConstantOffsets(const DataLayout &DL, APInt &Offset) const
This is a wrapper around stripAndAccumulateConstantOffsets with the in-bounds requirement set to fals...
Definition: Value.h:736
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:187
An efficient, type-erasing, non-owning reference to a callable.
TypeSize getSequentialElementStride(const DataLayout &DL) const
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:52
raw_ostream & indent(unsigned NumSpaces)
indent - Insert 'NumSpaces' spaces.
friend const_iterator end(StringRef path)
Get end iterator over path.
Definition: Path.cpp:236
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
Definition: PatternMatch.h:561
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:450
LocationClass< Ty > location(Ty &L)
Definition: CommandLine.h:470
DiagnosticInfoOptimizationBase::Argument NV
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
std::optional< int > getPointersDiff(Type *ElemTyA, Value *PtrA, Type *ElemTyB, Value *PtrB, const DataLayout &DL, ScalarEvolution &SE, bool StrictCheck=false, bool CheckType=true)
Returns the distance between the pointers PtrA and PtrB iff they are compatible and it is possible to...
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
@ Offset
Definition: DWP.cpp:456
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1722
Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
unsigned getPointerAddressSpace(const Type *T)
Definition: SPIRVUtils.h:122
std::optional< const MDOperand * > findStringMetadataForLoop(const Loop *TheLoop, StringRef Name)
Find string metadata for loop.
Definition: LoopInfo.cpp:1053
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
AddressSpace
Definition: NVPTXBaseInfo.h:21
const Value * getPointerOperand(const Value *V)
A helper function that returns the pointer operand of a load, store or GEP instruction.
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
Definition: STLExtras.h:1928
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1729
bool NullPointerIsDefined(const Function *F, unsigned AS=0)
Check whether null pointer dereferencing is considered undefined behavior for a given function or an ...
Definition: Function.cpp:2036
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
bool isPointerTy(const Type *T)
Definition: SPIRVUtils.h:116
std::optional< int64_t > getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr, const Loop *Lp, const DenseMap< Value *, const SCEV * > &StridesMap=DenseMap< Value *, const SCEV * >(), bool Assume=false, bool ShouldCheckWrap=true)
If the pointer has a constant stride return it in units of the access type size.
bool sortPtrAccesses(ArrayRef< Value * > VL, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
Attempt to sort the pointers in VL and return the sorted indices in SortedIndices,...
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
void getUnderlyingObjects(const Value *V, SmallVectorImpl< const Value * > &Objects, LoopInfo *LI=nullptr, unsigned MaxLookup=6)
This method is similar to getUnderlyingObject except that it can look through phi and select instruct...
const SCEV * replaceSymbolicStrideSCEV(PredicatedScalarEvolution &PSE, const DenseMap< Value *, const SCEV * > &PtrToStride, Value *Ptr)
Return the SCEV corresponding to a pointer with the symbolic stride replaced with constant one,...
bool isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL, ScalarEvolution &SE, bool CheckType=true)
Returns true if the memory operations A and B are consecutive.
bool isGuaranteedNotToBeUndefOrPoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Return true if this function can prove that V does not have undef bits and is never poison.
OutputIt copy(R &&Range, OutputIt Out)
Definition: STLExtras.h:1824
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1749
gep_type_iterator gep_type_begin(const User *GEP)
Type * getLoadStoreType(Value *I)
A helper function that returns the type of a load or store instruction.
Implement std::hash so that hash_code can be used in STL containers.
Definition: BitVector.h:858
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
MDNode * Scope
The tag for alias scope specification (used with noalias).
Definition: Metadata.h:783
MDNode * TBAA
The tag for type-based alias analysis.
Definition: Metadata.h:777
MDNode * NoAlias
The tag specifying the noalias scope.
Definition: Metadata.h:786
A special type used by analysis passes to provide an address that identifies that particular analysis...
Definition: Analysis.h:26
Dependece between memory access instructions.
DepType Type
The type of the dependence.
bool isPossiblyBackward() const
May be a lexically backward dependence type (includes Unknown).
bool isForward() const
Lexically forward dependence.
bool isBackward() const
Lexically backward dependence.
void print(raw_ostream &OS, unsigned Depth, const SmallVectorImpl< Instruction * > &Instrs) const
Print the dependence.
Instruction * getDestination(const LoopAccessInfo &LAI) const
Return the destination instruction of the dependence.
Instruction * getSource(const LoopAccessInfo &LAI) const
Return the source instruction of the dependence.
DepType
The type of the dependence.
static const char * DepName[]
String version of the types.
static VectorizationSafetyStatus isSafeForVectorization(DepType Type)
Dependence types that don't prevent vectorization.
unsigned AddressSpace
Address space of the involved pointers.
bool addPointer(unsigned Index, RuntimePointerChecking &RtCheck)
Tries to add the pointer recorded in RtCheck at index Index to this pointer checking group.
bool NeedsFreeze
Whether the pointer needs to be frozen after expansion, e.g.
const SCEV * High
The SCEV expression which represents the upper bound of all the pointers in this group.
SmallVector< unsigned, 2 > Members
Indices of all the pointers that constitute this grouping.
RuntimeCheckingPtrGroup(unsigned Index, RuntimePointerChecking &RtCheck)
Create a new pointer checking group containing a single pointer, with index Index in RtCheck.
const SCEV * Low
The SCEV expression which represents the lower bound of all the pointers in this group.
bool IsWritePtr
Holds the information if this pointer is used for writing to memory.
unsigned DependencySetId
Holds the id of the set of pointers that could be dependent because of a shared underlying object.
unsigned AliasSetId
Holds the id of the disjoint alias set to which this pointer belongs.
static const unsigned MaxVectorWidth
Maximum SIMD width.
static unsigned VectorizationFactor
VF as overridden by the user.
static unsigned RuntimeMemoryCheckThreshold
\When performing memory disambiguation checks at runtime do not make more than this number of compari...
static bool isInterleaveForced()
True if force-vector-interleave was specified by the user.
static unsigned VectorizationInterleave
Interleave factor as overridden by the user.
Function object to check whether the first component of a container supported by std::get (like std::...
Definition: STLExtras.h:1450