LLVM 19.0.0git
LoopAccessAnalysis.cpp
Go to the documentation of this file.
1//===- LoopAccessAnalysis.cpp - Loop Access Analysis Implementation --------==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// The implementation for the loop memory dependence that was originally
10// developed for the loop vectorizer.
11//
12//===----------------------------------------------------------------------===//
13
15#include "llvm/ADT/APInt.h"
16#include "llvm/ADT/DenseMap.h"
19#include "llvm/ADT/STLExtras.h"
20#include "llvm/ADT/SetVector.h"
22#include "llvm/ADT/SmallSet.h"
36#include "llvm/IR/BasicBlock.h"
37#include "llvm/IR/Constants.h"
38#include "llvm/IR/DataLayout.h"
39#include "llvm/IR/DebugLoc.h"
42#include "llvm/IR/Dominators.h"
43#include "llvm/IR/Function.h"
45#include "llvm/IR/InstrTypes.h"
46#include "llvm/IR/Instruction.h"
48#include "llvm/IR/Operator.h"
49#include "llvm/IR/PassManager.h"
51#include "llvm/IR/Type.h"
52#include "llvm/IR/Value.h"
53#include "llvm/IR/ValueHandle.h"
56#include "llvm/Support/Debug.h"
59#include <algorithm>
60#include <cassert>
61#include <cstdint>
62#include <iterator>
63#include <utility>
64#include <variant>
65#include <vector>
66
67using namespace llvm;
68using namespace llvm::PatternMatch;
69
70#define DEBUG_TYPE "loop-accesses"
71
73VectorizationFactor("force-vector-width", cl::Hidden,
74 cl::desc("Sets the SIMD width. Zero is autoselect."),
77
79VectorizationInterleave("force-vector-interleave", cl::Hidden,
80 cl::desc("Sets the vectorization interleave count. "
81 "Zero is autoselect."),
85
87 "runtime-memory-check-threshold", cl::Hidden,
88 cl::desc("When performing memory disambiguation checks at runtime do not "
89 "generate more than this number of comparisons (default = 8)."),
92
93/// The maximum iterations used to merge memory checks
95 "memory-check-merge-threshold", cl::Hidden,
96 cl::desc("Maximum number of comparisons done when trying to merge "
97 "runtime memory checks. (default = 100)"),
98 cl::init(100));
99
100/// Maximum SIMD width.
101const unsigned VectorizerParams::MaxVectorWidth = 64;
102
103/// We collect dependences up to this threshold.
105 MaxDependences("max-dependences", cl::Hidden,
106 cl::desc("Maximum number of dependences collected by "
107 "loop-access analysis (default = 100)"),
108 cl::init(100));
109
110/// This enables versioning on the strides of symbolically striding memory
111/// accesses in code like the following.
112/// for (i = 0; i < N; ++i)
113/// A[i * Stride1] += B[i * Stride2] ...
114///
115/// Will be roughly translated to
116/// if (Stride1 == 1 && Stride2 == 1) {
117/// for (i = 0; i < N; i+=4)
118/// A[i:i+3] += ...
119/// } else
120/// ...
122 "enable-mem-access-versioning", cl::init(true), cl::Hidden,
123 cl::desc("Enable symbolic stride memory access versioning"));
124
125/// Enable store-to-load forwarding conflict detection. This option can
126/// be disabled for correctness testing.
128 "store-to-load-forwarding-conflict-detection", cl::Hidden,
129 cl::desc("Enable conflict detection in loop-access analysis"),
130 cl::init(true));
131
133 "max-forked-scev-depth", cl::Hidden,
134 cl::desc("Maximum recursion depth when finding forked SCEVs (default = 5)"),
135 cl::init(5));
136
138 "laa-speculate-unit-stride", cl::Hidden,
139 cl::desc("Speculate that non-constant strides are unit in LAA"),
140 cl::init(true));
141
143 "hoist-runtime-checks", cl::Hidden,
144 cl::desc(
145 "Hoist inner loop runtime memory checks to outer loop if possible"),
148
150 return ::VectorizationInterleave.getNumOccurrences() > 0;
151}
152
154 const DenseMap<Value *, const SCEV *> &PtrToStride,
155 Value *Ptr) {
156 const SCEV *OrigSCEV = PSE.getSCEV(Ptr);
157
158 // If there is an entry in the map return the SCEV of the pointer with the
159 // symbolic stride replaced by one.
161 if (SI == PtrToStride.end())
162 // For a non-symbolic stride, just return the original expression.
163 return OrigSCEV;
164
165 const SCEV *StrideSCEV = SI->second;
166 // Note: This assert is both overly strong and overly weak. The actual
167 // invariant here is that StrideSCEV should be loop invariant. The only
168 // such invariant strides we happen to speculate right now are unknowns
169 // and thus this is a reasonable proxy of the actual invariant.
170 assert(isa<SCEVUnknown>(StrideSCEV) && "shouldn't be in map");
171
172 ScalarEvolution *SE = PSE.getSE();
173 const auto *CT = SE->getOne(StrideSCEV->getType());
174 PSE.addPredicate(*SE->getEqualPredicate(StrideSCEV, CT));
175 auto *Expr = PSE.getSCEV(Ptr);
176
177 LLVM_DEBUG(dbgs() << "LAA: Replacing SCEV: " << *OrigSCEV
178 << " by: " << *Expr << "\n");
179 return Expr;
180}
181
183 unsigned Index, RuntimePointerChecking &RtCheck)
184 : High(RtCheck.Pointers[Index].End), Low(RtCheck.Pointers[Index].Start),
185 AddressSpace(RtCheck.Pointers[Index]
186 .PointerValue->getType()
188 NeedsFreeze(RtCheck.Pointers[Index].NeedsFreeze) {
190}
191
192/// Calculate Start and End points of memory access.
193/// Let's assume A is the first access and B is a memory access on N-th loop
194/// iteration. Then B is calculated as:
195/// B = A + Step*N .
196/// Step value may be positive or negative.
197/// N is a calculated back-edge taken count:
198/// N = (TripCount > 0) ? RoundDown(TripCount -1 , VF) : 0
199/// Start and End points are calculated in the following way:
200/// Start = UMIN(A, B) ; End = UMAX(A, B) + SizeOfElt,
201/// where SizeOfElt is the size of single memory access in bytes.
202///
203/// There is no conflict when the intervals are disjoint:
204/// NoConflict = (P2.Start >= P1.End) || (P1.Start >= P2.End)
206 Type *AccessTy, bool WritePtr,
207 unsigned DepSetId, unsigned ASId,
209 bool NeedsFreeze) {
210 ScalarEvolution *SE = PSE.getSE();
211
212 const SCEV *ScStart;
213 const SCEV *ScEnd;
214
215 if (SE->isLoopInvariant(PtrExpr, Lp)) {
216 ScStart = ScEnd = PtrExpr;
217 } else {
218 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PtrExpr);
219 assert(AR && "Invalid addrec expression");
220 const SCEV *Ex = PSE.getBackedgeTakenCount();
221
222 ScStart = AR->getStart();
223 ScEnd = AR->evaluateAtIteration(Ex, *SE);
224 const SCEV *Step = AR->getStepRecurrence(*SE);
225
226 // For expressions with negative step, the upper bound is ScStart and the
227 // lower bound is ScEnd.
228 if (const auto *CStep = dyn_cast<SCEVConstant>(Step)) {
229 if (CStep->getValue()->isNegative())
230 std::swap(ScStart, ScEnd);
231 } else {
232 // Fallback case: the step is not constant, but we can still
233 // get the upper and lower bounds of the interval by using min/max
234 // expressions.
235 ScStart = SE->getUMinExpr(ScStart, ScEnd);
236 ScEnd = SE->getUMaxExpr(AR->getStart(), ScEnd);
237 }
238 }
239 assert(SE->isLoopInvariant(ScStart, Lp) && "ScStart needs to be invariant");
240 assert(SE->isLoopInvariant(ScEnd, Lp)&& "ScEnd needs to be invariant");
241
242 // Add the size of the pointed element to ScEnd.
243 auto &DL = Lp->getHeader()->getModule()->getDataLayout();
244 Type *IdxTy = DL.getIndexType(Ptr->getType());
245 const SCEV *EltSizeSCEV = SE->getStoreSizeOfExpr(IdxTy, AccessTy);
246 ScEnd = SE->getAddExpr(ScEnd, EltSizeSCEV);
247
248 Pointers.emplace_back(Ptr, ScStart, ScEnd, WritePtr, DepSetId, ASId, PtrExpr,
249 NeedsFreeze);
250}
251
252void RuntimePointerChecking::tryToCreateDiffCheck(
253 const RuntimeCheckingPtrGroup &CGI, const RuntimeCheckingPtrGroup &CGJ) {
254 if (!CanUseDiffCheck)
255 return;
256
257 // If either group contains multiple different pointers, bail out.
258 // TODO: Support multiple pointers by using the minimum or maximum pointer,
259 // depending on src & sink.
260 if (CGI.Members.size() != 1 || CGJ.Members.size() != 1) {
261 CanUseDiffCheck = false;
262 return;
263 }
264
265 PointerInfo *Src = &Pointers[CGI.Members[0]];
266 PointerInfo *Sink = &Pointers[CGJ.Members[0]];
267
268 // If either pointer is read and written, multiple checks may be needed. Bail
269 // out.
270 if (!DC.getOrderForAccess(Src->PointerValue, !Src->IsWritePtr).empty() ||
271 !DC.getOrderForAccess(Sink->PointerValue, !Sink->IsWritePtr).empty()) {
272 CanUseDiffCheck = false;
273 return;
274 }
275
276 ArrayRef<unsigned> AccSrc =
277 DC.getOrderForAccess(Src->PointerValue, Src->IsWritePtr);
278 ArrayRef<unsigned> AccSink =
279 DC.getOrderForAccess(Sink->PointerValue, Sink->IsWritePtr);
280 // If either pointer is accessed multiple times, there may not be a clear
281 // src/sink relation. Bail out for now.
282 if (AccSrc.size() != 1 || AccSink.size() != 1) {
283 CanUseDiffCheck = false;
284 return;
285 }
286 // If the sink is accessed before src, swap src/sink.
287 if (AccSink[0] < AccSrc[0])
288 std::swap(Src, Sink);
289
290 auto *SrcAR = dyn_cast<SCEVAddRecExpr>(Src->Expr);
291 auto *SinkAR = dyn_cast<SCEVAddRecExpr>(Sink->Expr);
292 if (!SrcAR || !SinkAR || SrcAR->getLoop() != DC.getInnermostLoop() ||
293 SinkAR->getLoop() != DC.getInnermostLoop()) {
294 CanUseDiffCheck = false;
295 return;
296 }
297
299 DC.getInstructionsForAccess(Src->PointerValue, Src->IsWritePtr);
301 DC.getInstructionsForAccess(Sink->PointerValue, Sink->IsWritePtr);
302 Type *SrcTy = getLoadStoreType(SrcInsts[0]);
303 Type *DstTy = getLoadStoreType(SinkInsts[0]);
304 if (isa<ScalableVectorType>(SrcTy) || isa<ScalableVectorType>(DstTy)) {
305 CanUseDiffCheck = false;
306 return;
307 }
308 const DataLayout &DL =
309 SinkAR->getLoop()->getHeader()->getModule()->getDataLayout();
310 unsigned AllocSize =
311 std::max(DL.getTypeAllocSize(SrcTy), DL.getTypeAllocSize(DstTy));
312
313 // Only matching constant steps matching the AllocSize are supported at the
314 // moment. This simplifies the difference computation. Can be extended in the
315 // future.
316 auto *Step = dyn_cast<SCEVConstant>(SinkAR->getStepRecurrence(*SE));
317 if (!Step || Step != SrcAR->getStepRecurrence(*SE) ||
318 Step->getAPInt().abs() != AllocSize) {
319 CanUseDiffCheck = false;
320 return;
321 }
322
323 IntegerType *IntTy =
324 IntegerType::get(Src->PointerValue->getContext(),
325 DL.getPointerSizeInBits(CGI.AddressSpace));
326
327 // When counting down, the dependence distance needs to be swapped.
328 if (Step->getValue()->isNegative())
329 std::swap(SinkAR, SrcAR);
330
331 const SCEV *SinkStartInt = SE->getPtrToIntExpr(SinkAR->getStart(), IntTy);
332 const SCEV *SrcStartInt = SE->getPtrToIntExpr(SrcAR->getStart(), IntTy);
333 if (isa<SCEVCouldNotCompute>(SinkStartInt) ||
334 isa<SCEVCouldNotCompute>(SrcStartInt)) {
335 CanUseDiffCheck = false;
336 return;
337 }
338
339 const Loop *InnerLoop = SrcAR->getLoop();
340 // If the start values for both Src and Sink also vary according to an outer
341 // loop, then it's probably better to avoid creating diff checks because
342 // they may not be hoisted. We should instead let llvm::addRuntimeChecks
343 // do the expanded full range overlap checks, which can be hoisted.
344 if (HoistRuntimeChecks && InnerLoop->getParentLoop() &&
345 isa<SCEVAddRecExpr>(SinkStartInt) && isa<SCEVAddRecExpr>(SrcStartInt)) {
346 auto *SrcStartAR = cast<SCEVAddRecExpr>(SrcStartInt);
347 auto *SinkStartAR = cast<SCEVAddRecExpr>(SinkStartInt);
348 const Loop *StartARLoop = SrcStartAR->getLoop();
349 if (StartARLoop == SinkStartAR->getLoop() &&
350 StartARLoop == InnerLoop->getParentLoop() &&
351 // If the diff check would already be loop invariant (due to the
352 // recurrences being the same), then we prefer to keep the diff checks
353 // because they are cheaper.
354 SrcStartAR->getStepRecurrence(*SE) !=
355 SinkStartAR->getStepRecurrence(*SE)) {
356 LLVM_DEBUG(dbgs() << "LAA: Not creating diff runtime check, since these "
357 "cannot be hoisted out of the outer loop\n");
358 CanUseDiffCheck = false;
359 return;
360 }
361 }
362
363 LLVM_DEBUG(dbgs() << "LAA: Creating diff runtime check for:\n"
364 << "SrcStart: " << *SrcStartInt << '\n'
365 << "SinkStartInt: " << *SinkStartInt << '\n');
366 DiffChecks.emplace_back(SrcStartInt, SinkStartInt, AllocSize,
367 Src->NeedsFreeze || Sink->NeedsFreeze);
368}
369
370SmallVector<RuntimePointerCheck, 4> RuntimePointerChecking::generateChecks() {
372
373 for (unsigned I = 0; I < CheckingGroups.size(); ++I) {
374 for (unsigned J = I + 1; J < CheckingGroups.size(); ++J) {
377
378 if (needsChecking(CGI, CGJ)) {
379 tryToCreateDiffCheck(CGI, CGJ);
380 Checks.push_back(std::make_pair(&CGI, &CGJ));
381 }
382 }
383 }
384 return Checks;
385}
386
387void RuntimePointerChecking::generateChecks(
388 MemoryDepChecker::DepCandidates &DepCands, bool UseDependencies) {
389 assert(Checks.empty() && "Checks is not empty");
390 groupChecks(DepCands, UseDependencies);
391 Checks = generateChecks();
392}
393
395 const RuntimeCheckingPtrGroup &M, const RuntimeCheckingPtrGroup &N) const {
396 for (unsigned I = 0, EI = M.Members.size(); EI != I; ++I)
397 for (unsigned J = 0, EJ = N.Members.size(); EJ != J; ++J)
398 if (needsChecking(M.Members[I], N.Members[J]))
399 return true;
400 return false;
401}
402
403/// Compare \p I and \p J and return the minimum.
404/// Return nullptr in case we couldn't find an answer.
405static const SCEV *getMinFromExprs(const SCEV *I, const SCEV *J,
406 ScalarEvolution *SE) {
407 const SCEV *Diff = SE->getMinusSCEV(J, I);
408 const SCEVConstant *C = dyn_cast<const SCEVConstant>(Diff);
409
410 if (!C)
411 return nullptr;
412 if (C->getValue()->isNegative())
413 return J;
414 return I;
415}
416
418 RuntimePointerChecking &RtCheck) {
419 return addPointer(
420 Index, RtCheck.Pointers[Index].Start, RtCheck.Pointers[Index].End,
421 RtCheck.Pointers[Index].PointerValue->getType()->getPointerAddressSpace(),
422 RtCheck.Pointers[Index].NeedsFreeze, *RtCheck.SE);
423}
424
426 const SCEV *End, unsigned AS,
427 bool NeedsFreeze,
428 ScalarEvolution &SE) {
429 assert(AddressSpace == AS &&
430 "all pointers in a checking group must be in the same address space");
431
432 // Compare the starts and ends with the known minimum and maximum
433 // of this set. We need to know how we compare against the min/max
434 // of the set in order to be able to emit memchecks.
435 const SCEV *Min0 = getMinFromExprs(Start, Low, &SE);
436 if (!Min0)
437 return false;
438
439 const SCEV *Min1 = getMinFromExprs(End, High, &SE);
440 if (!Min1)
441 return false;
442
443 // Update the low bound expression if we've found a new min value.
444 if (Min0 == Start)
445 Low = Start;
446
447 // Update the high bound expression if we've found a new max value.
448 if (Min1 != End)
449 High = End;
450
452 this->NeedsFreeze |= NeedsFreeze;
453 return true;
454}
455
456void RuntimePointerChecking::groupChecks(
457 MemoryDepChecker::DepCandidates &DepCands, bool UseDependencies) {
458 // We build the groups from dependency candidates equivalence classes
459 // because:
460 // - We know that pointers in the same equivalence class share
461 // the same underlying object and therefore there is a chance
462 // that we can compare pointers
463 // - We wouldn't be able to merge two pointers for which we need
464 // to emit a memcheck. The classes in DepCands are already
465 // conveniently built such that no two pointers in the same
466 // class need checking against each other.
467
468 // We use the following (greedy) algorithm to construct the groups
469 // For every pointer in the equivalence class:
470 // For each existing group:
471 // - if the difference between this pointer and the min/max bounds
472 // of the group is a constant, then make the pointer part of the
473 // group and update the min/max bounds of that group as required.
474
475 CheckingGroups.clear();
476
477 // If we need to check two pointers to the same underlying object
478 // with a non-constant difference, we shouldn't perform any pointer
479 // grouping with those pointers. This is because we can easily get
480 // into cases where the resulting check would return false, even when
481 // the accesses are safe.
482 //
483 // The following example shows this:
484 // for (i = 0; i < 1000; ++i)
485 // a[5000 + i * m] = a[i] + a[i + 9000]
486 //
487 // Here grouping gives a check of (5000, 5000 + 1000 * m) against
488 // (0, 10000) which is always false. However, if m is 1, there is no
489 // dependence. Not grouping the checks for a[i] and a[i + 9000] allows
490 // us to perform an accurate check in this case.
491 //
492 // The above case requires that we have an UnknownDependence between
493 // accesses to the same underlying object. This cannot happen unless
494 // FoundNonConstantDistanceDependence is set, and therefore UseDependencies
495 // is also false. In this case we will use the fallback path and create
496 // separate checking groups for all pointers.
497
498 // If we don't have the dependency partitions, construct a new
499 // checking pointer group for each pointer. This is also required
500 // for correctness, because in this case we can have checking between
501 // pointers to the same underlying object.
502 if (!UseDependencies) {
503 for (unsigned I = 0; I < Pointers.size(); ++I)
504 CheckingGroups.push_back(RuntimeCheckingPtrGroup(I, *this));
505 return;
506 }
507
508 unsigned TotalComparisons = 0;
509
511 for (unsigned Index = 0; Index < Pointers.size(); ++Index) {
512 auto Iter = PositionMap.insert({Pointers[Index].PointerValue, {}});
513 Iter.first->second.push_back(Index);
514 }
515
516 // We need to keep track of what pointers we've already seen so we
517 // don't process them twice.
519
520 // Go through all equivalence classes, get the "pointer check groups"
521 // and add them to the overall solution. We use the order in which accesses
522 // appear in 'Pointers' to enforce determinism.
523 for (unsigned I = 0; I < Pointers.size(); ++I) {
524 // We've seen this pointer before, and therefore already processed
525 // its equivalence class.
526 if (Seen.count(I))
527 continue;
528
529 MemoryDepChecker::MemAccessInfo Access(Pointers[I].PointerValue,
530 Pointers[I].IsWritePtr);
531
533 auto LeaderI = DepCands.findValue(DepCands.getLeaderValue(Access));
534
535 // Because DepCands is constructed by visiting accesses in the order in
536 // which they appear in alias sets (which is deterministic) and the
537 // iteration order within an equivalence class member is only dependent on
538 // the order in which unions and insertions are performed on the
539 // equivalence class, the iteration order is deterministic.
540 for (auto MI = DepCands.member_begin(LeaderI), ME = DepCands.member_end();
541 MI != ME; ++MI) {
542 auto PointerI = PositionMap.find(MI->getPointer());
543 assert(PointerI != PositionMap.end() &&
544 "pointer in equivalence class not found in PositionMap");
545 for (unsigned Pointer : PointerI->second) {
546 bool Merged = false;
547 // Mark this pointer as seen.
548 Seen.insert(Pointer);
549
550 // Go through all the existing sets and see if we can find one
551 // which can include this pointer.
552 for (RuntimeCheckingPtrGroup &Group : Groups) {
553 // Don't perform more than a certain amount of comparisons.
554 // This should limit the cost of grouping the pointers to something
555 // reasonable. If we do end up hitting this threshold, the algorithm
556 // will create separate groups for all remaining pointers.
557 if (TotalComparisons > MemoryCheckMergeThreshold)
558 break;
559
560 TotalComparisons++;
561
562 if (Group.addPointer(Pointer, *this)) {
563 Merged = true;
564 break;
565 }
566 }
567
568 if (!Merged)
569 // We couldn't add this pointer to any existing set or the threshold
570 // for the number of comparisons has been reached. Create a new group
571 // to hold the current pointer.
572 Groups.push_back(RuntimeCheckingPtrGroup(Pointer, *this));
573 }
574 }
575
576 // We've computed the grouped checks for this partition.
577 // Save the results and continue with the next one.
578 llvm::copy(Groups, std::back_inserter(CheckingGroups));
579 }
580}
581
583 const SmallVectorImpl<int> &PtrToPartition, unsigned PtrIdx1,
584 unsigned PtrIdx2) {
585 return (PtrToPartition[PtrIdx1] != -1 &&
586 PtrToPartition[PtrIdx1] == PtrToPartition[PtrIdx2]);
587}
588
589bool RuntimePointerChecking::needsChecking(unsigned I, unsigned J) const {
590 const PointerInfo &PointerI = Pointers[I];
591 const PointerInfo &PointerJ = Pointers[J];
592
593 // No need to check if two readonly pointers intersect.
594 if (!PointerI.IsWritePtr && !PointerJ.IsWritePtr)
595 return false;
596
597 // Only need to check pointers between two different dependency sets.
598 if (PointerI.DependencySetId == PointerJ.DependencySetId)
599 return false;
600
601 // Only need to check pointers in the same alias set.
602 if (PointerI.AliasSetId != PointerJ.AliasSetId)
603 return false;
604
605 return true;
606}
607
610 unsigned Depth) const {
611 unsigned N = 0;
612 for (const auto &Check : Checks) {
613 const auto &First = Check.first->Members, &Second = Check.second->Members;
614
615 OS.indent(Depth) << "Check " << N++ << ":\n";
616
617 OS.indent(Depth + 2) << "Comparing group (" << Check.first << "):\n";
618 for (unsigned K = 0; K < First.size(); ++K)
619 OS.indent(Depth + 2) << *Pointers[First[K]].PointerValue << "\n";
620
621 OS.indent(Depth + 2) << "Against group (" << Check.second << "):\n";
622 for (unsigned K = 0; K < Second.size(); ++K)
623 OS.indent(Depth + 2) << *Pointers[Second[K]].PointerValue << "\n";
624 }
625}
626
628
629 OS.indent(Depth) << "Run-time memory checks:\n";
630 printChecks(OS, Checks, Depth);
631
632 OS.indent(Depth) << "Grouped accesses:\n";
633 for (unsigned I = 0; I < CheckingGroups.size(); ++I) {
634 const auto &CG = CheckingGroups[I];
635
636 OS.indent(Depth + 2) << "Group " << &CG << ":\n";
637 OS.indent(Depth + 4) << "(Low: " << *CG.Low << " High: " << *CG.High
638 << ")\n";
639 for (unsigned J = 0; J < CG.Members.size(); ++J) {
640 OS.indent(Depth + 6) << "Member: " << *Pointers[CG.Members[J]].Expr
641 << "\n";
642 }
643 }
644}
645
646namespace {
647
648/// Analyses memory accesses in a loop.
649///
650/// Checks whether run time pointer checks are needed and builds sets for data
651/// dependence checking.
652class AccessAnalysis {
653public:
654 /// Read or write access location.
655 typedef PointerIntPair<Value *, 1, bool> MemAccessInfo;
656 typedef SmallVector<MemAccessInfo, 8> MemAccessInfoList;
657
658 AccessAnalysis(Loop *TheLoop, AAResults *AA, LoopInfo *LI,
661 SmallPtrSetImpl<MDNode *> &LoopAliasScopes)
662 : TheLoop(TheLoop), BAA(*AA), AST(BAA), LI(LI), DepCands(DA), PSE(PSE),
663 LoopAliasScopes(LoopAliasScopes) {
664 // We're analyzing dependences across loop iterations.
665 BAA.enableCrossIterationMode();
666 }
667
668 /// Register a load and whether it is only read from.
669 void addLoad(MemoryLocation &Loc, Type *AccessTy, bool IsReadOnly) {
670 Value *Ptr = const_cast<Value *>(Loc.Ptr);
671 AST.add(adjustLoc(Loc));
672 Accesses[MemAccessInfo(Ptr, false)].insert(AccessTy);
673 if (IsReadOnly)
674 ReadOnlyPtr.insert(Ptr);
675 }
676
677 /// Register a store.
678 void addStore(MemoryLocation &Loc, Type *AccessTy) {
679 Value *Ptr = const_cast<Value *>(Loc.Ptr);
680 AST.add(adjustLoc(Loc));
681 Accesses[MemAccessInfo(Ptr, true)].insert(AccessTy);
682 }
683
684 /// Check if we can emit a run-time no-alias check for \p Access.
685 ///
686 /// Returns true if we can emit a run-time no alias check for \p Access.
687 /// If we can check this access, this also adds it to a dependence set and
688 /// adds a run-time to check for it to \p RtCheck. If \p Assume is true,
689 /// we will attempt to use additional run-time checks in order to get
690 /// the bounds of the pointer.
691 bool createCheckForAccess(RuntimePointerChecking &RtCheck,
692 MemAccessInfo Access, Type *AccessTy,
693 const DenseMap<Value *, const SCEV *> &Strides,
695 Loop *TheLoop, unsigned &RunningDepId,
696 unsigned ASId, bool ShouldCheckStride, bool Assume);
697
698 /// Check whether we can check the pointers at runtime for
699 /// non-intersection.
700 ///
701 /// Returns true if we need no check or if we do and we can generate them
702 /// (i.e. the pointers have computable bounds).
703 bool canCheckPtrAtRT(RuntimePointerChecking &RtCheck, ScalarEvolution *SE,
704 Loop *TheLoop, const DenseMap<Value *, const SCEV *> &Strides,
705 Value *&UncomputablePtr, bool ShouldCheckWrap = false);
706
707 /// Goes over all memory accesses, checks whether a RT check is needed
708 /// and builds sets of dependent accesses.
709 void buildDependenceSets() {
710 processMemAccesses();
711 }
712
713 /// Initial processing of memory accesses determined that we need to
714 /// perform dependency checking.
715 ///
716 /// Note that this can later be cleared if we retry memcheck analysis without
717 /// dependency checking (i.e. FoundNonConstantDistanceDependence).
718 bool isDependencyCheckNeeded() { return !CheckDeps.empty(); }
719
720 /// We decided that no dependence analysis would be used. Reset the state.
721 void resetDepChecks(MemoryDepChecker &DepChecker) {
722 CheckDeps.clear();
723 DepChecker.clearDependences();
724 }
725
726 MemAccessInfoList &getDependenciesToCheck() { return CheckDeps; }
727
730 return UnderlyingObjects;
731 }
732
733private:
735
736 /// Adjust the MemoryLocation so that it represents accesses to this
737 /// location across all iterations, rather than a single one.
738 MemoryLocation adjustLoc(MemoryLocation Loc) const {
739 // The accessed location varies within the loop, but remains within the
740 // underlying object.
742 Loc.AATags.Scope = adjustAliasScopeList(Loc.AATags.Scope);
743 Loc.AATags.NoAlias = adjustAliasScopeList(Loc.AATags.NoAlias);
744 return Loc;
745 }
746
747 /// Drop alias scopes that are only valid within a single loop iteration.
748 MDNode *adjustAliasScopeList(MDNode *ScopeList) const {
749 if (!ScopeList)
750 return nullptr;
751
752 // For the sake of simplicity, drop the whole scope list if any scope is
753 // iteration-local.
754 if (any_of(ScopeList->operands(), [&](Metadata *Scope) {
755 return LoopAliasScopes.contains(cast<MDNode>(Scope));
756 }))
757 return nullptr;
758
759 return ScopeList;
760 }
761
762 /// Go over all memory access and check whether runtime pointer checks
763 /// are needed and build sets of dependency check candidates.
764 void processMemAccesses();
765
766 /// Map of all accesses. Values are the types used to access memory pointed to
767 /// by the pointer.
768 PtrAccessMap Accesses;
769
770 /// The loop being checked.
771 const Loop *TheLoop;
772
773 /// List of accesses that need a further dependence check.
774 MemAccessInfoList CheckDeps;
775
776 /// Set of pointers that are read only.
777 SmallPtrSet<Value*, 16> ReadOnlyPtr;
778
779 /// Batched alias analysis results.
780 BatchAAResults BAA;
781
782 /// An alias set tracker to partition the access set by underlying object and
783 //intrinsic property (such as TBAA metadata).
784 AliasSetTracker AST;
785
786 LoopInfo *LI;
787
788 /// Sets of potentially dependent accesses - members of one set share an
789 /// underlying pointer. The set "CheckDeps" identfies which sets really need a
790 /// dependence check.
792
793 /// Initial processing of memory accesses determined that we may need
794 /// to add memchecks. Perform the analysis to determine the necessary checks.
795 ///
796 /// Note that, this is different from isDependencyCheckNeeded. When we retry
797 /// memcheck analysis without dependency checking
798 /// (i.e. FoundNonConstantDistanceDependence), isDependencyCheckNeeded is
799 /// cleared while this remains set if we have potentially dependent accesses.
800 bool IsRTCheckAnalysisNeeded = false;
801
802 /// The SCEV predicate containing all the SCEV-related assumptions.
804
806
807 /// Alias scopes that are declared inside the loop, and as such not valid
808 /// across iterations.
809 SmallPtrSetImpl<MDNode *> &LoopAliasScopes;
810};
811
812} // end anonymous namespace
813
814/// Check whether a pointer can participate in a runtime bounds check.
815/// If \p Assume, try harder to prove that we can compute the bounds of \p Ptr
816/// by adding run-time checks (overflow checks) if necessary.
818 const SCEV *PtrScev, Loop *L, bool Assume) {
819 // The bounds for loop-invariant pointer is trivial.
820 if (PSE.getSE()->isLoopInvariant(PtrScev, L))
821 return true;
822
823 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PtrScev);
824
825 if (!AR && Assume)
826 AR = PSE.getAsAddRec(Ptr);
827
828 if (!AR)
829 return false;
830
831 return AR->isAffine();
832}
833
834/// Check whether a pointer address cannot wrap.
836 const DenseMap<Value *, const SCEV *> &Strides, Value *Ptr, Type *AccessTy,
837 Loop *L) {
838 const SCEV *PtrScev = PSE.getSCEV(Ptr);
839 if (PSE.getSE()->isLoopInvariant(PtrScev, L))
840 return true;
841
842 int64_t Stride = getPtrStride(PSE, AccessTy, Ptr, L, Strides).value_or(0);
843 if (Stride == 1 || PSE.hasNoOverflow(Ptr, SCEVWrapPredicate::IncrementNUSW))
844 return true;
845
846 return false;
847}
848
849static void visitPointers(Value *StartPtr, const Loop &InnermostLoop,
850 function_ref<void(Value *)> AddPointer) {
852 SmallVector<Value *> WorkList;
853 WorkList.push_back(StartPtr);
854
855 while (!WorkList.empty()) {
856 Value *Ptr = WorkList.pop_back_val();
857 if (!Visited.insert(Ptr).second)
858 continue;
859 auto *PN = dyn_cast<PHINode>(Ptr);
860 // SCEV does not look through non-header PHIs inside the loop. Such phis
861 // can be analyzed by adding separate accesses for each incoming pointer
862 // value.
863 if (PN && InnermostLoop.contains(PN->getParent()) &&
864 PN->getParent() != InnermostLoop.getHeader()) {
865 for (const Use &Inc : PN->incoming_values())
866 WorkList.push_back(Inc);
867 } else
868 AddPointer(Ptr);
869 }
870}
871
872// Walk back through the IR for a pointer, looking for a select like the
873// following:
874//
875// %offset = select i1 %cmp, i64 %a, i64 %b
876// %addr = getelementptr double, double* %base, i64 %offset
877// %ld = load double, double* %addr, align 8
878//
879// We won't be able to form a single SCEVAddRecExpr from this since the
880// address for each loop iteration depends on %cmp. We could potentially
881// produce multiple valid SCEVAddRecExprs, though, and check all of them for
882// memory safety/aliasing if needed.
883//
884// If we encounter some IR we don't yet handle, or something obviously fine
885// like a constant, then we just add the SCEV for that term to the list passed
886// in by the caller. If we have a node that may potentially yield a valid
887// SCEVAddRecExpr then we decompose it into parts and build the SCEV terms
888// ourselves before adding to the list.
889static void findForkedSCEVs(
890 ScalarEvolution *SE, const Loop *L, Value *Ptr,
892 unsigned Depth) {
893 // If our Value is a SCEVAddRecExpr, loop invariant, not an instruction, or
894 // we've exceeded our limit on recursion, just return whatever we have
895 // regardless of whether it can be used for a forked pointer or not, along
896 // with an indication of whether it might be a poison or undef value.
897 const SCEV *Scev = SE->getSCEV(Ptr);
898 if (isa<SCEVAddRecExpr>(Scev) || L->isLoopInvariant(Ptr) ||
899 !isa<Instruction>(Ptr) || Depth == 0) {
900 ScevList.emplace_back(Scev, !isGuaranteedNotToBeUndefOrPoison(Ptr));
901 return;
902 }
903
904 Depth--;
905
906 auto UndefPoisonCheck = [](PointerIntPair<const SCEV *, 1, bool> S) {
907 return get<1>(S);
908 };
909
910 auto GetBinOpExpr = [&SE](unsigned Opcode, const SCEV *L, const SCEV *R) {
911 switch (Opcode) {
912 case Instruction::Add:
913 return SE->getAddExpr(L, R);
914 case Instruction::Sub:
915 return SE->getMinusSCEV(L, R);
916 default:
917 llvm_unreachable("Unexpected binary operator when walking ForkedPtrs");
918 }
919 };
920
921 Instruction *I = cast<Instruction>(Ptr);
922 unsigned Opcode = I->getOpcode();
923 switch (Opcode) {
924 case Instruction::GetElementPtr: {
925 GetElementPtrInst *GEP = cast<GetElementPtrInst>(I);
926 Type *SourceTy = GEP->getSourceElementType();
927 // We only handle base + single offset GEPs here for now.
928 // Not dealing with preexisting gathers yet, so no vectors.
929 if (I->getNumOperands() != 2 || SourceTy->isVectorTy()) {
930 ScevList.emplace_back(Scev, !isGuaranteedNotToBeUndefOrPoison(GEP));
931 break;
932 }
935 findForkedSCEVs(SE, L, I->getOperand(0), BaseScevs, Depth);
936 findForkedSCEVs(SE, L, I->getOperand(1), OffsetScevs, Depth);
937
938 // See if we need to freeze our fork...
939 bool NeedsFreeze = any_of(BaseScevs, UndefPoisonCheck) ||
940 any_of(OffsetScevs, UndefPoisonCheck);
941
942 // Check that we only have a single fork, on either the base or the offset.
943 // Copy the SCEV across for the one without a fork in order to generate
944 // the full SCEV for both sides of the GEP.
945 if (OffsetScevs.size() == 2 && BaseScevs.size() == 1)
946 BaseScevs.push_back(BaseScevs[0]);
947 else if (BaseScevs.size() == 2 && OffsetScevs.size() == 1)
948 OffsetScevs.push_back(OffsetScevs[0]);
949 else {
950 ScevList.emplace_back(Scev, NeedsFreeze);
951 break;
952 }
953
954 // Find the pointer type we need to extend to.
955 Type *IntPtrTy = SE->getEffectiveSCEVType(
956 SE->getSCEV(GEP->getPointerOperand())->getType());
957
958 // Find the size of the type being pointed to. We only have a single
959 // index term (guarded above) so we don't need to index into arrays or
960 // structures, just get the size of the scalar value.
961 const SCEV *Size = SE->getSizeOfExpr(IntPtrTy, SourceTy);
962
963 // Scale up the offsets by the size of the type, then add to the bases.
964 const SCEV *Scaled1 = SE->getMulExpr(
965 Size, SE->getTruncateOrSignExtend(get<0>(OffsetScevs[0]), IntPtrTy));
966 const SCEV *Scaled2 = SE->getMulExpr(
967 Size, SE->getTruncateOrSignExtend(get<0>(OffsetScevs[1]), IntPtrTy));
968 ScevList.emplace_back(SE->getAddExpr(get<0>(BaseScevs[0]), Scaled1),
969 NeedsFreeze);
970 ScevList.emplace_back(SE->getAddExpr(get<0>(BaseScevs[1]), Scaled2),
971 NeedsFreeze);
972 break;
973 }
974 case Instruction::Select: {
976 // A select means we've found a forked pointer, but we currently only
977 // support a single select per pointer so if there's another behind this
978 // then we just bail out and return the generic SCEV.
979 findForkedSCEVs(SE, L, I->getOperand(1), ChildScevs, Depth);
980 findForkedSCEVs(SE, L, I->getOperand(2), ChildScevs, Depth);
981 if (ChildScevs.size() == 2) {
982 ScevList.push_back(ChildScevs[0]);
983 ScevList.push_back(ChildScevs[1]);
984 } else
985 ScevList.emplace_back(Scev, !isGuaranteedNotToBeUndefOrPoison(Ptr));
986 break;
987 }
988 case Instruction::PHI: {
990 // A phi means we've found a forked pointer, but we currently only
991 // support a single phi per pointer so if there's another behind this
992 // then we just bail out and return the generic SCEV.
993 if (I->getNumOperands() == 2) {
994 findForkedSCEVs(SE, L, I->getOperand(0), ChildScevs, Depth);
995 findForkedSCEVs(SE, L, I->getOperand(1), ChildScevs, Depth);
996 }
997 if (ChildScevs.size() == 2) {
998 ScevList.push_back(ChildScevs[0]);
999 ScevList.push_back(ChildScevs[1]);
1000 } else
1001 ScevList.emplace_back(Scev, !isGuaranteedNotToBeUndefOrPoison(Ptr));
1002 break;
1003 }
1004 case Instruction::Add:
1005 case Instruction::Sub: {
1008 findForkedSCEVs(SE, L, I->getOperand(0), LScevs, Depth);
1009 findForkedSCEVs(SE, L, I->getOperand(1), RScevs, Depth);
1010
1011 // See if we need to freeze our fork...
1012 bool NeedsFreeze =
1013 any_of(LScevs, UndefPoisonCheck) || any_of(RScevs, UndefPoisonCheck);
1014
1015 // Check that we only have a single fork, on either the left or right side.
1016 // Copy the SCEV across for the one without a fork in order to generate
1017 // the full SCEV for both sides of the BinOp.
1018 if (LScevs.size() == 2 && RScevs.size() == 1)
1019 RScevs.push_back(RScevs[0]);
1020 else if (RScevs.size() == 2 && LScevs.size() == 1)
1021 LScevs.push_back(LScevs[0]);
1022 else {
1023 ScevList.emplace_back(Scev, NeedsFreeze);
1024 break;
1025 }
1026
1027 ScevList.emplace_back(
1028 GetBinOpExpr(Opcode, get<0>(LScevs[0]), get<0>(RScevs[0])),
1029 NeedsFreeze);
1030 ScevList.emplace_back(
1031 GetBinOpExpr(Opcode, get<0>(LScevs[1]), get<0>(RScevs[1])),
1032 NeedsFreeze);
1033 break;
1034 }
1035 default:
1036 // Just return the current SCEV if we haven't handled the instruction yet.
1037 LLVM_DEBUG(dbgs() << "ForkedPtr unhandled instruction: " << *I << "\n");
1038 ScevList.emplace_back(Scev, !isGuaranteedNotToBeUndefOrPoison(Ptr));
1039 break;
1040 }
1041}
1042
1045 const DenseMap<Value *, const SCEV *> &StridesMap, Value *Ptr,
1046 const Loop *L) {
1047 ScalarEvolution *SE = PSE.getSE();
1048 assert(SE->isSCEVable(Ptr->getType()) && "Value is not SCEVable!");
1050 findForkedSCEVs(SE, L, Ptr, Scevs, MaxForkedSCEVDepth);
1051
1052 // For now, we will only accept a forked pointer with two possible SCEVs
1053 // that are either SCEVAddRecExprs or loop invariant.
1054 if (Scevs.size() == 2 &&
1055 (isa<SCEVAddRecExpr>(get<0>(Scevs[0])) ||
1056 SE->isLoopInvariant(get<0>(Scevs[0]), L)) &&
1057 (isa<SCEVAddRecExpr>(get<0>(Scevs[1])) ||
1058 SE->isLoopInvariant(get<0>(Scevs[1]), L))) {
1059 LLVM_DEBUG(dbgs() << "LAA: Found forked pointer: " << *Ptr << "\n");
1060 LLVM_DEBUG(dbgs() << "\t(1) " << *get<0>(Scevs[0]) << "\n");
1061 LLVM_DEBUG(dbgs() << "\t(2) " << *get<0>(Scevs[1]) << "\n");
1062 return Scevs;
1063 }
1064
1065 return {{replaceSymbolicStrideSCEV(PSE, StridesMap, Ptr), false}};
1066}
1067
1068bool AccessAnalysis::createCheckForAccess(RuntimePointerChecking &RtCheck,
1069 MemAccessInfo Access, Type *AccessTy,
1070 const DenseMap<Value *, const SCEV *> &StridesMap,
1072 Loop *TheLoop, unsigned &RunningDepId,
1073 unsigned ASId, bool ShouldCheckWrap,
1074 bool Assume) {
1075 Value *Ptr = Access.getPointer();
1076
1078 findForkedPointer(PSE, StridesMap, Ptr, TheLoop);
1079
1080 for (auto &P : TranslatedPtrs) {
1081 const SCEV *PtrExpr = get<0>(P);
1082 if (!hasComputableBounds(PSE, Ptr, PtrExpr, TheLoop, Assume))
1083 return false;
1084
1085 // When we run after a failing dependency check we have to make sure
1086 // we don't have wrapping pointers.
1087 if (ShouldCheckWrap) {
1088 // Skip wrap checking when translating pointers.
1089 if (TranslatedPtrs.size() > 1)
1090 return false;
1091
1092 if (!isNoWrap(PSE, StridesMap, Ptr, AccessTy, TheLoop)) {
1093 auto *Expr = PSE.getSCEV(Ptr);
1094 if (!Assume || !isa<SCEVAddRecExpr>(Expr))
1095 return false;
1097 }
1098 }
1099 // If there's only one option for Ptr, look it up after bounds and wrap
1100 // checking, because assumptions might have been added to PSE.
1101 if (TranslatedPtrs.size() == 1)
1102 TranslatedPtrs[0] = {replaceSymbolicStrideSCEV(PSE, StridesMap, Ptr),
1103 false};
1104 }
1105
1106 for (auto [PtrExpr, NeedsFreeze] : TranslatedPtrs) {
1107 // The id of the dependence set.
1108 unsigned DepId;
1109
1110 if (isDependencyCheckNeeded()) {
1111 Value *Leader = DepCands.getLeaderValue(Access).getPointer();
1112 unsigned &LeaderId = DepSetId[Leader];
1113 if (!LeaderId)
1114 LeaderId = RunningDepId++;
1115 DepId = LeaderId;
1116 } else
1117 // Each access has its own dependence set.
1118 DepId = RunningDepId++;
1119
1120 bool IsWrite = Access.getInt();
1121 RtCheck.insert(TheLoop, Ptr, PtrExpr, AccessTy, IsWrite, DepId, ASId, PSE,
1122 NeedsFreeze);
1123 LLVM_DEBUG(dbgs() << "LAA: Found a runtime check ptr:" << *Ptr << '\n');
1124 }
1125
1126 return true;
1127}
1128
1129bool AccessAnalysis::canCheckPtrAtRT(RuntimePointerChecking &RtCheck,
1130 ScalarEvolution *SE, Loop *TheLoop,
1131 const DenseMap<Value *, const SCEV *> &StridesMap,
1132 Value *&UncomputablePtr, bool ShouldCheckWrap) {
1133 // Find pointers with computable bounds. We are going to use this information
1134 // to place a runtime bound check.
1135 bool CanDoRT = true;
1136
1137 bool MayNeedRTCheck = false;
1138 if (!IsRTCheckAnalysisNeeded) return true;
1139
1140 bool IsDepCheckNeeded = isDependencyCheckNeeded();
1141
1142 // We assign a consecutive id to access from different alias sets.
1143 // Accesses between different groups doesn't need to be checked.
1144 unsigned ASId = 0;
1145 for (auto &AS : AST) {
1146 int NumReadPtrChecks = 0;
1147 int NumWritePtrChecks = 0;
1148 bool CanDoAliasSetRT = true;
1149 ++ASId;
1150 auto ASPointers = AS.getPointers();
1151
1152 // We assign consecutive id to access from different dependence sets.
1153 // Accesses within the same set don't need a runtime check.
1154 unsigned RunningDepId = 1;
1156
1158
1159 // First, count how many write and read accesses are in the alias set. Also
1160 // collect MemAccessInfos for later.
1162 for (const Value *Ptr_ : ASPointers) {
1163 Value *Ptr = const_cast<Value *>(Ptr_);
1164 bool IsWrite = Accesses.count(MemAccessInfo(Ptr, true));
1165 if (IsWrite)
1166 ++NumWritePtrChecks;
1167 else
1168 ++NumReadPtrChecks;
1169 AccessInfos.emplace_back(Ptr, IsWrite);
1170 }
1171
1172 // We do not need runtime checks for this alias set, if there are no writes
1173 // or a single write and no reads.
1174 if (NumWritePtrChecks == 0 ||
1175 (NumWritePtrChecks == 1 && NumReadPtrChecks == 0)) {
1176 assert((ASPointers.size() <= 1 ||
1177 all_of(ASPointers,
1178 [this](const Value *Ptr) {
1179 MemAccessInfo AccessWrite(const_cast<Value *>(Ptr),
1180 true);
1181 return DepCands.findValue(AccessWrite) == DepCands.end();
1182 })) &&
1183 "Can only skip updating CanDoRT below, if all entries in AS "
1184 "are reads or there is at most 1 entry");
1185 continue;
1186 }
1187
1188 for (auto &Access : AccessInfos) {
1189 for (const auto &AccessTy : Accesses[Access]) {
1190 if (!createCheckForAccess(RtCheck, Access, AccessTy, StridesMap,
1191 DepSetId, TheLoop, RunningDepId, ASId,
1192 ShouldCheckWrap, false)) {
1193 LLVM_DEBUG(dbgs() << "LAA: Can't find bounds for ptr:"
1194 << *Access.getPointer() << '\n');
1195 Retries.push_back({Access, AccessTy});
1196 CanDoAliasSetRT = false;
1197 }
1198 }
1199 }
1200
1201 // Note that this function computes CanDoRT and MayNeedRTCheck
1202 // independently. For example CanDoRT=false, MayNeedRTCheck=false means that
1203 // we have a pointer for which we couldn't find the bounds but we don't
1204 // actually need to emit any checks so it does not matter.
1205 //
1206 // We need runtime checks for this alias set, if there are at least 2
1207 // dependence sets (in which case RunningDepId > 2) or if we need to re-try
1208 // any bound checks (because in that case the number of dependence sets is
1209 // incomplete).
1210 bool NeedsAliasSetRTCheck = RunningDepId > 2 || !Retries.empty();
1211
1212 // We need to perform run-time alias checks, but some pointers had bounds
1213 // that couldn't be checked.
1214 if (NeedsAliasSetRTCheck && !CanDoAliasSetRT) {
1215 // Reset the CanDoSetRt flag and retry all accesses that have failed.
1216 // We know that we need these checks, so we can now be more aggressive
1217 // and add further checks if required (overflow checks).
1218 CanDoAliasSetRT = true;
1219 for (auto Retry : Retries) {
1220 MemAccessInfo Access = Retry.first;
1221 Type *AccessTy = Retry.second;
1222 if (!createCheckForAccess(RtCheck, Access, AccessTy, StridesMap,
1223 DepSetId, TheLoop, RunningDepId, ASId,
1224 ShouldCheckWrap, /*Assume=*/true)) {
1225 CanDoAliasSetRT = false;
1226 UncomputablePtr = Access.getPointer();
1227 break;
1228 }
1229 }
1230 }
1231
1232 CanDoRT &= CanDoAliasSetRT;
1233 MayNeedRTCheck |= NeedsAliasSetRTCheck;
1234 ++ASId;
1235 }
1236
1237 // If the pointers that we would use for the bounds comparison have different
1238 // address spaces, assume the values aren't directly comparable, so we can't
1239 // use them for the runtime check. We also have to assume they could
1240 // overlap. In the future there should be metadata for whether address spaces
1241 // are disjoint.
1242 unsigned NumPointers = RtCheck.Pointers.size();
1243 for (unsigned i = 0; i < NumPointers; ++i) {
1244 for (unsigned j = i + 1; j < NumPointers; ++j) {
1245 // Only need to check pointers between two different dependency sets.
1246 if (RtCheck.Pointers[i].DependencySetId ==
1247 RtCheck.Pointers[j].DependencySetId)
1248 continue;
1249 // Only need to check pointers in the same alias set.
1250 if (RtCheck.Pointers[i].AliasSetId != RtCheck.Pointers[j].AliasSetId)
1251 continue;
1252
1253 Value *PtrI = RtCheck.Pointers[i].PointerValue;
1254 Value *PtrJ = RtCheck.Pointers[j].PointerValue;
1255
1256 unsigned ASi = PtrI->getType()->getPointerAddressSpace();
1257 unsigned ASj = PtrJ->getType()->getPointerAddressSpace();
1258 if (ASi != ASj) {
1259 LLVM_DEBUG(
1260 dbgs() << "LAA: Runtime check would require comparison between"
1261 " different address spaces\n");
1262 return false;
1263 }
1264 }
1265 }
1266
1267 if (MayNeedRTCheck && CanDoRT)
1268 RtCheck.generateChecks(DepCands, IsDepCheckNeeded);
1269
1270 LLVM_DEBUG(dbgs() << "LAA: We need to do " << RtCheck.getNumberOfChecks()
1271 << " pointer comparisons.\n");
1272
1273 // If we can do run-time checks, but there are no checks, no runtime checks
1274 // are needed. This can happen when all pointers point to the same underlying
1275 // object for example.
1276 RtCheck.Need = CanDoRT ? RtCheck.getNumberOfChecks() != 0 : MayNeedRTCheck;
1277
1278 bool CanDoRTIfNeeded = !RtCheck.Need || CanDoRT;
1279 if (!CanDoRTIfNeeded)
1280 RtCheck.reset();
1281 return CanDoRTIfNeeded;
1282}
1283
1284void AccessAnalysis::processMemAccesses() {
1285 // We process the set twice: first we process read-write pointers, last we
1286 // process read-only pointers. This allows us to skip dependence tests for
1287 // read-only pointers.
1288
1289 LLVM_DEBUG(dbgs() << "LAA: Processing memory accesses...\n");
1290 LLVM_DEBUG(dbgs() << " AST: "; AST.dump());
1291 LLVM_DEBUG(dbgs() << "LAA: Accesses(" << Accesses.size() << "):\n");
1292 LLVM_DEBUG({
1293 for (auto A : Accesses)
1294 dbgs() << "\t" << *A.first.getPointer() << " ("
1295 << (A.first.getInt()
1296 ? "write"
1297 : (ReadOnlyPtr.count(A.first.getPointer()) ? "read-only"
1298 : "read"))
1299 << ")\n";
1300 });
1301
1302 // The AliasSetTracker has nicely partitioned our pointers by metadata
1303 // compatibility and potential for underlying-object overlap. As a result, we
1304 // only need to check for potential pointer dependencies within each alias
1305 // set.
1306 for (const auto &AS : AST) {
1307 // Note that both the alias-set tracker and the alias sets themselves used
1308 // ordered collections internally and so the iteration order here is
1309 // deterministic.
1310 auto ASPointers = AS.getPointers();
1311
1312 bool SetHasWrite = false;
1313
1314 // Map of pointers to last access encountered.
1315 typedef DenseMap<const Value*, MemAccessInfo> UnderlyingObjToAccessMap;
1316 UnderlyingObjToAccessMap ObjToLastAccess;
1317
1318 // Set of access to check after all writes have been processed.
1319 PtrAccessMap DeferredAccesses;
1320
1321 // Iterate over each alias set twice, once to process read/write pointers,
1322 // and then to process read-only pointers.
1323 for (int SetIteration = 0; SetIteration < 2; ++SetIteration) {
1324 bool UseDeferred = SetIteration > 0;
1325 PtrAccessMap &S = UseDeferred ? DeferredAccesses : Accesses;
1326
1327 for (const Value *Ptr_ : ASPointers) {
1328 Value *Ptr = const_cast<Value *>(Ptr_);
1329
1330 // For a single memory access in AliasSetTracker, Accesses may contain
1331 // both read and write, and they both need to be handled for CheckDeps.
1332 for (const auto &AC : S) {
1333 if (AC.first.getPointer() != Ptr)
1334 continue;
1335
1336 bool IsWrite = AC.first.getInt();
1337
1338 // If we're using the deferred access set, then it contains only
1339 // reads.
1340 bool IsReadOnlyPtr = ReadOnlyPtr.count(Ptr) && !IsWrite;
1341 if (UseDeferred && !IsReadOnlyPtr)
1342 continue;
1343 // Otherwise, the pointer must be in the PtrAccessSet, either as a
1344 // read or a write.
1345 assert(((IsReadOnlyPtr && UseDeferred) || IsWrite ||
1346 S.count(MemAccessInfo(Ptr, false))) &&
1347 "Alias-set pointer not in the access set?");
1348
1349 MemAccessInfo Access(Ptr, IsWrite);
1350 DepCands.insert(Access);
1351
1352 // Memorize read-only pointers for later processing and skip them in
1353 // the first round (they need to be checked after we have seen all
1354 // write pointers). Note: we also mark pointer that are not
1355 // consecutive as "read-only" pointers (so that we check
1356 // "a[b[i]] +="). Hence, we need the second check for "!IsWrite".
1357 if (!UseDeferred && IsReadOnlyPtr) {
1358 // We only use the pointer keys, the types vector values don't
1359 // matter.
1360 DeferredAccesses.insert({Access, {}});
1361 continue;
1362 }
1363
1364 // If this is a write - check other reads and writes for conflicts. If
1365 // this is a read only check other writes for conflicts (but only if
1366 // there is no other write to the ptr - this is an optimization to
1367 // catch "a[i] = a[i] + " without having to do a dependence check).
1368 if ((IsWrite || IsReadOnlyPtr) && SetHasWrite) {
1369 CheckDeps.push_back(Access);
1370 IsRTCheckAnalysisNeeded = true;
1371 }
1372
1373 if (IsWrite)
1374 SetHasWrite = true;
1375
1376 // Create sets of pointers connected by a shared alias set and
1377 // underlying object.
1378 typedef SmallVector<const Value *, 16> ValueVector;
1379 ValueVector TempObjects;
1380
1381 UnderlyingObjects[Ptr] = {};
1382 SmallVector<const Value *, 16> &UOs = UnderlyingObjects[Ptr];
1383 ::getUnderlyingObjects(Ptr, UOs, LI);
1385 << "Underlying objects for pointer " << *Ptr << "\n");
1386 for (const Value *UnderlyingObj : UOs) {
1387 // nullptr never alias, don't join sets for pointer that have "null"
1388 // in their UnderlyingObjects list.
1389 if (isa<ConstantPointerNull>(UnderlyingObj) &&
1391 TheLoop->getHeader()->getParent(),
1392 UnderlyingObj->getType()->getPointerAddressSpace()))
1393 continue;
1394
1395 UnderlyingObjToAccessMap::iterator Prev =
1396 ObjToLastAccess.find(UnderlyingObj);
1397 if (Prev != ObjToLastAccess.end())
1398 DepCands.unionSets(Access, Prev->second);
1399
1400 ObjToLastAccess[UnderlyingObj] = Access;
1401 LLVM_DEBUG(dbgs() << " " << *UnderlyingObj << "\n");
1402 }
1403 }
1404 }
1405 }
1406 }
1407}
1408
1409/// Return true if an AddRec pointer \p Ptr is unsigned non-wrapping,
1410/// i.e. monotonically increasing/decreasing.
1411static bool isNoWrapAddRec(Value *Ptr, const SCEVAddRecExpr *AR,
1412 PredicatedScalarEvolution &PSE, const Loop *L) {
1413
1414 // FIXME: This should probably only return true for NUW.
1416 return true;
1417
1419 return true;
1420
1421 // Scalar evolution does not propagate the non-wrapping flags to values that
1422 // are derived from a non-wrapping induction variable because non-wrapping
1423 // could be flow-sensitive.
1424 //
1425 // Look through the potentially overflowing instruction to try to prove
1426 // non-wrapping for the *specific* value of Ptr.
1427
1428 // The arithmetic implied by an inbounds GEP can't overflow.
1429 auto *GEP = dyn_cast<GetElementPtrInst>(Ptr);
1430 if (!GEP || !GEP->isInBounds())
1431 return false;
1432
1433 // Make sure there is only one non-const index and analyze that.
1434 Value *NonConstIndex = nullptr;
1435 for (Value *Index : GEP->indices())
1436 if (!isa<ConstantInt>(Index)) {
1437 if (NonConstIndex)
1438 return false;
1439 NonConstIndex = Index;
1440 }
1441 if (!NonConstIndex)
1442 // The recurrence is on the pointer, ignore for now.
1443 return false;
1444
1445 // The index in GEP is signed. It is non-wrapping if it's derived from a NSW
1446 // AddRec using a NSW operation.
1447 if (auto *OBO = dyn_cast<OverflowingBinaryOperator>(NonConstIndex))
1448 if (OBO->hasNoSignedWrap() &&
1449 // Assume constant for other the operand so that the AddRec can be
1450 // easily found.
1451 isa<ConstantInt>(OBO->getOperand(1))) {
1452 auto *OpScev = PSE.getSCEV(OBO->getOperand(0));
1453
1454 if (auto *OpAR = dyn_cast<SCEVAddRecExpr>(OpScev))
1455 return OpAR->getLoop() == L && OpAR->getNoWrapFlags(SCEV::FlagNSW);
1456 }
1457
1458 return false;
1459}
1460
1461/// Check whether the access through \p Ptr has a constant stride.
1463 Type *AccessTy, Value *Ptr,
1464 const Loop *Lp,
1465 const DenseMap<Value *, const SCEV *> &StridesMap,
1466 bool Assume, bool ShouldCheckWrap) {
1467 Type *Ty = Ptr->getType();
1468 assert(Ty->isPointerTy() && "Unexpected non-ptr");
1469
1470 if (isa<ScalableVectorType>(AccessTy)) {
1471 LLVM_DEBUG(dbgs() << "LAA: Bad stride - Scalable object: " << *AccessTy
1472 << "\n");
1473 return std::nullopt;
1474 }
1475
1476 const SCEV *PtrScev = replaceSymbolicStrideSCEV(PSE, StridesMap, Ptr);
1477
1478 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PtrScev);
1479 if (Assume && !AR)
1480 AR = PSE.getAsAddRec(Ptr);
1481
1482 if (!AR) {
1483 LLVM_DEBUG(dbgs() << "LAA: Bad stride - Not an AddRecExpr pointer " << *Ptr
1484 << " SCEV: " << *PtrScev << "\n");
1485 return std::nullopt;
1486 }
1487
1488 // The access function must stride over the innermost loop.
1489 if (Lp != AR->getLoop()) {
1490 LLVM_DEBUG(dbgs() << "LAA: Bad stride - Not striding over innermost loop "
1491 << *Ptr << " SCEV: " << *AR << "\n");
1492 return std::nullopt;
1493 }
1494
1495 // Check the step is constant.
1496 const SCEV *Step = AR->getStepRecurrence(*PSE.getSE());
1497
1498 // Calculate the pointer stride and check if it is constant.
1499 const SCEVConstant *C = dyn_cast<SCEVConstant>(Step);
1500 if (!C) {
1501 LLVM_DEBUG(dbgs() << "LAA: Bad stride - Not a constant strided " << *Ptr
1502 << " SCEV: " << *AR << "\n");
1503 return std::nullopt;
1504 }
1505
1506 auto &DL = Lp->getHeader()->getModule()->getDataLayout();
1507 TypeSize AllocSize = DL.getTypeAllocSize(AccessTy);
1508 int64_t Size = AllocSize.getFixedValue();
1509 const APInt &APStepVal = C->getAPInt();
1510
1511 // Huge step value - give up.
1512 if (APStepVal.getBitWidth() > 64)
1513 return std::nullopt;
1514
1515 int64_t StepVal = APStepVal.getSExtValue();
1516
1517 // Strided access.
1518 int64_t Stride = StepVal / Size;
1519 int64_t Rem = StepVal % Size;
1520 if (Rem)
1521 return std::nullopt;
1522
1523 if (!ShouldCheckWrap)
1524 return Stride;
1525
1526 // The address calculation must not wrap. Otherwise, a dependence could be
1527 // inverted.
1528 if (isNoWrapAddRec(Ptr, AR, PSE, Lp))
1529 return Stride;
1530
1531 // An inbounds getelementptr that is a AddRec with a unit stride
1532 // cannot wrap per definition. If it did, the result would be poison
1533 // and any memory access dependent on it would be immediate UB
1534 // when executed.
1535 if (auto *GEP = dyn_cast<GetElementPtrInst>(Ptr);
1536 GEP && GEP->isInBounds() && (Stride == 1 || Stride == -1))
1537 return Stride;
1538
1539 // If the null pointer is undefined, then a access sequence which would
1540 // otherwise access it can be assumed not to unsigned wrap. Note that this
1541 // assumes the object in memory is aligned to the natural alignment.
1542 unsigned AddrSpace = Ty->getPointerAddressSpace();
1543 if (!NullPointerIsDefined(Lp->getHeader()->getParent(), AddrSpace) &&
1544 (Stride == 1 || Stride == -1))
1545 return Stride;
1546
1547 if (Assume) {
1549 LLVM_DEBUG(dbgs() << "LAA: Pointer may wrap:\n"
1550 << "LAA: Pointer: " << *Ptr << "\n"
1551 << "LAA: SCEV: " << *AR << "\n"
1552 << "LAA: Added an overflow assumption\n");
1553 return Stride;
1554 }
1555 LLVM_DEBUG(
1556 dbgs() << "LAA: Bad stride - Pointer may wrap in the address space "
1557 << *Ptr << " SCEV: " << *AR << "\n");
1558 return std::nullopt;
1559}
1560
1561std::optional<int> llvm::getPointersDiff(Type *ElemTyA, Value *PtrA,
1562 Type *ElemTyB, Value *PtrB,
1563 const DataLayout &DL,
1564 ScalarEvolution &SE, bool StrictCheck,
1565 bool CheckType) {
1566 assert(PtrA && PtrB && "Expected non-nullptr pointers.");
1567
1568 // Make sure that A and B are different pointers.
1569 if (PtrA == PtrB)
1570 return 0;
1571
1572 // Make sure that the element types are the same if required.
1573 if (CheckType && ElemTyA != ElemTyB)
1574 return std::nullopt;
1575
1576 unsigned ASA = PtrA->getType()->getPointerAddressSpace();
1577 unsigned ASB = PtrB->getType()->getPointerAddressSpace();
1578
1579 // Check that the address spaces match.
1580 if (ASA != ASB)
1581 return std::nullopt;
1582 unsigned IdxWidth = DL.getIndexSizeInBits(ASA);
1583
1584 APInt OffsetA(IdxWidth, 0), OffsetB(IdxWidth, 0);
1585 Value *PtrA1 = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
1586 Value *PtrB1 = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB);
1587
1588 int Val;
1589 if (PtrA1 == PtrB1) {
1590 // Retrieve the address space again as pointer stripping now tracks through
1591 // `addrspacecast`.
1592 ASA = cast<PointerType>(PtrA1->getType())->getAddressSpace();
1593 ASB = cast<PointerType>(PtrB1->getType())->getAddressSpace();
1594 // Check that the address spaces match and that the pointers are valid.
1595 if (ASA != ASB)
1596 return std::nullopt;
1597
1598 IdxWidth = DL.getIndexSizeInBits(ASA);
1599 OffsetA = OffsetA.sextOrTrunc(IdxWidth);
1600 OffsetB = OffsetB.sextOrTrunc(IdxWidth);
1601
1602 OffsetB -= OffsetA;
1603 Val = OffsetB.getSExtValue();
1604 } else {
1605 // Otherwise compute the distance with SCEV between the base pointers.
1606 const SCEV *PtrSCEVA = SE.getSCEV(PtrA);
1607 const SCEV *PtrSCEVB = SE.getSCEV(PtrB);
1608 const auto *Diff =
1609 dyn_cast<SCEVConstant>(SE.getMinusSCEV(PtrSCEVB, PtrSCEVA));
1610 if (!Diff)
1611 return std::nullopt;
1612 Val = Diff->getAPInt().getSExtValue();
1613 }
1614 int Size = DL.getTypeStoreSize(ElemTyA);
1615 int Dist = Val / Size;
1616
1617 // Ensure that the calculated distance matches the type-based one after all
1618 // the bitcasts removal in the provided pointers.
1619 if (!StrictCheck || Dist * Size == Val)
1620 return Dist;
1621 return std::nullopt;
1622}
1623
1625 const DataLayout &DL, ScalarEvolution &SE,
1626 SmallVectorImpl<unsigned> &SortedIndices) {
1628 VL, [](const Value *V) { return V->getType()->isPointerTy(); }) &&
1629 "Expected list of pointer operands.");
1630 // Walk over the pointers, and map each of them to an offset relative to
1631 // first pointer in the array.
1632 Value *Ptr0 = VL[0];
1633
1634 using DistOrdPair = std::pair<int64_t, int>;
1635 auto Compare = llvm::less_first();
1636 std::set<DistOrdPair, decltype(Compare)> Offsets(Compare);
1637 Offsets.emplace(0, 0);
1638 int Cnt = 1;
1639 bool IsConsecutive = true;
1640 for (auto *Ptr : VL.drop_front()) {
1641 std::optional<int> Diff = getPointersDiff(ElemTy, Ptr0, ElemTy, Ptr, DL, SE,
1642 /*StrictCheck=*/true);
1643 if (!Diff)
1644 return false;
1645
1646 // Check if the pointer with the same offset is found.
1647 int64_t Offset = *Diff;
1648 auto Res = Offsets.emplace(Offset, Cnt);
1649 if (!Res.second)
1650 return false;
1651 // Consecutive order if the inserted element is the last one.
1652 IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end();
1653 ++Cnt;
1654 }
1655 SortedIndices.clear();
1656 if (!IsConsecutive) {
1657 // Fill SortedIndices array only if it is non-consecutive.
1658 SortedIndices.resize(VL.size());
1659 Cnt = 0;
1660 for (const std::pair<int64_t, int> &Pair : Offsets) {
1661 SortedIndices[Cnt] = Pair.second;
1662 ++Cnt;
1663 }
1664 }
1665 return true;
1666}
1667
1668/// Returns true if the memory operations \p A and \p B are consecutive.
1670 ScalarEvolution &SE, bool CheckType) {
1673 if (!PtrA || !PtrB)
1674 return false;
1675 Type *ElemTyA = getLoadStoreType(A);
1676 Type *ElemTyB = getLoadStoreType(B);
1677 std::optional<int> Diff =
1678 getPointersDiff(ElemTyA, PtrA, ElemTyB, PtrB, DL, SE,
1679 /*StrictCheck=*/true, CheckType);
1680 return Diff && *Diff == 1;
1681}
1682
1684 visitPointers(SI->getPointerOperand(), *InnermostLoop,
1685 [this, SI](Value *Ptr) {
1686 Accesses[MemAccessInfo(Ptr, true)].push_back(AccessIdx);
1687 InstMap.push_back(SI);
1688 ++AccessIdx;
1689 });
1690}
1691
1693 visitPointers(LI->getPointerOperand(), *InnermostLoop,
1694 [this, LI](Value *Ptr) {
1695 Accesses[MemAccessInfo(Ptr, false)].push_back(AccessIdx);
1696 InstMap.push_back(LI);
1697 ++AccessIdx;
1698 });
1699}
1700
1703 switch (Type) {
1704 case NoDep:
1705 case Forward:
1708
1709 case Unknown:
1712 case Backward:
1714 case IndirectUnsafe:
1716 }
1717 llvm_unreachable("unexpected DepType!");
1718}
1719
1721 switch (Type) {
1722 case NoDep:
1723 case Forward:
1724 case ForwardButPreventsForwarding:
1725 case Unknown:
1726 case IndirectUnsafe:
1727 return false;
1728
1729 case BackwardVectorizable:
1730 case Backward:
1731 case BackwardVectorizableButPreventsForwarding:
1732 return true;
1733 }
1734 llvm_unreachable("unexpected DepType!");
1735}
1736
1738 return isBackward() || Type == Unknown;
1739}
1740
1742 switch (Type) {
1743 case Forward:
1744 case ForwardButPreventsForwarding:
1745 return true;
1746
1747 case NoDep:
1748 case Unknown:
1749 case BackwardVectorizable:
1750 case Backward:
1751 case BackwardVectorizableButPreventsForwarding:
1752 case IndirectUnsafe:
1753 return false;
1754 }
1755 llvm_unreachable("unexpected DepType!");
1756}
1757
1758bool MemoryDepChecker::couldPreventStoreLoadForward(uint64_t Distance,
1759 uint64_t TypeByteSize) {
1760 // If loads occur at a distance that is not a multiple of a feasible vector
1761 // factor store-load forwarding does not take place.
1762 // Positive dependences might cause troubles because vectorizing them might
1763 // prevent store-load forwarding making vectorized code run a lot slower.
1764 // a[i] = a[i-3] ^ a[i-8];
1765 // The stores to a[i:i+1] don't align with the stores to a[i-3:i-2] and
1766 // hence on your typical architecture store-load forwarding does not take
1767 // place. Vectorizing in such cases does not make sense.
1768 // Store-load forwarding distance.
1769
1770 // After this many iterations store-to-load forwarding conflicts should not
1771 // cause any slowdowns.
1772 const uint64_t NumItersForStoreLoadThroughMemory = 8 * TypeByteSize;
1773 // Maximum vector factor.
1774 uint64_t MaxVFWithoutSLForwardIssues = std::min(
1775 VectorizerParams::MaxVectorWidth * TypeByteSize, MinDepDistBytes);
1776
1777 // Compute the smallest VF at which the store and load would be misaligned.
1778 for (uint64_t VF = 2 * TypeByteSize; VF <= MaxVFWithoutSLForwardIssues;
1779 VF *= 2) {
1780 // If the number of vector iteration between the store and the load are
1781 // small we could incur conflicts.
1782 if (Distance % VF && Distance / VF < NumItersForStoreLoadThroughMemory) {
1783 MaxVFWithoutSLForwardIssues = (VF >> 1);
1784 break;
1785 }
1786 }
1787
1788 if (MaxVFWithoutSLForwardIssues < 2 * TypeByteSize) {
1789 LLVM_DEBUG(
1790 dbgs() << "LAA: Distance " << Distance
1791 << " that could cause a store-load forwarding conflict\n");
1792 return true;
1793 }
1794
1795 if (MaxVFWithoutSLForwardIssues < MinDepDistBytes &&
1796 MaxVFWithoutSLForwardIssues !=
1797 VectorizerParams::MaxVectorWidth * TypeByteSize)
1798 MinDepDistBytes = MaxVFWithoutSLForwardIssues;
1799 return false;
1800}
1801
1802void MemoryDepChecker::mergeInStatus(VectorizationSafetyStatus S) {
1803 if (Status < S)
1804 Status = S;
1805}
1806
1807/// Given a dependence-distance \p Dist between two
1808/// memory accesses, that have strides in the same direction whose absolute
1809/// value of the maximum stride is given in \p MaxStride, and that have the same
1810/// type size \p TypeByteSize, in a loop whose takenCount is \p
1811/// BackedgeTakenCount, check if it is possible to prove statically that the
1812/// dependence distance is larger than the range that the accesses will travel
1813/// through the execution of the loop. If so, return true; false otherwise. This
1814/// is useful for example in loops such as the following (PR31098):
1815/// for (i = 0; i < D; ++i) {
1816/// = out[i];
1817/// out[i+D] =
1818/// }
1820 const SCEV &BackedgeTakenCount,
1821 const SCEV &Dist, uint64_t MaxStride,
1822 uint64_t TypeByteSize) {
1823
1824 // If we can prove that
1825 // (**) |Dist| > BackedgeTakenCount * Step
1826 // where Step is the absolute stride of the memory accesses in bytes,
1827 // then there is no dependence.
1828 //
1829 // Rationale:
1830 // We basically want to check if the absolute distance (|Dist/Step|)
1831 // is >= the loop iteration count (or > BackedgeTakenCount).
1832 // This is equivalent to the Strong SIV Test (Practical Dependence Testing,
1833 // Section 4.2.1); Note, that for vectorization it is sufficient to prove
1834 // that the dependence distance is >= VF; This is checked elsewhere.
1835 // But in some cases we can prune dependence distances early, and
1836 // even before selecting the VF, and without a runtime test, by comparing
1837 // the distance against the loop iteration count. Since the vectorized code
1838 // will be executed only if LoopCount >= VF, proving distance >= LoopCount
1839 // also guarantees that distance >= VF.
1840 //
1841 const uint64_t ByteStride = MaxStride * TypeByteSize;
1842 const SCEV *Step = SE.getConstant(BackedgeTakenCount.getType(), ByteStride);
1843 const SCEV *Product = SE.getMulExpr(&BackedgeTakenCount, Step);
1844
1845 const SCEV *CastedDist = &Dist;
1846 const SCEV *CastedProduct = Product;
1847 uint64_t DistTypeSizeBits = DL.getTypeSizeInBits(Dist.getType());
1848 uint64_t ProductTypeSizeBits = DL.getTypeSizeInBits(Product->getType());
1849
1850 // The dependence distance can be positive/negative, so we sign extend Dist;
1851 // The multiplication of the absolute stride in bytes and the
1852 // backedgeTakenCount is non-negative, so we zero extend Product.
1853 if (DistTypeSizeBits > ProductTypeSizeBits)
1854 CastedProduct = SE.getZeroExtendExpr(Product, Dist.getType());
1855 else
1856 CastedDist = SE.getNoopOrSignExtend(&Dist, Product->getType());
1857
1858 // Is Dist - (BackedgeTakenCount * Step) > 0 ?
1859 // (If so, then we have proven (**) because |Dist| >= Dist)
1860 const SCEV *Minus = SE.getMinusSCEV(CastedDist, CastedProduct);
1861 if (SE.isKnownPositive(Minus))
1862 return true;
1863
1864 // Second try: Is -Dist - (BackedgeTakenCount * Step) > 0 ?
1865 // (If so, then we have proven (**) because |Dist| >= -1*Dist)
1866 const SCEV *NegDist = SE.getNegativeSCEV(CastedDist);
1867 Minus = SE.getMinusSCEV(NegDist, CastedProduct);
1868 if (SE.isKnownPositive(Minus))
1869 return true;
1870
1871 return false;
1872}
1873
1874/// Check the dependence for two accesses with the same stride \p Stride.
1875/// \p Distance is the positive distance and \p TypeByteSize is type size in
1876/// bytes.
1877///
1878/// \returns true if they are independent.
1880 uint64_t TypeByteSize) {
1881 assert(Stride > 1 && "The stride must be greater than 1");
1882 assert(TypeByteSize > 0 && "The type size in byte must be non-zero");
1883 assert(Distance > 0 && "The distance must be non-zero");
1884
1885 // Skip if the distance is not multiple of type byte size.
1886 if (Distance % TypeByteSize)
1887 return false;
1888
1889 uint64_t ScaledDist = Distance / TypeByteSize;
1890
1891 // No dependence if the scaled distance is not multiple of the stride.
1892 // E.g.
1893 // for (i = 0; i < 1024 ; i += 4)
1894 // A[i+2] = A[i] + 1;
1895 //
1896 // Two accesses in memory (scaled distance is 2, stride is 4):
1897 // | A[0] | | | | A[4] | | | |
1898 // | | | A[2] | | | | A[6] | |
1899 //
1900 // E.g.
1901 // for (i = 0; i < 1024 ; i += 3)
1902 // A[i+4] = A[i] + 1;
1903 //
1904 // Two accesses in memory (scaled distance is 4, stride is 3):
1905 // | A[0] | | | A[3] | | | A[6] | | |
1906 // | | | | | A[4] | | | A[7] | |
1907 return ScaledDist % Stride;
1908}
1909
1910/// Returns true if any of the underlying objects has a loop varying address,
1911/// i.e. may change in \p L.
1912static bool
1914 ScalarEvolution &SE, const Loop *L) {
1915 return any_of(UnderlyingObjects, [&SE, L](const Value *UO) {
1916 return !SE.isLoopInvariant(SE.getSCEV(const_cast<Value *>(UO)), L);
1917 });
1918}
1919
1920namespace {
1921struct DepDistanceStrideAndSizeInfo {
1922 const SCEV *Dist;
1923 uint64_t StrideA;
1924 uint64_t StrideB;
1925 uint64_t TypeByteSize;
1926 bool AIsWrite;
1927 bool BIsWrite;
1928
1929 DepDistanceStrideAndSizeInfo(const SCEV *Dist, uint64_t StrideA,
1930 uint64_t StrideB, uint64_t TypeByteSize,
1931 bool AIsWrite, bool BIsWrite)
1932 : Dist(Dist), StrideA(StrideA), StrideB(StrideB),
1933 TypeByteSize(TypeByteSize), AIsWrite(AIsWrite), BIsWrite(BIsWrite) {}
1934};
1935} // namespace
1936
1937// Get the dependence distance, strides, type size and whether it is a write for
1938// the dependence between A and B. Returns a DepType, if we can prove there's
1939// no dependence or the analysis fails. Outlined to lambda to limit he scope
1940// of various temporary variables, like A/BPtr, StrideA/BPtr and others.
1941// Returns either the dependence result, if it could already be determined, or a
1942// struct containing (Distance, Stride, TypeSize, AIsWrite, BIsWrite).
1943static std::variant<MemoryDepChecker::Dependence::DepType,
1944 DepDistanceStrideAndSizeInfo>
1948 const DenseMap<Value *, const SCEV *> &Strides,
1949 const DenseMap<Value *, SmallVector<const Value *, 16>> &UnderlyingObjects,
1950 PredicatedScalarEvolution &PSE, const Loop *InnermostLoop) {
1951 auto &DL = InnermostLoop->getHeader()->getModule()->getDataLayout();
1952 auto &SE = *PSE.getSE();
1953 auto [APtr, AIsWrite] = A;
1954 auto [BPtr, BIsWrite] = B;
1955
1956 // Two reads are independent.
1957 if (!AIsWrite && !BIsWrite)
1959
1960 Type *ATy = getLoadStoreType(AInst);
1961 Type *BTy = getLoadStoreType(BInst);
1962
1963 // We cannot check pointers in different address spaces.
1964 if (APtr->getType()->getPointerAddressSpace() !=
1965 BPtr->getType()->getPointerAddressSpace())
1967
1968 int64_t StrideAPtr =
1969 getPtrStride(PSE, ATy, APtr, InnermostLoop, Strides, true).value_or(0);
1970 int64_t StrideBPtr =
1971 getPtrStride(PSE, BTy, BPtr, InnermostLoop, Strides, true).value_or(0);
1972
1973 const SCEV *Src = PSE.getSCEV(APtr);
1974 const SCEV *Sink = PSE.getSCEV(BPtr);
1975
1976 // If the induction step is negative we have to invert source and sink of the
1977 // dependence when measuring the distance between them. We should not swap
1978 // AIsWrite with BIsWrite, as their uses expect them in program order.
1979 if (StrideAPtr < 0) {
1980 std::swap(Src, Sink);
1981 std::swap(AInst, BInst);
1982 }
1983
1984 const SCEV *Dist = SE.getMinusSCEV(Sink, Src);
1985
1986 LLVM_DEBUG(dbgs() << "LAA: Src Scev: " << *Src << "Sink Scev: " << *Sink
1987 << "(Induction step: " << StrideAPtr << ")\n");
1988 LLVM_DEBUG(dbgs() << "LAA: Distance for " << *AInst << " to " << *BInst
1989 << ": " << *Dist << "\n");
1990
1991 // Needs accesses where the addresses of the accessed underlying objects do
1992 // not change within the loop.
1993 if (isLoopVariantIndirectAddress(UnderlyingObjects.find(APtr)->second, SE,
1994 InnermostLoop) ||
1995 isLoopVariantIndirectAddress(UnderlyingObjects.find(BPtr)->second, SE,
1996 InnermostLoop))
1998
1999 // Need accesses with constant strides and the same direction. We don't want
2000 // to vectorize "A[B[i]] += ..." and similar code or pointer arithmetic that
2001 // could wrap in the address space.
2002 if (!StrideAPtr || !StrideBPtr || (StrideAPtr > 0 && StrideBPtr < 0) ||
2003 (StrideAPtr < 0 && StrideBPtr > 0)) {
2004 LLVM_DEBUG(dbgs() << "Pointer access with non-constant stride\n");
2006 }
2007
2008 uint64_t TypeByteSize = DL.getTypeAllocSize(ATy);
2009 bool HasSameSize =
2010 DL.getTypeStoreSizeInBits(ATy) == DL.getTypeStoreSizeInBits(BTy);
2011 if (!HasSameSize)
2012 TypeByteSize = 0;
2013 return DepDistanceStrideAndSizeInfo(Dist, std::abs(StrideAPtr),
2014 std::abs(StrideBPtr), TypeByteSize,
2015 AIsWrite, BIsWrite);
2016}
2017
2018MemoryDepChecker::Dependence::DepType MemoryDepChecker::isDependent(
2019 const MemAccessInfo &A, unsigned AIdx, const MemAccessInfo &B,
2020 unsigned BIdx, const DenseMap<Value *, const SCEV *> &Strides,
2022 &UnderlyingObjects) {
2023 assert(AIdx < BIdx && "Must pass arguments in program order");
2024
2025 // Get the dependence distance, stride, type size and what access writes for
2026 // the dependence between A and B.
2028 A, InstMap[AIdx], B, InstMap[BIdx], Strides, UnderlyingObjects, PSE,
2029 InnermostLoop);
2030 if (std::holds_alternative<Dependence::DepType>(Res))
2031 return std::get<Dependence::DepType>(Res);
2032
2033 const auto &[Dist, StrideA, StrideB, TypeByteSize, AIsWrite, BIsWrite] =
2034 std::get<DepDistanceStrideAndSizeInfo>(Res);
2035 bool HasSameSize = TypeByteSize > 0;
2036
2037 std::optional<uint64_t> CommonStride =
2038 StrideA == StrideB ? std::make_optional(StrideA) : std::nullopt;
2039 if (isa<SCEVCouldNotCompute>(Dist)) {
2040 // TODO: Relax requirement that there is a common stride to retry with
2041 // non-constant distance dependencies.
2042 FoundNonConstantDistanceDependence |= !!CommonStride;
2043 LLVM_DEBUG(dbgs() << "LAA: Dependence because of uncomputable distance.\n");
2044 return Dependence::Unknown;
2045 }
2046
2047 ScalarEvolution &SE = *PSE.getSE();
2048 auto &DL = InnermostLoop->getHeader()->getModule()->getDataLayout();
2049 uint64_t MaxStride = std::max(StrideA, StrideB);
2050
2051 // If the distance between the acecsses is larger than their maximum absolute
2052 // stride multiplied by the backedge taken count, the accesses are independet,
2053 // i.e. they are far enough appart that accesses won't access the same
2054 // location across all loop ierations.
2055 if (HasSameSize &&
2057 MaxStride, TypeByteSize))
2058 return Dependence::NoDep;
2059
2060 const SCEVConstant *C = dyn_cast<SCEVConstant>(Dist);
2061
2062 // Attempt to prove strided accesses independent.
2063 if (C) {
2064 const APInt &Val = C->getAPInt();
2065 int64_t Distance = Val.getSExtValue();
2066
2067 // If the distance between accesses and their strides are known constants,
2068 // check whether the accesses interlace each other.
2069 if (std::abs(Distance) > 0 && CommonStride && *CommonStride > 1 &&
2070 HasSameSize &&
2071 areStridedAccessesIndependent(std::abs(Distance), *CommonStride,
2072 TypeByteSize)) {
2073 LLVM_DEBUG(dbgs() << "LAA: Strided accesses are independent\n");
2074 return Dependence::NoDep;
2075 }
2076 }
2077
2078 // Negative distances are not plausible dependencies.
2079 if (SE.isKnownNonPositive(Dist)) {
2080 if (SE.isKnownNonNegative(Dist)) {
2081 if (HasSameSize) {
2082 // Write to the same location with the same size.
2083 return Dependence::Forward;
2084 } else {
2085 LLVM_DEBUG(dbgs() << "LAA: possibly zero dependence difference but "
2086 "different type sizes\n");
2087 return Dependence::Unknown;
2088 }
2089 }
2090
2091 bool IsTrueDataDependence = (AIsWrite && !BIsWrite);
2092 // Check if the first access writes to a location that is read in a later
2093 // iteration, where the distance between them is not a multiple of a vector
2094 // factor and relatively small.
2095 //
2096 // NOTE: There is no need to update MaxSafeVectorWidthInBits after call to
2097 // couldPreventStoreLoadForward, even if it changed MinDepDistBytes, since a
2098 // forward dependency will allow vectorization using any width.
2099
2100 if (IsTrueDataDependence && EnableForwardingConflictDetection) {
2101 if (!C) {
2102 // TODO: FoundNonConstantDistanceDependence is used as a necessary
2103 // condition to consider retrying with runtime checks. Historically, we
2104 // did not set it when strides were different but there is no inherent
2105 // reason to.
2106 FoundNonConstantDistanceDependence |= CommonStride.has_value();
2107 return Dependence::Unknown;
2108 }
2109 if (!HasSameSize ||
2110 couldPreventStoreLoadForward(C->getAPInt().abs().getZExtValue(),
2111 TypeByteSize)) {
2112 LLVM_DEBUG(
2113 dbgs() << "LAA: Forward but may prevent st->ld forwarding\n");
2115 }
2116 }
2117
2118 LLVM_DEBUG(dbgs() << "LAA: Dependence is negative\n");
2119 return Dependence::Forward;
2120 }
2121
2122 if (!C) {
2123 // TODO: FoundNonConstantDistanceDependence is used as a necessary condition
2124 // to consider retrying with runtime checks. Historically, we did not set it
2125 // when strides were different but there is no inherent reason to.
2126 FoundNonConstantDistanceDependence |= CommonStride.has_value();
2127 LLVM_DEBUG(dbgs() << "LAA: Dependence because of non-constant distance\n");
2128 return Dependence::Unknown;
2129 }
2130
2131 if (!SE.isKnownPositive(Dist))
2132 return Dependence::Unknown;
2133
2134 if (!HasSameSize) {
2135 LLVM_DEBUG(dbgs() << "LAA: ReadWrite-Write positive dependency with "
2136 "different type sizes\n");
2137 return Dependence::Unknown;
2138 }
2139
2140 // The logic below currently only supports StrideA == StrideB, i.e. there's a
2141 // common stride.
2142 if (!CommonStride)
2143 return Dependence::Unknown;
2144
2145 const APInt &Val = C->getAPInt();
2146 int64_t Distance = Val.getSExtValue();
2147
2148 // Bail out early if passed-in parameters make vectorization not feasible.
2149 unsigned ForcedFactor = (VectorizerParams::VectorizationFactor ?
2151 unsigned ForcedUnroll = (VectorizerParams::VectorizationInterleave ?
2153 // The minimum number of iterations for a vectorized/unrolled version.
2154 unsigned MinNumIter = std::max(ForcedFactor * ForcedUnroll, 2U);
2155
2156 // It's not vectorizable if the distance is smaller than the minimum distance
2157 // needed for a vectroized/unrolled version. Vectorizing one iteration in
2158 // front needs TypeByteSize * Stride. Vectorizing the last iteration needs
2159 // TypeByteSize (No need to plus the last gap distance).
2160 //
2161 // E.g. Assume one char is 1 byte in memory and one int is 4 bytes.
2162 // foo(int *A) {
2163 // int *B = (int *)((char *)A + 14);
2164 // for (i = 0 ; i < 1024 ; i += 2)
2165 // B[i] = A[i] + 1;
2166 // }
2167 //
2168 // Two accesses in memory (stride is 2):
2169 // | A[0] | | A[2] | | A[4] | | A[6] | |
2170 // | B[0] | | B[2] | | B[4] |
2171 //
2172 // Distance needs for vectorizing iterations except the last iteration:
2173 // 4 * 2 * (MinNumIter - 1). Distance needs for the last iteration: 4.
2174 // So the minimum distance needed is: 4 * 2 * (MinNumIter - 1) + 4.
2175 //
2176 // If MinNumIter is 2, it is vectorizable as the minimum distance needed is
2177 // 12, which is less than distance.
2178 //
2179 // If MinNumIter is 4 (Say if a user forces the vectorization factor to be 4),
2180 // the minimum distance needed is 28, which is greater than distance. It is
2181 // not safe to do vectorization.
2182 uint64_t MinDistanceNeeded =
2183 TypeByteSize * (*CommonStride) * (MinNumIter - 1) + TypeByteSize;
2184 if (MinDistanceNeeded > static_cast<uint64_t>(Distance)) {
2185 LLVM_DEBUG(dbgs() << "LAA: Failure because of positive distance "
2186 << Distance << '\n');
2187 return Dependence::Backward;
2188 }
2189
2190 // Unsafe if the minimum distance needed is greater than smallest dependence
2191 // distance distance.
2192 if (MinDistanceNeeded > MinDepDistBytes) {
2193 LLVM_DEBUG(dbgs() << "LAA: Failure because it needs at least "
2194 << MinDistanceNeeded << " size in bytes\n");
2195 return Dependence::Backward;
2196 }
2197
2198 // Positive distance bigger than max vectorization factor.
2199 // FIXME: Should use max factor instead of max distance in bytes, which could
2200 // not handle different types.
2201 // E.g. Assume one char is 1 byte in memory and one int is 4 bytes.
2202 // void foo (int *A, char *B) {
2203 // for (unsigned i = 0; i < 1024; i++) {
2204 // A[i+2] = A[i] + 1;
2205 // B[i+2] = B[i] + 1;
2206 // }
2207 // }
2208 //
2209 // This case is currently unsafe according to the max safe distance. If we
2210 // analyze the two accesses on array B, the max safe dependence distance
2211 // is 2. Then we analyze the accesses on array A, the minimum distance needed
2212 // is 8, which is less than 2 and forbidden vectorization, But actually
2213 // both A and B could be vectorized by 2 iterations.
2214 MinDepDistBytes =
2215 std::min(static_cast<uint64_t>(Distance), MinDepDistBytes);
2216
2217 bool IsTrueDataDependence = (!AIsWrite && BIsWrite);
2218 uint64_t MinDepDistBytesOld = MinDepDistBytes;
2219 if (IsTrueDataDependence && EnableForwardingConflictDetection &&
2220 couldPreventStoreLoadForward(Distance, TypeByteSize)) {
2221 // Sanity check that we didn't update MinDepDistBytes when calling
2222 // couldPreventStoreLoadForward
2223 assert(MinDepDistBytes == MinDepDistBytesOld &&
2224 "An update to MinDepDistBytes requires an update to "
2225 "MaxSafeVectorWidthInBits");
2226 (void)MinDepDistBytesOld;
2228 }
2229
2230 // An update to MinDepDistBytes requires an update to MaxSafeVectorWidthInBits
2231 // since there is a backwards dependency.
2232 uint64_t MaxVF = MinDepDistBytes / (TypeByteSize * (*CommonStride));
2233 LLVM_DEBUG(dbgs() << "LAA: Positive distance " << Val.getSExtValue()
2234 << " with max VF = " << MaxVF << '\n');
2235 uint64_t MaxVFInBits = MaxVF * TypeByteSize * 8;
2236 MaxSafeVectorWidthInBits = std::min(MaxSafeVectorWidthInBits, MaxVFInBits);
2238}
2239
2241 DepCandidates &AccessSets, MemAccessInfoList &CheckDeps,
2242 const DenseMap<Value *, const SCEV *> &Strides,
2244 &UnderlyingObjects) {
2245
2246 MinDepDistBytes = -1;
2248 for (MemAccessInfo CurAccess : CheckDeps) {
2249 if (Visited.count(CurAccess))
2250 continue;
2251
2252 // Get the relevant memory access set.
2254 AccessSets.findValue(AccessSets.getLeaderValue(CurAccess));
2255
2256 // Check accesses within this set.
2258 AccessSets.member_begin(I);
2260 AccessSets.member_end();
2261
2262 // Check every access pair.
2263 while (AI != AE) {
2264 Visited.insert(*AI);
2265 bool AIIsWrite = AI->getInt();
2266 // Check loads only against next equivalent class, but stores also against
2267 // other stores in the same equivalence class - to the same address.
2269 (AIIsWrite ? AI : std::next(AI));
2270 while (OI != AE) {
2271 // Check every accessing instruction pair in program order.
2272 for (std::vector<unsigned>::iterator I1 = Accesses[*AI].begin(),
2273 I1E = Accesses[*AI].end(); I1 != I1E; ++I1)
2274 // Scan all accesses of another equivalence class, but only the next
2275 // accesses of the same equivalent class.
2276 for (std::vector<unsigned>::iterator
2277 I2 = (OI == AI ? std::next(I1) : Accesses[*OI].begin()),
2278 I2E = (OI == AI ? I1E : Accesses[*OI].end());
2279 I2 != I2E; ++I2) {
2280 auto A = std::make_pair(&*AI, *I1);
2281 auto B = std::make_pair(&*OI, *I2);
2282
2283 assert(*I1 != *I2);
2284 if (*I1 > *I2)
2285 std::swap(A, B);
2286
2288 isDependent(*A.first, A.second, *B.first, B.second, Strides,
2289 UnderlyingObjects);
2291
2292 // Gather dependences unless we accumulated MaxDependences
2293 // dependences. In that case return as soon as we find the first
2294 // unsafe dependence. This puts a limit on this quadratic
2295 // algorithm.
2296 if (RecordDependences) {
2297 if (Type != Dependence::NoDep)
2298 Dependences.push_back(Dependence(A.second, B.second, Type));
2299
2300 if (Dependences.size() >= MaxDependences) {
2301 RecordDependences = false;
2302 Dependences.clear();
2304 << "Too many dependences, stopped recording\n");
2305 }
2306 }
2307 if (!RecordDependences && !isSafeForVectorization())
2308 return false;
2309 }
2310 ++OI;
2311 }
2312 AI++;
2313 }
2314 }
2315
2316 LLVM_DEBUG(dbgs() << "Total Dependences: " << Dependences.size() << "\n");
2317 return isSafeForVectorization();
2318}
2319
2322 MemAccessInfo Access(Ptr, isWrite);
2323 auto &IndexVector = Accesses.find(Access)->second;
2324
2326 transform(IndexVector,
2327 std::back_inserter(Insts),
2328 [&](unsigned Idx) { return this->InstMap[Idx]; });
2329 return Insts;
2330}
2331
2333 "NoDep",
2334 "Unknown",
2335 "IndirectUnsafe",
2336 "Forward",
2337 "ForwardButPreventsForwarding",
2338 "Backward",
2339 "BackwardVectorizable",
2340 "BackwardVectorizableButPreventsForwarding"};
2341
2343 raw_ostream &OS, unsigned Depth,
2344 const SmallVectorImpl<Instruction *> &Instrs) const {
2345 OS.indent(Depth) << DepName[Type] << ":\n";
2346 OS.indent(Depth + 2) << *Instrs[Source] << " -> \n";
2347 OS.indent(Depth + 2) << *Instrs[Destination] << "\n";
2348}
2349
2350bool LoopAccessInfo::canAnalyzeLoop() {
2351 // We need to have a loop header.
2352 LLVM_DEBUG(dbgs() << "LAA: Found a loop in "
2353 << TheLoop->getHeader()->getParent()->getName() << ": "
2354 << TheLoop->getHeader()->getName() << '\n');
2355
2356 // We can only analyze innermost loops.
2357 if (!TheLoop->isInnermost()) {
2358 LLVM_DEBUG(dbgs() << "LAA: loop is not the innermost loop\n");
2359 recordAnalysis("NotInnerMostLoop") << "loop is not the innermost loop";
2360 return false;
2361 }
2362
2363 // We must have a single backedge.
2364 if (TheLoop->getNumBackEdges() != 1) {
2365 LLVM_DEBUG(
2366 dbgs() << "LAA: loop control flow is not understood by analyzer\n");
2367 recordAnalysis("CFGNotUnderstood")
2368 << "loop control flow is not understood by analyzer";
2369 return false;
2370 }
2371
2372 // ScalarEvolution needs to be able to find the exit count.
2373 const SCEV *ExitCount = PSE->getBackedgeTakenCount();
2374 if (isa<SCEVCouldNotCompute>(ExitCount)) {
2375 recordAnalysis("CantComputeNumberOfIterations")
2376 << "could not determine number of loop iterations";
2377 LLVM_DEBUG(dbgs() << "LAA: SCEV could not compute the loop exit count.\n");
2378 return false;
2379 }
2380
2381 return true;
2382}
2383
2384void LoopAccessInfo::analyzeLoop(AAResults *AA, LoopInfo *LI,
2385 const TargetLibraryInfo *TLI,
2386 DominatorTree *DT) {
2387 // Holds the Load and Store instructions.
2390 SmallPtrSet<MDNode *, 8> LoopAliasScopes;
2391
2392 // Holds all the different accesses in the loop.
2393 unsigned NumReads = 0;
2394 unsigned NumReadWrites = 0;
2395
2396 bool HasComplexMemInst = false;
2397
2398 // A runtime check is only legal to insert if there are no convergent calls.
2399 HasConvergentOp = false;
2400
2401 PtrRtChecking->Pointers.clear();
2402 PtrRtChecking->Need = false;
2403
2404 const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel();
2405
2406 const bool EnableMemAccessVersioningOfLoop =
2408 !TheLoop->getHeader()->getParent()->hasOptSize();
2409
2410 // Traverse blocks in fixed RPOT order, regardless of their storage in the
2411 // loop info, as it may be arbitrary.
2412 LoopBlocksRPO RPOT(TheLoop);
2413 RPOT.perform(LI);
2414 for (BasicBlock *BB : RPOT) {
2415 // Scan the BB and collect legal loads and stores. Also detect any
2416 // convergent instructions.
2417 for (Instruction &I : *BB) {
2418 if (auto *Call = dyn_cast<CallBase>(&I)) {
2419 if (Call->isConvergent())
2420 HasConvergentOp = true;
2421 }
2422
2423 // With both a non-vectorizable memory instruction and a convergent
2424 // operation, found in this loop, no reason to continue the search.
2425 if (HasComplexMemInst && HasConvergentOp) {
2426 CanVecMem = false;
2427 return;
2428 }
2429
2430 // Avoid hitting recordAnalysis multiple times.
2431 if (HasComplexMemInst)
2432 continue;
2433
2434 // Record alias scopes defined inside the loop.
2435 if (auto *Decl = dyn_cast<NoAliasScopeDeclInst>(&I))
2436 for (Metadata *Op : Decl->getScopeList()->operands())
2437 LoopAliasScopes.insert(cast<MDNode>(Op));
2438
2439 // Many math library functions read the rounding mode. We will only
2440 // vectorize a loop if it contains known function calls that don't set
2441 // the flag. Therefore, it is safe to ignore this read from memory.
2442 auto *Call = dyn_cast<CallInst>(&I);
2443 if (Call && getVectorIntrinsicIDForCall(Call, TLI))
2444 continue;
2445
2446 // If this is a load, save it. If this instruction can read from memory
2447 // but is not a load, then we quit. Notice that we don't handle function
2448 // calls that read or write.
2449 if (I.mayReadFromMemory()) {
2450 // If the function has an explicit vectorized counterpart, we can safely
2451 // assume that it can be vectorized.
2452 if (Call && !Call->isNoBuiltin() && Call->getCalledFunction() &&
2453 !VFDatabase::getMappings(*Call).empty())
2454 continue;
2455
2456 auto *Ld = dyn_cast<LoadInst>(&I);
2457 if (!Ld) {
2458 recordAnalysis("CantVectorizeInstruction", Ld)
2459 << "instruction cannot be vectorized";
2460 HasComplexMemInst = true;
2461 continue;
2462 }
2463 if (!Ld->isSimple() && !IsAnnotatedParallel) {
2464 recordAnalysis("NonSimpleLoad", Ld)
2465 << "read with atomic ordering or volatile read";
2466 LLVM_DEBUG(dbgs() << "LAA: Found a non-simple load.\n");
2467 HasComplexMemInst = true;
2468 continue;
2469 }
2470 NumLoads++;
2471 Loads.push_back(Ld);
2472 DepChecker->addAccess(Ld);
2473 if (EnableMemAccessVersioningOfLoop)
2474 collectStridedAccess(Ld);
2475 continue;
2476 }
2477
2478 // Save 'store' instructions. Abort if other instructions write to memory.
2479 if (I.mayWriteToMemory()) {
2480 auto *St = dyn_cast<StoreInst>(&I);
2481 if (!St) {
2482 recordAnalysis("CantVectorizeInstruction", St)
2483 << "instruction cannot be vectorized";
2484 HasComplexMemInst = true;
2485 continue;
2486 }
2487 if (!St->isSimple() && !IsAnnotatedParallel) {
2488 recordAnalysis("NonSimpleStore", St)
2489 << "write with atomic ordering or volatile write";
2490 LLVM_DEBUG(dbgs() << "LAA: Found a non-simple store.\n");
2491 HasComplexMemInst = true;
2492 continue;
2493 }
2494 NumStores++;
2495 Stores.push_back(St);
2496 DepChecker->addAccess(St);
2497 if (EnableMemAccessVersioningOfLoop)
2498 collectStridedAccess(St);
2499 }
2500 } // Next instr.
2501 } // Next block.
2502
2503 if (HasComplexMemInst) {
2504 CanVecMem = false;
2505 return;
2506 }
2507
2508 // Now we have two lists that hold the loads and the stores.
2509 // Next, we find the pointers that they use.
2510
2511 // Check if we see any stores. If there are no stores, then we don't
2512 // care if the pointers are *restrict*.
2513 if (!Stores.size()) {
2514 LLVM_DEBUG(dbgs() << "LAA: Found a read-only loop!\n");
2515 CanVecMem = true;
2516 return;
2517 }
2518
2519 MemoryDepChecker::DepCandidates DependentAccesses;
2520 AccessAnalysis Accesses(TheLoop, AA, LI, DependentAccesses, *PSE,
2521 LoopAliasScopes);
2522
2523 // Holds the analyzed pointers. We don't want to call getUnderlyingObjects
2524 // multiple times on the same object. If the ptr is accessed twice, once
2525 // for read and once for write, it will only appear once (on the write
2526 // list). This is okay, since we are going to check for conflicts between
2527 // writes and between reads and writes, but not between reads and reads.
2529
2530 // Record uniform store addresses to identify if we have multiple stores
2531 // to the same address.
2532 SmallPtrSet<Value *, 16> UniformStores;
2533
2534 for (StoreInst *ST : Stores) {
2535 Value *Ptr = ST->getPointerOperand();
2536
2537 if (isInvariant(Ptr)) {
2538 // Record store instructions to loop invariant addresses
2539 StoresToInvariantAddresses.push_back(ST);
2540 HasStoreStoreDependenceInvolvingLoopInvariantAddress |=
2541 !UniformStores.insert(Ptr).second;
2542 }
2543
2544 // If we did *not* see this pointer before, insert it to the read-write
2545 // list. At this phase it is only a 'write' list.
2546 Type *AccessTy = getLoadStoreType(ST);
2547 if (Seen.insert({Ptr, AccessTy}).second) {
2548 ++NumReadWrites;
2549
2551 // The TBAA metadata could have a control dependency on the predication
2552 // condition, so we cannot rely on it when determining whether or not we
2553 // need runtime pointer checks.
2554 if (blockNeedsPredication(ST->getParent(), TheLoop, DT))
2555 Loc.AATags.TBAA = nullptr;
2556
2557 visitPointers(const_cast<Value *>(Loc.Ptr), *TheLoop,
2558 [&Accesses, AccessTy, Loc](Value *Ptr) {
2559 MemoryLocation NewLoc = Loc.getWithNewPtr(Ptr);
2560 Accesses.addStore(NewLoc, AccessTy);
2561 });
2562 }
2563 }
2564
2565 if (IsAnnotatedParallel) {
2566 LLVM_DEBUG(
2567 dbgs() << "LAA: A loop annotated parallel, ignore memory dependency "
2568 << "checks.\n");
2569 CanVecMem = true;
2570 return;
2571 }
2572
2573 for (LoadInst *LD : Loads) {
2574 Value *Ptr = LD->getPointerOperand();
2575 // If we did *not* see this pointer before, insert it to the
2576 // read list. If we *did* see it before, then it is already in
2577 // the read-write list. This allows us to vectorize expressions
2578 // such as A[i] += x; Because the address of A[i] is a read-write
2579 // pointer. This only works if the index of A[i] is consecutive.
2580 // If the address of i is unknown (for example A[B[i]]) then we may
2581 // read a few words, modify, and write a few words, and some of the
2582 // words may be written to the same address.
2583 bool IsReadOnlyPtr = false;
2584 Type *AccessTy = getLoadStoreType(LD);
2585 if (Seen.insert({Ptr, AccessTy}).second ||
2586 !getPtrStride(*PSE, LD->getType(), Ptr, TheLoop, SymbolicStrides).value_or(0)) {
2587 ++NumReads;
2588 IsReadOnlyPtr = true;
2589 }
2590
2591 // See if there is an unsafe dependency between a load to a uniform address and
2592 // store to the same uniform address.
2593 if (UniformStores.count(Ptr)) {
2594 LLVM_DEBUG(dbgs() << "LAA: Found an unsafe dependency between a uniform "
2595 "load and uniform store to the same address!\n");
2596 HasLoadStoreDependenceInvolvingLoopInvariantAddress = true;
2597 }
2598
2600 // The TBAA metadata could have a control dependency on the predication
2601 // condition, so we cannot rely on it when determining whether or not we
2602 // need runtime pointer checks.
2603 if (blockNeedsPredication(LD->getParent(), TheLoop, DT))
2604 Loc.AATags.TBAA = nullptr;
2605
2606 visitPointers(const_cast<Value *>(Loc.Ptr), *TheLoop,
2607 [&Accesses, AccessTy, Loc, IsReadOnlyPtr](Value *Ptr) {
2608 MemoryLocation NewLoc = Loc.getWithNewPtr(Ptr);
2609 Accesses.addLoad(NewLoc, AccessTy, IsReadOnlyPtr);
2610 });
2611 }
2612
2613 // If we write (or read-write) to a single destination and there are no
2614 // other reads in this loop then is it safe to vectorize.
2615 if (NumReadWrites == 1 && NumReads == 0) {
2616 LLVM_DEBUG(dbgs() << "LAA: Found a write-only loop!\n");
2617 CanVecMem = true;
2618 return;
2619 }
2620
2621 // Build dependence sets and check whether we need a runtime pointer bounds
2622 // check.
2623 Accesses.buildDependenceSets();
2624
2625 // Find pointers with computable bounds. We are going to use this information
2626 // to place a runtime bound check.
2627 Value *UncomputablePtr = nullptr;
2628 bool CanDoRTIfNeeded =
2629 Accesses.canCheckPtrAtRT(*PtrRtChecking, PSE->getSE(), TheLoop,
2630 SymbolicStrides, UncomputablePtr, false);
2631 if (!CanDoRTIfNeeded) {
2632 auto *I = dyn_cast_or_null<Instruction>(UncomputablePtr);
2633 recordAnalysis("CantIdentifyArrayBounds", I)
2634 << "cannot identify array bounds";
2635 LLVM_DEBUG(dbgs() << "LAA: We can't vectorize because we can't find "
2636 << "the array bounds.\n");
2637 CanVecMem = false;
2638 return;
2639 }
2640
2641 LLVM_DEBUG(
2642 dbgs() << "LAA: May be able to perform a memory runtime check if needed.\n");
2643
2644 CanVecMem = true;
2645 if (Accesses.isDependencyCheckNeeded()) {
2646 LLVM_DEBUG(dbgs() << "LAA: Checking memory dependencies\n");
2647 CanVecMem = DepChecker->areDepsSafe(
2648 DependentAccesses, Accesses.getDependenciesToCheck(), SymbolicStrides,
2649 Accesses.getUnderlyingObjects());
2650
2651 if (!CanVecMem && DepChecker->shouldRetryWithRuntimeCheck()) {
2652 LLVM_DEBUG(dbgs() << "LAA: Retrying with memory checks\n");
2653
2654 // Clear the dependency checks. We assume they are not needed.
2655 Accesses.resetDepChecks(*DepChecker);
2656
2657 PtrRtChecking->reset();
2658 PtrRtChecking->Need = true;
2659
2660 auto *SE = PSE->getSE();
2661 UncomputablePtr = nullptr;
2662 CanDoRTIfNeeded = Accesses.canCheckPtrAtRT(
2663 *PtrRtChecking, SE, TheLoop, SymbolicStrides, UncomputablePtr, true);
2664
2665 // Check that we found the bounds for the pointer.
2666 if (!CanDoRTIfNeeded) {
2667 auto *I = dyn_cast_or_null<Instruction>(UncomputablePtr);
2668 recordAnalysis("CantCheckMemDepsAtRunTime", I)
2669 << "cannot check memory dependencies at runtime";
2670 LLVM_DEBUG(dbgs() << "LAA: Can't vectorize with memory checks\n");
2671 CanVecMem = false;
2672 return;
2673 }
2674
2675 CanVecMem = true;
2676 }
2677 }
2678
2679 if (HasConvergentOp) {
2680 recordAnalysis("CantInsertRuntimeCheckWithConvergent")
2681 << "cannot add control dependency to convergent operation";
2682 LLVM_DEBUG(dbgs() << "LAA: We can't vectorize because a runtime check "
2683 "would be needed with a convergent operation\n");
2684 CanVecMem = false;
2685 return;
2686 }
2687
2688 if (CanVecMem)
2689 LLVM_DEBUG(
2690 dbgs() << "LAA: No unsafe dependent memory operations in loop. We"
2691 << (PtrRtChecking->Need ? "" : " don't")
2692 << " need runtime memory checks.\n");
2693 else
2694 emitUnsafeDependenceRemark();
2695}
2696
2697void LoopAccessInfo::emitUnsafeDependenceRemark() {
2698 auto Deps = getDepChecker().getDependences();
2699 if (!Deps)
2700 return;
2701 auto Found = llvm::find_if(*Deps, [](const MemoryDepChecker::Dependence &D) {
2704 });
2705 if (Found == Deps->end())
2706 return;
2707 MemoryDepChecker::Dependence Dep = *Found;
2708
2709 LLVM_DEBUG(dbgs() << "LAA: unsafe dependent memory operations in loop\n");
2710
2711 // Emit remark for first unsafe dependence
2712 bool HasForcedDistribution = false;
2713 std::optional<const MDOperand *> Value =
2714 findStringMetadataForLoop(TheLoop, "llvm.loop.distribute.enable");
2715 if (Value) {
2716 const MDOperand *Op = *Value;
2717 assert(Op && mdconst::hasa<ConstantInt>(*Op) && "invalid metadata");
2718 HasForcedDistribution = mdconst::extract<ConstantInt>(*Op)->getZExtValue();
2719 }
2720
2721 const std::string Info =
2722 HasForcedDistribution
2723 ? "unsafe dependent memory operations in loop."
2724 : "unsafe dependent memory operations in loop. Use "
2725 "#pragma clang loop distribute(enable) to allow loop distribution "
2726 "to attempt to isolate the offending operations into a separate "
2727 "loop";
2729 recordAnalysis("UnsafeDep", Dep.getDestination(*this)) << Info;
2730
2731 switch (Dep.Type) {
2735 llvm_unreachable("Unexpected dependence");
2737 R << "\nBackward loop carried data dependence.";
2738 break;
2740 R << "\nForward loop carried data dependence that prevents "
2741 "store-to-load forwarding.";
2742 break;
2744 R << "\nBackward loop carried data dependence that prevents "
2745 "store-to-load forwarding.";
2746 break;
2748 R << "\nUnsafe indirect dependence.";
2749 break;
2751 R << "\nUnknown data dependence.";
2752 break;
2753 }
2754
2755 if (Instruction *I = Dep.getSource(*this)) {
2756 DebugLoc SourceLoc = I->getDebugLoc();
2757 if (auto *DD = dyn_cast_or_null<Instruction>(getPointerOperand(I)))
2758 SourceLoc = DD->getDebugLoc();
2759 if (SourceLoc)
2760 R << " Memory location is the same as accessed at "
2761 << ore::NV("Location", SourceLoc);
2762 }
2763}
2764
2766 DominatorTree *DT) {
2767 assert(TheLoop->contains(BB) && "Unknown block used");
2768
2769 // Blocks that do not dominate the latch need predication.
2770 BasicBlock* Latch = TheLoop->getLoopLatch();
2771 return !DT->dominates(BB, Latch);
2772}
2773
2774OptimizationRemarkAnalysis &LoopAccessInfo::recordAnalysis(StringRef RemarkName,
2775 Instruction *I) {
2776 assert(!Report && "Multiple reports generated");
2777
2778 Value *CodeRegion = TheLoop->getHeader();
2779 DebugLoc DL = TheLoop->getStartLoc();
2780
2781 if (I) {
2782 CodeRegion = I->getParent();
2783 // If there is no debug location attached to the instruction, revert back to
2784 // using the loop's.
2785 if (I->getDebugLoc())
2786 DL = I->getDebugLoc();
2787 }
2788
2789 Report = std::make_unique<OptimizationRemarkAnalysis>(DEBUG_TYPE, RemarkName, DL,
2790 CodeRegion);
2791 return *Report;
2792}
2793
2795 auto *SE = PSE->getSE();
2796 // TODO: Is this really what we want? Even without FP SCEV, we may want some
2797 // trivially loop-invariant FP values to be considered invariant.
2798 if (!SE->isSCEVable(V->getType()))
2799 return false;
2800 const SCEV *S = SE->getSCEV(V);
2801 return SE->isLoopInvariant(S, TheLoop);
2802}
2803
2804/// Find the operand of the GEP that should be checked for consecutive
2805/// stores. This ignores trailing indices that have no effect on the final
2806/// pointer.
2807static unsigned getGEPInductionOperand(const GetElementPtrInst *Gep) {
2808 const DataLayout &DL = Gep->getModule()->getDataLayout();
2809 unsigned LastOperand = Gep->getNumOperands() - 1;
2810 TypeSize GEPAllocSize = DL.getTypeAllocSize(Gep->getResultElementType());
2811
2812 // Walk backwards and try to peel off zeros.
2813 while (LastOperand > 1 && match(Gep->getOperand(LastOperand), m_Zero())) {
2814 // Find the type we're currently indexing into.
2815 gep_type_iterator GEPTI = gep_type_begin(Gep);
2816 std::advance(GEPTI, LastOperand - 2);
2817
2818 // If it's a type with the same allocation size as the result of the GEP we
2819 // can peel off the zero index.
2820 TypeSize ElemSize = GEPTI.isStruct()
2821 ? DL.getTypeAllocSize(GEPTI.getIndexedType())
2823 if (ElemSize != GEPAllocSize)
2824 break;
2825 --LastOperand;
2826 }
2827
2828 return LastOperand;
2829}
2830
2831/// If the argument is a GEP, then returns the operand identified by
2832/// getGEPInductionOperand. However, if there is some other non-loop-invariant
2833/// operand, it returns that instead.
2835 GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
2836 if (!GEP)
2837 return Ptr;
2838
2839 unsigned InductionOperand = getGEPInductionOperand(GEP);
2840
2841 // Check that all of the gep indices are uniform except for our induction
2842 // operand.
2843 for (unsigned i = 0, e = GEP->getNumOperands(); i != e; ++i)
2844 if (i != InductionOperand &&
2845 !SE->isLoopInvariant(SE->getSCEV(GEP->getOperand(i)), Lp))
2846 return Ptr;
2847 return GEP->getOperand(InductionOperand);
2848}
2849
2850/// If a value has only one user that is a CastInst, return it.
2852 Value *UniqueCast = nullptr;
2853 for (User *U : Ptr->users()) {
2854 CastInst *CI = dyn_cast<CastInst>(U);
2855 if (CI && CI->getType() == Ty) {
2856 if (!UniqueCast)
2857 UniqueCast = CI;
2858 else
2859 return nullptr;
2860 }
2861 }
2862 return UniqueCast;
2863}
2864
2865/// Get the stride of a pointer access in a loop. Looks for symbolic
2866/// strides "a[i*stride]". Returns the symbolic stride, or null otherwise.
2868 auto *PtrTy = dyn_cast<PointerType>(Ptr->getType());
2869 if (!PtrTy || PtrTy->isAggregateType())
2870 return nullptr;
2871
2872 // Try to remove a gep instruction to make the pointer (actually index at this
2873 // point) easier analyzable. If OrigPtr is equal to Ptr we are analyzing the
2874 // pointer, otherwise, we are analyzing the index.
2875 Value *OrigPtr = Ptr;
2876
2877 // The size of the pointer access.
2878 int64_t PtrAccessSize = 1;
2879
2880 Ptr = stripGetElementPtr(Ptr, SE, Lp);
2881 const SCEV *V = SE->getSCEV(Ptr);
2882
2883 if (Ptr != OrigPtr)
2884 // Strip off casts.
2885 while (const SCEVIntegralCastExpr *C = dyn_cast<SCEVIntegralCastExpr>(V))
2886 V = C->getOperand();
2887
2888 const SCEVAddRecExpr *S = dyn_cast<SCEVAddRecExpr>(V);
2889 if (!S)
2890 return nullptr;
2891
2892 // If the pointer is invariant then there is no stride and it makes no
2893 // sense to add it here.
2894 if (Lp != S->getLoop())
2895 return nullptr;
2896
2897 V = S->getStepRecurrence(*SE);
2898 if (!V)
2899 return nullptr;
2900
2901 // Strip off the size of access multiplication if we are still analyzing the
2902 // pointer.
2903 if (OrigPtr == Ptr) {
2904 if (const SCEVMulExpr *M = dyn_cast<SCEVMulExpr>(V)) {
2905 if (M->getOperand(0)->getSCEVType() != scConstant)
2906 return nullptr;
2907
2908 const APInt &APStepVal = cast<SCEVConstant>(M->getOperand(0))->getAPInt();
2909
2910 // Huge step value - give up.
2911 if (APStepVal.getBitWidth() > 64)
2912 return nullptr;
2913
2914 int64_t StepVal = APStepVal.getSExtValue();
2915 if (PtrAccessSize != StepVal)
2916 return nullptr;
2917 V = M->getOperand(1);
2918 }
2919 }
2920
2921 // Note that the restriction after this loop invariant check are only
2922 // profitability restrictions.
2923 if (!SE->isLoopInvariant(V, Lp))
2924 return nullptr;
2925
2926 // Look for the loop invariant symbolic value.
2927 const SCEVUnknown *U = dyn_cast<SCEVUnknown>(V);
2928 if (!U) {
2929 const auto *C = dyn_cast<SCEVIntegralCastExpr>(V);
2930 if (!C)
2931 return nullptr;
2932 U = dyn_cast<SCEVUnknown>(C->getOperand());
2933 if (!U)
2934 return nullptr;
2935
2936 // Match legacy behavior - this is not needed for correctness
2937 if (!getUniqueCastUse(U->getValue(), Lp, V->getType()))
2938 return nullptr;
2939 }
2940
2941 return V;
2942}
2943
2944void LoopAccessInfo::collectStridedAccess(Value *MemAccess) {
2945 Value *Ptr = getLoadStorePointerOperand(MemAccess);
2946 if (!Ptr)
2947 return;
2948
2949 // Note: getStrideFromPointer is a *profitability* heuristic. We
2950 // could broaden the scope of values returned here - to anything
2951 // which happens to be loop invariant and contributes to the
2952 // computation of an interesting IV - but we chose not to as we
2953 // don't have a cost model here, and broadening the scope exposes
2954 // far too many unprofitable cases.
2955 const SCEV *StrideExpr = getStrideFromPointer(Ptr, PSE->getSE(), TheLoop);
2956 if (!StrideExpr)
2957 return;
2958
2959 LLVM_DEBUG(dbgs() << "LAA: Found a strided access that is a candidate for "
2960 "versioning:");
2961 LLVM_DEBUG(dbgs() << " Ptr: " << *Ptr << " Stride: " << *StrideExpr << "\n");
2962
2963 if (!SpeculateUnitStride) {
2964 LLVM_DEBUG(dbgs() << " Chose not to due to -laa-speculate-unit-stride\n");
2965 return;
2966 }
2967
2968 // Avoid adding the "Stride == 1" predicate when we know that
2969 // Stride >= Trip-Count. Such a predicate will effectively optimize a single
2970 // or zero iteration loop, as Trip-Count <= Stride == 1.
2971 //
2972 // TODO: We are currently not making a very informed decision on when it is
2973 // beneficial to apply stride versioning. It might make more sense that the
2974 // users of this analysis (such as the vectorizer) will trigger it, based on
2975 // their specific cost considerations; For example, in cases where stride
2976 // versioning does not help resolving memory accesses/dependences, the
2977 // vectorizer should evaluate the cost of the runtime test, and the benefit
2978 // of various possible stride specializations, considering the alternatives
2979 // of using gather/scatters (if available).
2980
2981 const SCEV *BETakenCount = PSE->getBackedgeTakenCount();
2982
2983 // Match the types so we can compare the stride and the BETakenCount.
2984 // The Stride can be positive/negative, so we sign extend Stride;
2985 // The backedgeTakenCount is non-negative, so we zero extend BETakenCount.
2986 const DataLayout &DL = TheLoop->getHeader()->getModule()->getDataLayout();
2987 uint64_t StrideTypeSizeBits = DL.getTypeSizeInBits(StrideExpr->getType());
2988 uint64_t BETypeSizeBits = DL.getTypeSizeInBits(BETakenCount->getType());
2989 const SCEV *CastedStride = StrideExpr;
2990 const SCEV *CastedBECount = BETakenCount;
2991 ScalarEvolution *SE = PSE->getSE();
2992 if (BETypeSizeBits >= StrideTypeSizeBits)
2993 CastedStride = SE->getNoopOrSignExtend(StrideExpr, BETakenCount->getType());
2994 else
2995 CastedBECount = SE->getZeroExtendExpr(BETakenCount, StrideExpr->getType());
2996 const SCEV *StrideMinusBETaken = SE->getMinusSCEV(CastedStride, CastedBECount);
2997 // Since TripCount == BackEdgeTakenCount + 1, checking:
2998 // "Stride >= TripCount" is equivalent to checking:
2999 // Stride - BETakenCount > 0
3000 if (SE->isKnownPositive(StrideMinusBETaken)) {
3001 LLVM_DEBUG(
3002 dbgs() << "LAA: Stride>=TripCount; No point in versioning as the "
3003 "Stride==1 predicate will imply that the loop executes "
3004 "at most once.\n");
3005 return;
3006 }
3007 LLVM_DEBUG(dbgs() << "LAA: Found a strided access that we can version.\n");
3008
3009 // Strip back off the integer cast, and check that our result is a
3010 // SCEVUnknown as we expect.
3011 const SCEV *StrideBase = StrideExpr;
3012 if (const auto *C = dyn_cast<SCEVIntegralCastExpr>(StrideBase))
3013 StrideBase = C->getOperand();
3014 SymbolicStrides[Ptr] = cast<SCEVUnknown>(StrideBase);
3015}
3016
3018 const TargetLibraryInfo *TLI, AAResults *AA,
3019 DominatorTree *DT, LoopInfo *LI)
3020 : PSE(std::make_unique<PredicatedScalarEvolution>(*SE, *L)),
3021 PtrRtChecking(nullptr),
3022 DepChecker(std::make_unique<MemoryDepChecker>(*PSE, L)), TheLoop(L) {
3023 PtrRtChecking = std::make_unique<RuntimePointerChecking>(*DepChecker, SE);
3024 if (canAnalyzeLoop()) {
3025 analyzeLoop(AA, LI, TLI, DT);
3026 }
3027}
3028
3030 if (CanVecMem) {
3031 OS.indent(Depth) << "Memory dependences are safe";
3032 const MemoryDepChecker &DC = getDepChecker();
3033 if (!DC.isSafeForAnyVectorWidth())
3034 OS << " with a maximum safe vector width of "
3035 << DC.getMaxSafeVectorWidthInBits() << " bits";
3036 if (PtrRtChecking->Need)
3037 OS << " with run-time checks";
3038 OS << "\n";
3039 }
3040
3041 if (HasConvergentOp)
3042 OS.indent(Depth) << "Has convergent operation in loop\n";
3043
3044 if (Report)
3045 OS.indent(Depth) << "Report: " << Report->getMsg() << "\n";
3046
3047 if (auto *Dependences = DepChecker->getDependences()) {
3048 OS.indent(Depth) << "Dependences:\n";
3049 for (const auto &Dep : *Dependences) {
3050 Dep.print(OS, Depth + 2, DepChecker->getMemoryInstructions());
3051 OS << "\n";
3052 }
3053 } else
3054 OS.indent(Depth) << "Too many dependences, not recorded\n";
3055
3056 // List the pair of accesses need run-time checks to prove independence.
3057 PtrRtChecking->print(OS, Depth);
3058 OS << "\n";
3059
3060 OS.indent(Depth)
3061 << "Non vectorizable stores to invariant address were "
3062 << (HasStoreStoreDependenceInvolvingLoopInvariantAddress ||
3063 HasLoadStoreDependenceInvolvingLoopInvariantAddress
3064 ? ""
3065 : "not ")
3066 << "found in loop.\n";
3067
3068 OS.indent(Depth) << "SCEV assumptions:\n";
3069 PSE->getPredicate().print(OS, Depth);
3070
3071 OS << "\n";
3072
3073 OS.indent(Depth) << "Expressions re-written:\n";
3074 PSE->print(OS, Depth);
3075}
3076
3078 auto I = LoopAccessInfoMap.insert({&L, nullptr});
3079
3080 if (I.second)
3081 I.first->second =
3082 std::make_unique<LoopAccessInfo>(&L, &SE, TLI, &AA, &DT, &LI);
3083
3084 return *I.first->second;
3085}
3086
3088 Function &F, const PreservedAnalyses &PA,
3090 // Check whether our analysis is preserved.
3091 auto PAC = PA.getChecker<LoopAccessAnalysis>();
3092 if (!PAC.preserved() && !PAC.preservedSet<AllAnalysesOn<Function>>())
3093 // If not, give up now.
3094 return true;
3095
3096 // Check whether the analyses we depend on became invalid for any reason.
3097 // Skip checking TargetLibraryAnalysis as it is immutable and can't become
3098 // invalid.
3099 return Inv.invalidate<AAManager>(F, PA) ||
3101 Inv.invalidate<LoopAnalysis>(F, PA) ||
3103}
3104
3108 auto &AA = FAM.getResult<AAManager>(F);
3109 auto &DT = FAM.getResult<DominatorTreeAnalysis>(F);
3110 auto &LI = FAM.getResult<LoopAnalysis>(F);
3111 auto &TLI = FAM.getResult<TargetLibraryAnalysis>(F);
3112 return LoopAccessInfoManager(SE, AA, DT, LI, &TLI);
3113}
3114
3115AnalysisKey LoopAccessAnalysis::Key;
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
This file implements a class to represent arbitrary precision integral constant values and operations...
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
This file contains the declarations for the subclasses of Constant, which represent the different fla...
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
This file defines the DenseMap class.
uint64_t Size
bool End
Definition: ELF_riscv.cpp:480
Generic implementation of equivalence classes through the use Tarjan's efficient union-find algorithm...
#define Check(C,...)
#define DEBUG_TYPE
Hexagon Common GEP
IRTranslator LLVM IR MI
static cl::opt< unsigned > MaxDependences("max-dependences", cl::Hidden, cl::desc("Maximum number of dependences collected by " "loop-access analysis (default = 100)"), cl::init(100))
We collect dependences up to this threshold.
static cl::opt< bool > EnableForwardingConflictDetection("store-to-load-forwarding-conflict-detection", cl::Hidden, cl::desc("Enable conflict detection in loop-access analysis"), cl::init(true))
Enable store-to-load forwarding conflict detection.
static void findForkedSCEVs(ScalarEvolution *SE, const Loop *L, Value *Ptr, SmallVectorImpl< PointerIntPair< const SCEV *, 1, bool > > &ScevList, unsigned Depth)
static std::variant< MemoryDepChecker::Dependence::DepType, DepDistanceStrideAndSizeInfo > getDependenceDistanceStrideAndSize(const AccessAnalysis::MemAccessInfo &A, Instruction *AInst, const AccessAnalysis::MemAccessInfo &B, Instruction *BInst, const DenseMap< Value *, const SCEV * > &Strides, const DenseMap< Value *, SmallVector< const Value *, 16 > > &UnderlyingObjects, PredicatedScalarEvolution &PSE, const Loop *InnermostLoop)
static bool hasComputableBounds(PredicatedScalarEvolution &PSE, Value *Ptr, const SCEV *PtrScev, Loop *L, bool Assume)
Check whether a pointer can participate in a runtime bounds check.
static cl::opt< unsigned > MemoryCheckMergeThreshold("memory-check-merge-threshold", cl::Hidden, cl::desc("Maximum number of comparisons done when trying to merge " "runtime memory checks. (default = 100)"), cl::init(100))
The maximum iterations used to merge memory checks.
static bool isNoWrap(PredicatedScalarEvolution &PSE, const DenseMap< Value *, const SCEV * > &Strides, Value *Ptr, Type *AccessTy, Loop *L)
Check whether a pointer address cannot wrap.
static const SCEV * getStrideFromPointer(Value *Ptr, ScalarEvolution *SE, Loop *Lp)
Get the stride of a pointer access in a loop.
static unsigned getGEPInductionOperand(const GetElementPtrInst *Gep)
Find the operand of the GEP that should be checked for consecutive stores.
static bool isSafeDependenceDistance(const DataLayout &DL, ScalarEvolution &SE, const SCEV &BackedgeTakenCount, const SCEV &Dist, uint64_t MaxStride, uint64_t TypeByteSize)
Given a dependence-distance Dist between two memory accesses, that have strides in the same direction...
static cl::opt< unsigned, true > VectorizationInterleave("force-vector-interleave", cl::Hidden, cl::desc("Sets the vectorization interleave count. " "Zero is autoselect."), cl::location(VectorizerParams::VectorizationInterleave))
static bool isLoopVariantIndirectAddress(ArrayRef< const Value * > UnderlyingObjects, ScalarEvolution &SE, const Loop *L)
Returns true if any of the underlying objects has a loop varying address, i.e.
static Value * getUniqueCastUse(Value *Ptr, Loop *Lp, Type *Ty)
If a value has only one user that is a CastInst, return it.
static cl::opt< bool, true > HoistRuntimeChecks("hoist-runtime-checks", cl::Hidden, cl::desc("Hoist inner loop runtime memory checks to outer loop if possible"), cl::location(VectorizerParams::HoistRuntimeChecks), cl::init(true))
static cl::opt< unsigned, true > VectorizationFactor("force-vector-width", cl::Hidden, cl::desc("Sets the SIMD width. Zero is autoselect."), cl::location(VectorizerParams::VectorizationFactor))
static cl::opt< unsigned, true > RuntimeMemoryCheckThreshold("runtime-memory-check-threshold", cl::Hidden, cl::desc("When performing memory disambiguation checks at runtime do not " "generate more than this number of comparisons (default = 8)."), cl::location(VectorizerParams::RuntimeMemoryCheckThreshold), cl::init(8))
static void visitPointers(Value *StartPtr, const Loop &InnermostLoop, function_ref< void(Value *)> AddPointer)
static bool isNoWrapAddRec(Value *Ptr, const SCEVAddRecExpr *AR, PredicatedScalarEvolution &PSE, const Loop *L)
Return true if an AddRec pointer Ptr is unsigned non-wrapping, i.e.
static Value * stripGetElementPtr(Value *Ptr, ScalarEvolution *SE, Loop *Lp)
If the argument is a GEP, then returns the operand identified by getGEPInductionOperand.
static bool areStridedAccessesIndependent(uint64_t Distance, uint64_t Stride, uint64_t TypeByteSize)
Check the dependence for two accesses with the same stride Stride.
static const SCEV * getMinFromExprs(const SCEV *I, const SCEV *J, ScalarEvolution *SE)
Compare I and J and return the minimum.
static cl::opt< unsigned > MaxForkedSCEVDepth("max-forked-scev-depth", cl::Hidden, cl::desc("Maximum recursion depth when finding forked SCEVs (default = 5)"), cl::init(5))
static cl::opt< bool > SpeculateUnitStride("laa-speculate-unit-stride", cl::Hidden, cl::desc("Speculate that non-constant strides are unit in LAA"), cl::init(true))
static SmallVector< PointerIntPair< const SCEV *, 1, bool > > findForkedPointer(PredicatedScalarEvolution &PSE, const DenseMap< Value *, const SCEV * > &StridesMap, Value *Ptr, const Loop *L)
static cl::opt< bool > EnableMemAccessVersioning("enable-mem-access-versioning", cl::init(true), cl::Hidden, cl::desc("Enable symbolic stride memory access versioning"))
This enables versioning on the strides of symbolically striding memory accesses in code like the foll...
This header provides classes for managing per-loop analyses.
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
This file provides utility analysis objects describing memory locations.
uint64_t High
#define P(N)
FunctionAnalysisManager FAM
This header defines various interfaces for pass management in LLVM.
This file defines the PointerIntPair class.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file contains some templates that are useful if you are working with the STL at all.
raw_pwrite_stream & OS
static LLVM_ATTRIBUTE_ALWAYS_INLINE bool CheckType(MVT::SimpleValueType VT, SDValue N, const TargetLowering *TLI, const DataLayout &DL)
This file implements a set that has insertion order iteration characteristics.
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallVector class.
static SymbolRef::Type getType(const Symbol *Sym)
Definition: TapiFile.cpp:40
static const X86InstrFMA3Group Groups[]
A manager for alias analyses.
Class for arbitrary precision integers.
Definition: APInt.h:76
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1439
APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition: APInt.cpp:1010
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1513
This templated class represents "all analyses that operate over <a particular IR unit>" (e....
Definition: Analysis.h:47
API to communicate dependencies between analyses during invalidation.
Definition: PassManager.h:360
bool invalidate(IRUnitT &IR, const PreservedAnalyses &PA)
Trigger the invalidation of some other analysis pass if not already handled and return whether it was...
Definition: PassManager.h:378
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:321
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:473
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
ArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
Definition: ArrayRef.h:204
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:160
LLVM Basic Block Representation.
Definition: BasicBlock.h:60
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:206
const Module * getModule() const
Return the module owning the function this basic block belongs to, or nullptr if the function does no...
Definition: BasicBlock.cpp:289
This class is a wrapper over an AAResults, and it is intended to be used only when there are no IR ch...
This is the base class for all instructions that perform data casts.
Definition: InstrTypes.h:601
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
A debug info location.
Definition: DebugLoc.h:33
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:155
iterator end()
Definition: DenseMap.h:84
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:220
Analysis pass which computes a DominatorTree.
Definition: Dominators.h:279
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:162
bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
Definition: Dominators.cpp:122
EquivalenceClasses - This represents a collection of equivalence classes and supports three efficient...
iterator findValue(const ElemTy &V) const
findValue - Return an iterator to the specified value.
iterator insert(const ElemTy &Data)
insert - Insert a new value into the union/find set, ignoring the request if the value already exists...
member_iterator member_end() const
typename std::set< ECValue, ECValueComparator >::const_iterator iterator
iterator* - Provides a way to iterate over all values in the set.
member_iterator member_begin(iterator I) const
member_iterator unionSets(const ElemTy &V1, const ElemTy &V2)
union - Merge the two equivalence sets for the specified values, inserting them if they do not alread...
const ElemTy & getLeaderValue(const ElemTy &V) const
getLeaderValue - Return the leader for the specified value that is in the set.
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition: Function.h:685
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Definition: Instructions.h:973
Type * getResultElementType() const
PointerType * getType() const
Global values are always pointers.
Definition: GlobalValue.h:294
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
Definition: Instruction.cpp:83
Class to represent integer types.
Definition: DerivedTypes.h:40
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:278
An instruction for reading from memory.
Definition: Instructions.h:184
Value * getPointerOperand()
Definition: Instructions.h:280
static constexpr LocationSize beforeOrAfterPointer()
Any location before or after the base pointer (but still within the underlying object).
This analysis provides dependence information for the memory accesses of a loop.
Result run(Function &F, FunctionAnalysisManager &AM)
bool invalidate(Function &F, const PreservedAnalyses &PA, FunctionAnalysisManager::Invalidator &Inv)
const LoopAccessInfo & getInfo(Loop &L)
Drive the analysis of memory accesses in the loop.
const MemoryDepChecker & getDepChecker() const
the Memory Dependence Checker which can determine the loop-independent and loop-carried dependences b...
bool isInvariant(Value *V) const
Returns true if value V is loop invariant.
void print(raw_ostream &OS, unsigned Depth=0) const
Print the information about the memory accesses in the loop.
LoopAccessInfo(Loop *L, ScalarEvolution *SE, const TargetLibraryInfo *TLI, AAResults *AA, DominatorTree *DT, LoopInfo *LI)
static bool blockNeedsPredication(BasicBlock *BB, Loop *TheLoop, DominatorTree *DT)
Return true if the block BB needs to be predicated in order for the loop to be vectorized.
Analysis pass that exposes the LoopInfo for a function.
Definition: LoopInfo.h:566
bool contains(const LoopT *L) const
Return true if the specified loop is contained within in this loop.
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
bool isInnermost() const
Return true if the loop does not contain any (natural) loops.
unsigned getNumBackEdges() const
Calculate the number of back edges to the loop header.
BlockT * getHeader() const
LoopT * getParentLoop() const
Return the parent loop if it exists or nullptr for top level loops.
Wrapper class to LoopBlocksDFS that provides a standard begin()/end() interface for the DFS reverse p...
Definition: LoopIterator.h:172
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:44
bool isAnnotatedParallel() const
Returns true if the loop is annotated parallel.
Definition: LoopInfo.cpp:564
DebugLoc getStartLoc() const
Return the debug location of the start of this loop.
Definition: LoopInfo.cpp:631
Metadata node.
Definition: Metadata.h:1067
ArrayRef< MDOperand > operands() const
Definition: Metadata.h:1426
Tracking metadata reference owned by Metadata.
Definition: Metadata.h:889
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:36
Checks memory dependences among accesses to the same underlying object to determine whether there vec...
ArrayRef< unsigned > getOrderForAccess(Value *Ptr, bool IsWrite) const
Return the program order indices for the access location (Ptr, IsWrite).
bool isSafeForAnyVectorWidth() const
Return true if the number of elements that are safe to operate on simultaneously is not bounded.
bool areDepsSafe(DepCandidates &AccessSets, MemAccessInfoList &CheckDeps, const DenseMap< Value *, const SCEV * > &Strides, const DenseMap< Value *, SmallVector< const Value *, 16 > > &UnderlyingObjects)
Check whether the dependencies between the accesses are safe.
const SmallVectorImpl< Instruction * > & getMemoryInstructions() const
The vector of memory access instructions.
const Loop * getInnermostLoop() const
uint64_t getMaxSafeVectorWidthInBits() const
Return the number of elements that are safe to operate on simultaneously, multiplied by the size of t...
bool isSafeForVectorization() const
No memory dependence was encountered that would inhibit vectorization.
const SmallVectorImpl< Dependence > * getDependences() const
Returns the memory dependences.
SmallVector< Instruction *, 4 > getInstructionsForAccess(Value *Ptr, bool isWrite) const
Find the set of instructions that read or write via Ptr.
VectorizationSafetyStatus
Type to keep track of the status of the dependence check.
bool shouldRetryWithRuntimeCheck() const
In same cases when the dependency check fails we can still vectorize the loop with a dynamic array ac...
void addAccess(StoreInst *SI)
Register the location (instructions are given increasing numbers) of a write access.
PointerIntPair< Value *, 1, bool > MemAccessInfo
Representation for a specific memory location.
static MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
LocationSize Size
The maximum size of the location, in address-units, or UnknownSize if the size is not known.
AAMDNodes AATags
The metadata nodes which describes the aliasing of the location (each member is null if that kind of ...
const Value * Ptr
The address of the start of the location.
Root of the metadata hierarchy.
Definition: Metadata.h:62
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition: Module.h:293
Diagnostic information for optimization analysis remarks.
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
void addPredicate(const SCEVPredicate &Pred)
Adds a new predicate.
ScalarEvolution * getSE() const
Returns the ScalarEvolution analysis used.
const SCEVPredicate & getPredicate() const
bool hasNoOverflow(Value *V, SCEVWrapPredicate::IncrementWrapFlags Flags)
Returns true if we've proved that V doesn't wrap by means of a SCEV predicate.
void setNoOverflow(Value *V, SCEVWrapPredicate::IncrementWrapFlags Flags)
Proves that V doesn't overflow by adding SCEV predicate.
void print(raw_ostream &OS, unsigned Depth) const
Print the SCEV mappings done by the Predicated Scalar Evolution.
const SCEVAddRecExpr * getAsAddRec(Value *V)
Attempts to produce an AddRecExpr for V by adding additional SCEV predicates.
const SCEV * getBackedgeTakenCount()
Get the (predicated) backedge count for the analyzed loop.
const SCEV * getSCEV(Value *V)
Returns the SCEV expression of V, in the context of the current SCEV predicate.
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:109
PreservedAnalysisChecker getChecker() const
Build a checker for this PreservedAnalyses and the specified analysis type.
Definition: Analysis.h:264
Holds information about the memory runtime legality checks to verify that a group of pointers do not ...
bool Need
This flag indicates if we need to add the runtime check.
void reset()
Reset the state of the pointer runtime information.
unsigned getNumberOfChecks() const
Returns the number of run-time checks required according to needsChecking.
void printChecks(raw_ostream &OS, const SmallVectorImpl< RuntimePointerCheck > &Checks, unsigned Depth=0) const
Print Checks.
bool needsChecking(const RuntimeCheckingPtrGroup &M, const RuntimeCheckingPtrGroup &N) const
Decide if we need to add a check between two groups of pointers, according to needsChecking.
void print(raw_ostream &OS, unsigned Depth=0) const
Print the list run-time memory checks necessary.
SmallVector< RuntimeCheckingPtrGroup, 2 > CheckingGroups
Holds a partitioning of pointers into "check groups".
void generateChecks(MemoryDepChecker::DepCandidates &DepCands, bool UseDependencies)
Generate the checks and store it.
static bool arePointersInSamePartition(const SmallVectorImpl< int > &PtrToPartition, unsigned PtrIdx1, unsigned PtrIdx2)
Check if pointers are in the same partition.
SmallVector< PointerInfo, 2 > Pointers
Information about the pointers that may require checking.
void insert(Loop *Lp, Value *Ptr, const SCEV *PtrExpr, Type *AccessTy, bool WritePtr, unsigned DepSetId, unsigned ASId, PredicatedScalarEvolution &PSE, bool NeedsFreeze)
Insert a pointer and calculate the start and end SCEVs.
This node represents a polynomial recurrence on the trip count of the specified loop.
const SCEV * evaluateAtIteration(const SCEV *It, ScalarEvolution &SE) const
Return the value of this chain of recurrences at the specified iteration number.
const SCEV * getStepRecurrence(ScalarEvolution &SE) const
Constructs and returns the recurrence indicating how much this expression steps by.
bool isAffine() const
Return true if this represents an expression A + B*x where A and B are loop invariant values.
This class represents a constant integer value.
This is the base class for unary integral cast operator classes.
This node represents multiplication of some number of SCEVs.
NoWrapFlags getNoWrapFlags(NoWrapFlags Mask=NoWrapMask) const
virtual void print(raw_ostream &OS, unsigned Depth=0) const =0
Prints a textual representation of this predicate with an indentation of Depth.
This means that we are dealing with an entirely unknown SCEV value, and only represent it as its LLVM...
This class represents an analyzed expression in the program.
Type * getType() const
Return the LLVM type of this SCEV expression.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
bool isKnownNonNegative(const SCEV *S)
Test if the given expression is known to be non-negative.
const SCEV * getNegativeSCEV(const SCEV *V, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
Return the SCEV object corresponding to -V.
bool isKnownNonPositive(const SCEV *S)
Test if the given expression is known to be non-positive.
const SCEV * getUMaxExpr(const SCEV *LHS, const SCEV *RHS)
const SCEVPredicate * getEqualPredicate(const SCEV *LHS, const SCEV *RHS)
const SCEV * getConstant(ConstantInt *V)
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
const SCEV * getNoopOrSignExtend(const SCEV *V, Type *Ty)
Return a SCEV corresponding to a conversion of the input value to the specified type.
const SCEV * getOne(Type *Ty)
Return a SCEV for the constant 1 of a specific type.
const SCEV * getPtrToIntExpr(const SCEV *Op, Type *Ty)
bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
bool isKnownPositive(const SCEV *S)
Test if the given expression is known to be positive.
const SCEV * getZeroExtendExpr(const SCEV *Op, Type *Ty, unsigned Depth=0)
bool isSCEVable(Type *Ty) const
Test if values of the given type are analyzable within the SCEV framework.
Type * getEffectiveSCEVType(Type *Ty) const
Return a type with the same bitwidth as the given type and which represents how SCEV will treat the g...
const SCEV * getUMinExpr(const SCEV *LHS, const SCEV *RHS, bool Sequential=false)
const SCEV * getStoreSizeOfExpr(Type *IntTy, Type *StoreTy)
Return an expression for the store size of StoreTy that is type IntTy.
const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
const SCEV * getSizeOfExpr(Type *IntTy, TypeSize Size)
Return an expression for a TypeSize.
const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
const SCEV * getTruncateOrSignExtend(const SCEV *V, Type *Ty, unsigned Depth=0)
Return a SCEV corresponding to a conversion of the input value to the specified type.
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:321
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:360
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:342
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:427
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:135
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition: SmallSet.h:166
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:179
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:950
void resize(size_type N)
Definition: SmallVector.h:651
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
An instruction for storing to memory.
Definition: Instructions.h:317
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:265
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:255
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
Value * getOperand(unsigned i) const
Definition: User.h:169
unsigned getNumOperands() const
Definition: User.h:191
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
Definition: VectorUtils.h:70
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
const Value * stripAndAccumulateInBoundsConstantOffsets(const DataLayout &DL, APInt &Offset) const
This is a wrapper around stripAndAccumulateConstantOffsets with the in-bounds requirement set to fals...
Definition: Value.h:736
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:199
An efficient, type-erasing, non-owning reference to a callable.
TypeSize getSequentialElementStride(const DataLayout &DL) const
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:52
raw_ostream & indent(unsigned NumSpaces)
indent - Insert 'NumSpaces' spaces.
friend const_iterator end(StringRef path)
Get end iterator over path.
Definition: Path.cpp:236
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
Definition: PatternMatch.h:606
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:450
LocationClass< Ty > location(Ty &L)
Definition: CommandLine.h:470
DiagnosticInfoOptimizationBase::Argument NV
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
std::optional< int > getPointersDiff(Type *ElemTyA, Value *PtrA, Type *ElemTyB, Value *PtrB, const DataLayout &DL, ScalarEvolution &SE, bool StrictCheck=false, bool CheckType=true)
Returns the distance between the pointers PtrA and PtrB iff they are compatible and it is possible to...
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
@ Offset
Definition: DWP.cpp:456
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1722
Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
unsigned getPointerAddressSpace(const Type *T)
Definition: SPIRVUtils.h:122
std::optional< const MDOperand * > findStringMetadataForLoop(const Loop *TheLoop, StringRef Name)
Find string metadata for loop.
Definition: LoopInfo.cpp:1053
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
AddressSpace
Definition: NVPTXBaseInfo.h:21
const Value * getPointerOperand(const Value *V)
A helper function that returns the pointer operand of a load, store or GEP instruction.
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
Definition: STLExtras.h:1928
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1729
bool NullPointerIsDefined(const Function *F, unsigned AS=0)
Check whether null pointer dereferencing is considered undefined behavior for a given function or an ...
Definition: Function.cpp:2048
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
bool isPointerTy(const Type *T)
Definition: SPIRVUtils.h:116
std::optional< int64_t > getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr, const Loop *Lp, const DenseMap< Value *, const SCEV * > &StridesMap=DenseMap< Value *, const SCEV * >(), bool Assume=false, bool ShouldCheckWrap=true)
If the pointer has a constant stride return it in units of the access type size.
bool sortPtrAccesses(ArrayRef< Value * > VL, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
Attempt to sort the pointers in VL and return the sorted indices in SortedIndices,...
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
void getUnderlyingObjects(const Value *V, SmallVectorImpl< const Value * > &Objects, LoopInfo *LI=nullptr, unsigned MaxLookup=6)
This method is similar to getUnderlyingObject except that it can look through phi and select instruct...
const SCEV * replaceSymbolicStrideSCEV(PredicatedScalarEvolution &PSE, const DenseMap< Value *, const SCEV * > &PtrToStride, Value *Ptr)
Return the SCEV corresponding to a pointer with the symbolic stride replaced with constant one,...
bool isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL, ScalarEvolution &SE, bool CheckType=true)
Returns true if the memory operations A and B are consecutive.
bool isGuaranteedNotToBeUndefOrPoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Return true if this function can prove that V does not have undef bits and is never poison.
OutputIt copy(R &&Range, OutputIt Out)
Definition: STLExtras.h:1824
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1749
gep_type_iterator gep_type_begin(const User *GEP)
Type * getLoadStoreType(Value *I)
A helper function that returns the type of a load or store instruction.
Implement std::hash so that hash_code can be used in STL containers.
Definition: BitVector.h:858
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
MDNode * Scope
The tag for alias scope specification (used with noalias).
Definition: Metadata.h:783
MDNode * TBAA
The tag for type-based alias analysis.
Definition: Metadata.h:777
MDNode * NoAlias
The tag specifying the noalias scope.
Definition: Metadata.h:786
A special type used by analysis passes to provide an address that identifies that particular analysis...
Definition: Analysis.h:26
Dependece between memory access instructions.
DepType Type
The type of the dependence.
bool isPossiblyBackward() const
May be a lexically backward dependence type (includes Unknown).
bool isForward() const
Lexically forward dependence.
bool isBackward() const
Lexically backward dependence.
void print(raw_ostream &OS, unsigned Depth, const SmallVectorImpl< Instruction * > &Instrs) const
Print the dependence.
Instruction * getDestination(const LoopAccessInfo &LAI) const
Return the destination instruction of the dependence.
Instruction * getSource(const LoopAccessInfo &LAI) const
Return the source instruction of the dependence.
DepType
The type of the dependence.
static const char * DepName[]
String version of the types.
static VectorizationSafetyStatus isSafeForVectorization(DepType Type)
Dependence types that don't prevent vectorization.
unsigned AddressSpace
Address space of the involved pointers.
bool addPointer(unsigned Index, RuntimePointerChecking &RtCheck)
Tries to add the pointer recorded in RtCheck at index Index to this pointer checking group.
bool NeedsFreeze
Whether the pointer needs to be frozen after expansion, e.g.
const SCEV * High
The SCEV expression which represents the upper bound of all the pointers in this group.
SmallVector< unsigned, 2 > Members
Indices of all the pointers that constitute this grouping.
RuntimeCheckingPtrGroup(unsigned Index, RuntimePointerChecking &RtCheck)
Create a new pointer checking group containing a single pointer, with index Index in RtCheck.
const SCEV * Low
The SCEV expression which represents the lower bound of all the pointers in this group.
bool IsWritePtr
Holds the information if this pointer is used for writing to memory.
unsigned DependencySetId
Holds the id of the set of pointers that could be dependent because of a shared underlying object.
unsigned AliasSetId
Holds the id of the disjoint alias set to which this pointer belongs.
static const unsigned MaxVectorWidth
Maximum SIMD width.
static unsigned VectorizationFactor
VF as overridden by the user.
static unsigned RuntimeMemoryCheckThreshold
\When performing memory disambiguation checks at runtime do not make more than this number of compari...
static bool isInterleaveForced()
True if force-vector-interleave was specified by the user.
static unsigned VectorizationInterleave
Interleave factor as overridden by the user.
Function object to check whether the first component of a container supported by std::get (like std::...
Definition: STLExtras.h:1450