LLVM 19.0.0git
LoopAccessAnalysis.cpp
Go to the documentation of this file.
1//===- LoopAccessAnalysis.cpp - Loop Access Analysis Implementation --------==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// The implementation for the loop memory dependence that was originally
10// developed for the loop vectorizer.
11//
12//===----------------------------------------------------------------------===//
13
15#include "llvm/ADT/APInt.h"
16#include "llvm/ADT/DenseMap.h"
19#include "llvm/ADT/STLExtras.h"
20#include "llvm/ADT/SetVector.h"
22#include "llvm/ADT/SmallSet.h"
36#include "llvm/IR/BasicBlock.h"
37#include "llvm/IR/Constants.h"
38#include "llvm/IR/DataLayout.h"
39#include "llvm/IR/DebugLoc.h"
42#include "llvm/IR/Dominators.h"
43#include "llvm/IR/Function.h"
45#include "llvm/IR/InstrTypes.h"
46#include "llvm/IR/Instruction.h"
48#include "llvm/IR/Operator.h"
49#include "llvm/IR/PassManager.h"
51#include "llvm/IR/Type.h"
52#include "llvm/IR/Value.h"
53#include "llvm/IR/ValueHandle.h"
56#include "llvm/Support/Debug.h"
59#include <algorithm>
60#include <cassert>
61#include <cstdint>
62#include <iterator>
63#include <utility>
64#include <variant>
65#include <vector>
66
67using namespace llvm;
68using namespace llvm::PatternMatch;
69
70#define DEBUG_TYPE "loop-accesses"
71
73VectorizationFactor("force-vector-width", cl::Hidden,
74 cl::desc("Sets the SIMD width. Zero is autoselect."),
77
79VectorizationInterleave("force-vector-interleave", cl::Hidden,
80 cl::desc("Sets the vectorization interleave count. "
81 "Zero is autoselect."),
85
87 "runtime-memory-check-threshold", cl::Hidden,
88 cl::desc("When performing memory disambiguation checks at runtime do not "
89 "generate more than this number of comparisons (default = 8)."),
92
93/// The maximum iterations used to merge memory checks
95 "memory-check-merge-threshold", cl::Hidden,
96 cl::desc("Maximum number of comparisons done when trying to merge "
97 "runtime memory checks. (default = 100)"),
98 cl::init(100));
99
100/// Maximum SIMD width.
101const unsigned VectorizerParams::MaxVectorWidth = 64;
102
103/// We collect dependences up to this threshold.
105 MaxDependences("max-dependences", cl::Hidden,
106 cl::desc("Maximum number of dependences collected by "
107 "loop-access analysis (default = 100)"),
108 cl::init(100));
109
110/// This enables versioning on the strides of symbolically striding memory
111/// accesses in code like the following.
112/// for (i = 0; i < N; ++i)
113/// A[i * Stride1] += B[i * Stride2] ...
114///
115/// Will be roughly translated to
116/// if (Stride1 == 1 && Stride2 == 1) {
117/// for (i = 0; i < N; i+=4)
118/// A[i:i+3] += ...
119/// } else
120/// ...
122 "enable-mem-access-versioning", cl::init(true), cl::Hidden,
123 cl::desc("Enable symbolic stride memory access versioning"));
124
125/// Enable store-to-load forwarding conflict detection. This option can
126/// be disabled for correctness testing.
128 "store-to-load-forwarding-conflict-detection", cl::Hidden,
129 cl::desc("Enable conflict detection in loop-access analysis"),
130 cl::init(true));
131
133 "max-forked-scev-depth", cl::Hidden,
134 cl::desc("Maximum recursion depth when finding forked SCEVs (default = 5)"),
135 cl::init(5));
136
138 "laa-speculate-unit-stride", cl::Hidden,
139 cl::desc("Speculate that non-constant strides are unit in LAA"),
140 cl::init(true));
141
143 "hoist-runtime-checks", cl::Hidden,
144 cl::desc(
145 "Hoist inner loop runtime memory checks to outer loop if possible"),
148
150 return ::VectorizationInterleave.getNumOccurrences() > 0;
151}
152
154 const DenseMap<Value *, const SCEV *> &PtrToStride,
155 Value *Ptr) {
156 const SCEV *OrigSCEV = PSE.getSCEV(Ptr);
157
158 // If there is an entry in the map return the SCEV of the pointer with the
159 // symbolic stride replaced by one.
161 if (SI == PtrToStride.end())
162 // For a non-symbolic stride, just return the original expression.
163 return OrigSCEV;
164
165 const SCEV *StrideSCEV = SI->second;
166 // Note: This assert is both overly strong and overly weak. The actual
167 // invariant here is that StrideSCEV should be loop invariant. The only
168 // such invariant strides we happen to speculate right now are unknowns
169 // and thus this is a reasonable proxy of the actual invariant.
170 assert(isa<SCEVUnknown>(StrideSCEV) && "shouldn't be in map");
171
172 ScalarEvolution *SE = PSE.getSE();
173 const auto *CT = SE->getOne(StrideSCEV->getType());
174 PSE.addPredicate(*SE->getEqualPredicate(StrideSCEV, CT));
175 auto *Expr = PSE.getSCEV(Ptr);
176
177 LLVM_DEBUG(dbgs() << "LAA: Replacing SCEV: " << *OrigSCEV
178 << " by: " << *Expr << "\n");
179 return Expr;
180}
181
183 unsigned Index, RuntimePointerChecking &RtCheck)
184 : High(RtCheck.Pointers[Index].End), Low(RtCheck.Pointers[Index].Start),
185 AddressSpace(RtCheck.Pointers[Index]
186 .PointerValue->getType()
188 NeedsFreeze(RtCheck.Pointers[Index].NeedsFreeze) {
190}
191
192/// Calculate Start and End points of memory access.
193/// Let's assume A is the first access and B is a memory access on N-th loop
194/// iteration. Then B is calculated as:
195/// B = A + Step*N .
196/// Step value may be positive or negative.
197/// N is a calculated back-edge taken count:
198/// N = (TripCount > 0) ? RoundDown(TripCount -1 , VF) : 0
199/// Start and End points are calculated in the following way:
200/// Start = UMIN(A, B) ; End = UMAX(A, B) + SizeOfElt,
201/// where SizeOfElt is the size of single memory access in bytes.
202///
203/// There is no conflict when the intervals are disjoint:
204/// NoConflict = (P2.Start >= P1.End) || (P1.Start >= P2.End)
206 Type *AccessTy, bool WritePtr,
207 unsigned DepSetId, unsigned ASId,
209 bool NeedsFreeze) {
210 ScalarEvolution *SE = PSE.getSE();
211
212 const SCEV *ScStart;
213 const SCEV *ScEnd;
214
215 if (SE->isLoopInvariant(PtrExpr, Lp)) {
216 ScStart = ScEnd = PtrExpr;
217 } else {
218 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PtrExpr);
219 assert(AR && "Invalid addrec expression");
220 const SCEV *Ex = PSE.getBackedgeTakenCount();
221
222 ScStart = AR->getStart();
223 ScEnd = AR->evaluateAtIteration(Ex, *SE);
224 const SCEV *Step = AR->getStepRecurrence(*SE);
225
226 // For expressions with negative step, the upper bound is ScStart and the
227 // lower bound is ScEnd.
228 if (const auto *CStep = dyn_cast<SCEVConstant>(Step)) {
229 if (CStep->getValue()->isNegative())
230 std::swap(ScStart, ScEnd);
231 } else {
232 // Fallback case: the step is not constant, but we can still
233 // get the upper and lower bounds of the interval by using min/max
234 // expressions.
235 ScStart = SE->getUMinExpr(ScStart, ScEnd);
236 ScEnd = SE->getUMaxExpr(AR->getStart(), ScEnd);
237 }
238 }
239 assert(SE->isLoopInvariant(ScStart, Lp) && "ScStart needs to be invariant");
240 assert(SE->isLoopInvariant(ScEnd, Lp)&& "ScEnd needs to be invariant");
241
242 // Add the size of the pointed element to ScEnd.
243 auto &DL = Lp->getHeader()->getModule()->getDataLayout();
244 Type *IdxTy = DL.getIndexType(Ptr->getType());
245 const SCEV *EltSizeSCEV = SE->getStoreSizeOfExpr(IdxTy, AccessTy);
246 ScEnd = SE->getAddExpr(ScEnd, EltSizeSCEV);
247
248 Pointers.emplace_back(Ptr, ScStart, ScEnd, WritePtr, DepSetId, ASId, PtrExpr,
249 NeedsFreeze);
250}
251
252void RuntimePointerChecking::tryToCreateDiffCheck(
253 const RuntimeCheckingPtrGroup &CGI, const RuntimeCheckingPtrGroup &CGJ) {
254 if (!CanUseDiffCheck)
255 return;
256
257 // If either group contains multiple different pointers, bail out.
258 // TODO: Support multiple pointers by using the minimum or maximum pointer,
259 // depending on src & sink.
260 if (CGI.Members.size() != 1 || CGJ.Members.size() != 1) {
261 CanUseDiffCheck = false;
262 return;
263 }
264
265 PointerInfo *Src = &Pointers[CGI.Members[0]];
266 PointerInfo *Sink = &Pointers[CGJ.Members[0]];
267
268 // If either pointer is read and written, multiple checks may be needed. Bail
269 // out.
270 if (!DC.getOrderForAccess(Src->PointerValue, !Src->IsWritePtr).empty() ||
271 !DC.getOrderForAccess(Sink->PointerValue, !Sink->IsWritePtr).empty()) {
272 CanUseDiffCheck = false;
273 return;
274 }
275
276 ArrayRef<unsigned> AccSrc =
277 DC.getOrderForAccess(Src->PointerValue, Src->IsWritePtr);
278 ArrayRef<unsigned> AccSink =
279 DC.getOrderForAccess(Sink->PointerValue, Sink->IsWritePtr);
280 // If either pointer is accessed multiple times, there may not be a clear
281 // src/sink relation. Bail out for now.
282 if (AccSrc.size() != 1 || AccSink.size() != 1) {
283 CanUseDiffCheck = false;
284 return;
285 }
286 // If the sink is accessed before src, swap src/sink.
287 if (AccSink[0] < AccSrc[0])
288 std::swap(Src, Sink);
289
290 auto *SrcAR = dyn_cast<SCEVAddRecExpr>(Src->Expr);
291 auto *SinkAR = dyn_cast<SCEVAddRecExpr>(Sink->Expr);
292 if (!SrcAR || !SinkAR || SrcAR->getLoop() != DC.getInnermostLoop() ||
293 SinkAR->getLoop() != DC.getInnermostLoop()) {
294 CanUseDiffCheck = false;
295 return;
296 }
297
299 DC.getInstructionsForAccess(Src->PointerValue, Src->IsWritePtr);
301 DC.getInstructionsForAccess(Sink->PointerValue, Sink->IsWritePtr);
302 Type *SrcTy = getLoadStoreType(SrcInsts[0]);
303 Type *DstTy = getLoadStoreType(SinkInsts[0]);
304 if (isa<ScalableVectorType>(SrcTy) || isa<ScalableVectorType>(DstTy)) {
305 CanUseDiffCheck = false;
306 return;
307 }
308 const DataLayout &DL =
309 SinkAR->getLoop()->getHeader()->getModule()->getDataLayout();
310 unsigned AllocSize =
311 std::max(DL.getTypeAllocSize(SrcTy), DL.getTypeAllocSize(DstTy));
312
313 // Only matching constant steps matching the AllocSize are supported at the
314 // moment. This simplifies the difference computation. Can be extended in the
315 // future.
316 auto *Step = dyn_cast<SCEVConstant>(SinkAR->getStepRecurrence(*SE));
317 if (!Step || Step != SrcAR->getStepRecurrence(*SE) ||
318 Step->getAPInt().abs() != AllocSize) {
319 CanUseDiffCheck = false;
320 return;
321 }
322
323 IntegerType *IntTy =
324 IntegerType::get(Src->PointerValue->getContext(),
325 DL.getPointerSizeInBits(CGI.AddressSpace));
326
327 // When counting down, the dependence distance needs to be swapped.
328 if (Step->getValue()->isNegative())
329 std::swap(SinkAR, SrcAR);
330
331 const SCEV *SinkStartInt = SE->getPtrToIntExpr(SinkAR->getStart(), IntTy);
332 const SCEV *SrcStartInt = SE->getPtrToIntExpr(SrcAR->getStart(), IntTy);
333 if (isa<SCEVCouldNotCompute>(SinkStartInt) ||
334 isa<SCEVCouldNotCompute>(SrcStartInt)) {
335 CanUseDiffCheck = false;
336 return;
337 }
338
339 const Loop *InnerLoop = SrcAR->getLoop();
340 // If the start values for both Src and Sink also vary according to an outer
341 // loop, then it's probably better to avoid creating diff checks because
342 // they may not be hoisted. We should instead let llvm::addRuntimeChecks
343 // do the expanded full range overlap checks, which can be hoisted.
344 if (HoistRuntimeChecks && InnerLoop->getParentLoop() &&
345 isa<SCEVAddRecExpr>(SinkStartInt) && isa<SCEVAddRecExpr>(SrcStartInt)) {
346 auto *SrcStartAR = cast<SCEVAddRecExpr>(SrcStartInt);
347 auto *SinkStartAR = cast<SCEVAddRecExpr>(SinkStartInt);
348 const Loop *StartARLoop = SrcStartAR->getLoop();
349 if (StartARLoop == SinkStartAR->getLoop() &&
350 StartARLoop == InnerLoop->getParentLoop() &&
351 // If the diff check would already be loop invariant (due to the
352 // recurrences being the same), then we prefer to keep the diff checks
353 // because they are cheaper.
354 SrcStartAR->getStepRecurrence(*SE) !=
355 SinkStartAR->getStepRecurrence(*SE)) {
356 LLVM_DEBUG(dbgs() << "LAA: Not creating diff runtime check, since these "
357 "cannot be hoisted out of the outer loop\n");
358 CanUseDiffCheck = false;
359 return;
360 }
361 }
362
363 LLVM_DEBUG(dbgs() << "LAA: Creating diff runtime check for:\n"
364 << "SrcStart: " << *SrcStartInt << '\n'
365 << "SinkStartInt: " << *SinkStartInt << '\n');
366 DiffChecks.emplace_back(SrcStartInt, SinkStartInt, AllocSize,
367 Src->NeedsFreeze || Sink->NeedsFreeze);
368}
369
370SmallVector<RuntimePointerCheck, 4> RuntimePointerChecking::generateChecks() {
372
373 for (unsigned I = 0; I < CheckingGroups.size(); ++I) {
374 for (unsigned J = I + 1; J < CheckingGroups.size(); ++J) {
377
378 if (needsChecking(CGI, CGJ)) {
379 tryToCreateDiffCheck(CGI, CGJ);
380 Checks.push_back(std::make_pair(&CGI, &CGJ));
381 }
382 }
383 }
384 return Checks;
385}
386
387void RuntimePointerChecking::generateChecks(
388 MemoryDepChecker::DepCandidates &DepCands, bool UseDependencies) {
389 assert(Checks.empty() && "Checks is not empty");
390 groupChecks(DepCands, UseDependencies);
391 Checks = generateChecks();
392}
393
395 const RuntimeCheckingPtrGroup &M, const RuntimeCheckingPtrGroup &N) const {
396 for (unsigned I = 0, EI = M.Members.size(); EI != I; ++I)
397 for (unsigned J = 0, EJ = N.Members.size(); EJ != J; ++J)
398 if (needsChecking(M.Members[I], N.Members[J]))
399 return true;
400 return false;
401}
402
403/// Compare \p I and \p J and return the minimum.
404/// Return nullptr in case we couldn't find an answer.
405static const SCEV *getMinFromExprs(const SCEV *I, const SCEV *J,
406 ScalarEvolution *SE) {
407 const SCEV *Diff = SE->getMinusSCEV(J, I);
408 const SCEVConstant *C = dyn_cast<const SCEVConstant>(Diff);
409
410 if (!C)
411 return nullptr;
412 if (C->getValue()->isNegative())
413 return J;
414 return I;
415}
416
418 RuntimePointerChecking &RtCheck) {
419 return addPointer(
420 Index, RtCheck.Pointers[Index].Start, RtCheck.Pointers[Index].End,
421 RtCheck.Pointers[Index].PointerValue->getType()->getPointerAddressSpace(),
422 RtCheck.Pointers[Index].NeedsFreeze, *RtCheck.SE);
423}
424
426 const SCEV *End, unsigned AS,
427 bool NeedsFreeze,
428 ScalarEvolution &SE) {
429 assert(AddressSpace == AS &&
430 "all pointers in a checking group must be in the same address space");
431
432 // Compare the starts and ends with the known minimum and maximum
433 // of this set. We need to know how we compare against the min/max
434 // of the set in order to be able to emit memchecks.
435 const SCEV *Min0 = getMinFromExprs(Start, Low, &SE);
436 if (!Min0)
437 return false;
438
439 const SCEV *Min1 = getMinFromExprs(End, High, &SE);
440 if (!Min1)
441 return false;
442
443 // Update the low bound expression if we've found a new min value.
444 if (Min0 == Start)
445 Low = Start;
446
447 // Update the high bound expression if we've found a new max value.
448 if (Min1 != End)
449 High = End;
450
452 this->NeedsFreeze |= NeedsFreeze;
453 return true;
454}
455
456void RuntimePointerChecking::groupChecks(
457 MemoryDepChecker::DepCandidates &DepCands, bool UseDependencies) {
458 // We build the groups from dependency candidates equivalence classes
459 // because:
460 // - We know that pointers in the same equivalence class share
461 // the same underlying object and therefore there is a chance
462 // that we can compare pointers
463 // - We wouldn't be able to merge two pointers for which we need
464 // to emit a memcheck. The classes in DepCands are already
465 // conveniently built such that no two pointers in the same
466 // class need checking against each other.
467
468 // We use the following (greedy) algorithm to construct the groups
469 // For every pointer in the equivalence class:
470 // For each existing group:
471 // - if the difference between this pointer and the min/max bounds
472 // of the group is a constant, then make the pointer part of the
473 // group and update the min/max bounds of that group as required.
474
475 CheckingGroups.clear();
476
477 // If we need to check two pointers to the same underlying object
478 // with a non-constant difference, we shouldn't perform any pointer
479 // grouping with those pointers. This is because we can easily get
480 // into cases where the resulting check would return false, even when
481 // the accesses are safe.
482 //
483 // The following example shows this:
484 // for (i = 0; i < 1000; ++i)
485 // a[5000 + i * m] = a[i] + a[i + 9000]
486 //
487 // Here grouping gives a check of (5000, 5000 + 1000 * m) against
488 // (0, 10000) which is always false. However, if m is 1, there is no
489 // dependence. Not grouping the checks for a[i] and a[i + 9000] allows
490 // us to perform an accurate check in this case.
491 //
492 // The above case requires that we have an UnknownDependence between
493 // accesses to the same underlying object. This cannot happen unless
494 // FoundNonConstantDistanceDependence is set, and therefore UseDependencies
495 // is also false. In this case we will use the fallback path and create
496 // separate checking groups for all pointers.
497
498 // If we don't have the dependency partitions, construct a new
499 // checking pointer group for each pointer. This is also required
500 // for correctness, because in this case we can have checking between
501 // pointers to the same underlying object.
502 if (!UseDependencies) {
503 for (unsigned I = 0; I < Pointers.size(); ++I)
504 CheckingGroups.push_back(RuntimeCheckingPtrGroup(I, *this));
505 return;
506 }
507
508 unsigned TotalComparisons = 0;
509
511 for (unsigned Index = 0; Index < Pointers.size(); ++Index) {
512 auto Iter = PositionMap.insert({Pointers[Index].PointerValue, {}});
513 Iter.first->second.push_back(Index);
514 }
515
516 // We need to keep track of what pointers we've already seen so we
517 // don't process them twice.
519
520 // Go through all equivalence classes, get the "pointer check groups"
521 // and add them to the overall solution. We use the order in which accesses
522 // appear in 'Pointers' to enforce determinism.
523 for (unsigned I = 0; I < Pointers.size(); ++I) {
524 // We've seen this pointer before, and therefore already processed
525 // its equivalence class.
526 if (Seen.count(I))
527 continue;
528
529 MemoryDepChecker::MemAccessInfo Access(Pointers[I].PointerValue,
530 Pointers[I].IsWritePtr);
531
533 auto LeaderI = DepCands.findValue(DepCands.getLeaderValue(Access));
534
535 // Because DepCands is constructed by visiting accesses in the order in
536 // which they appear in alias sets (which is deterministic) and the
537 // iteration order within an equivalence class member is only dependent on
538 // the order in which unions and insertions are performed on the
539 // equivalence class, the iteration order is deterministic.
540 for (auto MI = DepCands.member_begin(LeaderI), ME = DepCands.member_end();
541 MI != ME; ++MI) {
542 auto PointerI = PositionMap.find(MI->getPointer());
543 assert(PointerI != PositionMap.end() &&
544 "pointer in equivalence class not found in PositionMap");
545 for (unsigned Pointer : PointerI->second) {
546 bool Merged = false;
547 // Mark this pointer as seen.
548 Seen.insert(Pointer);
549
550 // Go through all the existing sets and see if we can find one
551 // which can include this pointer.
552 for (RuntimeCheckingPtrGroup &Group : Groups) {
553 // Don't perform more than a certain amount of comparisons.
554 // This should limit the cost of grouping the pointers to something
555 // reasonable. If we do end up hitting this threshold, the algorithm
556 // will create separate groups for all remaining pointers.
557 if (TotalComparisons > MemoryCheckMergeThreshold)
558 break;
559
560 TotalComparisons++;
561
562 if (Group.addPointer(Pointer, *this)) {
563 Merged = true;
564 break;
565 }
566 }
567
568 if (!Merged)
569 // We couldn't add this pointer to any existing set or the threshold
570 // for the number of comparisons has been reached. Create a new group
571 // to hold the current pointer.
572 Groups.push_back(RuntimeCheckingPtrGroup(Pointer, *this));
573 }
574 }
575
576 // We've computed the grouped checks for this partition.
577 // Save the results and continue with the next one.
578 llvm::copy(Groups, std::back_inserter(CheckingGroups));
579 }
580}
581
583 const SmallVectorImpl<int> &PtrToPartition, unsigned PtrIdx1,
584 unsigned PtrIdx2) {
585 return (PtrToPartition[PtrIdx1] != -1 &&
586 PtrToPartition[PtrIdx1] == PtrToPartition[PtrIdx2]);
587}
588
589bool RuntimePointerChecking::needsChecking(unsigned I, unsigned J) const {
590 const PointerInfo &PointerI = Pointers[I];
591 const PointerInfo &PointerJ = Pointers[J];
592
593 // No need to check if two readonly pointers intersect.
594 if (!PointerI.IsWritePtr && !PointerJ.IsWritePtr)
595 return false;
596
597 // Only need to check pointers between two different dependency sets.
598 if (PointerI.DependencySetId == PointerJ.DependencySetId)
599 return false;
600
601 // Only need to check pointers in the same alias set.
602 if (PointerI.AliasSetId != PointerJ.AliasSetId)
603 return false;
604
605 return true;
606}
607
610 unsigned Depth) const {
611 unsigned N = 0;
612 for (const auto &Check : Checks) {
613 const auto &First = Check.first->Members, &Second = Check.second->Members;
614
615 OS.indent(Depth) << "Check " << N++ << ":\n";
616
617 OS.indent(Depth + 2) << "Comparing group (" << Check.first << "):\n";
618 for (unsigned K = 0; K < First.size(); ++K)
619 OS.indent(Depth + 2) << *Pointers[First[K]].PointerValue << "\n";
620
621 OS.indent(Depth + 2) << "Against group (" << Check.second << "):\n";
622 for (unsigned K = 0; K < Second.size(); ++K)
623 OS.indent(Depth + 2) << *Pointers[Second[K]].PointerValue << "\n";
624 }
625}
626
628
629 OS.indent(Depth) << "Run-time memory checks:\n";
630 printChecks(OS, Checks, Depth);
631
632 OS.indent(Depth) << "Grouped accesses:\n";
633 for (unsigned I = 0; I < CheckingGroups.size(); ++I) {
634 const auto &CG = CheckingGroups[I];
635
636 OS.indent(Depth + 2) << "Group " << &CG << ":\n";
637 OS.indent(Depth + 4) << "(Low: " << *CG.Low << " High: " << *CG.High
638 << ")\n";
639 for (unsigned J = 0; J < CG.Members.size(); ++J) {
640 OS.indent(Depth + 6) << "Member: " << *Pointers[CG.Members[J]].Expr
641 << "\n";
642 }
643 }
644}
645
646namespace {
647
648/// Analyses memory accesses in a loop.
649///
650/// Checks whether run time pointer checks are needed and builds sets for data
651/// dependence checking.
652class AccessAnalysis {
653public:
654 /// Read or write access location.
655 typedef PointerIntPair<Value *, 1, bool> MemAccessInfo;
656 typedef SmallVector<MemAccessInfo, 8> MemAccessInfoList;
657
658 AccessAnalysis(Loop *TheLoop, AAResults *AA, LoopInfo *LI,
661 SmallPtrSetImpl<MDNode *> &LoopAliasScopes)
662 : TheLoop(TheLoop), BAA(*AA), AST(BAA), LI(LI), DepCands(DA), PSE(PSE),
663 LoopAliasScopes(LoopAliasScopes) {
664 // We're analyzing dependences across loop iterations.
665 BAA.enableCrossIterationMode();
666 }
667
668 /// Register a load and whether it is only read from.
669 void addLoad(MemoryLocation &Loc, Type *AccessTy, bool IsReadOnly) {
670 Value *Ptr = const_cast<Value *>(Loc.Ptr);
671 AST.add(adjustLoc(Loc));
672 Accesses[MemAccessInfo(Ptr, false)].insert(AccessTy);
673 if (IsReadOnly)
674 ReadOnlyPtr.insert(Ptr);
675 }
676
677 /// Register a store.
678 void addStore(MemoryLocation &Loc, Type *AccessTy) {
679 Value *Ptr = const_cast<Value *>(Loc.Ptr);
680 AST.add(adjustLoc(Loc));
681 Accesses[MemAccessInfo(Ptr, true)].insert(AccessTy);
682 }
683
684 /// Check if we can emit a run-time no-alias check for \p Access.
685 ///
686 /// Returns true if we can emit a run-time no alias check for \p Access.
687 /// If we can check this access, this also adds it to a dependence set and
688 /// adds a run-time to check for it to \p RtCheck. If \p Assume is true,
689 /// we will attempt to use additional run-time checks in order to get
690 /// the bounds of the pointer.
691 bool createCheckForAccess(RuntimePointerChecking &RtCheck,
692 MemAccessInfo Access, Type *AccessTy,
693 const DenseMap<Value *, const SCEV *> &Strides,
695 Loop *TheLoop, unsigned &RunningDepId,
696 unsigned ASId, bool ShouldCheckStride, bool Assume);
697
698 /// Check whether we can check the pointers at runtime for
699 /// non-intersection.
700 ///
701 /// Returns true if we need no check or if we do and we can generate them
702 /// (i.e. the pointers have computable bounds).
703 bool canCheckPtrAtRT(RuntimePointerChecking &RtCheck, ScalarEvolution *SE,
704 Loop *TheLoop, const DenseMap<Value *, const SCEV *> &Strides,
705 Value *&UncomputablePtr, bool ShouldCheckWrap = false);
706
707 /// Goes over all memory accesses, checks whether a RT check is needed
708 /// and builds sets of dependent accesses.
709 void buildDependenceSets() {
710 processMemAccesses();
711 }
712
713 /// Initial processing of memory accesses determined that we need to
714 /// perform dependency checking.
715 ///
716 /// Note that this can later be cleared if we retry memcheck analysis without
717 /// dependency checking (i.e. FoundNonConstantDistanceDependence).
718 bool isDependencyCheckNeeded() { return !CheckDeps.empty(); }
719
720 /// We decided that no dependence analysis would be used. Reset the state.
721 void resetDepChecks(MemoryDepChecker &DepChecker) {
722 CheckDeps.clear();
723 DepChecker.clearDependences();
724 }
725
726 MemAccessInfoList &getDependenciesToCheck() { return CheckDeps; }
727
730 return UnderlyingObjects;
731 }
732
733private:
735
736 /// Adjust the MemoryLocation so that it represents accesses to this
737 /// location across all iterations, rather than a single one.
738 MemoryLocation adjustLoc(MemoryLocation Loc) const {
739 // The accessed location varies within the loop, but remains within the
740 // underlying object.
742 Loc.AATags.Scope = adjustAliasScopeList(Loc.AATags.Scope);
743 Loc.AATags.NoAlias = adjustAliasScopeList(Loc.AATags.NoAlias);
744 return Loc;
745 }
746
747 /// Drop alias scopes that are only valid within a single loop iteration.
748 MDNode *adjustAliasScopeList(MDNode *ScopeList) const {
749 if (!ScopeList)
750 return nullptr;
751
752 // For the sake of simplicity, drop the whole scope list if any scope is
753 // iteration-local.
754 if (any_of(ScopeList->operands(), [&](Metadata *Scope) {
755 return LoopAliasScopes.contains(cast<MDNode>(Scope));
756 }))
757 return nullptr;
758
759 return ScopeList;
760 }
761
762 /// Go over all memory access and check whether runtime pointer checks
763 /// are needed and build sets of dependency check candidates.
764 void processMemAccesses();
765
766 /// Map of all accesses. Values are the types used to access memory pointed to
767 /// by the pointer.
768 PtrAccessMap Accesses;
769
770 /// The loop being checked.
771 const Loop *TheLoop;
772
773 /// List of accesses that need a further dependence check.
774 MemAccessInfoList CheckDeps;
775
776 /// Set of pointers that are read only.
777 SmallPtrSet<Value*, 16> ReadOnlyPtr;
778
779 /// Batched alias analysis results.
780 BatchAAResults BAA;
781
782 /// An alias set tracker to partition the access set by underlying object and
783 //intrinsic property (such as TBAA metadata).
784 AliasSetTracker AST;
785
786 LoopInfo *LI;
787
788 /// Sets of potentially dependent accesses - members of one set share an
789 /// underlying pointer. The set "CheckDeps" identfies which sets really need a
790 /// dependence check.
792
793 /// Initial processing of memory accesses determined that we may need
794 /// to add memchecks. Perform the analysis to determine the necessary checks.
795 ///
796 /// Note that, this is different from isDependencyCheckNeeded. When we retry
797 /// memcheck analysis without dependency checking
798 /// (i.e. FoundNonConstantDistanceDependence), isDependencyCheckNeeded is
799 /// cleared while this remains set if we have potentially dependent accesses.
800 bool IsRTCheckAnalysisNeeded = false;
801
802 /// The SCEV predicate containing all the SCEV-related assumptions.
804
806
807 /// Alias scopes that are declared inside the loop, and as such not valid
808 /// across iterations.
809 SmallPtrSetImpl<MDNode *> &LoopAliasScopes;
810};
811
812} // end anonymous namespace
813
814/// Check whether a pointer can participate in a runtime bounds check.
815/// If \p Assume, try harder to prove that we can compute the bounds of \p Ptr
816/// by adding run-time checks (overflow checks) if necessary.
818 const SCEV *PtrScev, Loop *L, bool Assume) {
819 // The bounds for loop-invariant pointer is trivial.
820 if (PSE.getSE()->isLoopInvariant(PtrScev, L))
821 return true;
822
823 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PtrScev);
824
825 if (!AR && Assume)
826 AR = PSE.getAsAddRec(Ptr);
827
828 if (!AR)
829 return false;
830
831 return AR->isAffine();
832}
833
834/// Check whether a pointer address cannot wrap.
836 const DenseMap<Value *, const SCEV *> &Strides, Value *Ptr, Type *AccessTy,
837 Loop *L) {
838 const SCEV *PtrScev = PSE.getSCEV(Ptr);
839 if (PSE.getSE()->isLoopInvariant(PtrScev, L))
840 return true;
841
842 int64_t Stride = getPtrStride(PSE, AccessTy, Ptr, L, Strides).value_or(0);
843 if (Stride == 1 || PSE.hasNoOverflow(Ptr, SCEVWrapPredicate::IncrementNUSW))
844 return true;
845
846 return false;
847}
848
849static void visitPointers(Value *StartPtr, const Loop &InnermostLoop,
850 function_ref<void(Value *)> AddPointer) {
852 SmallVector<Value *> WorkList;
853 WorkList.push_back(StartPtr);
854
855 while (!WorkList.empty()) {
856 Value *Ptr = WorkList.pop_back_val();
857 if (!Visited.insert(Ptr).second)
858 continue;
859 auto *PN = dyn_cast<PHINode>(Ptr);
860 // SCEV does not look through non-header PHIs inside the loop. Such phis
861 // can be analyzed by adding separate accesses for each incoming pointer
862 // value.
863 if (PN && InnermostLoop.contains(PN->getParent()) &&
864 PN->getParent() != InnermostLoop.getHeader()) {
865 for (const Use &Inc : PN->incoming_values())
866 WorkList.push_back(Inc);
867 } else
868 AddPointer(Ptr);
869 }
870}
871
872// Walk back through the IR for a pointer, looking for a select like the
873// following:
874//
875// %offset = select i1 %cmp, i64 %a, i64 %b
876// %addr = getelementptr double, double* %base, i64 %offset
877// %ld = load double, double* %addr, align 8
878//
879// We won't be able to form a single SCEVAddRecExpr from this since the
880// address for each loop iteration depends on %cmp. We could potentially
881// produce multiple valid SCEVAddRecExprs, though, and check all of them for
882// memory safety/aliasing if needed.
883//
884// If we encounter some IR we don't yet handle, or something obviously fine
885// like a constant, then we just add the SCEV for that term to the list passed
886// in by the caller. If we have a node that may potentially yield a valid
887// SCEVAddRecExpr then we decompose it into parts and build the SCEV terms
888// ourselves before adding to the list.
889static void findForkedSCEVs(
890 ScalarEvolution *SE, const Loop *L, Value *Ptr,
892 unsigned Depth) {
893 // If our Value is a SCEVAddRecExpr, loop invariant, not an instruction, or
894 // we've exceeded our limit on recursion, just return whatever we have
895 // regardless of whether it can be used for a forked pointer or not, along
896 // with an indication of whether it might be a poison or undef value.
897 const SCEV *Scev = SE->getSCEV(Ptr);
898 if (isa<SCEVAddRecExpr>(Scev) || L->isLoopInvariant(Ptr) ||
899 !isa<Instruction>(Ptr) || Depth == 0) {
900 ScevList.emplace_back(Scev, !isGuaranteedNotToBeUndefOrPoison(Ptr));
901 return;
902 }
903
904 Depth--;
905
906 auto UndefPoisonCheck = [](PointerIntPair<const SCEV *, 1, bool> S) {
907 return get<1>(S);
908 };
909
910 auto GetBinOpExpr = [&SE](unsigned Opcode, const SCEV *L, const SCEV *R) {
911 switch (Opcode) {
912 case Instruction::Add:
913 return SE->getAddExpr(L, R);
914 case Instruction::Sub:
915 return SE->getMinusSCEV(L, R);
916 default:
917 llvm_unreachable("Unexpected binary operator when walking ForkedPtrs");
918 }
919 };
920
921 Instruction *I = cast<Instruction>(Ptr);
922 unsigned Opcode = I->getOpcode();
923 switch (Opcode) {
924 case Instruction::GetElementPtr: {
925 GetElementPtrInst *GEP = cast<GetElementPtrInst>(I);
926 Type *SourceTy = GEP->getSourceElementType();
927 // We only handle base + single offset GEPs here for now.
928 // Not dealing with preexisting gathers yet, so no vectors.
929 if (I->getNumOperands() != 2 || SourceTy->isVectorTy()) {
930 ScevList.emplace_back(Scev, !isGuaranteedNotToBeUndefOrPoison(GEP));
931 break;
932 }
935 findForkedSCEVs(SE, L, I->getOperand(0), BaseScevs, Depth);
936 findForkedSCEVs(SE, L, I->getOperand(1), OffsetScevs, Depth);
937
938 // See if we need to freeze our fork...
939 bool NeedsFreeze = any_of(BaseScevs, UndefPoisonCheck) ||
940 any_of(OffsetScevs, UndefPoisonCheck);
941
942 // Check that we only have a single fork, on either the base or the offset.
943 // Copy the SCEV across for the one without a fork in order to generate
944 // the full SCEV for both sides of the GEP.
945 if (OffsetScevs.size() == 2 && BaseScevs.size() == 1)
946 BaseScevs.push_back(BaseScevs[0]);
947 else if (BaseScevs.size() == 2 && OffsetScevs.size() == 1)
948 OffsetScevs.push_back(OffsetScevs[0]);
949 else {
950 ScevList.emplace_back(Scev, NeedsFreeze);
951 break;
952 }
953
954 // Find the pointer type we need to extend to.
955 Type *IntPtrTy = SE->getEffectiveSCEVType(
956 SE->getSCEV(GEP->getPointerOperand())->getType());
957
958 // Find the size of the type being pointed to. We only have a single
959 // index term (guarded above) so we don't need to index into arrays or
960 // structures, just get the size of the scalar value.
961 const SCEV *Size = SE->getSizeOfExpr(IntPtrTy, SourceTy);
962
963 // Scale up the offsets by the size of the type, then add to the bases.
964 const SCEV *Scaled1 = SE->getMulExpr(
965 Size, SE->getTruncateOrSignExtend(get<0>(OffsetScevs[0]), IntPtrTy));
966 const SCEV *Scaled2 = SE->getMulExpr(
967 Size, SE->getTruncateOrSignExtend(get<0>(OffsetScevs[1]), IntPtrTy));
968 ScevList.emplace_back(SE->getAddExpr(get<0>(BaseScevs[0]), Scaled1),
969 NeedsFreeze);
970 ScevList.emplace_back(SE->getAddExpr(get<0>(BaseScevs[1]), Scaled2),
971 NeedsFreeze);
972 break;
973 }
974 case Instruction::Select: {
976 // A select means we've found a forked pointer, but we currently only
977 // support a single select per pointer so if there's another behind this
978 // then we just bail out and return the generic SCEV.
979 findForkedSCEVs(SE, L, I->getOperand(1), ChildScevs, Depth);
980 findForkedSCEVs(SE, L, I->getOperand(2), ChildScevs, Depth);
981 if (ChildScevs.size() == 2) {
982 ScevList.push_back(ChildScevs[0]);
983 ScevList.push_back(ChildScevs[1]);
984 } else
985 ScevList.emplace_back(Scev, !isGuaranteedNotToBeUndefOrPoison(Ptr));
986 break;
987 }
988 case Instruction::PHI: {
990 // A phi means we've found a forked pointer, but we currently only
991 // support a single phi per pointer so if there's another behind this
992 // then we just bail out and return the generic SCEV.
993 if (I->getNumOperands() == 2) {
994 findForkedSCEVs(SE, L, I->getOperand(0), ChildScevs, Depth);
995 findForkedSCEVs(SE, L, I->getOperand(1), ChildScevs, Depth);
996 }
997 if (ChildScevs.size() == 2) {
998 ScevList.push_back(ChildScevs[0]);
999 ScevList.push_back(ChildScevs[1]);
1000 } else
1001 ScevList.emplace_back(Scev, !isGuaranteedNotToBeUndefOrPoison(Ptr));
1002 break;
1003 }
1004 case Instruction::Add:
1005 case Instruction::Sub: {
1008 findForkedSCEVs(SE, L, I->getOperand(0), LScevs, Depth);
1009 findForkedSCEVs(SE, L, I->getOperand(1), RScevs, Depth);
1010
1011 // See if we need to freeze our fork...
1012 bool NeedsFreeze =
1013 any_of(LScevs, UndefPoisonCheck) || any_of(RScevs, UndefPoisonCheck);
1014
1015 // Check that we only have a single fork, on either the left or right side.
1016 // Copy the SCEV across for the one without a fork in order to generate
1017 // the full SCEV for both sides of the BinOp.
1018 if (LScevs.size() == 2 && RScevs.size() == 1)
1019 RScevs.push_back(RScevs[0]);
1020 else if (RScevs.size() == 2 && LScevs.size() == 1)
1021 LScevs.push_back(LScevs[0]);
1022 else {
1023 ScevList.emplace_back(Scev, NeedsFreeze);
1024 break;
1025 }
1026
1027 ScevList.emplace_back(
1028 GetBinOpExpr(Opcode, get<0>(LScevs[0]), get<0>(RScevs[0])),
1029 NeedsFreeze);
1030 ScevList.emplace_back(
1031 GetBinOpExpr(Opcode, get<0>(LScevs[1]), get<0>(RScevs[1])),
1032 NeedsFreeze);
1033 break;
1034 }
1035 default:
1036 // Just return the current SCEV if we haven't handled the instruction yet.
1037 LLVM_DEBUG(dbgs() << "ForkedPtr unhandled instruction: " << *I << "\n");
1038 ScevList.emplace_back(Scev, !isGuaranteedNotToBeUndefOrPoison(Ptr));
1039 break;
1040 }
1041}
1042
1045 const DenseMap<Value *, const SCEV *> &StridesMap, Value *Ptr,
1046 const Loop *L) {
1047 ScalarEvolution *SE = PSE.getSE();
1048 assert(SE->isSCEVable(Ptr->getType()) && "Value is not SCEVable!");
1050 findForkedSCEVs(SE, L, Ptr, Scevs, MaxForkedSCEVDepth);
1051
1052 // For now, we will only accept a forked pointer with two possible SCEVs
1053 // that are either SCEVAddRecExprs or loop invariant.
1054 if (Scevs.size() == 2 &&
1055 (isa<SCEVAddRecExpr>(get<0>(Scevs[0])) ||
1056 SE->isLoopInvariant(get<0>(Scevs[0]), L)) &&
1057 (isa<SCEVAddRecExpr>(get<0>(Scevs[1])) ||
1058 SE->isLoopInvariant(get<0>(Scevs[1]), L))) {
1059 LLVM_DEBUG(dbgs() << "LAA: Found forked pointer: " << *Ptr << "\n");
1060 LLVM_DEBUG(dbgs() << "\t(1) " << *get<0>(Scevs[0]) << "\n");
1061 LLVM_DEBUG(dbgs() << "\t(2) " << *get<0>(Scevs[1]) << "\n");
1062 return Scevs;
1063 }
1064
1065 return {{replaceSymbolicStrideSCEV(PSE, StridesMap, Ptr), false}};
1066}
1067
1068bool AccessAnalysis::createCheckForAccess(RuntimePointerChecking &RtCheck,
1069 MemAccessInfo Access, Type *AccessTy,
1070 const DenseMap<Value *, const SCEV *> &StridesMap,
1072 Loop *TheLoop, unsigned &RunningDepId,
1073 unsigned ASId, bool ShouldCheckWrap,
1074 bool Assume) {
1075 Value *Ptr = Access.getPointer();
1076
1078 findForkedPointer(PSE, StridesMap, Ptr, TheLoop);
1079
1080 for (auto &P : TranslatedPtrs) {
1081 const SCEV *PtrExpr = get<0>(P);
1082 if (!hasComputableBounds(PSE, Ptr, PtrExpr, TheLoop, Assume))
1083 return false;
1084
1085 // When we run after a failing dependency check we have to make sure
1086 // we don't have wrapping pointers.
1087 if (ShouldCheckWrap) {
1088 // Skip wrap checking when translating pointers.
1089 if (TranslatedPtrs.size() > 1)
1090 return false;
1091
1092 if (!isNoWrap(PSE, StridesMap, Ptr, AccessTy, TheLoop)) {
1093 auto *Expr = PSE.getSCEV(Ptr);
1094 if (!Assume || !isa<SCEVAddRecExpr>(Expr))
1095 return false;
1097 }
1098 }
1099 // If there's only one option for Ptr, look it up after bounds and wrap
1100 // checking, because assumptions might have been added to PSE.
1101 if (TranslatedPtrs.size() == 1)
1102 TranslatedPtrs[0] = {replaceSymbolicStrideSCEV(PSE, StridesMap, Ptr),
1103 false};
1104 }
1105
1106 for (auto [PtrExpr, NeedsFreeze] : TranslatedPtrs) {
1107 // The id of the dependence set.
1108 unsigned DepId;
1109
1110 if (isDependencyCheckNeeded()) {
1111 Value *Leader = DepCands.getLeaderValue(Access).getPointer();
1112 unsigned &LeaderId = DepSetId[Leader];
1113 if (!LeaderId)
1114 LeaderId = RunningDepId++;
1115 DepId = LeaderId;
1116 } else
1117 // Each access has its own dependence set.
1118 DepId = RunningDepId++;
1119
1120 bool IsWrite = Access.getInt();
1121 RtCheck.insert(TheLoop, Ptr, PtrExpr, AccessTy, IsWrite, DepId, ASId, PSE,
1122 NeedsFreeze);
1123 LLVM_DEBUG(dbgs() << "LAA: Found a runtime check ptr:" << *Ptr << '\n');
1124 }
1125
1126 return true;
1127}
1128
1129bool AccessAnalysis::canCheckPtrAtRT(RuntimePointerChecking &RtCheck,
1130 ScalarEvolution *SE, Loop *TheLoop,
1131 const DenseMap<Value *, const SCEV *> &StridesMap,
1132 Value *&UncomputablePtr, bool ShouldCheckWrap) {
1133 // Find pointers with computable bounds. We are going to use this information
1134 // to place a runtime bound check.
1135 bool CanDoRT = true;
1136
1137 bool MayNeedRTCheck = false;
1138 if (!IsRTCheckAnalysisNeeded) return true;
1139
1140 bool IsDepCheckNeeded = isDependencyCheckNeeded();
1141
1142 // We assign a consecutive id to access from different alias sets.
1143 // Accesses between different groups doesn't need to be checked.
1144 unsigned ASId = 0;
1145 for (auto &AS : AST) {
1146 int NumReadPtrChecks = 0;
1147 int NumWritePtrChecks = 0;
1148 bool CanDoAliasSetRT = true;
1149 ++ASId;
1150 auto ASPointers = AS.getPointers();
1151
1152 // We assign consecutive id to access from different dependence sets.
1153 // Accesses within the same set don't need a runtime check.
1154 unsigned RunningDepId = 1;
1156
1158
1159 // First, count how many write and read accesses are in the alias set. Also
1160 // collect MemAccessInfos for later.
1162 for (const Value *Ptr_ : ASPointers) {
1163 Value *Ptr = const_cast<Value *>(Ptr_);
1164 bool IsWrite = Accesses.count(MemAccessInfo(Ptr, true));
1165 if (IsWrite)
1166 ++NumWritePtrChecks;
1167 else
1168 ++NumReadPtrChecks;
1169 AccessInfos.emplace_back(Ptr, IsWrite);
1170 }
1171
1172 // We do not need runtime checks for this alias set, if there are no writes
1173 // or a single write and no reads.
1174 if (NumWritePtrChecks == 0 ||
1175 (NumWritePtrChecks == 1 && NumReadPtrChecks == 0)) {
1176 assert((ASPointers.size() <= 1 ||
1177 all_of(ASPointers,
1178 [this](const Value *Ptr) {
1179 MemAccessInfo AccessWrite(const_cast<Value *>(Ptr),
1180 true);
1181 return DepCands.findValue(AccessWrite) == DepCands.end();
1182 })) &&
1183 "Can only skip updating CanDoRT below, if all entries in AS "
1184 "are reads or there is at most 1 entry");
1185 continue;
1186 }
1187
1188 for (auto &Access : AccessInfos) {
1189 for (const auto &AccessTy : Accesses[Access]) {
1190 if (!createCheckForAccess(RtCheck, Access, AccessTy, StridesMap,
1191 DepSetId, TheLoop, RunningDepId, ASId,
1192 ShouldCheckWrap, false)) {
1193 LLVM_DEBUG(dbgs() << "LAA: Can't find bounds for ptr:"
1194 << *Access.getPointer() << '\n');
1195 Retries.push_back({Access, AccessTy});
1196 CanDoAliasSetRT = false;
1197 }
1198 }
1199 }
1200
1201 // Note that this function computes CanDoRT and MayNeedRTCheck
1202 // independently. For example CanDoRT=false, MayNeedRTCheck=false means that
1203 // we have a pointer for which we couldn't find the bounds but we don't
1204 // actually need to emit any checks so it does not matter.
1205 //
1206 // We need runtime checks for this alias set, if there are at least 2
1207 // dependence sets (in which case RunningDepId > 2) or if we need to re-try
1208 // any bound checks (because in that case the number of dependence sets is
1209 // incomplete).
1210 bool NeedsAliasSetRTCheck = RunningDepId > 2 || !Retries.empty();
1211
1212 // We need to perform run-time alias checks, but some pointers had bounds
1213 // that couldn't be checked.
1214 if (NeedsAliasSetRTCheck && !CanDoAliasSetRT) {
1215 // Reset the CanDoSetRt flag and retry all accesses that have failed.
1216 // We know that we need these checks, so we can now be more aggressive
1217 // and add further checks if required (overflow checks).
1218 CanDoAliasSetRT = true;
1219 for (auto Retry : Retries) {
1220 MemAccessInfo Access = Retry.first;
1221 Type *AccessTy = Retry.second;
1222 if (!createCheckForAccess(RtCheck, Access, AccessTy, StridesMap,
1223 DepSetId, TheLoop, RunningDepId, ASId,
1224 ShouldCheckWrap, /*Assume=*/true)) {
1225 CanDoAliasSetRT = false;
1226 UncomputablePtr = Access.getPointer();
1227 break;
1228 }
1229 }
1230 }
1231
1232 CanDoRT &= CanDoAliasSetRT;
1233 MayNeedRTCheck |= NeedsAliasSetRTCheck;
1234 ++ASId;
1235 }
1236
1237 // If the pointers that we would use for the bounds comparison have different
1238 // address spaces, assume the values aren't directly comparable, so we can't
1239 // use them for the runtime check. We also have to assume they could
1240 // overlap. In the future there should be metadata for whether address spaces
1241 // are disjoint.
1242 unsigned NumPointers = RtCheck.Pointers.size();
1243 for (unsigned i = 0; i < NumPointers; ++i) {
1244 for (unsigned j = i + 1; j < NumPointers; ++j) {
1245 // Only need to check pointers between two different dependency sets.
1246 if (RtCheck.Pointers[i].DependencySetId ==
1247 RtCheck.Pointers[j].DependencySetId)
1248 continue;
1249 // Only need to check pointers in the same alias set.
1250 if (RtCheck.Pointers[i].AliasSetId != RtCheck.Pointers[j].AliasSetId)
1251 continue;
1252
1253 Value *PtrI = RtCheck.Pointers[i].PointerValue;
1254 Value *PtrJ = RtCheck.Pointers[j].PointerValue;
1255
1256 unsigned ASi = PtrI->getType()->getPointerAddressSpace();
1257 unsigned ASj = PtrJ->getType()->getPointerAddressSpace();
1258 if (ASi != ASj) {
1259 LLVM_DEBUG(
1260 dbgs() << "LAA: Runtime check would require comparison between"
1261 " different address spaces\n");
1262 return false;
1263 }
1264 }
1265 }
1266
1267 if (MayNeedRTCheck && CanDoRT)
1268 RtCheck.generateChecks(DepCands, IsDepCheckNeeded);
1269
1270 LLVM_DEBUG(dbgs() << "LAA: We need to do " << RtCheck.getNumberOfChecks()
1271 << " pointer comparisons.\n");
1272
1273 // If we can do run-time checks, but there are no checks, no runtime checks
1274 // are needed. This can happen when all pointers point to the same underlying
1275 // object for example.
1276 RtCheck.Need = CanDoRT ? RtCheck.getNumberOfChecks() != 0 : MayNeedRTCheck;
1277
1278 bool CanDoRTIfNeeded = !RtCheck.Need || CanDoRT;
1279 if (!CanDoRTIfNeeded)
1280 RtCheck.reset();
1281 return CanDoRTIfNeeded;
1282}
1283
1284void AccessAnalysis::processMemAccesses() {
1285 // We process the set twice: first we process read-write pointers, last we
1286 // process read-only pointers. This allows us to skip dependence tests for
1287 // read-only pointers.
1288
1289 LLVM_DEBUG(dbgs() << "LAA: Processing memory accesses...\n");
1290 LLVM_DEBUG(dbgs() << " AST: "; AST.dump());
1291 LLVM_DEBUG(dbgs() << "LAA: Accesses(" << Accesses.size() << "):\n");
1292 LLVM_DEBUG({
1293 for (auto A : Accesses)
1294 dbgs() << "\t" << *A.first.getPointer() << " ("
1295 << (A.first.getInt()
1296 ? "write"
1297 : (ReadOnlyPtr.count(A.first.getPointer()) ? "read-only"
1298 : "read"))
1299 << ")\n";
1300 });
1301
1302 // The AliasSetTracker has nicely partitioned our pointers by metadata
1303 // compatibility and potential for underlying-object overlap. As a result, we
1304 // only need to check for potential pointer dependencies within each alias
1305 // set.
1306 for (const auto &AS : AST) {
1307 // Note that both the alias-set tracker and the alias sets themselves used
1308 // ordered collections internally and so the iteration order here is
1309 // deterministic.
1310 auto ASPointers = AS.getPointers();
1311
1312 bool SetHasWrite = false;
1313
1314 // Map of pointers to last access encountered.
1315 typedef DenseMap<const Value*, MemAccessInfo> UnderlyingObjToAccessMap;
1316 UnderlyingObjToAccessMap ObjToLastAccess;
1317
1318 // Set of access to check after all writes have been processed.
1319 PtrAccessMap DeferredAccesses;
1320
1321 // Iterate over each alias set twice, once to process read/write pointers,
1322 // and then to process read-only pointers.
1323 for (int SetIteration = 0; SetIteration < 2; ++SetIteration) {
1324 bool UseDeferred = SetIteration > 0;
1325 PtrAccessMap &S = UseDeferred ? DeferredAccesses : Accesses;
1326
1327 for (const Value *Ptr_ : ASPointers) {
1328 Value *Ptr = const_cast<Value *>(Ptr_);
1329
1330 // For a single memory access in AliasSetTracker, Accesses may contain
1331 // both read and write, and they both need to be handled for CheckDeps.
1332 for (const auto &AC : S) {
1333 if (AC.first.getPointer() != Ptr)
1334 continue;
1335
1336 bool IsWrite = AC.first.getInt();
1337
1338 // If we're using the deferred access set, then it contains only
1339 // reads.
1340 bool IsReadOnlyPtr = ReadOnlyPtr.count(Ptr) && !IsWrite;
1341 if (UseDeferred && !IsReadOnlyPtr)
1342 continue;
1343 // Otherwise, the pointer must be in the PtrAccessSet, either as a
1344 // read or a write.
1345 assert(((IsReadOnlyPtr && UseDeferred) || IsWrite ||
1346 S.count(MemAccessInfo(Ptr, false))) &&
1347 "Alias-set pointer not in the access set?");
1348
1349 MemAccessInfo Access(Ptr, IsWrite);
1350 DepCands.insert(Access);
1351
1352 // Memorize read-only pointers for later processing and skip them in
1353 // the first round (they need to be checked after we have seen all
1354 // write pointers). Note: we also mark pointer that are not
1355 // consecutive as "read-only" pointers (so that we check
1356 // "a[b[i]] +="). Hence, we need the second check for "!IsWrite".
1357 if (!UseDeferred && IsReadOnlyPtr) {
1358 // We only use the pointer keys, the types vector values don't
1359 // matter.
1360 DeferredAccesses.insert({Access, {}});
1361 continue;
1362 }
1363
1364 // If this is a write - check other reads and writes for conflicts. If
1365 // this is a read only check other writes for conflicts (but only if
1366 // there is no other write to the ptr - this is an optimization to
1367 // catch "a[i] = a[i] + " without having to do a dependence check).
1368 if ((IsWrite || IsReadOnlyPtr) && SetHasWrite) {
1369 CheckDeps.push_back(Access);
1370 IsRTCheckAnalysisNeeded = true;
1371 }
1372
1373 if (IsWrite)
1374 SetHasWrite = true;
1375
1376 // Create sets of pointers connected by a shared alias set and
1377 // underlying object.
1378 typedef SmallVector<const Value *, 16> ValueVector;
1379 ValueVector TempObjects;
1380
1381 UnderlyingObjects[Ptr] = {};
1382 SmallVector<const Value *, 16> &UOs = UnderlyingObjects[Ptr];
1383 ::getUnderlyingObjects(Ptr, UOs, LI);
1385 << "Underlying objects for pointer " << *Ptr << "\n");
1386 for (const Value *UnderlyingObj : UOs) {
1387 // nullptr never alias, don't join sets for pointer that have "null"
1388 // in their UnderlyingObjects list.
1389 if (isa<ConstantPointerNull>(UnderlyingObj) &&
1391 TheLoop->getHeader()->getParent(),
1392 UnderlyingObj->getType()->getPointerAddressSpace()))
1393 continue;
1394
1395 UnderlyingObjToAccessMap::iterator Prev =
1396 ObjToLastAccess.find(UnderlyingObj);
1397 if (Prev != ObjToLastAccess.end())
1398 DepCands.unionSets(Access, Prev->second);
1399
1400 ObjToLastAccess[UnderlyingObj] = Access;
1401 LLVM_DEBUG(dbgs() << " " << *UnderlyingObj << "\n");
1402 }
1403 }
1404 }
1405 }
1406 }
1407}
1408
1409/// Return true if an AddRec pointer \p Ptr is unsigned non-wrapping,
1410/// i.e. monotonically increasing/decreasing.
1411static bool isNoWrapAddRec(Value *Ptr, const SCEVAddRecExpr *AR,
1412 PredicatedScalarEvolution &PSE, const Loop *L) {
1413
1414 // FIXME: This should probably only return true for NUW.
1416 return true;
1417
1419 return true;
1420
1421 // Scalar evolution does not propagate the non-wrapping flags to values that
1422 // are derived from a non-wrapping induction variable because non-wrapping
1423 // could be flow-sensitive.
1424 //
1425 // Look through the potentially overflowing instruction to try to prove
1426 // non-wrapping for the *specific* value of Ptr.
1427
1428 // The arithmetic implied by an inbounds GEP can't overflow.
1429 auto *GEP = dyn_cast<GetElementPtrInst>(Ptr);
1430 if (!GEP || !GEP->isInBounds())
1431 return false;
1432
1433 // Make sure there is only one non-const index and analyze that.
1434 Value *NonConstIndex = nullptr;
1435 for (Value *Index : GEP->indices())
1436 if (!isa<ConstantInt>(Index)) {
1437 if (NonConstIndex)
1438 return false;
1439 NonConstIndex = Index;
1440 }
1441 if (!NonConstIndex)
1442 // The recurrence is on the pointer, ignore for now.
1443 return false;
1444
1445 // The index in GEP is signed. It is non-wrapping if it's derived from a NSW
1446 // AddRec using a NSW operation.
1447 if (auto *OBO = dyn_cast<OverflowingBinaryOperator>(NonConstIndex))
1448 if (OBO->hasNoSignedWrap() &&
1449 // Assume constant for other the operand so that the AddRec can be
1450 // easily found.
1451 isa<ConstantInt>(OBO->getOperand(1))) {
1452 auto *OpScev = PSE.getSCEV(OBO->getOperand(0));
1453
1454 if (auto *OpAR = dyn_cast<SCEVAddRecExpr>(OpScev))
1455 return OpAR->getLoop() == L && OpAR->getNoWrapFlags(SCEV::FlagNSW);
1456 }
1457
1458 return false;
1459}
1460
1461/// Check whether the access through \p Ptr has a constant stride.
1463 Type *AccessTy, Value *Ptr,
1464 const Loop *Lp,
1465 const DenseMap<Value *, const SCEV *> &StridesMap,
1466 bool Assume, bool ShouldCheckWrap) {
1467 Type *Ty = Ptr->getType();
1468 assert(Ty->isPointerTy() && "Unexpected non-ptr");
1469
1470 if (isa<ScalableVectorType>(AccessTy)) {
1471 LLVM_DEBUG(dbgs() << "LAA: Bad stride - Scalable object: " << *AccessTy
1472 << "\n");
1473 return std::nullopt;
1474 }
1475
1476 const SCEV *PtrScev = replaceSymbolicStrideSCEV(PSE, StridesMap, Ptr);
1477
1478 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PtrScev);
1479 if (Assume && !AR)
1480 AR = PSE.getAsAddRec(Ptr);
1481
1482 if (!AR) {
1483 LLVM_DEBUG(dbgs() << "LAA: Bad stride - Not an AddRecExpr pointer " << *Ptr
1484 << " SCEV: " << *PtrScev << "\n");
1485 return std::nullopt;
1486 }
1487
1488 // The access function must stride over the innermost loop.
1489 if (Lp != AR->getLoop()) {
1490 LLVM_DEBUG(dbgs() << "LAA: Bad stride - Not striding over innermost loop "
1491 << *Ptr << " SCEV: " << *AR << "\n");
1492 return std::nullopt;
1493 }
1494
1495 // Check the step is constant.
1496 const SCEV *Step = AR->getStepRecurrence(*PSE.getSE());
1497
1498 // Calculate the pointer stride and check if it is constant.
1499 const SCEVConstant *C = dyn_cast<SCEVConstant>(Step);
1500 if (!C) {
1501 LLVM_DEBUG(dbgs() << "LAA: Bad stride - Not a constant strided " << *Ptr
1502 << " SCEV: " << *AR << "\n");
1503 return std::nullopt;
1504 }
1505
1506 auto &DL = Lp->getHeader()->getModule()->getDataLayout();
1507 TypeSize AllocSize = DL.getTypeAllocSize(AccessTy);
1508 int64_t Size = AllocSize.getFixedValue();
1509 const APInt &APStepVal = C->getAPInt();
1510
1511 // Huge step value - give up.
1512 if (APStepVal.getBitWidth() > 64)
1513 return std::nullopt;
1514
1515 int64_t StepVal = APStepVal.getSExtValue();
1516
1517 // Strided access.
1518 int64_t Stride = StepVal / Size;
1519 int64_t Rem = StepVal % Size;
1520 if (Rem)
1521 return std::nullopt;
1522
1523 if (!ShouldCheckWrap)
1524 return Stride;
1525
1526 // The address calculation must not wrap. Otherwise, a dependence could be
1527 // inverted.
1528 if (isNoWrapAddRec(Ptr, AR, PSE, Lp))
1529 return Stride;
1530
1531 // An inbounds getelementptr that is a AddRec with a unit stride
1532 // cannot wrap per definition. If it did, the result would be poison
1533 // and any memory access dependent on it would be immediate UB
1534 // when executed.
1535 if (auto *GEP = dyn_cast<GetElementPtrInst>(Ptr);
1536 GEP && GEP->isInBounds() && (Stride == 1 || Stride == -1))
1537 return Stride;
1538
1539 // If the null pointer is undefined, then a access sequence which would
1540 // otherwise access it can be assumed not to unsigned wrap. Note that this
1541 // assumes the object in memory is aligned to the natural alignment.
1542 unsigned AddrSpace = Ty->getPointerAddressSpace();
1543 if (!NullPointerIsDefined(Lp->getHeader()->getParent(), AddrSpace) &&
1544 (Stride == 1 || Stride == -1))
1545 return Stride;
1546
1547 if (Assume) {
1549 LLVM_DEBUG(dbgs() << "LAA: Pointer may wrap:\n"
1550 << "LAA: Pointer: " << *Ptr << "\n"
1551 << "LAA: SCEV: " << *AR << "\n"
1552 << "LAA: Added an overflow assumption\n");
1553 return Stride;
1554 }
1555 LLVM_DEBUG(
1556 dbgs() << "LAA: Bad stride - Pointer may wrap in the address space "
1557 << *Ptr << " SCEV: " << *AR << "\n");
1558 return std::nullopt;
1559}
1560
1561std::optional<int> llvm::getPointersDiff(Type *ElemTyA, Value *PtrA,
1562 Type *ElemTyB, Value *PtrB,
1563 const DataLayout &DL,
1564 ScalarEvolution &SE, bool StrictCheck,
1565 bool CheckType) {
1566 assert(PtrA && PtrB && "Expected non-nullptr pointers.");
1567
1568 // Make sure that A and B are different pointers.
1569 if (PtrA == PtrB)
1570 return 0;
1571
1572 // Make sure that the element types are the same if required.
1573 if (CheckType && ElemTyA != ElemTyB)
1574 return std::nullopt;
1575
1576 unsigned ASA = PtrA->getType()->getPointerAddressSpace();
1577 unsigned ASB = PtrB->getType()->getPointerAddressSpace();
1578
1579 // Check that the address spaces match.
1580 if (ASA != ASB)
1581 return std::nullopt;
1582 unsigned IdxWidth = DL.getIndexSizeInBits(ASA);
1583
1584 APInt OffsetA(IdxWidth, 0), OffsetB(IdxWidth, 0);
1585 Value *PtrA1 = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
1586 Value *PtrB1 = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB);
1587
1588 int Val;
1589 if (PtrA1 == PtrB1) {
1590 // Retrieve the address space again as pointer stripping now tracks through
1591 // `addrspacecast`.
1592 ASA = cast<PointerType>(PtrA1->getType())->getAddressSpace();
1593 ASB = cast<PointerType>(PtrB1->getType())->getAddressSpace();
1594 // Check that the address spaces match and that the pointers are valid.
1595 if (ASA != ASB)
1596 return std::nullopt;
1597
1598 IdxWidth = DL.getIndexSizeInBits(ASA);
1599 OffsetA = OffsetA.sextOrTrunc(IdxWidth);
1600 OffsetB = OffsetB.sextOrTrunc(IdxWidth);
1601
1602 OffsetB -= OffsetA;
1603 Val = OffsetB.getSExtValue();
1604 } else {
1605 // Otherwise compute the distance with SCEV between the base pointers.
1606 const SCEV *PtrSCEVA = SE.getSCEV(PtrA);
1607 const SCEV *PtrSCEVB = SE.getSCEV(PtrB);
1608 const auto *Diff =
1609 dyn_cast<SCEVConstant>(SE.getMinusSCEV(PtrSCEVB, PtrSCEVA));
1610 if (!Diff)
1611 return std::nullopt;
1612 Val = Diff->getAPInt().getSExtValue();
1613 }
1614 int Size = DL.getTypeStoreSize(ElemTyA);
1615 int Dist = Val / Size;
1616
1617 // Ensure that the calculated distance matches the type-based one after all
1618 // the bitcasts removal in the provided pointers.
1619 if (!StrictCheck || Dist * Size == Val)
1620 return Dist;
1621 return std::nullopt;
1622}
1623
1625 const DataLayout &DL, ScalarEvolution &SE,
1626 SmallVectorImpl<unsigned> &SortedIndices) {
1628 VL, [](const Value *V) { return V->getType()->isPointerTy(); }) &&
1629 "Expected list of pointer operands.");
1630 // Walk over the pointers, and map each of them to an offset relative to
1631 // first pointer in the array.
1632 Value *Ptr0 = VL[0];
1633
1634 using DistOrdPair = std::pair<int64_t, int>;
1635 auto Compare = llvm::less_first();
1636 std::set<DistOrdPair, decltype(Compare)> Offsets(Compare);
1637 Offsets.emplace(0, 0);
1638 int Cnt = 1;
1639 bool IsConsecutive = true;
1640 for (auto *Ptr : VL.drop_front()) {
1641 std::optional<int> Diff = getPointersDiff(ElemTy, Ptr0, ElemTy, Ptr, DL, SE,
1642 /*StrictCheck=*/true);
1643 if (!Diff)
1644 return false;
1645
1646 // Check if the pointer with the same offset is found.
1647 int64_t Offset = *Diff;
1648 auto Res = Offsets.emplace(Offset, Cnt);
1649 if (!Res.second)
1650 return false;
1651 // Consecutive order if the inserted element is the last one.
1652 IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end();
1653 ++Cnt;
1654 }
1655 SortedIndices.clear();
1656 if (!IsConsecutive) {
1657 // Fill SortedIndices array only if it is non-consecutive.
1658 SortedIndices.resize(VL.size());
1659 Cnt = 0;
1660 for (const std::pair<int64_t, int> &Pair : Offsets) {
1661 SortedIndices[Cnt] = Pair.second;
1662 ++Cnt;
1663 }
1664 }
1665 return true;
1666}
1667
1668/// Returns true if the memory operations \p A and \p B are consecutive.
1670 ScalarEvolution &SE, bool CheckType) {
1673 if (!PtrA || !PtrB)
1674 return false;
1675 Type *ElemTyA = getLoadStoreType(A);
1676 Type *ElemTyB = getLoadStoreType(B);
1677 std::optional<int> Diff =
1678 getPointersDiff(ElemTyA, PtrA, ElemTyB, PtrB, DL, SE,
1679 /*StrictCheck=*/true, CheckType);
1680 return Diff && *Diff == 1;
1681}
1682
1684 visitPointers(SI->getPointerOperand(), *InnermostLoop,
1685 [this, SI](Value *Ptr) {
1686 Accesses[MemAccessInfo(Ptr, true)].push_back(AccessIdx);
1687 InstMap.push_back(SI);
1688 ++AccessIdx;
1689 });
1690}
1691
1693 visitPointers(LI->getPointerOperand(), *InnermostLoop,
1694 [this, LI](Value *Ptr) {
1695 Accesses[MemAccessInfo(Ptr, false)].push_back(AccessIdx);
1696 InstMap.push_back(LI);
1697 ++AccessIdx;
1698 });
1699}
1700
1703 switch (Type) {
1704 case NoDep:
1705 case Forward:
1708
1709 case Unknown:
1712 case Backward:
1714 case IndirectUnsafe:
1716 }
1717 llvm_unreachable("unexpected DepType!");
1718}
1719
1721 switch (Type) {
1722 case NoDep:
1723 case Forward:
1724 case ForwardButPreventsForwarding:
1725 case Unknown:
1726 case IndirectUnsafe:
1727 return false;
1728
1729 case BackwardVectorizable:
1730 case Backward:
1731 case BackwardVectorizableButPreventsForwarding:
1732 return true;
1733 }
1734 llvm_unreachable("unexpected DepType!");
1735}
1736
1738 return isBackward() || Type == Unknown;
1739}
1740
1742 switch (Type) {
1743 case Forward:
1744 case ForwardButPreventsForwarding:
1745 return true;
1746
1747 case NoDep:
1748 case Unknown:
1749 case BackwardVectorizable:
1750 case Backward:
1751 case BackwardVectorizableButPreventsForwarding:
1752 case IndirectUnsafe:
1753 return false;
1754 }
1755 llvm_unreachable("unexpected DepType!");
1756}
1757
1758bool MemoryDepChecker::couldPreventStoreLoadForward(uint64_t Distance,
1759 uint64_t TypeByteSize) {
1760 // If loads occur at a distance that is not a multiple of a feasible vector
1761 // factor store-load forwarding does not take place.
1762 // Positive dependences might cause troubles because vectorizing them might
1763 // prevent store-load forwarding making vectorized code run a lot slower.
1764 // a[i] = a[i-3] ^ a[i-8];
1765 // The stores to a[i:i+1] don't align with the stores to a[i-3:i-2] and
1766 // hence on your typical architecture store-load forwarding does not take
1767 // place. Vectorizing in such cases does not make sense.
1768 // Store-load forwarding distance.
1769
1770 // After this many iterations store-to-load forwarding conflicts should not
1771 // cause any slowdowns.
1772 const uint64_t NumItersForStoreLoadThroughMemory = 8 * TypeByteSize;
1773 // Maximum vector factor.
1774 uint64_t MaxVFWithoutSLForwardIssues = std::min(
1775 VectorizerParams::MaxVectorWidth * TypeByteSize, MinDepDistBytes);
1776
1777 // Compute the smallest VF at which the store and load would be misaligned.
1778 for (uint64_t VF = 2 * TypeByteSize; VF <= MaxVFWithoutSLForwardIssues;
1779 VF *= 2) {
1780 // If the number of vector iteration between the store and the load are
1781 // small we could incur conflicts.
1782 if (Distance % VF && Distance / VF < NumItersForStoreLoadThroughMemory) {
1783 MaxVFWithoutSLForwardIssues = (VF >> 1);
1784 break;
1785 }
1786 }
1787
1788 if (MaxVFWithoutSLForwardIssues < 2 * TypeByteSize) {
1789 LLVM_DEBUG(
1790 dbgs() << "LAA: Distance " << Distance
1791 << " that could cause a store-load forwarding conflict\n");
1792 return true;
1793 }
1794
1795 if (MaxVFWithoutSLForwardIssues < MinDepDistBytes &&
1796 MaxVFWithoutSLForwardIssues !=
1797 VectorizerParams::MaxVectorWidth * TypeByteSize)
1798 MinDepDistBytes = MaxVFWithoutSLForwardIssues;
1799 return false;
1800}
1801
1802void MemoryDepChecker::mergeInStatus(VectorizationSafetyStatus S) {
1803 if (Status < S)
1804 Status = S;
1805}
1806
1807/// Given a dependence-distance \p Dist between two
1808/// memory accesses, that have the same stride whose absolute value is given
1809/// in \p Stride, and that have the same type size \p TypeByteSize,
1810/// in a loop whose takenCount is \p BackedgeTakenCount, check if it is
1811/// possible to prove statically that the dependence distance is larger
1812/// than the range that the accesses will travel through the execution of
1813/// the loop. If so, return true; false otherwise. This is useful for
1814/// example in loops such as the following (PR31098):
1815/// for (i = 0; i < D; ++i) {
1816/// = out[i];
1817/// out[i+D] =
1818/// }
1820 const SCEV &BackedgeTakenCount,
1821 const SCEV &Dist, uint64_t Stride,
1822 uint64_t TypeByteSize) {
1823
1824 // If we can prove that
1825 // (**) |Dist| > BackedgeTakenCount * Step
1826 // where Step is the absolute stride of the memory accesses in bytes,
1827 // then there is no dependence.
1828 //
1829 // Rationale:
1830 // We basically want to check if the absolute distance (|Dist/Step|)
1831 // is >= the loop iteration count (or > BackedgeTakenCount).
1832 // This is equivalent to the Strong SIV Test (Practical Dependence Testing,
1833 // Section 4.2.1); Note, that for vectorization it is sufficient to prove
1834 // that the dependence distance is >= VF; This is checked elsewhere.
1835 // But in some cases we can prune dependence distances early, and
1836 // even before selecting the VF, and without a runtime test, by comparing
1837 // the distance against the loop iteration count. Since the vectorized code
1838 // will be executed only if LoopCount >= VF, proving distance >= LoopCount
1839 // also guarantees that distance >= VF.
1840 //
1841 const uint64_t ByteStride = Stride * TypeByteSize;
1842 const SCEV *Step = SE.getConstant(BackedgeTakenCount.getType(), ByteStride);
1843 const SCEV *Product = SE.getMulExpr(&BackedgeTakenCount, Step);
1844
1845 const SCEV *CastedDist = &Dist;
1846 const SCEV *CastedProduct = Product;
1847 uint64_t DistTypeSizeBits = DL.getTypeSizeInBits(Dist.getType());
1848 uint64_t ProductTypeSizeBits = DL.getTypeSizeInBits(Product->getType());
1849
1850 // The dependence distance can be positive/negative, so we sign extend Dist;
1851 // The multiplication of the absolute stride in bytes and the
1852 // backedgeTakenCount is non-negative, so we zero extend Product.
1853 if (DistTypeSizeBits > ProductTypeSizeBits)
1854 CastedProduct = SE.getZeroExtendExpr(Product, Dist.getType());
1855 else
1856 CastedDist = SE.getNoopOrSignExtend(&Dist, Product->getType());
1857
1858 // Is Dist - (BackedgeTakenCount * Step) > 0 ?
1859 // (If so, then we have proven (**) because |Dist| >= Dist)
1860 const SCEV *Minus = SE.getMinusSCEV(CastedDist, CastedProduct);
1861 if (SE.isKnownPositive(Minus))
1862 return true;
1863
1864 // Second try: Is -Dist - (BackedgeTakenCount * Step) > 0 ?
1865 // (If so, then we have proven (**) because |Dist| >= -1*Dist)
1866 const SCEV *NegDist = SE.getNegativeSCEV(CastedDist);
1867 Minus = SE.getMinusSCEV(NegDist, CastedProduct);
1868 if (SE.isKnownPositive(Minus))
1869 return true;
1870
1871 return false;
1872}
1873
1874/// Check the dependence for two accesses with the same stride \p Stride.
1875/// \p Distance is the positive distance and \p TypeByteSize is type size in
1876/// bytes.
1877///
1878/// \returns true if they are independent.
1880 uint64_t TypeByteSize) {
1881 assert(Stride > 1 && "The stride must be greater than 1");
1882 assert(TypeByteSize > 0 && "The type size in byte must be non-zero");
1883 assert(Distance > 0 && "The distance must be non-zero");
1884
1885 // Skip if the distance is not multiple of type byte size.
1886 if (Distance % TypeByteSize)
1887 return false;
1888
1889 uint64_t ScaledDist = Distance / TypeByteSize;
1890
1891 // No dependence if the scaled distance is not multiple of the stride.
1892 // E.g.
1893 // for (i = 0; i < 1024 ; i += 4)
1894 // A[i+2] = A[i] + 1;
1895 //
1896 // Two accesses in memory (scaled distance is 2, stride is 4):
1897 // | A[0] | | | | A[4] | | | |
1898 // | | | A[2] | | | | A[6] | |
1899 //
1900 // E.g.
1901 // for (i = 0; i < 1024 ; i += 3)
1902 // A[i+4] = A[i] + 1;
1903 //
1904 // Two accesses in memory (scaled distance is 4, stride is 3):
1905 // | A[0] | | | A[3] | | | A[6] | | |
1906 // | | | | | A[4] | | | A[7] | |
1907 return ScaledDist % Stride;
1908}
1909
1910/// Returns true if any of the underlying objects has a loop varying address,
1911/// i.e. may change in \p L.
1912static bool
1914 ScalarEvolution &SE, const Loop *L) {
1915 return any_of(UnderlyingObjects, [&SE, L](const Value *UO) {
1916 return !SE.isLoopInvariant(SE.getSCEV(const_cast<Value *>(UO)), L);
1917 });
1918}
1919
1920// Get the dependence distance, stride, type size in whether i is a write for
1921// the dependence between A and B. Returns a DepType, if we can prove there's
1922// no dependence or the analysis fails. Outlined to lambda to limit he scope
1923// of various temporary variables, like A/BPtr, StrideA/BPtr and others.
1924// Returns either the dependence result, if it could already be determined, or a
1925// tuple with (Distance, Stride, TypeSize, AIsWrite, BIsWrite).
1926static std::variant<MemoryDepChecker::Dependence::DepType,
1927 std::tuple<const SCEV *, uint64_t, uint64_t, bool, bool>>
1931 const DenseMap<Value *, const SCEV *> &Strides,
1932 const DenseMap<Value *, SmallVector<const Value *, 16>> &UnderlyingObjects,
1933 PredicatedScalarEvolution &PSE, const Loop *InnermostLoop) {
1934 auto &DL = InnermostLoop->getHeader()->getModule()->getDataLayout();
1935 auto &SE = *PSE.getSE();
1936 auto [APtr, AIsWrite] = A;
1937 auto [BPtr, BIsWrite] = B;
1938
1939 // Two reads are independent.
1940 if (!AIsWrite && !BIsWrite)
1942
1943 Type *ATy = getLoadStoreType(AInst);
1944 Type *BTy = getLoadStoreType(BInst);
1945
1946 // We cannot check pointers in different address spaces.
1947 if (APtr->getType()->getPointerAddressSpace() !=
1948 BPtr->getType()->getPointerAddressSpace())
1950
1951 int64_t StrideAPtr =
1952 getPtrStride(PSE, ATy, APtr, InnermostLoop, Strides, true).value_or(0);
1953 int64_t StrideBPtr =
1954 getPtrStride(PSE, BTy, BPtr, InnermostLoop, Strides, true).value_or(0);
1955
1956 const SCEV *Src = PSE.getSCEV(APtr);
1957 const SCEV *Sink = PSE.getSCEV(BPtr);
1958
1959 // If the induction step is negative we have to invert source and sink of the
1960 // dependence when measuring the distance between them. We should not swap
1961 // AIsWrite with BIsWrite, as their uses expect them in program order.
1962 if (StrideAPtr < 0) {
1963 std::swap(Src, Sink);
1964 std::swap(AInst, BInst);
1965 }
1966
1967 const SCEV *Dist = SE.getMinusSCEV(Sink, Src);
1968
1969 LLVM_DEBUG(dbgs() << "LAA: Src Scev: " << *Src << "Sink Scev: " << *Sink
1970 << "(Induction step: " << StrideAPtr << ")\n");
1971 LLVM_DEBUG(dbgs() << "LAA: Distance for " << *AInst << " to " << *BInst
1972 << ": " << *Dist << "\n");
1973
1974 // Needs accesses where the addresses of the accessed underlying objects do
1975 // not change within the loop.
1976 if (isLoopVariantIndirectAddress(UnderlyingObjects.find(APtr)->second, SE,
1977 InnermostLoop) ||
1978 isLoopVariantIndirectAddress(UnderlyingObjects.find(BPtr)->second, SE,
1979 InnermostLoop))
1981
1982 // Need accesses with constant stride. We don't want to vectorize
1983 // "A[B[i]] += ..." and similar code or pointer arithmetic that could wrap
1984 // in the address space.
1985 if (!StrideAPtr || !StrideBPtr || StrideAPtr != StrideBPtr) {
1986 LLVM_DEBUG(dbgs() << "Pointer access with non-constant stride\n");
1988 }
1989
1990 uint64_t TypeByteSize = DL.getTypeAllocSize(ATy);
1991 bool HasSameSize =
1992 DL.getTypeStoreSizeInBits(ATy) == DL.getTypeStoreSizeInBits(BTy);
1993 if (!HasSameSize)
1994 TypeByteSize = 0;
1995 uint64_t Stride = std::abs(StrideAPtr);
1996 return std::make_tuple(Dist, Stride, TypeByteSize, AIsWrite, BIsWrite);
1997}
1998
1999MemoryDepChecker::Dependence::DepType MemoryDepChecker::isDependent(
2000 const MemAccessInfo &A, unsigned AIdx, const MemAccessInfo &B,
2001 unsigned BIdx, const DenseMap<Value *, const SCEV *> &Strides,
2003 &UnderlyingObjects) {
2004 assert(AIdx < BIdx && "Must pass arguments in program order");
2005
2006 // Get the dependence distance, stride, type size and what access writes for
2007 // the dependence between A and B.
2009 A, InstMap[AIdx], B, InstMap[BIdx], Strides, UnderlyingObjects, PSE,
2010 InnermostLoop);
2011 if (std::holds_alternative<Dependence::DepType>(Res))
2012 return std::get<Dependence::DepType>(Res);
2013
2014 const auto &[Dist, Stride, TypeByteSize, AIsWrite, BIsWrite] =
2015 std::get<std::tuple<const SCEV *, uint64_t, uint64_t, bool, bool>>(Res);
2016 bool HasSameSize = TypeByteSize > 0;
2017
2018 ScalarEvolution &SE = *PSE.getSE();
2019 auto &DL = InnermostLoop->getHeader()->getModule()->getDataLayout();
2020 if (!isa<SCEVCouldNotCompute>(Dist) && HasSameSize &&
2022 Stride, TypeByteSize))
2023 return Dependence::NoDep;
2024
2025 const SCEVConstant *C = dyn_cast<SCEVConstant>(Dist);
2026 if (!C) {
2027 LLVM_DEBUG(dbgs() << "LAA: Dependence because of non-constant distance\n");
2028 FoundNonConstantDistanceDependence = true;
2029 return Dependence::Unknown;
2030 }
2031
2032 const APInt &Val = C->getAPInt();
2033 int64_t Distance = Val.getSExtValue();
2034
2035 // Attempt to prove strided accesses independent.
2036 if (std::abs(Distance) > 0 && Stride > 1 && HasSameSize &&
2037 areStridedAccessesIndependent(std::abs(Distance), Stride, TypeByteSize)) {
2038 LLVM_DEBUG(dbgs() << "LAA: Strided accesses are independent\n");
2039 return Dependence::NoDep;
2040 }
2041
2042 // Negative distances are not plausible dependencies.
2043 if (Val.isNegative()) {
2044 bool IsTrueDataDependence = (AIsWrite && !BIsWrite);
2045 // There is no need to update MaxSafeVectorWidthInBits after call to
2046 // couldPreventStoreLoadForward, even if it changed MinDepDistBytes,
2047 // since a forward dependency will allow vectorization using any width.
2048 if (IsTrueDataDependence && EnableForwardingConflictDetection &&
2049 (!HasSameSize || couldPreventStoreLoadForward(Val.abs().getZExtValue(),
2050 TypeByteSize))) {
2051 LLVM_DEBUG(dbgs() << "LAA: Forward but may prevent st->ld forwarding\n");
2053 }
2054
2055 LLVM_DEBUG(dbgs() << "LAA: Dependence is negative\n");
2056 return Dependence::Forward;
2057 }
2058
2059 // Write to the same location with the same size.
2060 if (Val == 0) {
2061 if (HasSameSize)
2062 return Dependence::Forward;
2063 LLVM_DEBUG(
2064 dbgs() << "LAA: Zero dependence difference but different type sizes\n");
2065 return Dependence::Unknown;
2066 }
2067
2068 assert(Val.isStrictlyPositive() && "Expect a positive value");
2069
2070 if (!HasSameSize) {
2071 LLVM_DEBUG(dbgs() << "LAA: ReadWrite-Write positive dependency with "
2072 "different type sizes\n");
2073 return Dependence::Unknown;
2074 }
2075
2076 // Bail out early if passed-in parameters make vectorization not feasible.
2077 unsigned ForcedFactor = (VectorizerParams::VectorizationFactor ?
2079 unsigned ForcedUnroll = (VectorizerParams::VectorizationInterleave ?
2081 // The minimum number of iterations for a vectorized/unrolled version.
2082 unsigned MinNumIter = std::max(ForcedFactor * ForcedUnroll, 2U);
2083
2084 // It's not vectorizable if the distance is smaller than the minimum distance
2085 // needed for a vectroized/unrolled version. Vectorizing one iteration in
2086 // front needs TypeByteSize * Stride. Vectorizing the last iteration needs
2087 // TypeByteSize (No need to plus the last gap distance).
2088 //
2089 // E.g. Assume one char is 1 byte in memory and one int is 4 bytes.
2090 // foo(int *A) {
2091 // int *B = (int *)((char *)A + 14);
2092 // for (i = 0 ; i < 1024 ; i += 2)
2093 // B[i] = A[i] + 1;
2094 // }
2095 //
2096 // Two accesses in memory (stride is 2):
2097 // | A[0] | | A[2] | | A[4] | | A[6] | |
2098 // | B[0] | | B[2] | | B[4] |
2099 //
2100 // Distance needs for vectorizing iterations except the last iteration:
2101 // 4 * 2 * (MinNumIter - 1). Distance needs for the last iteration: 4.
2102 // So the minimum distance needed is: 4 * 2 * (MinNumIter - 1) + 4.
2103 //
2104 // If MinNumIter is 2, it is vectorizable as the minimum distance needed is
2105 // 12, which is less than distance.
2106 //
2107 // If MinNumIter is 4 (Say if a user forces the vectorization factor to be 4),
2108 // the minimum distance needed is 28, which is greater than distance. It is
2109 // not safe to do vectorization.
2110 uint64_t MinDistanceNeeded =
2111 TypeByteSize * Stride * (MinNumIter - 1) + TypeByteSize;
2112 if (MinDistanceNeeded > static_cast<uint64_t>(Distance)) {
2113 LLVM_DEBUG(dbgs() << "LAA: Failure because of positive distance "
2114 << Distance << '\n');
2115 return Dependence::Backward;
2116 }
2117
2118 // Unsafe if the minimum distance needed is greater than smallest dependence
2119 // distance distance.
2120 if (MinDistanceNeeded > MinDepDistBytes) {
2121 LLVM_DEBUG(dbgs() << "LAA: Failure because it needs at least "
2122 << MinDistanceNeeded << " size in bytes\n");
2123 return Dependence::Backward;
2124 }
2125
2126 // Positive distance bigger than max vectorization factor.
2127 // FIXME: Should use max factor instead of max distance in bytes, which could
2128 // not handle different types.
2129 // E.g. Assume one char is 1 byte in memory and one int is 4 bytes.
2130 // void foo (int *A, char *B) {
2131 // for (unsigned i = 0; i < 1024; i++) {
2132 // A[i+2] = A[i] + 1;
2133 // B[i+2] = B[i] + 1;
2134 // }
2135 // }
2136 //
2137 // This case is currently unsafe according to the max safe distance. If we
2138 // analyze the two accesses on array B, the max safe dependence distance
2139 // is 2. Then we analyze the accesses on array A, the minimum distance needed
2140 // is 8, which is less than 2 and forbidden vectorization, But actually
2141 // both A and B could be vectorized by 2 iterations.
2142 MinDepDistBytes =
2143 std::min(static_cast<uint64_t>(Distance), MinDepDistBytes);
2144
2145 bool IsTrueDataDependence = (!AIsWrite && BIsWrite);
2146 uint64_t MinDepDistBytesOld = MinDepDistBytes;
2147 if (IsTrueDataDependence && EnableForwardingConflictDetection &&
2148 couldPreventStoreLoadForward(Distance, TypeByteSize)) {
2149 // Sanity check that we didn't update MinDepDistBytes when calling
2150 // couldPreventStoreLoadForward
2151 assert(MinDepDistBytes == MinDepDistBytesOld &&
2152 "An update to MinDepDistBytes requires an update to "
2153 "MaxSafeVectorWidthInBits");
2154 (void)MinDepDistBytesOld;
2156 }
2157
2158 // An update to MinDepDistBytes requires an update to MaxSafeVectorWidthInBits
2159 // since there is a backwards dependency.
2160 uint64_t MaxVF = MinDepDistBytes / (TypeByteSize * Stride);
2161 LLVM_DEBUG(dbgs() << "LAA: Positive distance " << Val.getSExtValue()
2162 << " with max VF = " << MaxVF << '\n');
2163 uint64_t MaxVFInBits = MaxVF * TypeByteSize * 8;
2164 MaxSafeVectorWidthInBits = std::min(MaxSafeVectorWidthInBits, MaxVFInBits);
2166}
2167
2169 DepCandidates &AccessSets, MemAccessInfoList &CheckDeps,
2170 const DenseMap<Value *, const SCEV *> &Strides,
2172 &UnderlyingObjects) {
2173
2174 MinDepDistBytes = -1;
2176 for (MemAccessInfo CurAccess : CheckDeps) {
2177 if (Visited.count(CurAccess))
2178 continue;
2179
2180 // Get the relevant memory access set.
2182 AccessSets.findValue(AccessSets.getLeaderValue(CurAccess));
2183
2184 // Check accesses within this set.
2186 AccessSets.member_begin(I);
2188 AccessSets.member_end();
2189
2190 // Check every access pair.
2191 while (AI != AE) {
2192 Visited.insert(*AI);
2193 bool AIIsWrite = AI->getInt();
2194 // Check loads only against next equivalent class, but stores also against
2195 // other stores in the same equivalence class - to the same address.
2197 (AIIsWrite ? AI : std::next(AI));
2198 while (OI != AE) {
2199 // Check every accessing instruction pair in program order.
2200 for (std::vector<unsigned>::iterator I1 = Accesses[*AI].begin(),
2201 I1E = Accesses[*AI].end(); I1 != I1E; ++I1)
2202 // Scan all accesses of another equivalence class, but only the next
2203 // accesses of the same equivalent class.
2204 for (std::vector<unsigned>::iterator
2205 I2 = (OI == AI ? std::next(I1) : Accesses[*OI].begin()),
2206 I2E = (OI == AI ? I1E : Accesses[*OI].end());
2207 I2 != I2E; ++I2) {
2208 auto A = std::make_pair(&*AI, *I1);
2209 auto B = std::make_pair(&*OI, *I2);
2210
2211 assert(*I1 != *I2);
2212 if (*I1 > *I2)
2213 std::swap(A, B);
2214
2216 isDependent(*A.first, A.second, *B.first, B.second, Strides,
2217 UnderlyingObjects);
2219
2220 // Gather dependences unless we accumulated MaxDependences
2221 // dependences. In that case return as soon as we find the first
2222 // unsafe dependence. This puts a limit on this quadratic
2223 // algorithm.
2224 if (RecordDependences) {
2225 if (Type != Dependence::NoDep)
2226 Dependences.push_back(Dependence(A.second, B.second, Type));
2227
2228 if (Dependences.size() >= MaxDependences) {
2229 RecordDependences = false;
2230 Dependences.clear();
2232 << "Too many dependences, stopped recording\n");
2233 }
2234 }
2235 if (!RecordDependences && !isSafeForVectorization())
2236 return false;
2237 }
2238 ++OI;
2239 }
2240 AI++;
2241 }
2242 }
2243
2244 LLVM_DEBUG(dbgs() << "Total Dependences: " << Dependences.size() << "\n");
2245 return isSafeForVectorization();
2246}
2247
2250 MemAccessInfo Access(Ptr, isWrite);
2251 auto &IndexVector = Accesses.find(Access)->second;
2252
2254 transform(IndexVector,
2255 std::back_inserter(Insts),
2256 [&](unsigned Idx) { return this->InstMap[Idx]; });
2257 return Insts;
2258}
2259
2261 "NoDep",
2262 "Unknown",
2263 "IndirectUnsafe",
2264 "Forward",
2265 "ForwardButPreventsForwarding",
2266 "Backward",
2267 "BackwardVectorizable",
2268 "BackwardVectorizableButPreventsForwarding"};
2269
2271 raw_ostream &OS, unsigned Depth,
2272 const SmallVectorImpl<Instruction *> &Instrs) const {
2273 OS.indent(Depth) << DepName[Type] << ":\n";
2274 OS.indent(Depth + 2) << *Instrs[Source] << " -> \n";
2275 OS.indent(Depth + 2) << *Instrs[Destination] << "\n";
2276}
2277
2278bool LoopAccessInfo::canAnalyzeLoop() {
2279 // We need to have a loop header.
2280 LLVM_DEBUG(dbgs() << "LAA: Found a loop in "
2281 << TheLoop->getHeader()->getParent()->getName() << ": "
2282 << TheLoop->getHeader()->getName() << '\n');
2283
2284 // We can only analyze innermost loops.
2285 if (!TheLoop->isInnermost()) {
2286 LLVM_DEBUG(dbgs() << "LAA: loop is not the innermost loop\n");
2287 recordAnalysis("NotInnerMostLoop") << "loop is not the innermost loop";
2288 return false;
2289 }
2290
2291 // We must have a single backedge.
2292 if (TheLoop->getNumBackEdges() != 1) {
2293 LLVM_DEBUG(
2294 dbgs() << "LAA: loop control flow is not understood by analyzer\n");
2295 recordAnalysis("CFGNotUnderstood")
2296 << "loop control flow is not understood by analyzer";
2297 return false;
2298 }
2299
2300 // ScalarEvolution needs to be able to find the exit count.
2301 const SCEV *ExitCount = PSE->getBackedgeTakenCount();
2302 if (isa<SCEVCouldNotCompute>(ExitCount)) {
2303 recordAnalysis("CantComputeNumberOfIterations")
2304 << "could not determine number of loop iterations";
2305 LLVM_DEBUG(dbgs() << "LAA: SCEV could not compute the loop exit count.\n");
2306 return false;
2307 }
2308
2309 return true;
2310}
2311
2312void LoopAccessInfo::analyzeLoop(AAResults *AA, LoopInfo *LI,
2313 const TargetLibraryInfo *TLI,
2314 DominatorTree *DT) {
2315 // Holds the Load and Store instructions.
2318 SmallPtrSet<MDNode *, 8> LoopAliasScopes;
2319
2320 // Holds all the different accesses in the loop.
2321 unsigned NumReads = 0;
2322 unsigned NumReadWrites = 0;
2323
2324 bool HasComplexMemInst = false;
2325
2326 // A runtime check is only legal to insert if there are no convergent calls.
2327 HasConvergentOp = false;
2328
2329 PtrRtChecking->Pointers.clear();
2330 PtrRtChecking->Need = false;
2331
2332 const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel();
2333
2334 const bool EnableMemAccessVersioningOfLoop =
2336 !TheLoop->getHeader()->getParent()->hasOptSize();
2337
2338 // Traverse blocks in fixed RPOT order, regardless of their storage in the
2339 // loop info, as it may be arbitrary.
2340 LoopBlocksRPO RPOT(TheLoop);
2341 RPOT.perform(LI);
2342 for (BasicBlock *BB : RPOT) {
2343 // Scan the BB and collect legal loads and stores. Also detect any
2344 // convergent instructions.
2345 for (Instruction &I : *BB) {
2346 if (auto *Call = dyn_cast<CallBase>(&I)) {
2347 if (Call->isConvergent())
2348 HasConvergentOp = true;
2349 }
2350
2351 // With both a non-vectorizable memory instruction and a convergent
2352 // operation, found in this loop, no reason to continue the search.
2353 if (HasComplexMemInst && HasConvergentOp) {
2354 CanVecMem = false;
2355 return;
2356 }
2357
2358 // Avoid hitting recordAnalysis multiple times.
2359 if (HasComplexMemInst)
2360 continue;
2361
2362 // Record alias scopes defined inside the loop.
2363 if (auto *Decl = dyn_cast<NoAliasScopeDeclInst>(&I))
2364 for (Metadata *Op : Decl->getScopeList()->operands())
2365 LoopAliasScopes.insert(cast<MDNode>(Op));
2366
2367 // Many math library functions read the rounding mode. We will only
2368 // vectorize a loop if it contains known function calls that don't set
2369 // the flag. Therefore, it is safe to ignore this read from memory.
2370 auto *Call = dyn_cast<CallInst>(&I);
2371 if (Call && getVectorIntrinsicIDForCall(Call, TLI))
2372 continue;
2373
2374 // If this is a load, save it. If this instruction can read from memory
2375 // but is not a load, then we quit. Notice that we don't handle function
2376 // calls that read or write.
2377 if (I.mayReadFromMemory()) {
2378 // If the function has an explicit vectorized counterpart, we can safely
2379 // assume that it can be vectorized.
2380 if (Call && !Call->isNoBuiltin() && Call->getCalledFunction() &&
2381 !VFDatabase::getMappings(*Call).empty())
2382 continue;
2383
2384 auto *Ld = dyn_cast<LoadInst>(&I);
2385 if (!Ld) {
2386 recordAnalysis("CantVectorizeInstruction", Ld)
2387 << "instruction cannot be vectorized";
2388 HasComplexMemInst = true;
2389 continue;
2390 }
2391 if (!Ld->isSimple() && !IsAnnotatedParallel) {
2392 recordAnalysis("NonSimpleLoad", Ld)
2393 << "read with atomic ordering or volatile read";
2394 LLVM_DEBUG(dbgs() << "LAA: Found a non-simple load.\n");
2395 HasComplexMemInst = true;
2396 continue;
2397 }
2398 NumLoads++;
2399 Loads.push_back(Ld);
2400 DepChecker->addAccess(Ld);
2401 if (EnableMemAccessVersioningOfLoop)
2402 collectStridedAccess(Ld);
2403 continue;
2404 }
2405
2406 // Save 'store' instructions. Abort if other instructions write to memory.
2407 if (I.mayWriteToMemory()) {
2408 auto *St = dyn_cast<StoreInst>(&I);
2409 if (!St) {
2410 recordAnalysis("CantVectorizeInstruction", St)
2411 << "instruction cannot be vectorized";
2412 HasComplexMemInst = true;
2413 continue;
2414 }
2415 if (!St->isSimple() && !IsAnnotatedParallel) {
2416 recordAnalysis("NonSimpleStore", St)
2417 << "write with atomic ordering or volatile write";
2418 LLVM_DEBUG(dbgs() << "LAA: Found a non-simple store.\n");
2419 HasComplexMemInst = true;
2420 continue;
2421 }
2422 NumStores++;
2423 Stores.push_back(St);
2424 DepChecker->addAccess(St);
2425 if (EnableMemAccessVersioningOfLoop)
2426 collectStridedAccess(St);
2427 }
2428 } // Next instr.
2429 } // Next block.
2430
2431 if (HasComplexMemInst) {
2432 CanVecMem = false;
2433 return;
2434 }
2435
2436 // Now we have two lists that hold the loads and the stores.
2437 // Next, we find the pointers that they use.
2438
2439 // Check if we see any stores. If there are no stores, then we don't
2440 // care if the pointers are *restrict*.
2441 if (!Stores.size()) {
2442 LLVM_DEBUG(dbgs() << "LAA: Found a read-only loop!\n");
2443 CanVecMem = true;
2444 return;
2445 }
2446
2447 MemoryDepChecker::DepCandidates DependentAccesses;
2448 AccessAnalysis Accesses(TheLoop, AA, LI, DependentAccesses, *PSE,
2449 LoopAliasScopes);
2450
2451 // Holds the analyzed pointers. We don't want to call getUnderlyingObjects
2452 // multiple times on the same object. If the ptr is accessed twice, once
2453 // for read and once for write, it will only appear once (on the write
2454 // list). This is okay, since we are going to check for conflicts between
2455 // writes and between reads and writes, but not between reads and reads.
2457
2458 // Record uniform store addresses to identify if we have multiple stores
2459 // to the same address.
2460 SmallPtrSet<Value *, 16> UniformStores;
2461
2462 for (StoreInst *ST : Stores) {
2463 Value *Ptr = ST->getPointerOperand();
2464
2465 if (isInvariant(Ptr)) {
2466 // Record store instructions to loop invariant addresses
2467 StoresToInvariantAddresses.push_back(ST);
2468 HasDependenceInvolvingLoopInvariantAddress |=
2469 !UniformStores.insert(Ptr).second;
2470 }
2471
2472 // If we did *not* see this pointer before, insert it to the read-write
2473 // list. At this phase it is only a 'write' list.
2474 Type *AccessTy = getLoadStoreType(ST);
2475 if (Seen.insert({Ptr, AccessTy}).second) {
2476 ++NumReadWrites;
2477
2479 // The TBAA metadata could have a control dependency on the predication
2480 // condition, so we cannot rely on it when determining whether or not we
2481 // need runtime pointer checks.
2482 if (blockNeedsPredication(ST->getParent(), TheLoop, DT))
2483 Loc.AATags.TBAA = nullptr;
2484
2485 visitPointers(const_cast<Value *>(Loc.Ptr), *TheLoop,
2486 [&Accesses, AccessTy, Loc](Value *Ptr) {
2487 MemoryLocation NewLoc = Loc.getWithNewPtr(Ptr);
2488 Accesses.addStore(NewLoc, AccessTy);
2489 });
2490 }
2491 }
2492
2493 if (IsAnnotatedParallel) {
2494 LLVM_DEBUG(
2495 dbgs() << "LAA: A loop annotated parallel, ignore memory dependency "
2496 << "checks.\n");
2497 CanVecMem = true;
2498 return;
2499 }
2500
2501 for (LoadInst *LD : Loads) {
2502 Value *Ptr = LD->getPointerOperand();
2503 // If we did *not* see this pointer before, insert it to the
2504 // read list. If we *did* see it before, then it is already in
2505 // the read-write list. This allows us to vectorize expressions
2506 // such as A[i] += x; Because the address of A[i] is a read-write
2507 // pointer. This only works if the index of A[i] is consecutive.
2508 // If the address of i is unknown (for example A[B[i]]) then we may
2509 // read a few words, modify, and write a few words, and some of the
2510 // words may be written to the same address.
2511 bool IsReadOnlyPtr = false;
2512 Type *AccessTy = getLoadStoreType(LD);
2513 if (Seen.insert({Ptr, AccessTy}).second ||
2514 !getPtrStride(*PSE, LD->getType(), Ptr, TheLoop, SymbolicStrides).value_or(0)) {
2515 ++NumReads;
2516 IsReadOnlyPtr = true;
2517 }
2518
2519 // See if there is an unsafe dependency between a load to a uniform address and
2520 // store to the same uniform address.
2521 if (UniformStores.count(Ptr)) {
2522 LLVM_DEBUG(dbgs() << "LAA: Found an unsafe dependency between a uniform "
2523 "load and uniform store to the same address!\n");
2524 HasDependenceInvolvingLoopInvariantAddress = true;
2525 }
2526
2528 // The TBAA metadata could have a control dependency on the predication
2529 // condition, so we cannot rely on it when determining whether or not we
2530 // need runtime pointer checks.
2531 if (blockNeedsPredication(LD->getParent(), TheLoop, DT))
2532 Loc.AATags.TBAA = nullptr;
2533
2534 visitPointers(const_cast<Value *>(Loc.Ptr), *TheLoop,
2535 [&Accesses, AccessTy, Loc, IsReadOnlyPtr](Value *Ptr) {
2536 MemoryLocation NewLoc = Loc.getWithNewPtr(Ptr);
2537 Accesses.addLoad(NewLoc, AccessTy, IsReadOnlyPtr);
2538 });
2539 }
2540
2541 // If we write (or read-write) to a single destination and there are no
2542 // other reads in this loop then is it safe to vectorize.
2543 if (NumReadWrites == 1 && NumReads == 0) {
2544 LLVM_DEBUG(dbgs() << "LAA: Found a write-only loop!\n");
2545 CanVecMem = true;
2546 return;
2547 }
2548
2549 // Build dependence sets and check whether we need a runtime pointer bounds
2550 // check.
2551 Accesses.buildDependenceSets();
2552
2553 // Find pointers with computable bounds. We are going to use this information
2554 // to place a runtime bound check.
2555 Value *UncomputablePtr = nullptr;
2556 bool CanDoRTIfNeeded =
2557 Accesses.canCheckPtrAtRT(*PtrRtChecking, PSE->getSE(), TheLoop,
2558 SymbolicStrides, UncomputablePtr, false);
2559 if (!CanDoRTIfNeeded) {
2560 auto *I = dyn_cast_or_null<Instruction>(UncomputablePtr);
2561 recordAnalysis("CantIdentifyArrayBounds", I)
2562 << "cannot identify array bounds";
2563 LLVM_DEBUG(dbgs() << "LAA: We can't vectorize because we can't find "
2564 << "the array bounds.\n");
2565 CanVecMem = false;
2566 return;
2567 }
2568
2569 LLVM_DEBUG(
2570 dbgs() << "LAA: May be able to perform a memory runtime check if needed.\n");
2571
2572 CanVecMem = true;
2573 if (Accesses.isDependencyCheckNeeded()) {
2574 LLVM_DEBUG(dbgs() << "LAA: Checking memory dependencies\n");
2575 CanVecMem = DepChecker->areDepsSafe(
2576 DependentAccesses, Accesses.getDependenciesToCheck(), SymbolicStrides,
2577 Accesses.getUnderlyingObjects());
2578
2579 if (!CanVecMem && DepChecker->shouldRetryWithRuntimeCheck()) {
2580 LLVM_DEBUG(dbgs() << "LAA: Retrying with memory checks\n");
2581
2582 // Clear the dependency checks. We assume they are not needed.
2583 Accesses.resetDepChecks(*DepChecker);
2584
2585 PtrRtChecking->reset();
2586 PtrRtChecking->Need = true;
2587
2588 auto *SE = PSE->getSE();
2589 UncomputablePtr = nullptr;
2590 CanDoRTIfNeeded = Accesses.canCheckPtrAtRT(
2591 *PtrRtChecking, SE, TheLoop, SymbolicStrides, UncomputablePtr, true);
2592
2593 // Check that we found the bounds for the pointer.
2594 if (!CanDoRTIfNeeded) {
2595 auto *I = dyn_cast_or_null<Instruction>(UncomputablePtr);
2596 recordAnalysis("CantCheckMemDepsAtRunTime", I)
2597 << "cannot check memory dependencies at runtime";
2598 LLVM_DEBUG(dbgs() << "LAA: Can't vectorize with memory checks\n");
2599 CanVecMem = false;
2600 return;
2601 }
2602
2603 CanVecMem = true;
2604 }
2605 }
2606
2607 if (HasConvergentOp) {
2608 recordAnalysis("CantInsertRuntimeCheckWithConvergent")
2609 << "cannot add control dependency to convergent operation";
2610 LLVM_DEBUG(dbgs() << "LAA: We can't vectorize because a runtime check "
2611 "would be needed with a convergent operation\n");
2612 CanVecMem = false;
2613 return;
2614 }
2615
2616 if (CanVecMem)
2617 LLVM_DEBUG(
2618 dbgs() << "LAA: No unsafe dependent memory operations in loop. We"
2619 << (PtrRtChecking->Need ? "" : " don't")
2620 << " need runtime memory checks.\n");
2621 else
2622 emitUnsafeDependenceRemark();
2623}
2624
2625void LoopAccessInfo::emitUnsafeDependenceRemark() {
2626 auto Deps = getDepChecker().getDependences();
2627 if (!Deps)
2628 return;
2629 auto Found = llvm::find_if(*Deps, [](const MemoryDepChecker::Dependence &D) {
2632 });
2633 if (Found == Deps->end())
2634 return;
2635 MemoryDepChecker::Dependence Dep = *Found;
2636
2637 LLVM_DEBUG(dbgs() << "LAA: unsafe dependent memory operations in loop\n");
2638
2639 // Emit remark for first unsafe dependence
2640 bool HasForcedDistribution = false;
2641 std::optional<const MDOperand *> Value =
2642 findStringMetadataForLoop(TheLoop, "llvm.loop.distribute.enable");
2643 if (Value) {
2644 const MDOperand *Op = *Value;
2645 assert(Op && mdconst::hasa<ConstantInt>(*Op) && "invalid metadata");
2646 HasForcedDistribution = mdconst::extract<ConstantInt>(*Op)->getZExtValue();
2647 }
2648
2649 const std::string Info =
2650 HasForcedDistribution
2651 ? "unsafe dependent memory operations in loop."
2652 : "unsafe dependent memory operations in loop. Use "
2653 "#pragma clang loop distribute(enable) to allow loop distribution "
2654 "to attempt to isolate the offending operations into a separate "
2655 "loop";
2657 recordAnalysis("UnsafeDep", Dep.getDestination(*this)) << Info;
2658
2659 switch (Dep.Type) {
2663 llvm_unreachable("Unexpected dependence");
2665 R << "\nBackward loop carried data dependence.";
2666 break;
2668 R << "\nForward loop carried data dependence that prevents "
2669 "store-to-load forwarding.";
2670 break;
2672 R << "\nBackward loop carried data dependence that prevents "
2673 "store-to-load forwarding.";
2674 break;
2676 R << "\nUnsafe indirect dependence.";
2677 break;
2679 R << "\nUnknown data dependence.";
2680 break;
2681 }
2682
2683 if (Instruction *I = Dep.getSource(*this)) {
2684 DebugLoc SourceLoc = I->getDebugLoc();
2685 if (auto *DD = dyn_cast_or_null<Instruction>(getPointerOperand(I)))
2686 SourceLoc = DD->getDebugLoc();
2687 if (SourceLoc)
2688 R << " Memory location is the same as accessed at "
2689 << ore::NV("Location", SourceLoc);
2690 }
2691}
2692
2694 DominatorTree *DT) {
2695 assert(TheLoop->contains(BB) && "Unknown block used");
2696
2697 // Blocks that do not dominate the latch need predication.
2698 BasicBlock* Latch = TheLoop->getLoopLatch();
2699 return !DT->dominates(BB, Latch);
2700}
2701
2702OptimizationRemarkAnalysis &LoopAccessInfo::recordAnalysis(StringRef RemarkName,
2703 Instruction *I) {
2704 assert(!Report && "Multiple reports generated");
2705
2706 Value *CodeRegion = TheLoop->getHeader();
2707 DebugLoc DL = TheLoop->getStartLoc();
2708
2709 if (I) {
2710 CodeRegion = I->getParent();
2711 // If there is no debug location attached to the instruction, revert back to
2712 // using the loop's.
2713 if (I->getDebugLoc())
2714 DL = I->getDebugLoc();
2715 }
2716
2717 Report = std::make_unique<OptimizationRemarkAnalysis>(DEBUG_TYPE, RemarkName, DL,
2718 CodeRegion);
2719 return *Report;
2720}
2721
2723 auto *SE = PSE->getSE();
2724 // TODO: Is this really what we want? Even without FP SCEV, we may want some
2725 // trivially loop-invariant FP values to be considered invariant.
2726 if (!SE->isSCEVable(V->getType()))
2727 return false;
2728 const SCEV *S = SE->getSCEV(V);
2729 return SE->isLoopInvariant(S, TheLoop);
2730}
2731
2732/// Find the operand of the GEP that should be checked for consecutive
2733/// stores. This ignores trailing indices that have no effect on the final
2734/// pointer.
2735static unsigned getGEPInductionOperand(const GetElementPtrInst *Gep) {
2736 const DataLayout &DL = Gep->getModule()->getDataLayout();
2737 unsigned LastOperand = Gep->getNumOperands() - 1;
2738 TypeSize GEPAllocSize = DL.getTypeAllocSize(Gep->getResultElementType());
2739
2740 // Walk backwards and try to peel off zeros.
2741 while (LastOperand > 1 && match(Gep->getOperand(LastOperand), m_Zero())) {
2742 // Find the type we're currently indexing into.
2743 gep_type_iterator GEPTI = gep_type_begin(Gep);
2744 std::advance(GEPTI, LastOperand - 2);
2745
2746 // If it's a type with the same allocation size as the result of the GEP we
2747 // can peel off the zero index.
2748 TypeSize ElemSize = GEPTI.isStruct()
2749 ? DL.getTypeAllocSize(GEPTI.getIndexedType())
2751 if (ElemSize != GEPAllocSize)
2752 break;
2753 --LastOperand;
2754 }
2755
2756 return LastOperand;
2757}
2758
2759/// If the argument is a GEP, then returns the operand identified by
2760/// getGEPInductionOperand. However, if there is some other non-loop-invariant
2761/// operand, it returns that instead.
2763 GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
2764 if (!GEP)
2765 return Ptr;
2766
2767 unsigned InductionOperand = getGEPInductionOperand(GEP);
2768
2769 // Check that all of the gep indices are uniform except for our induction
2770 // operand.
2771 for (unsigned i = 0, e = GEP->getNumOperands(); i != e; ++i)
2772 if (i != InductionOperand &&
2773 !SE->isLoopInvariant(SE->getSCEV(GEP->getOperand(i)), Lp))
2774 return Ptr;
2775 return GEP->getOperand(InductionOperand);
2776}
2777
2778/// If a value has only one user that is a CastInst, return it.
2780 Value *UniqueCast = nullptr;
2781 for (User *U : Ptr->users()) {
2782 CastInst *CI = dyn_cast<CastInst>(U);
2783 if (CI && CI->getType() == Ty) {
2784 if (!UniqueCast)
2785 UniqueCast = CI;
2786 else
2787 return nullptr;
2788 }
2789 }
2790 return UniqueCast;
2791}
2792
2793/// Get the stride of a pointer access in a loop. Looks for symbolic
2794/// strides "a[i*stride]". Returns the symbolic stride, or null otherwise.
2796 auto *PtrTy = dyn_cast<PointerType>(Ptr->getType());
2797 if (!PtrTy || PtrTy->isAggregateType())
2798 return nullptr;
2799
2800 // Try to remove a gep instruction to make the pointer (actually index at this
2801 // point) easier analyzable. If OrigPtr is equal to Ptr we are analyzing the
2802 // pointer, otherwise, we are analyzing the index.
2803 Value *OrigPtr = Ptr;
2804
2805 // The size of the pointer access.
2806 int64_t PtrAccessSize = 1;
2807
2808 Ptr = stripGetElementPtr(Ptr, SE, Lp);
2809 const SCEV *V = SE->getSCEV(Ptr);
2810
2811 if (Ptr != OrigPtr)
2812 // Strip off casts.
2813 while (const SCEVIntegralCastExpr *C = dyn_cast<SCEVIntegralCastExpr>(V))
2814 V = C->getOperand();
2815
2816 const SCEVAddRecExpr *S = dyn_cast<SCEVAddRecExpr>(V);
2817 if (!S)
2818 return nullptr;
2819
2820 // If the pointer is invariant then there is no stride and it makes no
2821 // sense to add it here.
2822 if (Lp != S->getLoop())
2823 return nullptr;
2824
2825 V = S->getStepRecurrence(*SE);
2826 if (!V)
2827 return nullptr;
2828
2829 // Strip off the size of access multiplication if we are still analyzing the
2830 // pointer.
2831 if (OrigPtr == Ptr) {
2832 if (const SCEVMulExpr *M = dyn_cast<SCEVMulExpr>(V)) {
2833 if (M->getOperand(0)->getSCEVType() != scConstant)
2834 return nullptr;
2835
2836 const APInt &APStepVal = cast<SCEVConstant>(M->getOperand(0))->getAPInt();
2837
2838 // Huge step value - give up.
2839 if (APStepVal.getBitWidth() > 64)
2840 return nullptr;
2841
2842 int64_t StepVal = APStepVal.getSExtValue();
2843 if (PtrAccessSize != StepVal)
2844 return nullptr;
2845 V = M->getOperand(1);
2846 }
2847 }
2848
2849 // Note that the restriction after this loop invariant check are only
2850 // profitability restrictions.
2851 if (!SE->isLoopInvariant(V, Lp))
2852 return nullptr;
2853
2854 // Look for the loop invariant symbolic value.
2855 const SCEVUnknown *U = dyn_cast<SCEVUnknown>(V);
2856 if (!U) {
2857 const auto *C = dyn_cast<SCEVIntegralCastExpr>(V);
2858 if (!C)
2859 return nullptr;
2860 U = dyn_cast<SCEVUnknown>(C->getOperand());
2861 if (!U)
2862 return nullptr;
2863
2864 // Match legacy behavior - this is not needed for correctness
2865 if (!getUniqueCastUse(U->getValue(), Lp, V->getType()))
2866 return nullptr;
2867 }
2868
2869 return V;
2870}
2871
2872void LoopAccessInfo::collectStridedAccess(Value *MemAccess) {
2873 Value *Ptr = getLoadStorePointerOperand(MemAccess);
2874 if (!Ptr)
2875 return;
2876
2877 // Note: getStrideFromPointer is a *profitability* heuristic. We
2878 // could broaden the scope of values returned here - to anything
2879 // which happens to be loop invariant and contributes to the
2880 // computation of an interesting IV - but we chose not to as we
2881 // don't have a cost model here, and broadening the scope exposes
2882 // far too many unprofitable cases.
2883 const SCEV *StrideExpr = getStrideFromPointer(Ptr, PSE->getSE(), TheLoop);
2884 if (!StrideExpr)
2885 return;
2886
2887 LLVM_DEBUG(dbgs() << "LAA: Found a strided access that is a candidate for "
2888 "versioning:");
2889 LLVM_DEBUG(dbgs() << " Ptr: " << *Ptr << " Stride: " << *StrideExpr << "\n");
2890
2891 if (!SpeculateUnitStride) {
2892 LLVM_DEBUG(dbgs() << " Chose not to due to -laa-speculate-unit-stride\n");
2893 return;
2894 }
2895
2896 // Avoid adding the "Stride == 1" predicate when we know that
2897 // Stride >= Trip-Count. Such a predicate will effectively optimize a single
2898 // or zero iteration loop, as Trip-Count <= Stride == 1.
2899 //
2900 // TODO: We are currently not making a very informed decision on when it is
2901 // beneficial to apply stride versioning. It might make more sense that the
2902 // users of this analysis (such as the vectorizer) will trigger it, based on
2903 // their specific cost considerations; For example, in cases where stride
2904 // versioning does not help resolving memory accesses/dependences, the
2905 // vectorizer should evaluate the cost of the runtime test, and the benefit
2906 // of various possible stride specializations, considering the alternatives
2907 // of using gather/scatters (if available).
2908
2909 const SCEV *BETakenCount = PSE->getBackedgeTakenCount();
2910
2911 // Match the types so we can compare the stride and the BETakenCount.
2912 // The Stride can be positive/negative, so we sign extend Stride;
2913 // The backedgeTakenCount is non-negative, so we zero extend BETakenCount.
2914 const DataLayout &DL = TheLoop->getHeader()->getModule()->getDataLayout();
2915 uint64_t StrideTypeSizeBits = DL.getTypeSizeInBits(StrideExpr->getType());
2916 uint64_t BETypeSizeBits = DL.getTypeSizeInBits(BETakenCount->getType());
2917 const SCEV *CastedStride = StrideExpr;
2918 const SCEV *CastedBECount = BETakenCount;
2919 ScalarEvolution *SE = PSE->getSE();
2920 if (BETypeSizeBits >= StrideTypeSizeBits)
2921 CastedStride = SE->getNoopOrSignExtend(StrideExpr, BETakenCount->getType());
2922 else
2923 CastedBECount = SE->getZeroExtendExpr(BETakenCount, StrideExpr->getType());
2924 const SCEV *StrideMinusBETaken = SE->getMinusSCEV(CastedStride, CastedBECount);
2925 // Since TripCount == BackEdgeTakenCount + 1, checking:
2926 // "Stride >= TripCount" is equivalent to checking:
2927 // Stride - BETakenCount > 0
2928 if (SE->isKnownPositive(StrideMinusBETaken)) {
2929 LLVM_DEBUG(
2930 dbgs() << "LAA: Stride>=TripCount; No point in versioning as the "
2931 "Stride==1 predicate will imply that the loop executes "
2932 "at most once.\n");
2933 return;
2934 }
2935 LLVM_DEBUG(dbgs() << "LAA: Found a strided access that we can version.\n");
2936
2937 // Strip back off the integer cast, and check that our result is a
2938 // SCEVUnknown as we expect.
2939 const SCEV *StrideBase = StrideExpr;
2940 if (const auto *C = dyn_cast<SCEVIntegralCastExpr>(StrideBase))
2941 StrideBase = C->getOperand();
2942 SymbolicStrides[Ptr] = cast<SCEVUnknown>(StrideBase);
2943}
2944
2946 const TargetLibraryInfo *TLI, AAResults *AA,
2947 DominatorTree *DT, LoopInfo *LI)
2948 : PSE(std::make_unique<PredicatedScalarEvolution>(*SE, *L)),
2949 PtrRtChecking(nullptr),
2950 DepChecker(std::make_unique<MemoryDepChecker>(*PSE, L)), TheLoop(L) {
2951 PtrRtChecking = std::make_unique<RuntimePointerChecking>(*DepChecker, SE);
2952 if (canAnalyzeLoop()) {
2953 analyzeLoop(AA, LI, TLI, DT);
2954 }
2955}
2956
2958 if (CanVecMem) {
2959 OS.indent(Depth) << "Memory dependences are safe";
2960 const MemoryDepChecker &DC = getDepChecker();
2961 if (!DC.isSafeForAnyVectorWidth())
2962 OS << " with a maximum safe vector width of "
2963 << DC.getMaxSafeVectorWidthInBits() << " bits";
2964 if (PtrRtChecking->Need)
2965 OS << " with run-time checks";
2966 OS << "\n";
2967 }
2968
2969 if (HasConvergentOp)
2970 OS.indent(Depth) << "Has convergent operation in loop\n";
2971
2972 if (Report)
2973 OS.indent(Depth) << "Report: " << Report->getMsg() << "\n";
2974
2975 if (auto *Dependences = DepChecker->getDependences()) {
2976 OS.indent(Depth) << "Dependences:\n";
2977 for (const auto &Dep : *Dependences) {
2978 Dep.print(OS, Depth + 2, DepChecker->getMemoryInstructions());
2979 OS << "\n";
2980 }
2981 } else
2982 OS.indent(Depth) << "Too many dependences, not recorded\n";
2983
2984 // List the pair of accesses need run-time checks to prove independence.
2985 PtrRtChecking->print(OS, Depth);
2986 OS << "\n";
2987
2988 OS.indent(Depth) << "Non vectorizable stores to invariant address were "
2989 << (HasDependenceInvolvingLoopInvariantAddress ? "" : "not ")
2990 << "found in loop.\n";
2991
2992 OS.indent(Depth) << "SCEV assumptions:\n";
2993 PSE->getPredicate().print(OS, Depth);
2994
2995 OS << "\n";
2996
2997 OS.indent(Depth) << "Expressions re-written:\n";
2998 PSE->print(OS, Depth);
2999}
3000
3002 auto I = LoopAccessInfoMap.insert({&L, nullptr});
3003
3004 if (I.second)
3005 I.first->second =
3006 std::make_unique<LoopAccessInfo>(&L, &SE, TLI, &AA, &DT, &LI);
3007
3008 return *I.first->second;
3009}
3010
3012 Function &F, const PreservedAnalyses &PA,
3014 // Check whether our analysis is preserved.
3015 auto PAC = PA.getChecker<LoopAccessAnalysis>();
3016 if (!PAC.preserved() && !PAC.preservedSet<AllAnalysesOn<Function>>())
3017 // If not, give up now.
3018 return true;
3019
3020 // Check whether the analyses we depend on became invalid for any reason.
3021 // Skip checking TargetLibraryAnalysis as it is immutable and can't become
3022 // invalid.
3023 return Inv.invalidate<AAManager>(F, PA) ||
3025 Inv.invalidate<LoopAnalysis>(F, PA) ||
3027}
3028
3032 auto &AA = FAM.getResult<AAManager>(F);
3033 auto &DT = FAM.getResult<DominatorTreeAnalysis>(F);
3034 auto &LI = FAM.getResult<LoopAnalysis>(F);
3035 auto &TLI = FAM.getResult<TargetLibraryAnalysis>(F);
3036 return LoopAccessInfoManager(SE, AA, DT, LI, &TLI);
3037}
3038
3039AnalysisKey LoopAccessAnalysis::Key;
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
This file implements a class to represent arbitrary precision integral constant values and operations...
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
This file contains the declarations for the subclasses of Constant, which represent the different fla...
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
This file defines the DenseMap class.
uint64_t Size
bool End
Definition: ELF_riscv.cpp:480
Generic implementation of equivalence classes through the use Tarjan's efficient union-find algorithm...
#define Check(C,...)
#define DEBUG_TYPE
Hexagon Common GEP
IRTranslator LLVM IR MI
static cl::opt< unsigned > MaxDependences("max-dependences", cl::Hidden, cl::desc("Maximum number of dependences collected by " "loop-access analysis (default = 100)"), cl::init(100))
We collect dependences up to this threshold.
static cl::opt< bool > EnableForwardingConflictDetection("store-to-load-forwarding-conflict-detection", cl::Hidden, cl::desc("Enable conflict detection in loop-access analysis"), cl::init(true))
Enable store-to-load forwarding conflict detection.
static void findForkedSCEVs(ScalarEvolution *SE, const Loop *L, Value *Ptr, SmallVectorImpl< PointerIntPair< const SCEV *, 1, bool > > &ScevList, unsigned Depth)
static bool hasComputableBounds(PredicatedScalarEvolution &PSE, Value *Ptr, const SCEV *PtrScev, Loop *L, bool Assume)
Check whether a pointer can participate in a runtime bounds check.
static cl::opt< unsigned > MemoryCheckMergeThreshold("memory-check-merge-threshold", cl::Hidden, cl::desc("Maximum number of comparisons done when trying to merge " "runtime memory checks. (default = 100)"), cl::init(100))
The maximum iterations used to merge memory checks.
static bool isNoWrap(PredicatedScalarEvolution &PSE, const DenseMap< Value *, const SCEV * > &Strides, Value *Ptr, Type *AccessTy, Loop *L)
Check whether a pointer address cannot wrap.
static const SCEV * getStrideFromPointer(Value *Ptr, ScalarEvolution *SE, Loop *Lp)
Get the stride of a pointer access in a loop.
static unsigned getGEPInductionOperand(const GetElementPtrInst *Gep)
Find the operand of the GEP that should be checked for consecutive stores.
static cl::opt< unsigned, true > VectorizationInterleave("force-vector-interleave", cl::Hidden, cl::desc("Sets the vectorization interleave count. " "Zero is autoselect."), cl::location(VectorizerParams::VectorizationInterleave))
static bool isLoopVariantIndirectAddress(ArrayRef< const Value * > UnderlyingObjects, ScalarEvolution &SE, const Loop *L)
Returns true if any of the underlying objects has a loop varying address, i.e.
static Value * getUniqueCastUse(Value *Ptr, Loop *Lp, Type *Ty)
If a value has only one user that is a CastInst, return it.
static cl::opt< bool, true > HoistRuntimeChecks("hoist-runtime-checks", cl::Hidden, cl::desc("Hoist inner loop runtime memory checks to outer loop if possible"), cl::location(VectorizerParams::HoistRuntimeChecks), cl::init(true))
static cl::opt< unsigned, true > VectorizationFactor("force-vector-width", cl::Hidden, cl::desc("Sets the SIMD width. Zero is autoselect."), cl::location(VectorizerParams::VectorizationFactor))
static cl::opt< unsigned, true > RuntimeMemoryCheckThreshold("runtime-memory-check-threshold", cl::Hidden, cl::desc("When performing memory disambiguation checks at runtime do not " "generate more than this number of comparisons (default = 8)."), cl::location(VectorizerParams::RuntimeMemoryCheckThreshold), cl::init(8))
static void visitPointers(Value *StartPtr, const Loop &InnermostLoop, function_ref< void(Value *)> AddPointer)
static bool isNoWrapAddRec(Value *Ptr, const SCEVAddRecExpr *AR, PredicatedScalarEvolution &PSE, const Loop *L)
Return true if an AddRec pointer Ptr is unsigned non-wrapping, i.e.
static Value * stripGetElementPtr(Value *Ptr, ScalarEvolution *SE, Loop *Lp)
If the argument is a GEP, then returns the operand identified by getGEPInductionOperand.
static bool isSafeDependenceDistance(const DataLayout &DL, ScalarEvolution &SE, const SCEV &BackedgeTakenCount, const SCEV &Dist, uint64_t Stride, uint64_t TypeByteSize)
Given a dependence-distance Dist between two memory accesses, that have the same stride whose absolut...
static bool areStridedAccessesIndependent(uint64_t Distance, uint64_t Stride, uint64_t TypeByteSize)
Check the dependence for two accesses with the same stride Stride.
static const SCEV * getMinFromExprs(const SCEV *I, const SCEV *J, ScalarEvolution *SE)
Compare I and J and return the minimum.
static std::variant< MemoryDepChecker::Dependence::DepType, std::tuple< const SCEV *, uint64_t, uint64_t, bool, bool > > getDependenceDistanceStrideAndSize(const AccessAnalysis::MemAccessInfo &A, Instruction *AInst, const AccessAnalysis::MemAccessInfo &B, Instruction *BInst, const DenseMap< Value *, const SCEV * > &Strides, const DenseMap< Value *, SmallVector< const Value *, 16 > > &UnderlyingObjects, PredicatedScalarEvolution &PSE, const Loop *InnermostLoop)
static cl::opt< unsigned > MaxForkedSCEVDepth("max-forked-scev-depth", cl::Hidden, cl::desc("Maximum recursion depth when finding forked SCEVs (default = 5)"), cl::init(5))
static cl::opt< bool > SpeculateUnitStride("laa-speculate-unit-stride", cl::Hidden, cl::desc("Speculate that non-constant strides are unit in LAA"), cl::init(true))
static SmallVector< PointerIntPair< const SCEV *, 1, bool > > findForkedPointer(PredicatedScalarEvolution &PSE, const DenseMap< Value *, const SCEV * > &StridesMap, Value *Ptr, const Loop *L)
static cl::opt< bool > EnableMemAccessVersioning("enable-mem-access-versioning", cl::init(true), cl::Hidden, cl::desc("Enable symbolic stride memory access versioning"))
This enables versioning on the strides of symbolically striding memory accesses in code like the foll...
This header provides classes for managing per-loop analyses.
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
This file provides utility analysis objects describing memory locations.
uint64_t High
#define P(N)
FunctionAnalysisManager FAM
This header defines various interfaces for pass management in LLVM.
This file defines the PointerIntPair class.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file contains some templates that are useful if you are working with the STL at all.
raw_pwrite_stream & OS
static LLVM_ATTRIBUTE_ALWAYS_INLINE bool CheckType(MVT::SimpleValueType VT, SDValue N, const TargetLowering *TLI, const DataLayout &DL)
This file implements a set that has insertion order iteration characteristics.
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallVector class.
static SymbolRef::Type getType(const Symbol *Sym)
Definition: TapiFile.cpp:40
static const X86InstrFMA3Group Groups[]
A manager for alias analyses.
Class for arbitrary precision integers.
Definition: APInt.h:76
uint64_t getZExtValue() const
Get zero extended value.
Definition: APInt.h:1491
APInt abs() const
Get the absolute value.
Definition: APInt.h:1737
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1439
bool isNegative() const
Determine sign of this APInt.
Definition: APInt.h:307
APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition: APInt.cpp:1010
bool isStrictlyPositive() const
Determine if this APInt Value is positive.
Definition: APInt.h:334
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1513
This templated class represents "all analyses that operate over <a particular IR unit>" (e....
Definition: Analysis.h:47
API to communicate dependencies between analyses during invalidation.
Definition: PassManager.h:387
bool invalidate(IRUnitT &IR, const PreservedAnalyses &PA)
Trigger the invalidation of some other analysis pass if not already handled and return whether it was...
Definition: PassManager.h:405
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:348
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:500
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
ArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
Definition: ArrayRef.h:204
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:160
LLVM Basic Block Representation.
Definition: BasicBlock.h:60
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:205
const Module * getModule() const
Return the module owning the function this basic block belongs to, or nullptr if the function does no...
Definition: BasicBlock.cpp:276
This class is a wrapper over an AAResults, and it is intended to be used only when there are no IR ch...
This is the base class for all instructions that perform data casts.
Definition: InstrTypes.h:579
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
A debug info location.
Definition: DebugLoc.h:33
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:155
iterator end()
Definition: DenseMap.h:84
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:220
Analysis pass which computes a DominatorTree.
Definition: Dominators.h:279
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:162
bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
Definition: Dominators.cpp:122
EquivalenceClasses - This represents a collection of equivalence classes and supports three efficient...
iterator findValue(const ElemTy &V) const
findValue - Return an iterator to the specified value.
iterator insert(const ElemTy &Data)
insert - Insert a new value into the union/find set, ignoring the request if the value already exists...
member_iterator member_end() const
typename std::set< ECValue, ECValueComparator >::const_iterator iterator
iterator* - Provides a way to iterate over all values in the set.
member_iterator member_begin(iterator I) const
member_iterator unionSets(const ElemTy &V1, const ElemTy &V2)
union - Merge the two equivalence sets for the specified values, inserting them if they do not alread...
const ElemTy & getLeaderValue(const ElemTy &V) const
getLeaderValue - Return the leader for the specified value that is in the set.
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition: Function.h:680
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Definition: Instructions.h:973
Type * getResultElementType() const
PointerType * getType() const
Global values are always pointers.
Definition: GlobalValue.h:294
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
Definition: Instruction.cpp:80
Class to represent integer types.
Definition: DerivedTypes.h:40
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:278
An instruction for reading from memory.
Definition: Instructions.h:184
Value * getPointerOperand()
Definition: Instructions.h:280
static constexpr LocationSize beforeOrAfterPointer()
Any location before or after the base pointer (but still within the underlying object).
This analysis provides dependence information for the memory accesses of a loop.
Result run(Function &F, FunctionAnalysisManager &AM)
bool invalidate(Function &F, const PreservedAnalyses &PA, FunctionAnalysisManager::Invalidator &Inv)
const LoopAccessInfo & getInfo(Loop &L)
Drive the analysis of memory accesses in the loop.
const MemoryDepChecker & getDepChecker() const
the Memory Dependence Checker which can determine the loop-independent and loop-carried dependences b...
bool isInvariant(Value *V) const
Returns true if value V is loop invariant.
void print(raw_ostream &OS, unsigned Depth=0) const
Print the information about the memory accesses in the loop.
LoopAccessInfo(Loop *L, ScalarEvolution *SE, const TargetLibraryInfo *TLI, AAResults *AA, DominatorTree *DT, LoopInfo *LI)
static bool blockNeedsPredication(BasicBlock *BB, Loop *TheLoop, DominatorTree *DT)
Return true if the block BB needs to be predicated in order for the loop to be vectorized.
Analysis pass that exposes the LoopInfo for a function.
Definition: LoopInfo.h:566
bool contains(const LoopT *L) const
Return true if the specified loop is contained within in this loop.
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
bool isInnermost() const
Return true if the loop does not contain any (natural) loops.
unsigned getNumBackEdges() const
Calculate the number of back edges to the loop header.
BlockT * getHeader() const
LoopT * getParentLoop() const
Return the parent loop if it exists or nullptr for top level loops.
Wrapper class to LoopBlocksDFS that provides a standard begin()/end() interface for the DFS reverse p...
Definition: LoopIterator.h:172
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:44
bool isAnnotatedParallel() const
Returns true if the loop is annotated parallel.
Definition: LoopInfo.cpp:564
DebugLoc getStartLoc() const
Return the debug location of the start of this loop.
Definition: LoopInfo.cpp:631
Metadata node.
Definition: Metadata.h:1067
ArrayRef< MDOperand > operands() const
Definition: Metadata.h:1426
Tracking metadata reference owned by Metadata.
Definition: Metadata.h:889
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:36
Checks memory dependences among accesses to the same underlying object to determine whether there vec...
ArrayRef< unsigned > getOrderForAccess(Value *Ptr, bool IsWrite) const
Return the program order indices for the access location (Ptr, IsWrite).
bool isSafeForAnyVectorWidth() const
Return true if the number of elements that are safe to operate on simultaneously is not bounded.
bool areDepsSafe(DepCandidates &AccessSets, MemAccessInfoList &CheckDeps, const DenseMap< Value *, const SCEV * > &Strides, const DenseMap< Value *, SmallVector< const Value *, 16 > > &UnderlyingObjects)
Check whether the dependencies between the accesses are safe.
const SmallVectorImpl< Instruction * > & getMemoryInstructions() const
The vector of memory access instructions.
const Loop * getInnermostLoop() const
uint64_t getMaxSafeVectorWidthInBits() const
Return the number of elements that are safe to operate on simultaneously, multiplied by the size of t...
bool isSafeForVectorization() const
No memory dependence was encountered that would inhibit vectorization.
const SmallVectorImpl< Dependence > * getDependences() const
Returns the memory dependences.
SmallVector< Instruction *, 4 > getInstructionsForAccess(Value *Ptr, bool isWrite) const
Find the set of instructions that read or write via Ptr.
VectorizationSafetyStatus
Type to keep track of the status of the dependence check.
bool shouldRetryWithRuntimeCheck() const
In same cases when the dependency check fails we can still vectorize the loop with a dynamic array ac...
void addAccess(StoreInst *SI)
Register the location (instructions are given increasing numbers) of a write access.
PointerIntPair< Value *, 1, bool > MemAccessInfo
Representation for a specific memory location.
static MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
LocationSize Size
The maximum size of the location, in address-units, or UnknownSize if the size is not known.
AAMDNodes AATags
The metadata nodes which describes the aliasing of the location (each member is null if that kind of ...
const Value * Ptr
The address of the start of the location.
Root of the metadata hierarchy.
Definition: Metadata.h:62
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition: Module.h:287
Diagnostic information for optimization analysis remarks.
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
void addPredicate(const SCEVPredicate &Pred)
Adds a new predicate.
ScalarEvolution * getSE() const
Returns the ScalarEvolution analysis used.
const SCEVPredicate & getPredicate() const
bool hasNoOverflow(Value *V, SCEVWrapPredicate::IncrementWrapFlags Flags)
Returns true if we've proved that V doesn't wrap by means of a SCEV predicate.
void setNoOverflow(Value *V, SCEVWrapPredicate::IncrementWrapFlags Flags)
Proves that V doesn't overflow by adding SCEV predicate.
void print(raw_ostream &OS, unsigned Depth) const
Print the SCEV mappings done by the Predicated Scalar Evolution.
const SCEVAddRecExpr * getAsAddRec(Value *V)
Attempts to produce an AddRecExpr for V by adding additional SCEV predicates.
const SCEV * getBackedgeTakenCount()
Get the (predicated) backedge count for the analyzed loop.
const SCEV * getSCEV(Value *V)
Returns the SCEV expression of V, in the context of the current SCEV predicate.
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:109
PreservedAnalysisChecker getChecker() const
Build a checker for this PreservedAnalyses and the specified analysis type.
Definition: Analysis.h:264
Holds information about the memory runtime legality checks to verify that a group of pointers do not ...
bool Need
This flag indicates if we need to add the runtime check.
void reset()
Reset the state of the pointer runtime information.
unsigned getNumberOfChecks() const
Returns the number of run-time checks required according to needsChecking.
void printChecks(raw_ostream &OS, const SmallVectorImpl< RuntimePointerCheck > &Checks, unsigned Depth=0) const
Print Checks.
bool needsChecking(const RuntimeCheckingPtrGroup &M, const RuntimeCheckingPtrGroup &N) const
Decide if we need to add a check between two groups of pointers, according to needsChecking.
void print(raw_ostream &OS, unsigned Depth=0) const
Print the list run-time memory checks necessary.
SmallVector< RuntimeCheckingPtrGroup, 2 > CheckingGroups
Holds a partitioning of pointers into "check groups".
void generateChecks(MemoryDepChecker::DepCandidates &DepCands, bool UseDependencies)
Generate the checks and store it.
static bool arePointersInSamePartition(const SmallVectorImpl< int > &PtrToPartition, unsigned PtrIdx1, unsigned PtrIdx2)
Check if pointers are in the same partition.
SmallVector< PointerInfo, 2 > Pointers
Information about the pointers that may require checking.
void insert(Loop *Lp, Value *Ptr, const SCEV *PtrExpr, Type *AccessTy, bool WritePtr, unsigned DepSetId, unsigned ASId, PredicatedScalarEvolution &PSE, bool NeedsFreeze)
Insert a pointer and calculate the start and end SCEVs.
This node represents a polynomial recurrence on the trip count of the specified loop.
const SCEV * evaluateAtIteration(const SCEV *It, ScalarEvolution &SE) const
Return the value of this chain of recurrences at the specified iteration number.
const SCEV * getStepRecurrence(ScalarEvolution &SE) const
Constructs and returns the recurrence indicating how much this expression steps by.
bool isAffine() const
Return true if this represents an expression A + B*x where A and B are loop invariant values.
This class represents a constant integer value.
This is the base class for unary integral cast operator classes.
This node represents multiplication of some number of SCEVs.
NoWrapFlags getNoWrapFlags(NoWrapFlags Mask=NoWrapMask) const
virtual void print(raw_ostream &OS, unsigned Depth=0) const =0
Prints a textual representation of this predicate with an indentation of Depth.
This means that we are dealing with an entirely unknown SCEV value, and only represent it as its LLVM...
This class represents an analyzed expression in the program.
Type * getType() const
Return the LLVM type of this SCEV expression.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
const SCEV * getNegativeSCEV(const SCEV *V, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
Return the SCEV object corresponding to -V.
const SCEV * getUMaxExpr(const SCEV *LHS, const SCEV *RHS)
const SCEVPredicate * getEqualPredicate(const SCEV *LHS, const SCEV *RHS)
const SCEV * getConstant(ConstantInt *V)
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
const SCEV * getNoopOrSignExtend(const SCEV *V, Type *Ty)
Return a SCEV corresponding to a conversion of the input value to the specified type.
const SCEV * getOne(Type *Ty)
Return a SCEV for the constant 1 of a specific type.
const SCEV * getPtrToIntExpr(const SCEV *Op, Type *Ty)
bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
bool isKnownPositive(const SCEV *S)
Test if the given expression is known to be positive.
const SCEV * getZeroExtendExpr(const SCEV *Op, Type *Ty, unsigned Depth=0)
bool isSCEVable(Type *Ty) const
Test if values of the given type are analyzable within the SCEV framework.
Type * getEffectiveSCEVType(Type *Ty) const
Return a type with the same bitwidth as the given type and which represents how SCEV will treat the g...
const SCEV * getUMinExpr(const SCEV *LHS, const SCEV *RHS, bool Sequential=false)
const SCEV * getStoreSizeOfExpr(Type *IntTy, Type *StoreTy)
Return an expression for the store size of StoreTy that is type IntTy.
const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
const SCEV * getSizeOfExpr(Type *IntTy, TypeSize Size)
Return an expression for a TypeSize.
const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
const SCEV * getTruncateOrSignExtend(const SCEV *V, Type *Ty, unsigned Depth=0)
Return a SCEV corresponding to a conversion of the input value to the specified type.
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:321
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:360
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:342
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:427
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:135
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition: SmallSet.h:166
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:179
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:950
void resize(size_type N)
Definition: SmallVector.h:651
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
An instruction for storing to memory.
Definition: Instructions.h:317
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:265
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:255
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
Value * getOperand(unsigned i) const
Definition: User.h:169
unsigned getNumOperands() const
Definition: User.h:191
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
Definition: VectorUtils.h:70
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
const Value * stripAndAccumulateInBoundsConstantOffsets(const DataLayout &DL, APInt &Offset) const
This is a wrapper around stripAndAccumulateConstantOffsets with the in-bounds requirement set to fals...
Definition: Value.h:736
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:187
An efficient, type-erasing, non-owning reference to a callable.
TypeSize getSequentialElementStride(const DataLayout &DL) const
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:52
raw_ostream & indent(unsigned NumSpaces)
indent - Insert 'NumSpaces' spaces.
friend const_iterator end(StringRef path)
Get end iterator over path.
Definition: Path.cpp:236
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
Definition: PatternMatch.h:561
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:450
LocationClass< Ty > location(Ty &L)
Definition: CommandLine.h:470
DiagnosticInfoOptimizationBase::Argument NV
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
std::optional< int > getPointersDiff(Type *ElemTyA, Value *PtrA, Type *ElemTyB, Value *PtrB, const DataLayout &DL, ScalarEvolution &SE, bool StrictCheck=false, bool CheckType=true)
Returns the distance between the pointers PtrA and PtrB iff they are compatible and it is possible to...
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
@ Offset
Definition: DWP.cpp:456
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1731
Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
unsigned getPointerAddressSpace(const Type *T)
Definition: SPIRVUtils.h:122
std::optional< const MDOperand * > findStringMetadataForLoop(const Loop *TheLoop, StringRef Name)
Find string metadata for loop.
Definition: LoopInfo.cpp:1053
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
AddressSpace
Definition: NVPTXBaseInfo.h:21
const Value * getPointerOperand(const Value *V)
A helper function that returns the pointer operand of a load, store or GEP instruction.
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
Definition: STLExtras.h:1937
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1738
bool NullPointerIsDefined(const Function *F, unsigned AS=0)
Check whether null pointer dereferencing is considered undefined behavior for a given function or an ...
Definition: Function.cpp:2014
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
bool isPointerTy(const Type *T)
Definition: SPIRVUtils.h:116
std::optional< int64_t > getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr, const Loop *Lp, const DenseMap< Value *, const SCEV * > &StridesMap=DenseMap< Value *, const SCEV * >(), bool Assume=false, bool ShouldCheckWrap=true)
If the pointer has a constant stride return it in units of the access type size.
bool sortPtrAccesses(ArrayRef< Value * > VL, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
Attempt to sort the pointers in VL and return the sorted indices in SortedIndices,...
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
void getUnderlyingObjects(const Value *V, SmallVectorImpl< const Value * > &Objects, LoopInfo *LI=nullptr, unsigned MaxLookup=6)
This method is similar to getUnderlyingObject except that it can look through phi and select instruct...
const SCEV * replaceSymbolicStrideSCEV(PredicatedScalarEvolution &PSE, const DenseMap< Value *, const SCEV * > &PtrToStride, Value *Ptr)
Return the SCEV corresponding to a pointer with the symbolic stride replaced with constant one,...
bool isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL, ScalarEvolution &SE, bool CheckType=true)
Returns true if the memory operations A and B are consecutive.
bool isGuaranteedNotToBeUndefOrPoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Return true if this function can prove that V does not have undef bits and is never poison.
OutputIt copy(R &&Range, OutputIt Out)
Definition: STLExtras.h:1833
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1758
gep_type_iterator gep_type_begin(const User *GEP)
Type * getLoadStoreType(Value *I)
A helper function that returns the type of a load or store instruction.
Implement std::hash so that hash_code can be used in STL containers.
Definition: BitVector.h:858
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
MDNode * Scope
The tag for alias scope specification (used with noalias).
Definition: Metadata.h:783
MDNode * TBAA
The tag for type-based alias analysis.
Definition: Metadata.h:777
MDNode * NoAlias
The tag specifying the noalias scope.
Definition: Metadata.h:786
A special type used by analysis passes to provide an address that identifies that particular analysis...
Definition: Analysis.h:26
Dependece between memory access instructions.
DepType Type
The type of the dependence.
bool isPossiblyBackward() const
May be a lexically backward dependence type (includes Unknown).
bool isForward() const
Lexically forward dependence.
bool isBackward() const
Lexically backward dependence.
void print(raw_ostream &OS, unsigned Depth, const SmallVectorImpl< Instruction * > &Instrs) const
Print the dependence.
Instruction * getDestination(const LoopAccessInfo &LAI) const
Return the destination instruction of the dependence.
Instruction * getSource(const LoopAccessInfo &LAI) const
Return the source instruction of the dependence.
DepType
The type of the dependence.
static const char * DepName[]
String version of the types.
static VectorizationSafetyStatus isSafeForVectorization(DepType Type)
Dependence types that don't prevent vectorization.
unsigned AddressSpace
Address space of the involved pointers.
bool addPointer(unsigned Index, RuntimePointerChecking &RtCheck)
Tries to add the pointer recorded in RtCheck at index Index to this pointer checking group.
bool NeedsFreeze
Whether the pointer needs to be frozen after expansion, e.g.
const SCEV * High
The SCEV expression which represents the upper bound of all the pointers in this group.
SmallVector< unsigned, 2 > Members
Indices of all the pointers that constitute this grouping.
RuntimeCheckingPtrGroup(unsigned Index, RuntimePointerChecking &RtCheck)
Create a new pointer checking group containing a single pointer, with index Index in RtCheck.
const SCEV * Low
The SCEV expression which represents the lower bound of all the pointers in this group.
bool IsWritePtr
Holds the information if this pointer is used for writing to memory.
unsigned DependencySetId
Holds the id of the set of pointers that could be dependent because of a shared underlying object.
unsigned AliasSetId
Holds the id of the disjoint alias set to which this pointer belongs.
static const unsigned MaxVectorWidth
Maximum SIMD width.
static unsigned VectorizationFactor
VF as overridden by the user.
static unsigned RuntimeMemoryCheckThreshold
\When performing memory disambiguation checks at runtime do not make more than this number of compari...
static bool isInterleaveForced()
True if force-vector-interleave was specified by the user.
static unsigned VectorizationInterleave
Interleave factor as overridden by the user.
Function object to check whether the first component of a container supported by std::get (like std::...
Definition: STLExtras.h:1459