LLVM 23.0.0git
LoopVectorizationPlanner.cpp
Go to the documentation of this file.
1//===- LoopVectorizationPlanner.cpp - VF selection and planning -----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file implements VFSelectionContext methods for loop vectorization
11/// VF selection, independent of cost-modeling decisions.
12///
13//===----------------------------------------------------------------------===//
14
21#include "llvm/Support/Debug.h"
25
26using namespace llvm;
27using namespace LoopVectorizationUtils;
28
29#define DEBUG_TYPE "loop-vectorize"
30
32
34 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
35 cl::desc("Maximize bandwidth when selecting vectorization factor which "
36 "will be determined by the smallest type in loop."));
37
39 "vectorizer-maximize-bandwidth-for-vector-calls", cl::init(true),
41 cl::desc("Try wider VFs if they enable the use of vector variants"));
42
44 "vectorizer-consider-reg-pressure", cl::init(false), cl::Hidden,
45 cl::desc("Discard VFs if their register pressure is too high."));
46
48 "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden,
50 "Pretend that scalable vectors are supported, even if the target does "
51 "not support them. This flag should only be used for testing."));
52
54 "prefer-inloop-reductions", cl::init(false), cl::Hidden,
55 cl::desc("Prefer in-loop vector reductions, "
56 "overriding the targets preference."));
57
58/// Note: This currently only applies to `llvm.masked.load` and
59/// `llvm.masked.store`. TODO: Extend this to cover other operations as needed.
61 "force-target-supports-masked-memory-ops", cl::init(false), cl::Hidden,
62 cl::desc("Assume the target supports masked memory operations (used for "
63 "testing)."));
64
66 "force-target-supports-gather-scatter-ops", cl::init(false), cl::Hidden,
67 cl::desc("Assume the target supports gather/scatter operations (used for "
68 "testing)."));
69
70/// Write a \p DebugMsg about vectorization to the debug output stream. If \p I
71/// is passed, the message relates to that particular instruction.
72#ifndef NDEBUG
73static void debugVectorizationMessage(const StringRef Prefix,
74 const StringRef DebugMsg,
75 Instruction *I) {
76 dbgs() << "LV: " << Prefix << DebugMsg;
77 if (I != nullptr)
78 dbgs() << " " << *I;
79 else
80 dbgs() << '.';
81 dbgs() << '\n';
82}
83#endif
84
85/// Create an analysis remark that explains why vectorization failed
86/// \p RemarkName is the identifier for the remark. If \p I is passed it is an
87/// instruction that prevents vectorization. Otherwise \p TheLoop is used for
88/// the location of the remark. If \p DL is passed, use it as debug location for
89/// the remark. \return the remark object that can be streamed to.
91 const Loop *TheLoop,
93 DebugLoc DL = {}) {
94 BasicBlock *CodeRegion = I ? I->getParent() : TheLoop->getHeader();
95 // If debug location is attached to the instruction, use it. Otherwise if DL
96 // was not provided, use the loop's.
97 if (I && I->getDebugLoc())
98 DL = I->getDebugLoc();
99 else if (!DL)
100 DL = TheLoop->getStartLoc();
101
102 return OptimizationRemarkAnalysis(DEBUG_TYPE, RemarkName, DL, CodeRegion);
103}
104
106 const StringRef DebugMsg, const StringRef OREMsg, const StringRef ORETag,
107 OptimizationRemarkEmitter *ORE, const Loop *TheLoop, Instruction *I) {
108 LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I));
109 ORE->emit(createLVAnalysis(ORETag, TheLoop, I)
110 << "loop not vectorized: " << OREMsg);
111}
112
114 const StringRef Msg, const StringRef ORETag, OptimizationRemarkEmitter *ORE,
115 const Loop *TheLoop, Instruction *I, DebugLoc DL) {
117 ORE->emit(createLVAnalysis(ORETag, TheLoop, I, DL) << Msg);
118}
119
121 Loop *TheLoop,
122 ElementCount VFWidth,
123 unsigned IC) {
125 "Vectorizing: ", TheLoop->isInnermost() ? "innermost loop" : "outer loop",
126 nullptr));
127 StringRef LoopType = TheLoop->isInnermost() ? "" : "outer ";
128 ORE->emit([&]() {
129 return OptimizationRemark(DEBUG_TYPE, "Vectorized", TheLoop->getStartLoc(),
130 TheLoop->getHeader())
131 << "vectorized " << LoopType << "loop (vectorization width: "
132 << ore::NV("VectorizationFactor", VFWidth)
133 << ", interleaved count: " << ore::NV("InterleaveCount", IC) << ")";
134 });
135}
136
138 ElementCount VF) const {
140 auto *Ty = getLoadStoreType(I);
141 const unsigned AS = getLoadStoreAddressSpace(I);
142 const Align Alignment = getLoadStoreAlignment(I);
143
145 (isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment, AS)
146 : TTI.isLegalMaskedStore(Ty, Alignment, AS));
147}
148
150 ElementCount VF) const {
151 bool LI = isa<LoadInst>(V);
152 bool SI = isa<StoreInst>(V);
153 if (!LI && !SI)
154 return false;
155 auto *Ty = getLoadStoreType(V);
157 if (VF.isVector())
158 Ty = VectorType::get(Ty, VF);
160 (LI && TTI.isLegalMaskedGather(Ty, Align)) ||
161 (SI && TTI.isLegalMaskedScatter(Ty, Align));
162}
163
165 return TTI.supportsScalableVectors() || ForceTargetSupportsScalableVectors ||
167}
168
169bool VFSelectionContext::useMaxBandwidth(bool IsScalable) const {
173 return MaximizeBandwidth || (MaximizeBandwidth.getNumOccurrences() == 0 &&
174 (TTI.shouldMaximizeVectorBandwidth(RegKind) ||
176 Legal->hasVectorCallVariants())));
177}
178
180 if (ConsiderRegPressure.getNumOccurrences())
181 return ConsiderRegPressure;
182
183 // TODO: We should eventually consider register pressure for all targets. The
184 // TTI hook is temporary whilst target-specific issues are being fixed.
185 if (TTI.shouldConsiderVectorizationRegPressure())
186 return true;
187
188 if (!useMaxBandwidth(VF.isScalable()))
189 return false;
190 // Only calculate register pressure for VFs enabled by MaxBandwidth.
192 VF, VF.isScalable() ? MaxPermissibleVFWithoutMaxBW.ScalableVF
193 : MaxPermissibleVFWithoutMaxBW.FixedVF);
194}
195
196ElementCount VFSelectionContext::clampVFByMaxTripCount(
197 ElementCount VF, unsigned MaxTripCount, unsigned UserIC,
198 bool FoldTailByMasking, bool RequiresScalarEpilogue) const {
199 unsigned EstimatedVF = VF.getKnownMinValue();
200 if (VF.isScalable() && F.hasFnAttribute(Attribute::VScaleRange)) {
201 auto Attr = F.getFnAttribute(Attribute::VScaleRange);
202 auto Min = Attr.getVScaleRangeMin();
203 EstimatedVF *= Min;
204 }
205
206 // When a scalar epilogue is required, at least one iteration of the scalar
207 // loop has to execute. Adjust MaxTripCount accordingly to avoid picking a
208 // max VF that results in a dead vector loop.
209 if (MaxTripCount > 0 && RequiresScalarEpilogue)
210 MaxTripCount -= 1;
211
212 // When the user specifies an interleave count, we need to ensure that
213 // VF * UserIC <= MaxTripCount to avoid a dead vector loop.
214 unsigned IC = UserIC > 0 ? UserIC : 1;
215 unsigned EstimatedVFTimesIC = EstimatedVF * IC;
216
217 if (MaxTripCount && MaxTripCount <= EstimatedVFTimesIC &&
218 (!FoldTailByMasking || isPowerOf2_32(MaxTripCount))) {
219 // If upper bound loop trip count (TC) is known at compile time there is no
220 // point in choosing VF greater than TC / IC (as done in the loop below).
221 // Select maximum power of two which doesn't exceed TC / IC. If VF is
222 // scalable, we only fall back on a fixed VF when the TC is less than or
223 // equal to the known number of lanes.
224 auto ClampedUpperTripCount = llvm::bit_floor(MaxTripCount / IC);
225 if (ClampedUpperTripCount == 0)
226 ClampedUpperTripCount = 1;
227 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not "
228 "exceeding the constant trip count"
229 << (UserIC > 0 ? " divided by UserIC" : "") << ": "
230 << ClampedUpperTripCount << "\n");
231 return ElementCount::get(ClampedUpperTripCount,
232 FoldTailByMasking ? VF.isScalable() : false);
233 }
234 return VF;
235}
236
237ElementCount VFSelectionContext::getMaximizedVFForTarget(
238 unsigned MaxTripCount, unsigned SmallestType, unsigned WidestType,
239 ElementCount MaxSafeVF, unsigned UserIC, bool FoldTailByMasking,
240 bool RequiresScalarEpilogue) {
241 bool ComputeScalableMaxVF = MaxSafeVF.isScalable();
242 const TypeSize WidestRegister = TTI.getRegisterBitWidth(
243 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
245
246 // Convenience function to return the minimum of two ElementCounts.
247 auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) {
248 assert((LHS.isScalable() == RHS.isScalable()) &&
249 "Scalable flags must match");
251 };
252
253 // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
254 // Note that both WidestRegister and WidestType may not be a powers of 2.
255 auto MaxVectorElementCount = ElementCount::get(
256 llvm::bit_floor(WidestRegister.getKnownMinValue() / WidestType),
257 ComputeScalableMaxVF);
258 MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF);
259 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
260 << (MaxVectorElementCount * WidestType) << " bits.\n");
261
262 if (!MaxVectorElementCount) {
263 LLVM_DEBUG(dbgs() << "LV: The target has no "
264 << (ComputeScalableMaxVF ? "scalable" : "fixed")
265 << " vector registers.\n");
266 return ElementCount::getFixed(1);
267 }
268
269 ElementCount MaxVF =
270 clampVFByMaxTripCount(MaxVectorElementCount, MaxTripCount, UserIC,
271 FoldTailByMasking, RequiresScalarEpilogue);
272 // If the MaxVF was already clamped, there's no point in trying to pick a
273 // larger one.
274 if (MaxVF != MaxVectorElementCount)
275 return MaxVF;
276
277 if (MaxVF.isScalable())
278 MaxPermissibleVFWithoutMaxBW.ScalableVF = MaxVF;
279 else
280 MaxPermissibleVFWithoutMaxBW.FixedVF = MaxVF;
281
282 if (useMaxBandwidth(ComputeScalableMaxVF)) {
283 auto MaxVectorElementCountMaxBW = ElementCount::get(
284 llvm::bit_floor(WidestRegister.getKnownMinValue() / SmallestType),
285 ComputeScalableMaxVF);
286 MaxVF = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF);
287
288 if (ElementCount MinVF =
289 TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) {
290 if (ElementCount::isKnownLT(MaxVF, MinVF)) {
291 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
292 << ") with target's minimum: " << MinVF << '\n');
293 MaxVF = MinVF;
294 }
295 }
296
297 MaxVF = clampVFByMaxTripCount(MaxVF, MaxTripCount, UserIC,
298 FoldTailByMasking, RequiresScalarEpilogue);
299 }
300 return MaxVF;
301}
302
303std::optional<unsigned> llvm::getMaxVScale(const Function &F,
304 const TargetTransformInfo &TTI) {
305 if (std::optional<unsigned> MaxVScale = TTI.getMaxVScale())
306 return MaxVScale;
307
308 if (F.hasFnAttribute(Attribute::VScaleRange))
309 return F.getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax();
310
311 return std::nullopt;
312}
313
314bool VFSelectionContext::isScalableVectorizationAllowed() {
315 if (IsScalableVectorizationAllowed)
316 return *IsScalableVectorizationAllowed;
317
318 IsScalableVectorizationAllowed = false;
320 return false;
321
322 if (Hints->isScalableVectorizationDisabled()) {
323 reportVectorizationInfo("Scalable vectorization is explicitly disabled",
324 "ScalableVectorizationDisabled", ORE, TheLoop);
325 return false;
326 }
327
328 LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n");
329
330 auto MaxScalableVF = ElementCount::getScalable(
331 std::numeric_limits<ElementCount::ScalarTy>::max());
332
333 // Test that the loop-vectorizer can legalize all operations for this MaxVF.
334 // FIXME: While for scalable vectors this is currently sufficient, this should
335 // be replaced by a more detailed mechanism that filters out specific VFs,
336 // instead of invalidating vectorization for a whole set of VFs based on the
337 // MaxVF.
338
339 // Disable scalable vectorization if the loop contains unsupported reductions.
340 if (!all_of(Legal->getReductionVars(), [&](const auto &Reduction) -> bool {
341 return TTI.isLegalToVectorizeReduction(Reduction.second, MaxScalableVF);
342 })) {
344 "Scalable vectorization not supported for the reduction "
345 "operations found in this loop.",
346 "ScalableVFUnfeasible", ORE, TheLoop);
347 return false;
348 }
349
350 // Disable scalable vectorization if the loop contains any instructions
351 // with element types not supported for scalable vectors.
352 if (any_of(ElementTypesInLoop, [&](Type *Ty) {
353 return !Ty->isVoidTy() && !TTI.isElementTypeLegalForScalableVector(Ty);
354 })) {
355 reportVectorizationInfo("Scalable vectorization is not supported "
356 "for all element types found in this loop.",
357 "ScalableVFUnfeasible", ORE, TheLoop);
358 return false;
359 }
360
361 if (!Legal->isSafeForAnyVectorWidth() && !getMaxVScale(F, TTI)) {
362 reportVectorizationInfo("The target does not provide maximum vscale value "
363 "for safe distance analysis.",
364 "ScalableVFUnfeasible", ORE, TheLoop);
365 return false;
366 }
367
368 IsScalableVectorizationAllowed = true;
369 return true;
370}
371
373VFSelectionContext::getMaxLegalScalableVF(unsigned MaxSafeElements) {
374 if (!isScalableVectorizationAllowed())
376
377 auto MaxScalableVF = ElementCount::getScalable(
378 std::numeric_limits<ElementCount::ScalarTy>::max());
379 if (Legal->isSafeForAnyVectorWidth())
380 return MaxScalableVF;
381
382 std::optional<unsigned> MaxVScale = getMaxVScale(F, TTI);
383 // Limit MaxScalableVF by the maximum safe dependence distance.
384 MaxScalableVF = ElementCount::getScalable(MaxSafeElements / *MaxVScale);
385
386 if (!MaxScalableVF)
388 "Max legal vector width too small, scalable vectorization "
389 "unfeasible.",
390 "ScalableVFUnfeasible", ORE, TheLoop);
391
392 return MaxScalableVF;
393}
394
396 unsigned MaxTripCount, ElementCount UserVF, unsigned UserIC,
397 bool FoldTailByMasking, bool RequiresScalarEpilogue) {
398 auto [SmallestType, WidestType] = getSmallestAndWidestTypes();
399
400 // Get the maximum safe dependence distance in bits computed by LAA.
401 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
402 // the memory accesses that is most restrictive (involved in the smallest
403 // dependence distance).
404 unsigned MaxSafeElementsPowerOf2 =
405 llvm::bit_floor(Legal->getMaxSafeVectorWidthInBits() / WidestType);
406 if (!Legal->isSafeForAnyStoreLoadForwardDistances()) {
407 unsigned SLDist = Legal->getMaxStoreLoadForwardSafeDistanceInBits();
408 MaxSafeElementsPowerOf2 =
409 std::min(MaxSafeElementsPowerOf2, SLDist / WidestType);
410 }
411
412 auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElementsPowerOf2);
413 auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElementsPowerOf2);
414
415 if (!Legal->isSafeForAnyVectorWidth())
416 MaxSafeElements = MaxSafeElementsPowerOf2;
417
418 LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF
419 << ".\n");
420 LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF
421 << ".\n");
422
423 // First analyze the UserVF, fall back if the UserVF should be ignored.
424 if (UserVF) {
425 auto MaxSafeUserVF =
426 UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF;
427
428 if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) {
429 // If `VF=vscale x N` is safe, then so is `VF=N`
430 if (UserVF.isScalable())
431 return FixedScalableVFPair(
432 ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF);
433
434 return UserVF;
435 }
436
437 assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF));
438
439 // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it
440 // is better to ignore the hint and let the compiler choose a suitable VF.
441 if (!UserVF.isScalable()) {
442 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
443 << " is unsafe, clamping to max safe VF="
444 << MaxSafeFixedVF << ".\n");
445 ORE->emit([&]() {
446 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
447 TheLoop->getStartLoc(),
448 TheLoop->getHeader())
449 << "User-specified vectorization factor "
450 << ore::NV("UserVectorizationFactor", UserVF)
451 << " is unsafe, clamping to maximum safe vectorization factor "
452 << ore::NV("VectorizationFactor", MaxSafeFixedVF);
453 });
454 return MaxSafeFixedVF;
455 }
456
458 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
459 << " is ignored because scalable vectors are not "
460 "available.\n");
461 ORE->emit([&]() {
462 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
463 TheLoop->getStartLoc(),
464 TheLoop->getHeader())
465 << "User-specified vectorization factor "
466 << ore::NV("UserVectorizationFactor", UserVF)
467 << " is ignored because the target does not support scalable "
468 "vectors. The compiler will pick a more suitable value.";
469 });
470 } else {
471 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
472 << " is unsafe. Ignoring scalable UserVF.\n");
473 ORE->emit([&]() {
474 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
475 TheLoop->getStartLoc(),
476 TheLoop->getHeader())
477 << "User-specified vectorization factor "
478 << ore::NV("UserVectorizationFactor", UserVF)
479 << " is unsafe. Ignoring the hint to let the compiler pick a "
480 "more suitable value.";
481 });
482 }
483 }
484
485 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
486 << " / " << WidestType << " bits.\n");
487
490 if (auto MaxVF = getMaximizedVFForTarget(
491 MaxTripCount, SmallestType, WidestType, MaxSafeFixedVF, UserIC,
492 FoldTailByMasking, RequiresScalarEpilogue))
493 Result.FixedVF = MaxVF;
494
495 if (auto MaxVF = getMaximizedVFForTarget(
496 MaxTripCount, SmallestType, WidestType, MaxSafeScalableVF, UserIC,
497 FoldTailByMasking, RequiresScalarEpilogue))
498 if (MaxVF.isScalable()) {
499 Result.ScalableVF = MaxVF;
500 LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF
501 << "\n");
502 }
503
504 return Result;
505}
506
507std::pair<unsigned, unsigned>
509 unsigned MinWidth = -1U;
510 unsigned MaxWidth = 8;
511 const DataLayout &DL = F.getDataLayout();
512 // For in-loop reductions, no element types are added to ElementTypesInLoop
513 // if there are no loads/stores in the loop. In this case, check through the
514 // reduction variables to determine the maximum width.
515 if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) {
516 for (const auto &[_, RdxDesc] : Legal->getReductionVars()) {
517 // When finding the min width used by the recurrence we need to account
518 // for casts on the input operands of the recurrence.
519 MinWidth = std::min(
520 MinWidth,
521 std::min(RdxDesc.getMinWidthCastToRecurrenceTypeInBits(),
522 RdxDesc.getRecurrenceType()->getScalarSizeInBits()));
523 MaxWidth = std::max(MaxWidth,
524 RdxDesc.getRecurrenceType()->getScalarSizeInBits());
525 }
526 } else {
527 for (Type *T : ElementTypesInLoop) {
528 MinWidth = std::min<unsigned>(
529 MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue());
530 MaxWidth = std::max<unsigned>(
531 MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue());
532 }
533 }
534 return {MinWidth, MaxWidth};
535}
536
538 const SmallPtrSetImpl<const Value *> *ValuesToIgnore) {
539 ElementTypesInLoop.clear();
540 // For each block.
541 for (BasicBlock *BB : TheLoop->blocks()) {
542 // For each instruction in the loop.
543 for (Instruction &I : *BB) {
544 Type *T = I.getType();
545
546 // Skip ignored values.
547 if (ValuesToIgnore && ValuesToIgnore->contains(&I))
548 continue;
549
550 // Only examine Loads, Stores and PHINodes.
552 continue;
553
554 // Examine PHI nodes that are reduction variables. Update the type to
555 // account for the recurrence type.
556 if (auto *PN = dyn_cast<PHINode>(&I)) {
557 if (!Legal->isReductionVariable(PN))
558 continue;
559 const RecurrenceDescriptor &RdxDesc =
560 Legal->getRecurrenceDescriptor(PN);
562 TTI.preferInLoopReduction(RdxDesc.getRecurrenceKind(),
563 RdxDesc.getRecurrenceType()))
564 continue;
565 T = RdxDesc.getRecurrenceType();
566 }
567
568 // Examine the stored values.
569 if (auto *ST = dyn_cast<StoreInst>(&I))
570 T = ST->getValueOperand()->getType();
571
572 assert(T->isSized() &&
573 "Expected the load/store/recurrence type to be sized");
574
575 ElementTypesInLoop.insert(T);
576 }
577 }
578}
579
580void VFSelectionContext::initializeVScaleForTuning() {
582 return;
583
584 if (F.hasFnAttribute(Attribute::VScaleRange)) {
585 auto Attr = F.getFnAttribute(Attribute::VScaleRange);
586 auto Min = Attr.getVScaleRangeMin();
587 auto Max = Attr.getVScaleRangeMax();
588 if (Max && Min == Max) {
589 VScaleForTuning = Max;
590 return;
591 }
592 }
593
594 VScaleForTuning = TTI.getVScaleForTuning();
595}
596
598 const RecurrenceDescriptor &RdxDesc) const {
599 return !Hints->allowReordering() && RdxDesc.isOrdered();
600}
601
603 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
604
605 Loop *L = const_cast<Loop *>(TheLoop);
606 if (Legal->getRuntimePointerChecking()->Need) {
608 "Runtime ptr check is required with -Os/-Oz",
609 "runtime pointer checks needed. Enable vectorization of this "
610 "loop with '#pragma clang loop vectorize(enable)' when "
611 "compiling with -Os/-Oz",
612 "CantVersionLoopWithOptForSize", ORE, L);
613 return true;
614 }
615
616 if (!PSE.getPredicate().isAlwaysTrue()) {
618 "Runtime SCEV check is required with -Os/-Oz",
619 "runtime SCEV checks needed. Enable vectorization of this "
620 "loop with '#pragma clang loop vectorize(enable)' when "
621 "compiling with -Os/-Oz",
622 "CantVersionLoopWithOptForSize", ORE, L);
623 return true;
624 }
625
626 // FIXME: Avoid specializing for stride==1 instead of bailing out.
627 if (!Legal->getLAI()->getSymbolicStrides().empty()) {
629 "Runtime stride check for small trip count",
630 "runtime stride == 1 checks needed. Enable vectorization of "
631 "this loop without such check by compiling with -Os/-Oz",
632 "CantVersionLoopWithOptForSize", ORE, L);
633 return true;
634 }
635
636 return false;
637}
638
640 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
641}
642
644 // Avoid duplicating work finding in-loop reductions.
645 if (!InLoopReductions.empty())
646 return;
647
648 for (const auto &Reduction : Legal->getReductionVars()) {
649 PHINode *Phi = Reduction.first;
650 const RecurrenceDescriptor &RdxDesc = Reduction.second;
651
652 // Multi-use reductions (e.g., used in FindLastIV patterns) are handled
653 // separately and should not be considered for in-loop reductions.
654 if (RdxDesc.hasUsesOutsideReductionChain())
655 continue;
656
657 // We don't collect reductions that are type promoted (yet).
658 if (RdxDesc.getRecurrenceType() != Phi->getType())
659 continue;
660
661 // In-loop AnyOf and FindIV reductions are not yet supported.
662 RecurKind Kind = RdxDesc.getRecurrenceKind();
666 continue;
667
668 // If the target would prefer this reduction to happen "in-loop", then we
669 // want to record it as such.
671 !TTI.preferInLoopReduction(Kind, Phi->getType()))
672 continue;
673
674 // Check that we can correctly put the reductions into the loop, by
675 // finding the chain of operations that leads from the phi to the loop
676 // exit value.
677 SmallVector<Instruction *, 4> ReductionOperations =
678 RdxDesc.getReductionOpChain(Phi, const_cast<Loop *>(TheLoop));
679 bool InLoop = !ReductionOperations.empty();
680
681 if (InLoop) {
682 InLoopReductions.insert(Phi);
683 // Add the elements to InLoopReductionImmediateChains for cost modelling.
684 Instruction *LastChain = Phi;
685 for (auto *I : ReductionOperations) {
686 InLoopReductionImmediateChains[I] = LastChain;
687 LastChain = I;
688 }
689 }
690 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop")
691 << " reduction for phi: " << *Phi << "\n");
692 }
693}
694
695bool LoopVectorizationPlanner::isMoreProfitable(const VectorizationFactor &A,
696 const VectorizationFactor &B,
697 const unsigned MaxTripCount,
698 bool HasTail,
699 bool IsEpilogue) const {
700 InstructionCost CostA = A.Cost;
701 InstructionCost CostB = B.Cost;
702
703 // When there is a hint to always prefer scalable vectors, honour that hint.
705 if (A.Width.isScalable() && CostA.isValid() && !B.Width.isScalable() &&
706 !B.Width.isScalar())
707 return true;
708
709 // Improve estimate for the vector width if it is scalable.
710 unsigned EstimatedWidthA = A.Width.getKnownMinValue();
711 unsigned EstimatedWidthB = B.Width.getKnownMinValue();
712 if (std::optional<unsigned> VScale = Config.getVScaleForTuning()) {
713 if (A.Width.isScalable())
714 EstimatedWidthA *= *VScale;
715 if (B.Width.isScalable())
716 EstimatedWidthB *= *VScale;
717 }
718
719 // When optimizing for size choose whichever is smallest, which will be the
720 // one with the smallest cost for the whole loop. On a tie pick the larger
721 // vector width, on the assumption that throughput will be greater.
722 if (Config.CostKind == TTI::TCK_CodeSize)
723 return CostA < CostB ||
724 (CostA == CostB && EstimatedWidthA > EstimatedWidthB);
725
726 // Assume vscale may be larger than 1 (or the value being tuned for),
727 // so that scalable vectorization is slightly favorable over fixed-width
728 // vectorization.
729 bool PreferScalable = !TTI.preferFixedOverScalableIfEqualCost(IsEpilogue) &&
730 A.Width.isScalable() && !B.Width.isScalable();
731
732 auto CmpFn = [PreferScalable](const InstructionCost &LHS,
733 const InstructionCost &RHS) {
734 return PreferScalable ? LHS <= RHS : LHS < RHS;
735 };
736
737 // To avoid the need for FP division:
738 // (CostA / EstimatedWidthA) < (CostB / EstimatedWidthB)
739 // <=> (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA)
740 bool LowerCostWithoutTC =
741 CmpFn(CostA * EstimatedWidthB, CostB * EstimatedWidthA);
742 if (!MaxTripCount)
743 return LowerCostWithoutTC;
744
745 auto GetCostForTC = [MaxTripCount, HasTail](unsigned VF,
746 InstructionCost VectorCost,
747 InstructionCost ScalarCost) {
748 // If the trip count is a known (possibly small) constant, the trip count
749 // will be rounded up to an integer number of iterations under
750 // FoldTailByMasking. The total cost in that case will be
751 // VecCost*ceil(TripCount/VF). When not folding the tail, the total
752 // cost will be VecCost*floor(TC/VF) + ScalarCost*(TC%VF). There will be
753 // some extra overheads, but for the purpose of comparing the costs of
754 // different VFs we can use this to compare the total loop-body cost
755 // expected after vectorization.
756 if (HasTail)
757 return VectorCost * (MaxTripCount / VF) +
758 ScalarCost * (MaxTripCount % VF);
759 return VectorCost * divideCeil(MaxTripCount, VF);
760 };
761
762 auto RTCostA = GetCostForTC(EstimatedWidthA, CostA, A.ScalarCost);
763 auto RTCostB = GetCostForTC(EstimatedWidthB, CostB, B.ScalarCost);
764 bool LowerCostWithTC = CmpFn(RTCostA, RTCostB);
765 LLVM_DEBUG(if (LowerCostWithTC != LowerCostWithoutTC) {
766 dbgs() << "LV: VF " << (LowerCostWithTC ? A.Width : B.Width)
767 << " has lower cost than VF "
768 << (LowerCostWithTC ? B.Width : A.Width)
769 << " when taking the cost of the remaining scalar loop iterations "
770 "into consideration for a maximum trip count of "
771 << MaxTripCount << ".\n";
772 });
773 return LowerCostWithTC;
774}
775
776bool LoopVectorizationPlanner::isMoreProfitable(const VectorizationFactor &A,
777 const VectorizationFactor &B,
778 bool HasTail,
779 bool IsEpilogue) const {
780 const unsigned MaxTripCount = PSE.getSmallConstantMaxTripCount();
781 return LoopVectorizationPlanner::isMoreProfitable(A, B, MaxTripCount, HasTail,
782 IsEpilogue);
783}
784
785// TODO: we could return a pair of values that specify the max VF and
786// min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
787// `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
788// doesn't have a cost model that can choose which plan to execute if
789// more than one is generated.
792 if (UserVF.isScalable() && !supportsScalableVectors()) {
794 "Scalable vectorization requested but not supported by the target",
795 "the scalable user-specified vectorization width for outer-loop "
796 "vectorization cannot be used because the target does not support "
797 "scalable vectors.",
798 "ScalableVFUnfeasible", ORE, TheLoop);
800 }
801
802 ElementCount VF = UserVF;
803 if (VF.isZero()) {
804 auto [_, WidestType] = getSmallestAndWidestTypes();
805
806 auto RegKind = TTI.enableScalableVectorization()
809
810 TypeSize RegSize = TTI.getRegisterBitWidth(RegKind);
811 unsigned N = std::max<uint64_t>(1, RegSize.getKnownMinValue() / WidestType);
812 VF = ElementCount::get(N, RegSize.isScalable());
813 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
814
815 // Make sure we have a VF > 1 for stress testing.
817 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
818 << "overriding computed VF.\n");
820 }
821 }
823 "VF needs to be a power of two");
824 if (VF.isScalar())
826 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")
827 << "VF " << VF << " to build VPlans.\n");
828 return FixedScalableVFPair(VF);
829}
unsigned RegSize
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
#define DEBUG_TYPE
#define _
loop Loop Strength Reduction
This file defines the LoopVectorizationLegality class.
static void debugVectorizationMessage(const StringRef Prefix, const StringRef DebugMsg, Instruction *I)
Write a DebugMsg about vectorization to the debug output stream.
static cl::opt< bool > ForceTargetSupportsGatherScatterOps("force-target-supports-gather-scatter-ops", cl::init(false), cl::Hidden, cl::desc("Assume the target supports gather/scatter operations (used for " "testing)."))
cl::opt< bool > VPlanBuildOuterloopStressTest
static cl::opt< bool > ForceTargetSupportsScalableVectors("force-target-supports-scalable-vectors", cl::init(false), cl::Hidden, cl::desc("Pretend that scalable vectors are supported, even if the target does " "not support them. This flag should only be used for testing."))
static cl::opt< bool > ConsiderRegPressure("vectorizer-consider-reg-pressure", cl::init(false), cl::Hidden, cl::desc("Discard VFs if their register pressure is too high."))
static cl::opt< bool > UseWiderVFIfCallVariantsPresent("vectorizer-maximize-bandwidth-for-vector-calls", cl::init(true), cl::Hidden, cl::desc("Try wider VFs if they enable the use of vector variants"))
static OptimizationRemarkAnalysis createLVAnalysis(StringRef RemarkName, const Loop *TheLoop, Instruction *I, DebugLoc DL={})
Create an analysis remark that explains why vectorization failed RemarkName is the identifier for the...
static cl::opt< bool > ForceTargetSupportsMaskedMemoryOps("force-target-supports-masked-memory-ops", cl::init(false), cl::Hidden, cl::desc("Assume the target supports masked memory operations (used for " "testing)."))
Note: This currently only applies to llvm.masked.load and llvm.masked.store.
static cl::opt< bool > MaximizeBandwidth("vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, cl::desc("Maximize bandwidth when selecting vectorization factor which " "will be determined by the smallest type in loop."))
This file provides a LoopVectorizationPlanner class.
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
#define T
#define LLVM_DEBUG(...)
Definition Debug.h:119
Value * RHS
Value * LHS
LLVM Basic Block Representation.
Definition BasicBlock.h:62
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
A debug info location.
Definition DebugLoc.h:124
constexpr bool isVector() const
One or more elements.
Definition TypeSize.h:324
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition TypeSize.h:312
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition TypeSize.h:309
static constexpr ElementCount get(ScalarTy MinVal, bool Scalable)
Definition TypeSize.h:315
constexpr bool isScalar() const
Exactly one element.
Definition TypeSize.h:320
bool isInnermost() const
Return true if the loop does not contain any (natural) loops.
BlockT * getHeader() const
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
DebugLoc getStartLoc() const
Return the debug location of the start of this loop.
Definition LoopInfo.cpp:659
Diagnostic information for optimization analysis remarks.
The optimization diagnostic interface.
LLVM_ABI void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for applied optimization remarks.
The RecurrenceDescriptor is used to identify recurrences variables in a loop.
Type * getRecurrenceType() const
Returns the type of the recurrence.
bool hasUsesOutsideReductionChain() const
Returns true if the reduction PHI has any uses outside the reduction chain.
static bool isFindLastRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is of the form select(cmp(),x,y) where one of (x,...
LLVM_ABI SmallVector< Instruction *, 4 > getReductionOpChain(PHINode *Phi, Loop *L) const
Attempts to find a chain of operations from Phi to LoopExitInst that can be treated as a set of reduc...
static bool isAnyOfRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is of the form select(cmp(),x,y) where one of (x,...
RecurKind getRecurrenceKind() const
bool isOrdered() const
Expose an ordered FP reduction to the instance users.
static bool isFindIVRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is of the form select(cmp(),x,y) where one of (x,...
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
bool contains(ConstPtrType Ptr) const
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Represent a constant reference to a string, i.e.
Definition StringRef.h:56
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
@ TCK_CodeSize
Instruction code size.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:141
FixedScalableVFPair computeVPlanOuterloopVF(ElementCount UserVF)
Returns a scalable VF to use for outer-loop vectorization if the target supports it and a fixed VF ot...
std::pair< unsigned, unsigned > getSmallestAndWidestTypes() const
const TTI::TargetCostKind CostKind
The kind of cost that we are calculating.
bool runtimeChecksRequired()
Check whether vectorization would require runtime checks.
bool isLegalGatherOrScatter(Value *V, ElementCount VF) const
Returns true if the target machine can represent V as a masked gather or scatter operation.
void collectInLoopReductions()
Split reductions into those that happen in the loop, and those that happen outside.
FixedScalableVFPair computeFeasibleMaxVF(unsigned MaxTripCount, ElementCount UserVF, unsigned UserIC, bool FoldTailByMasking, bool RequiresScalarEpilogue)
bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const
Returns true if we should use strict in-order reductions for the given RdxDesc.
bool shouldConsiderRegPressureForVF(ElementCount VF) const
void collectElementTypesForWidening(const SmallPtrSetImpl< const Value * > *ValuesToIgnore=nullptr)
Collect element types in the loop that need widening.
bool isLegalMaskedLoadOrStore(Instruction *I, ElementCount VF) const
Returns true if the target machine supports masked loads or stores for I's data type and alignment.
std::optional< unsigned > getVScaleForTuning() const
void computeMinimalBitwidths()
Compute smallest bitwidth each instruction can be represented with.
LLVM Value Representation.
Definition Value.h:75
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
static constexpr bool isKnownLE(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:230
static constexpr bool isKnownLT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:216
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:168
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
constexpr bool isZero() const
Definition TypeSize.h:153
static constexpr bool isKnownGT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:223
void reportVectorizationFailure(const StringRef DebugMsg, const StringRef OREMsg, const StringRef ORETag, OptimizationRemarkEmitter *ORE, const Loop *TheLoop, Instruction *I=nullptr)
Reports a vectorization failure: print DebugMsg for debugging purposes along with the corresponding o...
void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag, OptimizationRemarkEmitter *ORE, const Loop *TheLoop, Instruction *I=nullptr, DebugLoc DL={})
Reports an informative message: print Msg for debugging purposes as well as an optimization remark.
void reportVectorization(OptimizationRemarkEmitter *ORE, Loop *TheLoop, ElementCount VFWidth, unsigned IC)
Report successful vectorization of the loop.
initializer< Ty > init(const Ty &Val)
DiagnosticInfoOptimizationBase::Argument NV
This is an optimization pass for GlobalISel generic memory operations.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
unsigned getLoadStoreAddressSpace(const Value *I)
A helper function that returns the address space of the pointer operand of load or store instruction.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
Align getLoadStoreAlignment(const Value *I)
A helper function that returns the alignment of load or store instruction.
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1746
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:209
std::optional< unsigned > getMaxVScale(const Function &F, const TargetTransformInfo &TTI)
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
TargetTransformInfo TTI
RecurKind
These are the kinds of recurrences that we support.
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition bit.h:347
LLVM_ABI MapVector< Instruction *, uint64_t > computeMinimumValueSizes(ArrayRef< BasicBlock * > Blocks, DemandedBits &DB, const TargetTransformInfo *TTI=nullptr)
Compute a map of integer instructions to their minimum legal type size.
cl::opt< bool > PreferInLoopReductions
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
A class that represents two vectorization factors (initialized with 0 by default).
static FixedScalableVFPair getNone()
TODO: The following VectorizationFactor was pulled out of LoopVectorizationCostModel class.
static LLVM_ABI ElementCount VectorizationFactor
VF as overridden by the user.