LLVM 23.0.0git
VPlanTransforms.cpp
Go to the documentation of this file.
1//===-- VPlanTransforms.cpp - Utility VPlan to VPlan transforms -----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file implements a set of utility VPlan to VPlan transformations.
11///
12//===----------------------------------------------------------------------===//
13
14#include "VPlanTransforms.h"
15#include "VPRecipeBuilder.h"
16#include "VPlan.h"
17#include "VPlanAnalysis.h"
18#include "VPlanCFG.h"
19#include "VPlanDominatorTree.h"
20#include "VPlanHelpers.h"
21#include "VPlanPatternMatch.h"
22#include "VPlanUtils.h"
23#include "VPlanVerifier.h"
24#include "llvm/ADT/APInt.h"
26#include "llvm/ADT/STLExtras.h"
28#include "llvm/ADT/SetVector.h"
30#include "llvm/ADT/TypeSwitch.h"
38#include "llvm/IR/Intrinsics.h"
39#include "llvm/IR/MDBuilder.h"
40#include "llvm/IR/Metadata.h"
45
46using namespace llvm;
47using namespace VPlanPatternMatch;
48using namespace SCEVPatternMatch;
49
51 VPlan &Plan, const TargetLibraryInfo &TLI) {
52
54 Plan.getVectorLoopRegion());
56 // Skip blocks outside region
57 if (!VPBB->getParent())
58 break;
59 VPRecipeBase *Term = VPBB->getTerminator();
60 auto EndIter = Term ? Term->getIterator() : VPBB->end();
61 // Introduce each ingredient into VPlan.
62 for (VPRecipeBase &Ingredient :
63 make_early_inc_range(make_range(VPBB->begin(), EndIter))) {
64
65 VPValue *VPV = Ingredient.getVPSingleValue();
66 if (!VPV->getUnderlyingValue())
67 continue;
68
70
71 VPRecipeBase *NewRecipe = nullptr;
72 if (auto *PhiR = dyn_cast<VPPhi>(&Ingredient)) {
73 auto *Phi = cast<PHINode>(PhiR->getUnderlyingValue());
74 NewRecipe = new VPWidenPHIRecipe(PhiR->operands(), PhiR->getDebugLoc(),
75 Phi->getName());
76 } else if (auto *VPI = dyn_cast<VPInstruction>(&Ingredient)) {
77 assert(!isa<PHINode>(Inst) && "phis should be handled above");
78 // Create VPWidenMemoryRecipe for loads and stores.
79 if (LoadInst *Load = dyn_cast<LoadInst>(Inst)) {
80 NewRecipe = new VPWidenLoadRecipe(
81 *Load, Ingredient.getOperand(0), nullptr /*Mask*/,
82 false /*Consecutive*/, *VPI, Ingredient.getDebugLoc());
83 } else if (StoreInst *Store = dyn_cast<StoreInst>(Inst)) {
84 NewRecipe = new VPWidenStoreRecipe(
85 *Store, Ingredient.getOperand(1), Ingredient.getOperand(0),
86 nullptr /*Mask*/, false /*Consecutive*/, *VPI,
87 Ingredient.getDebugLoc());
89 NewRecipe = new VPWidenGEPRecipe(GEP, Ingredient.operands(), *VPI,
90 Ingredient.getDebugLoc());
91 } else if (CallInst *CI = dyn_cast<CallInst>(Inst)) {
92 Intrinsic::ID VectorID = getVectorIntrinsicIDForCall(CI, &TLI);
93 if (VectorID == Intrinsic::not_intrinsic)
94 return false;
95
96 // The noalias.scope.decl intrinsic declares a noalias scope that
97 // is valid for a single iteration. Emitting it as a single-scalar
98 // replicate would incorrectly extend the scope across multiple
99 // original iterations packed into one vector iteration.
100 // FIXME: If we want to vectorize this loop, then we have to drop
101 // all the associated !alias.scope and !noalias.
102 if (VectorID == Intrinsic::experimental_noalias_scope_decl)
103 return false;
104
105 // These intrinsics are recognized by getVectorIntrinsicIDForCall
106 // but are not widenable. Emit them as replicate instead of widening.
107 if (VectorID == Intrinsic::assume ||
108 VectorID == Intrinsic::lifetime_end ||
109 VectorID == Intrinsic::lifetime_start ||
110 VectorID == Intrinsic::sideeffect ||
111 VectorID == Intrinsic::pseudoprobe) {
112 // If the operand of llvm.assume holds before vectorization, it will
113 // also hold per lane.
114 // llvm.pseudoprobe requires to be duplicated per lane for accurate
115 // sample count.
116 const bool IsSingleScalar = VectorID != Intrinsic::assume &&
117 VectorID != Intrinsic::pseudoprobe;
118 NewRecipe = new VPReplicateRecipe(CI, Ingredient.operands(),
119 /*IsSingleScalar=*/IsSingleScalar,
120 /*Mask=*/nullptr, *VPI, *VPI,
121 Ingredient.getDebugLoc());
122 } else {
123 NewRecipe = new VPWidenIntrinsicRecipe(
124 *CI, VectorID, drop_end(Ingredient.operands()), CI->getType(),
125 VPIRFlags(*CI), *VPI, CI->getDebugLoc());
126 }
127 } else if (auto *CI = dyn_cast<CastInst>(Inst)) {
128 NewRecipe = new VPWidenCastRecipe(
129 CI->getOpcode(), Ingredient.getOperand(0), CI->getType(), CI,
130 VPIRFlags(*CI), VPIRMetadata(*CI));
131 } else {
132 NewRecipe = new VPWidenRecipe(*Inst, Ingredient.operands(), *VPI,
133 *VPI, Ingredient.getDebugLoc());
134 }
135 } else {
137 "inductions must be created earlier");
138 continue;
139 }
140
141 NewRecipe->insertBefore(&Ingredient);
142 if (NewRecipe->getNumDefinedValues() == 1)
143 VPV->replaceAllUsesWith(NewRecipe->getVPSingleValue());
144 else
145 assert(NewRecipe->getNumDefinedValues() == 0 &&
146 "Only recpies with zero or one defined values expected");
147 Ingredient.eraseFromParent();
148 }
149 }
150 return true;
151}
152
153/// Helper for extra no-alias checks via known-safe recipe and SCEV.
155 const SmallPtrSetImpl<VPRecipeBase *> &ExcludeRecipes;
156 VPReplicateRecipe &GroupLeader;
158 const Loop &L;
159 VPTypeAnalysis &TypeInfo;
160
161 // Return true if \p A and \p B are known to not alias for all VFs in the
162 // plan, checked via the distance between the accesses
163 bool isNoAliasViaDistance(VPReplicateRecipe *A, VPReplicateRecipe *B) const {
164 if (A->getOpcode() != Instruction::Store ||
165 B->getOpcode() != Instruction::Store)
166 return false;
167
168 VPValue *AddrA = A->getOperand(1);
169 const SCEV *SCEVA = vputils::getSCEVExprForVPValue(AddrA, PSE, &L);
170 VPValue *AddrB = B->getOperand(1);
171 const SCEV *SCEVB = vputils::getSCEVExprForVPValue(AddrB, PSE, &L);
173 return false;
174
175 const APInt *Distance;
176 ScalarEvolution &SE = *PSE.getSE();
177 if (!match(SE.getMinusSCEV(SCEVA, SCEVB), m_scev_APInt(Distance)))
178 return false;
179
180 const DataLayout &DL = SE.getDataLayout();
181 Type *TyA = TypeInfo.inferScalarType(A->getOperand(0));
182 uint64_t SizeA = DL.getTypeStoreSize(TyA);
183 Type *TyB = TypeInfo.inferScalarType(B->getOperand(0));
184 uint64_t SizeB = DL.getTypeStoreSize(TyB);
185
186 // Use the maximum store size to ensure no overlap from either direction.
187 // Currently only handles fixed sizes, as it is only used for
188 // replicating VPReplicateRecipes.
189 uint64_t MaxStoreSize = std::max(SizeA, SizeB);
190
191 auto VFs = B->getParent()->getPlan()->vectorFactors();
193 if (MaxVF.isScalable())
194 return false;
195 return Distance->abs().uge(
196 MaxVF.multiplyCoefficientBy(MaxStoreSize).getFixedValue());
197 }
198
199public:
202 const Loop &L, VPTypeAnalysis &TypeInfo)
203 : ExcludeRecipes(ExcludeRecipes), GroupLeader(GroupLeader), PSE(PSE),
204 L(L), TypeInfo(TypeInfo) {}
205
206 /// Return true if \p R should be skipped during alias checking, either
207 /// because it's in the exclude set or because no-alias can be proven via
208 /// SCEV.
209 bool shouldSkip(VPRecipeBase &R) const {
210 auto *Store = dyn_cast<VPReplicateRecipe>(&R);
211 return ExcludeRecipes.contains(&R) ||
212 (Store && isNoAliasViaDistance(Store, &GroupLeader));
213 }
214};
215
216/// Check if a memory operation doesn't alias with memory operations using
217/// scoped noalias metadata, in blocks in the single-successor chain between \p
218/// FirstBB and \p LastBB. If \p SinkInfo is std::nullopt, only recipes that may
219/// write to memory are checked (for load hoisting). Otherwise recipes that both
220/// read and write memory are checked, and SCEV is used to prove no-alias
221/// between the group leader and other replicate recipes (for store sinking).
222static bool
224 VPBasicBlock *FirstBB, VPBasicBlock *LastBB,
225 std::optional<SinkStoreInfo> SinkInfo = {}) {
226 bool CheckReads = SinkInfo.has_value();
227 if (!MemLoc.AATags.Scope)
228 return false;
229
230 for (VPBasicBlock *VPBB :
232 for (VPRecipeBase &R : *VPBB) {
233 if (SinkInfo && SinkInfo->shouldSkip(R))
234 continue;
235
236 // Skip recipes that don't need checking.
237 if (!R.mayWriteToMemory() && !(CheckReads && R.mayReadFromMemory()))
238 continue;
239
241 if (!Loc)
242 // Conservatively assume aliasing for memory operations without
243 // location.
244 return false;
245
247 return false;
248 }
249 }
250 return true;
251}
252
253/// Collect either replicated Loads or Stores grouped by their address SCEV, in
254/// a deep-traversal of the vector loop region in \p Plan.
255template <unsigned Opcode>
258 VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L,
259 function_ref<bool(VPReplicateRecipe *)> FilterFn) {
260 static_assert(Opcode == Instruction::Load || Opcode == Instruction::Store,
261 "Only Load and Store opcodes supported");
262 constexpr bool IsLoad = (Opcode == Instruction::Load);
264 RecipesByAddress;
267 for (VPRecipeBase &R : *VPBB) {
268 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
269 if (!RepR || RepR->getOpcode() != Opcode || !FilterFn(RepR))
270 continue;
271
272 // For loads, operand 0 is address; for stores, operand 1 is address.
273 VPValue *Addr = RepR->getOperand(IsLoad ? 0 : 1);
274 const SCEV *AddrSCEV = vputils::getSCEVExprForVPValue(Addr, PSE, L);
275 if (!isa<SCEVCouldNotCompute>(AddrSCEV))
276 RecipesByAddress[AddrSCEV].push_back(RepR);
277 }
278 }
279 auto Groups = to_vector(RecipesByAddress.values());
280 VPDominatorTree VPDT(Plan);
281 for (auto &Group : Groups) {
282 // Sort mem ops by dominance order, with earliest (most dominating) first.
284 return VPDT.properlyDominates(A, B);
285 });
286 }
287 return Groups;
288}
289
290/// Return true if we do not know how to (mechanically) hoist or sink \p R out
291/// of a loop region. When sinking, passing \p Sinking = true ensures that
292/// assumes aren't sunk.
294 bool Sinking = false) {
295 // Assumes don't alias anything or throw; as long as they're guaranteed to
296 // execute, they're safe to hoist. They should however not be sunk, as it
297 // would destroy information.
299 return Sinking;
300
301 // TODO: Relax checks in the future, e.g. we could also hoist reads, if their
302 // memory location is not modified in the vector loop.
303 if (R.mayHaveSideEffects() || R.mayReadFromMemory() || R.isPhi())
304 return true;
305
306 // Allocas cannot be hoisted.
307 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
308 return RepR && RepR->getOpcode() == Instruction::Alloca;
309}
310
311static bool sinkScalarOperands(VPlan &Plan) {
312 auto Iter = vp_depth_first_deep(Plan.getEntry());
313 bool ScalarVFOnly = Plan.hasScalarVFOnly();
314 bool Changed = false;
315
317 auto InsertIfValidSinkCandidate = [ScalarVFOnly, &WorkList](
318 VPBasicBlock *SinkTo, VPValue *Op) {
319 auto *Candidate =
320 dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe());
321 if (!Candidate)
322 return;
323
324 // We only know how to sink VPReplicateRecipes and VPScalarIVStepsRecipes
325 // for now.
327 return;
328
329 if (Candidate->getParent() == SinkTo ||
330 cannotHoistOrSinkRecipe(*Candidate, /*Sinking=*/true))
331 return;
332
333 if (auto *RepR = dyn_cast<VPReplicateRecipe>(Candidate))
334 if (!ScalarVFOnly && RepR->isSingleScalar())
335 return;
336
337 WorkList.insert({SinkTo, Candidate});
338 };
339
340 // First, collect the operands of all recipes in replicate blocks as seeds for
341 // sinking.
343 VPBasicBlock *EntryVPBB = VPR->getEntryBasicBlock();
344 if (!VPR->isReplicator() || EntryVPBB->getSuccessors().size() != 2)
345 continue;
346 VPBasicBlock *VPBB = cast<VPBasicBlock>(EntryVPBB->getSuccessors().front());
347 if (VPBB->getSingleSuccessor() != VPR->getExitingBasicBlock())
348 continue;
349 for (auto &Recipe : *VPBB)
350 for (VPValue *Op : Recipe.operands())
351 InsertIfValidSinkCandidate(VPBB, Op);
352 }
353
354 // Try to sink each replicate or scalar IV steps recipe in the worklist.
355 for (unsigned I = 0; I != WorkList.size(); ++I) {
356 VPBasicBlock *SinkTo;
357 VPSingleDefRecipe *SinkCandidate;
358 std::tie(SinkTo, SinkCandidate) = WorkList[I];
359
360 // All recipe users of SinkCandidate must be in the same block SinkTo or all
361 // users outside of SinkTo must only use the first lane of SinkCandidate. In
362 // the latter case, we need to duplicate SinkCandidate.
363 auto UsersOutsideSinkTo =
364 make_filter_range(SinkCandidate->users(), [SinkTo](VPUser *U) {
365 return cast<VPRecipeBase>(U)->getParent() != SinkTo;
366 });
367 if (any_of(UsersOutsideSinkTo, [SinkCandidate](VPUser *U) {
368 return !U->usesFirstLaneOnly(SinkCandidate);
369 }))
370 continue;
371 bool NeedsDuplicating = !UsersOutsideSinkTo.empty();
372
373 if (NeedsDuplicating) {
374 if (ScalarVFOnly)
375 continue;
376 VPSingleDefRecipe *Clone;
377 if (auto *SinkCandidateRepR =
378 dyn_cast<VPReplicateRecipe>(SinkCandidate)) {
379 // TODO: Handle converting to uniform recipes as separate transform,
380 // then cloning should be sufficient here.
381 Instruction *I = SinkCandidate->getUnderlyingInstr();
382 Clone = new VPReplicateRecipe(I, SinkCandidate->operands(), true,
383 nullptr /*Mask*/, *SinkCandidateRepR,
384 *SinkCandidateRepR);
385 // TODO: add ".cloned" suffix to name of Clone's VPValue.
386 } else {
387 Clone = SinkCandidate->clone();
388 }
389
390 Clone->insertBefore(SinkCandidate);
391 SinkCandidate->replaceUsesWithIf(Clone, [SinkTo](VPUser &U, unsigned) {
392 return cast<VPRecipeBase>(&U)->getParent() != SinkTo;
393 });
394 }
395 SinkCandidate->moveBefore(*SinkTo, SinkTo->getFirstNonPhi());
396 for (VPValue *Op : SinkCandidate->operands())
397 InsertIfValidSinkCandidate(SinkTo, Op);
398 Changed = true;
399 }
400 return Changed;
401}
402
403/// If \p R is a region with a VPBranchOnMaskRecipe in the entry block, return
404/// the mask.
406 auto *EntryBB = dyn_cast<VPBasicBlock>(R->getEntry());
407 if (!EntryBB || EntryBB->size() != 1 ||
408 !isa<VPBranchOnMaskRecipe>(EntryBB->begin()))
409 return nullptr;
410
411 return cast<VPBranchOnMaskRecipe>(&*EntryBB->begin())->getOperand(0);
412}
413
414/// If \p R is a triangle region, return the 'then' block of the triangle.
416 auto *EntryBB = cast<VPBasicBlock>(R->getEntry());
417 if (EntryBB->getNumSuccessors() != 2)
418 return nullptr;
419
420 auto *Succ0 = dyn_cast<VPBasicBlock>(EntryBB->getSuccessors()[0]);
421 auto *Succ1 = dyn_cast<VPBasicBlock>(EntryBB->getSuccessors()[1]);
422 if (!Succ0 || !Succ1)
423 return nullptr;
424
425 if (Succ0->getNumSuccessors() + Succ1->getNumSuccessors() != 1)
426 return nullptr;
427 if (Succ0->getSingleSuccessor() == Succ1)
428 return Succ0;
429 if (Succ1->getSingleSuccessor() == Succ0)
430 return Succ1;
431 return nullptr;
432}
433
434// Merge replicate regions in their successor region, if a replicate region
435// is connected to a successor replicate region with the same predicate by a
436// single, empty VPBasicBlock.
438 SmallPtrSet<VPRegionBlock *, 4> TransformedRegions;
439
440 // Collect replicate regions followed by an empty block, followed by another
441 // replicate region with matching masks to process front. This is to avoid
442 // iterator invalidation issues while merging regions.
445 vp_depth_first_deep(Plan.getEntry()))) {
446 if (!Region1->isReplicator())
447 continue;
448 auto *MiddleBasicBlock =
449 dyn_cast_or_null<VPBasicBlock>(Region1->getSingleSuccessor());
450 if (!MiddleBasicBlock || !MiddleBasicBlock->empty())
451 continue;
452
453 auto *Region2 =
454 dyn_cast_or_null<VPRegionBlock>(MiddleBasicBlock->getSingleSuccessor());
455 if (!Region2 || !Region2->isReplicator())
456 continue;
457
458 VPValue *Mask1 = getPredicatedMask(Region1);
459 VPValue *Mask2 = getPredicatedMask(Region2);
460 if (!Mask1 || Mask1 != Mask2)
461 continue;
462
463 assert(Mask1 && Mask2 && "both region must have conditions");
464 WorkList.push_back(Region1);
465 }
466
467 // Move recipes from Region1 to its successor region, if both are triangles.
468 for (VPRegionBlock *Region1 : WorkList) {
469 if (TransformedRegions.contains(Region1))
470 continue;
471 auto *MiddleBasicBlock = cast<VPBasicBlock>(Region1->getSingleSuccessor());
472 auto *Region2 = cast<VPRegionBlock>(MiddleBasicBlock->getSingleSuccessor());
473
474 VPBasicBlock *Then1 = getPredicatedThenBlock(Region1);
475 VPBasicBlock *Then2 = getPredicatedThenBlock(Region2);
476 if (!Then1 || !Then2)
477 continue;
478
479 // Note: No fusion-preventing memory dependencies are expected in either
480 // region. Such dependencies should be rejected during earlier dependence
481 // checks, which guarantee accesses can be re-ordered for vectorization.
482 //
483 // Move recipes to the successor region.
484 for (VPRecipeBase &ToMove : make_early_inc_range(reverse(*Then1)))
485 ToMove.moveBefore(*Then2, Then2->getFirstNonPhi());
486
487 auto *Merge1 = cast<VPBasicBlock>(Then1->getSingleSuccessor());
488 auto *Merge2 = cast<VPBasicBlock>(Then2->getSingleSuccessor());
489
490 // Move VPPredInstPHIRecipes from the merge block to the successor region's
491 // merge block. Update all users inside the successor region to use the
492 // original values.
493 for (VPRecipeBase &Phi1ToMove : make_early_inc_range(reverse(*Merge1))) {
494 VPValue *PredInst1 =
495 cast<VPPredInstPHIRecipe>(&Phi1ToMove)->getOperand(0);
496 VPValue *Phi1ToMoveV = Phi1ToMove.getVPSingleValue();
497 Phi1ToMoveV->replaceUsesWithIf(PredInst1, [Then2](VPUser &U, unsigned) {
498 return cast<VPRecipeBase>(&U)->getParent() == Then2;
499 });
500
501 // Remove phi recipes that are unused after merging the regions.
502 if (Phi1ToMove.getVPSingleValue()->getNumUsers() == 0) {
503 Phi1ToMove.eraseFromParent();
504 continue;
505 }
506 Phi1ToMove.moveBefore(*Merge2, Merge2->begin());
507 }
508
509 // Remove the dead recipes in Region1's entry block.
510 for (VPRecipeBase &R :
511 make_early_inc_range(reverse(*Region1->getEntryBasicBlock())))
512 R.eraseFromParent();
513
514 // Finally, remove the first region.
515 for (VPBlockBase *Pred : make_early_inc_range(Region1->getPredecessors())) {
516 VPBlockUtils::disconnectBlocks(Pred, Region1);
517 VPBlockUtils::connectBlocks(Pred, MiddleBasicBlock);
518 }
519 VPBlockUtils::disconnectBlocks(Region1, MiddleBasicBlock);
520 TransformedRegions.insert(Region1);
521 }
522
523 return !TransformedRegions.empty();
524}
525
527 VPlan &Plan) {
528 Instruction *Instr = PredRecipe->getUnderlyingInstr();
529 // Build the triangular if-then region.
530 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
531 assert(Instr->getParent() && "Predicated instruction not in any basic block");
532 auto *BlockInMask = PredRecipe->getMask();
533 auto *MaskDef = BlockInMask->getDefiningRecipe();
534 auto *BOMRecipe = new VPBranchOnMaskRecipe(
535 BlockInMask, MaskDef ? MaskDef->getDebugLoc() : DebugLoc::getUnknown());
536 auto *Entry =
537 Plan.createVPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
538
539 // Replace predicated replicate recipe with a replicate recipe without a
540 // mask but in the replicate region.
541 auto *RecipeWithoutMask = new VPReplicateRecipe(
542 PredRecipe->getUnderlyingInstr(), drop_end(PredRecipe->operands()),
543 PredRecipe->isSingleScalar(), nullptr /*Mask*/, *PredRecipe, *PredRecipe,
544 PredRecipe->getDebugLoc());
545 auto *Pred =
546 Plan.createVPBasicBlock(Twine(RegionName) + ".if", RecipeWithoutMask);
547
548 VPPredInstPHIRecipe *PHIRecipe = nullptr;
549 if (PredRecipe->getNumUsers() != 0) {
550 PHIRecipe = new VPPredInstPHIRecipe(RecipeWithoutMask,
551 RecipeWithoutMask->getDebugLoc());
552 PredRecipe->replaceAllUsesWith(PHIRecipe);
553 PHIRecipe->setOperand(0, RecipeWithoutMask);
554 }
555 PredRecipe->eraseFromParent();
556 auto *Exiting =
557 Plan.createVPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
559 Plan.createReplicateRegion(Entry, Exiting, RegionName);
560
561 // Note: first set Entry as region entry and then connect successors starting
562 // from it in order, to propagate the "parent" of each VPBasicBlock.
563 VPBlockUtils::insertTwoBlocksAfter(Pred, Exiting, Entry);
564 VPBlockUtils::connectBlocks(Pred, Exiting);
565
566 return Region;
567}
568
569static void addReplicateRegions(VPlan &Plan) {
572 vp_depth_first_deep(Plan.getEntry()))) {
573 for (VPRecipeBase &R : *VPBB)
574 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
575 if (RepR->isPredicated())
576 WorkList.push_back(RepR);
577 }
578 }
579
580 unsigned BBNum = 0;
581 for (VPReplicateRecipe *RepR : WorkList) {
582 VPBasicBlock *CurrentBlock = RepR->getParent();
583 VPBasicBlock *SplitBlock = CurrentBlock->splitAt(RepR->getIterator());
584
585 BasicBlock *OrigBB = RepR->getUnderlyingInstr()->getParent();
586 SplitBlock->setName(
587 OrigBB->hasName() ? OrigBB->getName() + "." + Twine(BBNum++) : "");
588 // Record predicated instructions for above packing optimizations.
590 Region->setParent(CurrentBlock->getParent());
592
593 VPRegionBlock *ParentRegion = Region->getParent();
594 if (ParentRegion && ParentRegion->getExiting() == CurrentBlock)
595 ParentRegion->setExiting(SplitBlock);
596 }
597}
598
602 vp_depth_first_deep(Plan.getEntry()))) {
603 // Don't fold the blocks in the skeleton of the Plan into their single
604 // predecessors for now.
605 // TODO: Remove restriction once more of the skeleton is modeled in VPlan.
606 if (!VPBB->getParent())
607 continue;
608 auto *PredVPBB =
609 dyn_cast_or_null<VPBasicBlock>(VPBB->getSinglePredecessor());
610 if (!PredVPBB || PredVPBB->getNumSuccessors() != 1 ||
611 isa<VPIRBasicBlock>(PredVPBB))
612 continue;
613 WorkList.push_back(VPBB);
614 }
615
616 for (VPBasicBlock *VPBB : WorkList) {
617 VPBasicBlock *PredVPBB = cast<VPBasicBlock>(VPBB->getSinglePredecessor());
618 for (VPRecipeBase &R : make_early_inc_range(*VPBB))
619 R.moveBefore(*PredVPBB, PredVPBB->end());
620 VPBlockUtils::disconnectBlocks(PredVPBB, VPBB);
621 auto *ParentRegion = VPBB->getParent();
622 if (ParentRegion && ParentRegion->getExiting() == VPBB)
623 ParentRegion->setExiting(PredVPBB);
624 VPBlockUtils::transferSuccessors(VPBB, PredVPBB);
625 // VPBB is now dead and will be cleaned up when the plan gets destroyed.
626 }
627 return !WorkList.empty();
628}
629
631 // Convert masked VPReplicateRecipes to if-then region blocks.
633
634 bool ShouldSimplify = true;
635 while (ShouldSimplify) {
636 ShouldSimplify = sinkScalarOperands(Plan);
637 ShouldSimplify |= mergeReplicateRegionsIntoSuccessors(Plan);
638 ShouldSimplify |= mergeBlocksIntoPredecessors(Plan);
639 }
640}
641
642/// Remove redundant casts of inductions.
643///
644/// Such redundant casts are casts of induction variables that can be ignored,
645/// because we already proved that the casted phi is equal to the uncasted phi
646/// in the vectorized loop. There is no need to vectorize the cast - the same
647/// value can be used for both the phi and casts in the vector loop.
649 for (auto &Phi : Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
651 if (!IV || IV->getTruncInst())
652 continue;
653
654 // A sequence of IR Casts has potentially been recorded for IV, which
655 // *must be bypassed* when the IV is vectorized, because the vectorized IV
656 // will produce the desired casted value. This sequence forms a def-use
657 // chain and is provided in reverse order, ending with the cast that uses
658 // the IV phi. Search for the recipe of the last cast in the chain and
659 // replace it with the original IV. Note that only the final cast is
660 // expected to have users outside the cast-chain and the dead casts left
661 // over will be cleaned up later.
662 ArrayRef<Instruction *> Casts = IV->getInductionDescriptor().getCastInsts();
663 VPValue *FindMyCast = IV;
664 for (Instruction *IRCast : reverse(Casts)) {
665 VPSingleDefRecipe *FoundUserCast = nullptr;
666 for (auto *U : FindMyCast->users()) {
667 auto *UserCast = dyn_cast<VPSingleDefRecipe>(U);
668 if (UserCast && UserCast->getUnderlyingValue() == IRCast) {
669 FoundUserCast = UserCast;
670 break;
671 }
672 }
673 FindMyCast = FoundUserCast;
674 }
675 FindMyCast->replaceAllUsesWith(IV);
676 }
677}
678
679/// Try to replace VPWidenCanonicalIVRecipes with a widened canonical IV
680/// recipe, if it exists.
682 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
683 VPValue *CanonicalIV = LoopRegion->getCanonicalIV();
684 VPWidenCanonicalIVRecipe *WidenNewIV = nullptr;
685 for (VPUser *U : CanonicalIV->users()) {
687 if (WidenNewIV)
688 break;
689 }
690
691 if (!WidenNewIV)
692 return;
693
694 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
695 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
696 auto *WidenOriginalIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
697
698 if (!WidenOriginalIV || !WidenOriginalIV->isCanonical())
699 continue;
700
701 // Replace WidenNewIV with WidenOriginalIV if WidenOriginalIV provides
702 // everything WidenNewIV's users need. That is, WidenOriginalIV will
703 // generate a vector phi or all users of WidenNewIV demand the first lane
704 // only.
705 if (Plan.hasScalarVFOnly() ||
706 !vputils::onlyScalarValuesUsed(WidenOriginalIV) ||
707 vputils::onlyFirstLaneUsed(WidenNewIV)) {
708 // We are replacing a wide canonical iv with a suitable wide induction.
709 // This is used to compute header mask, hence all lanes will be used and
710 // we need to drop wrap flags only applying to lanes guranteed to execute
711 // in the original scalar loop.
712 WidenOriginalIV->dropPoisonGeneratingFlags();
713 WidenNewIV->replaceAllUsesWith(WidenOriginalIV);
714 WidenNewIV->eraseFromParent();
715 return;
716 }
717 }
718}
719
720/// Returns true if \p R is dead and can be removed.
721static bool isDeadRecipe(VPRecipeBase &R) {
722 // Do remove conditional assume instructions as their conditions may be
723 // flattened.
724 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
725 bool IsConditionalAssume = RepR && RepR->isPredicated() &&
727 if (IsConditionalAssume)
728 return true;
729
730 if (R.mayHaveSideEffects())
731 return false;
732
733 // Recipe is dead if no user keeps the recipe alive.
734 return all_of(R.definedValues(),
735 [](VPValue *V) { return V->getNumUsers() == 0; });
736}
737
740 Plan.getEntry());
742 // The recipes in the block are processed in reverse order, to catch chains
743 // of dead recipes.
744 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
745 if (isDeadRecipe(R)) {
746 R.eraseFromParent();
747 continue;
748 }
749
750 // Check if R is a dead VPPhi <-> update cycle and remove it.
751 VPValue *Start, *Incoming;
752 if (!match(&R, m_VPPhi(m_VPValue(Start), m_VPValue(Incoming))))
753 continue;
754 auto *PhiR = cast<VPPhi>(&R);
755 VPUser *PhiUser = PhiR->getSingleUser();
756 if (!PhiUser)
757 continue;
758 if (PhiUser != Incoming->getDefiningRecipe() ||
759 Incoming->getNumUsers() != 1)
760 continue;
761 PhiR->replaceAllUsesWith(Start);
762 PhiR->eraseFromParent();
763 Incoming->getDefiningRecipe()->eraseFromParent();
764 }
765 }
766}
767
770 Instruction::BinaryOps InductionOpcode,
771 FPMathOperator *FPBinOp, Instruction *TruncI,
772 VPIRValue *StartV, VPValue *Step, DebugLoc DL,
773 VPBuilder &Builder) {
774 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
775 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
776 VPValue *CanonicalIV = LoopRegion->getCanonicalIV();
777 VPSingleDefRecipe *BaseIV = Builder.createDerivedIV(
778 Kind, FPBinOp, StartV, CanonicalIV, Step, "offset.idx");
779
780 // Truncate base induction if needed.
781 VPTypeAnalysis TypeInfo(Plan);
782 Type *ResultTy = TypeInfo.inferScalarType(BaseIV);
783 if (TruncI) {
784 Type *TruncTy = TruncI->getType();
785 assert(ResultTy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits() &&
786 "Not truncating.");
787 assert(ResultTy->isIntegerTy() && "Truncation requires an integer type");
788 BaseIV = Builder.createScalarCast(Instruction::Trunc, BaseIV, TruncTy, DL);
789 ResultTy = TruncTy;
790 }
791
792 // Truncate step if needed.
793 Type *StepTy = TypeInfo.inferScalarType(Step);
794 if (ResultTy != StepTy) {
795 assert(StepTy->getScalarSizeInBits() > ResultTy->getScalarSizeInBits() &&
796 "Not truncating.");
797 assert(StepTy->isIntegerTy() && "Truncation requires an integer type");
798 auto *VecPreheader =
800 VPBuilder::InsertPointGuard Guard(Builder);
801 Builder.setInsertPoint(VecPreheader);
802 Step = Builder.createScalarCast(Instruction::Trunc, Step, ResultTy, DL);
803 }
804 return Builder.createScalarIVSteps(InductionOpcode, FPBinOp, BaseIV, Step,
805 &Plan.getVF(), DL);
806}
807
810 for (unsigned I = 0; I != Users.size(); ++I) {
812 if (isa<VPHeaderPHIRecipe>(Cur))
813 continue;
814 for (VPValue *V : Cur->definedValues())
815 Users.insert_range(V->users());
816 }
817 return Users.takeVector();
818}
819
820/// Scalarize a VPWidenPointerInductionRecipe by replacing it with a PtrAdd
821/// (IndStart, ScalarIVSteps (0, Step)). This is used when the recipe only
822/// generates scalar values.
823static VPValue *
825 VPlan &Plan, VPBuilder &Builder) {
827 VPIRValue *StartV = Plan.getZero(ID.getStep()->getType());
828 VPValue *StepV = PtrIV->getOperand(1);
830 Plan, InductionDescriptor::IK_IntInduction, Instruction::Add, nullptr,
831 nullptr, StartV, StepV, PtrIV->getDebugLoc(), Builder);
832
833 return Builder.createPtrAdd(PtrIV->getStartValue(), Steps,
834 PtrIV->getDebugLoc(), "next.gep");
835}
836
837/// Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd
838/// (IndStart, ScalarIVSteps (0, Step)) if only its scalar values are used, as
839/// VPWidenPointerInductionRecipe will generate vectors only. If some users
840/// require vectors while other require scalars, the scalar uses need to extract
841/// the scalars from the generated vectors (Note that this is different to how
842/// int/fp inductions are handled). Legalize extract-from-ends using uniform
843/// VPReplicateRecipe of wide inductions to use regular VPReplicateRecipe, so
844/// the correct end value is available. Also optimize
845/// VPWidenIntOrFpInductionRecipe, if any of its users needs scalar values, by
846/// providing them scalar steps built on the canonical scalar IV and update the
847/// original IV's users. This is an optional optimization to reduce the needs of
848/// vector extracts.
851 bool HasOnlyVectorVFs = !Plan.hasScalarVFOnly();
852 VPBuilder Builder(HeaderVPBB, HeaderVPBB->getFirstNonPhi());
853 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
854 auto *PhiR = dyn_cast<VPWidenInductionRecipe>(&Phi);
855 if (!PhiR)
856 continue;
857
858 // Try to narrow wide and replicating recipes to uniform recipes, based on
859 // VPlan analysis.
860 // TODO: Apply to all recipes in the future, to replace legacy uniformity
861 // analysis.
862 auto Users = collectUsersRecursively(PhiR);
863 for (VPUser *U : reverse(Users)) {
864 auto *Def = dyn_cast<VPRecipeWithIRFlags>(U);
865 auto *RepR = dyn_cast<VPReplicateRecipe>(U);
866 // Skip recipes that shouldn't be narrowed.
867 if (!Def || !isa<VPReplicateRecipe, VPWidenRecipe>(Def) ||
868 Def->getNumUsers() == 0 || !Def->getUnderlyingValue() ||
869 (RepR && (RepR->isSingleScalar() || RepR->isPredicated())))
870 continue;
871
872 // Skip recipes that may have other lanes than their first used.
874 continue;
875
876 // TODO: Support scalarizing ExtractValue.
877 if (match(Def,
879 continue;
880
881 auto *Clone = new VPReplicateRecipe(Def->getUnderlyingInstr(),
882 Def->operands(), /*IsUniform*/ true,
883 /*Mask*/ nullptr, /*Flags*/ *Def);
884 Clone->insertAfter(Def);
885 Def->replaceAllUsesWith(Clone);
886 }
887
888 // Replace wide pointer inductions which have only their scalars used by
889 // PtrAdd(IndStart, ScalarIVSteps (0, Step)).
890 if (auto *PtrIV = dyn_cast<VPWidenPointerInductionRecipe>(&Phi)) {
891 if (!Plan.hasScalarVFOnly() &&
892 !PtrIV->onlyScalarsGenerated(Plan.hasScalableVF()))
893 continue;
894
895 VPValue *PtrAdd = scalarizeVPWidenPointerInduction(PtrIV, Plan, Builder);
896 PtrIV->replaceAllUsesWith(PtrAdd);
897 continue;
898 }
899
900 // Replace widened induction with scalar steps for users that only use
901 // scalars.
902 auto *WideIV = cast<VPWidenIntOrFpInductionRecipe>(&Phi);
903 if (HasOnlyVectorVFs && none_of(WideIV->users(), [WideIV](VPUser *U) {
904 return U->usesScalars(WideIV);
905 }))
906 continue;
907
908 const InductionDescriptor &ID = WideIV->getInductionDescriptor();
910 Plan, ID.getKind(), ID.getInductionOpcode(),
911 dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
912 WideIV->getTruncInst(), WideIV->getStartValue(), WideIV->getStepValue(),
913 WideIV->getDebugLoc(), Builder);
914
915 // Update scalar users of IV to use Step instead.
916 if (!HasOnlyVectorVFs) {
917 assert(!Plan.hasScalableVF() &&
918 "plans containing a scalar VF cannot also include scalable VFs");
919 WideIV->replaceAllUsesWith(Steps);
920 } else {
921 bool HasScalableVF = Plan.hasScalableVF();
922 WideIV->replaceUsesWithIf(Steps,
923 [WideIV, HasScalableVF](VPUser &U, unsigned) {
924 if (HasScalableVF)
925 return U.usesFirstLaneOnly(WideIV);
926 return U.usesScalars(WideIV);
927 });
928 }
929 }
930}
931
932/// Check if \p VPV is an untruncated wide induction, either before or after the
933/// increment. If so return the header IV (before the increment), otherwise
934/// return null.
937 auto *WideIV = dyn_cast<VPWidenInductionRecipe>(VPV);
938 if (WideIV) {
939 // VPV itself is a wide induction, separately compute the end value for exit
940 // users if it is not a truncated IV.
941 auto *IntOrFpIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
942 return (IntOrFpIV && IntOrFpIV->getTruncInst()) ? nullptr : WideIV;
943 }
944
945 // Check if VPV is an optimizable induction increment.
946 VPRecipeBase *Def = VPV->getDefiningRecipe();
947 if (!Def || Def->getNumOperands() != 2)
948 return nullptr;
949 WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(0));
950 if (!WideIV)
951 WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(1));
952 if (!WideIV)
953 return nullptr;
954
955 auto IsWideIVInc = [&]() {
956 auto &ID = WideIV->getInductionDescriptor();
957
958 // Check if VPV increments the induction by the induction step.
959 VPValue *IVStep = WideIV->getStepValue();
960 switch (ID.getInductionOpcode()) {
961 case Instruction::Add:
962 return match(VPV, m_c_Add(m_Specific(WideIV), m_Specific(IVStep)));
963 case Instruction::FAdd:
964 return match(VPV, m_c_FAdd(m_Specific(WideIV), m_Specific(IVStep)));
965 case Instruction::FSub:
966 return match(VPV, m_Binary<Instruction::FSub>(m_Specific(WideIV),
967 m_Specific(IVStep)));
968 case Instruction::Sub: {
969 // IVStep will be the negated step of the subtraction. Check if Step == -1
970 // * IVStep.
971 VPValue *Step;
972 if (!match(VPV, m_Sub(m_VPValue(), m_VPValue(Step))))
973 return false;
974 const SCEV *IVStepSCEV = vputils::getSCEVExprForVPValue(IVStep, PSE);
975 const SCEV *StepSCEV = vputils::getSCEVExprForVPValue(Step, PSE);
976 ScalarEvolution &SE = *PSE.getSE();
977 return !isa<SCEVCouldNotCompute>(IVStepSCEV) &&
978 !isa<SCEVCouldNotCompute>(StepSCEV) &&
979 IVStepSCEV == SE.getNegativeSCEV(StepSCEV);
980 }
981 default:
982 return ID.getKind() == InductionDescriptor::IK_PtrInduction &&
983 match(VPV, m_GetElementPtr(m_Specific(WideIV),
984 m_Specific(WideIV->getStepValue())));
985 }
986 llvm_unreachable("should have been covered by switch above");
987 };
988 return IsWideIVInc() ? WideIV : nullptr;
989}
990
991/// Attempts to optimize the induction variable exit values for users in the
992/// early exit block.
994 VPTypeAnalysis &TypeInfo,
995 VPBlockBase *PredVPBB,
996 VPValue *Op,
998 VPValue *Incoming, *Mask;
1000 m_VPValue(Incoming))))
1001 return nullptr;
1002
1003 auto *WideIV = getOptimizableIVOf(Incoming, PSE);
1004 if (!WideIV)
1005 return nullptr;
1006
1007 auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
1008 if (WideIntOrFp && WideIntOrFp->getTruncInst())
1009 return nullptr;
1010
1011 // Calculate the final index.
1012 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
1013 auto *CanonicalIV = LoopRegion->getCanonicalIV();
1014 Type *CanonicalIVType = LoopRegion->getCanonicalIVType();
1015 VPBuilder B(cast<VPBasicBlock>(PredVPBB));
1016
1017 DebugLoc DL = cast<VPInstruction>(Op)->getDebugLoc();
1018 VPValue *FirstActiveLane =
1019 B.createNaryOp(VPInstruction::FirstActiveLane, Mask, DL);
1020 Type *FirstActiveLaneType = TypeInfo.inferScalarType(FirstActiveLane);
1021 FirstActiveLane = B.createScalarZExtOrTrunc(FirstActiveLane, CanonicalIVType,
1022 FirstActiveLaneType, DL);
1023 VPValue *EndValue = B.createAdd(CanonicalIV, FirstActiveLane, DL);
1024
1025 // `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
1026 // changed it means the exit is using the incremented value, so we need to
1027 // add the step.
1028 if (Incoming != WideIV) {
1029 VPValue *One = Plan.getConstantInt(CanonicalIVType, 1);
1030 EndValue = B.createAdd(EndValue, One, DL);
1031 }
1032
1033 if (!WideIntOrFp || !WideIntOrFp->isCanonical()) {
1034 const InductionDescriptor &ID = WideIV->getInductionDescriptor();
1035 VPIRValue *Start = WideIV->getStartValue();
1036 VPValue *Step = WideIV->getStepValue();
1037 EndValue = B.createDerivedIV(
1038 ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
1039 Start, EndValue, Step);
1040 }
1041
1042 return EndValue;
1043}
1044
1045/// Compute the end value for \p WideIV, unless it is truncated. Creates a
1046/// VPDerivedIVRecipe for non-canonical inductions.
1048 VPBuilder &VectorPHBuilder,
1049 VPTypeAnalysis &TypeInfo,
1050 VPValue *VectorTC) {
1051 auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
1052 // Truncated wide inductions resume from the last lane of their vector value
1053 // in the last vector iteration which is handled elsewhere.
1054 if (WideIntOrFp && WideIntOrFp->getTruncInst())
1055 return nullptr;
1056
1057 VPIRValue *Start = WideIV->getStartValue();
1058 VPValue *Step = WideIV->getStepValue();
1060 VPValue *EndValue = VectorTC;
1061 if (!WideIntOrFp || !WideIntOrFp->isCanonical()) {
1062 EndValue = VectorPHBuilder.createDerivedIV(
1063 ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
1064 Start, VectorTC, Step);
1065 }
1066
1067 // EndValue is derived from the vector trip count (which has the same type as
1068 // the widest induction) and thus may be wider than the induction here.
1069 Type *ScalarTypeOfWideIV = TypeInfo.inferScalarType(WideIV);
1070 if (ScalarTypeOfWideIV != TypeInfo.inferScalarType(EndValue)) {
1071 EndValue = VectorPHBuilder.createScalarCast(Instruction::Trunc, EndValue,
1072 ScalarTypeOfWideIV,
1073 WideIV->getDebugLoc());
1074 }
1075
1076 return EndValue;
1077}
1078
1079/// Attempts to optimize the induction variable exit values for users in the
1080/// exit block coming from the latch in the original scalar loop.
1082 VPlan &Plan, VPTypeAnalysis &TypeInfo, VPBlockBase *PredVPBB, VPValue *Op,
1084 VPValue *Incoming;
1085 VPWidenInductionRecipe *WideIV = nullptr;
1087 WideIV = getOptimizableIVOf(Incoming, PSE);
1088
1089 if (!WideIV)
1090 return nullptr;
1091
1092 VPValue *EndValue = EndValues.lookup(WideIV);
1093 assert(EndValue && "Must have computed the end value up front");
1094
1095 // `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
1096 // changed it means the exit is using the incremented value, so we don't
1097 // need to subtract the step.
1098 if (Incoming != WideIV)
1099 return EndValue;
1100
1101 // Otherwise, subtract the step from the EndValue.
1102 VPBuilder B(cast<VPBasicBlock>(PredVPBB)->getTerminator());
1103 VPValue *Step = WideIV->getStepValue();
1104 Type *ScalarTy = TypeInfo.inferScalarType(WideIV);
1105 if (ScalarTy->isIntegerTy())
1106 return B.createSub(EndValue, Step, DebugLoc::getUnknown(), "ind.escape");
1107 if (ScalarTy->isPointerTy()) {
1108 Type *StepTy = TypeInfo.inferScalarType(Step);
1109 auto *Zero = Plan.getZero(StepTy);
1110 return B.createPtrAdd(EndValue, B.createSub(Zero, Step),
1111 DebugLoc::getUnknown(), "ind.escape");
1112 }
1113 if (ScalarTy->isFloatingPointTy()) {
1114 const auto &ID = WideIV->getInductionDescriptor();
1115 return B.createNaryOp(
1116 ID.getInductionBinOp()->getOpcode() == Instruction::FAdd
1117 ? Instruction::FSub
1118 : Instruction::FAdd,
1119 {EndValue, Step}, {ID.getInductionBinOp()->getFastMathFlags()});
1120 }
1121 llvm_unreachable("all possible induction types must be handled");
1122 return nullptr;
1123}
1124
1126 VPlan &Plan, PredicatedScalarEvolution &PSE, bool FoldTail) {
1127 // Compute end values for all inductions.
1128 VPTypeAnalysis TypeInfo(Plan);
1129 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
1130 auto *VectorPH = cast<VPBasicBlock>(VectorRegion->getSinglePredecessor());
1131 VPBuilder VectorPHBuilder(VectorPH, VectorPH->begin());
1133 VPValue *ResumeTC =
1134 FoldTail ? Plan.getTripCount() : &Plan.getVectorTripCount();
1135 for (auto &Phi : VectorRegion->getEntryBasicBlock()->phis()) {
1136 auto *WideIV = dyn_cast<VPWidenInductionRecipe>(&Phi);
1137 if (!WideIV)
1138 continue;
1140 WideIV, VectorPHBuilder, TypeInfo, ResumeTC))
1141 EndValues[WideIV] = EndValue;
1142 }
1143
1144 VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
1145 for (VPRecipeBase &R : make_early_inc_range(*MiddleVPBB)) {
1146 VPValue *Op;
1147 if (!match(&R, m_ExitingIVValue(m_VPValue(Op))))
1148 continue;
1149 auto *WideIV = cast<VPWidenInductionRecipe>(Op);
1150 if (VPValue *EndValue = EndValues.lookup(WideIV)) {
1151 R.getVPSingleValue()->replaceAllUsesWith(EndValue);
1152 R.eraseFromParent();
1153 }
1154 }
1155
1156 // Then, optimize exit block users.
1157 for (VPIRBasicBlock *ExitVPBB : Plan.getExitBlocks()) {
1158 for (VPRecipeBase &R : ExitVPBB->phis()) {
1159 auto *ExitIRI = cast<VPIRPhi>(&R);
1160
1161 for (auto [Idx, PredVPBB] : enumerate(ExitVPBB->getPredecessors())) {
1162 VPValue *Escape = nullptr;
1163 if (PredVPBB == MiddleVPBB)
1164 Escape = optimizeLatchExitInductionUser(Plan, TypeInfo, PredVPBB,
1165 ExitIRI->getOperand(Idx),
1166 EndValues, PSE);
1167 else
1169 Plan, TypeInfo, PredVPBB, ExitIRI->getOperand(Idx), PSE);
1170 if (Escape)
1171 ExitIRI->setOperand(Idx, Escape);
1172 }
1173 }
1174 }
1175}
1176
1177/// Remove redundant EpxandSCEVRecipes in \p Plan's entry block by replacing
1178/// them with already existing recipes expanding the same SCEV expression.
1181
1182 for (VPRecipeBase &R :
1184 auto *ExpR = dyn_cast<VPExpandSCEVRecipe>(&R);
1185 if (!ExpR)
1186 continue;
1187
1188 const auto &[V, Inserted] = SCEV2VPV.try_emplace(ExpR->getSCEV(), ExpR);
1189 if (Inserted)
1190 continue;
1191 ExpR->replaceAllUsesWith(V->second);
1192 ExpR->eraseFromParent();
1193 }
1194}
1195
1197 SmallVector<VPValue *> WorkList;
1199 WorkList.push_back(V);
1200
1201 while (!WorkList.empty()) {
1202 VPValue *Cur = WorkList.pop_back_val();
1203 if (!Seen.insert(Cur).second)
1204 continue;
1205 VPRecipeBase *R = Cur->getDefiningRecipe();
1206 if (!R)
1207 continue;
1208 if (!isDeadRecipe(*R))
1209 continue;
1210 append_range(WorkList, R->operands());
1211 R->eraseFromParent();
1212 }
1213}
1214
1215/// Get any instruction opcode or intrinsic ID data embedded in recipe \p R.
1216/// Returns an optional pair, where the first element indicates whether it is
1217/// an intrinsic ID.
1218static std::optional<std::pair<bool, unsigned>>
1220 return TypeSwitch<const VPSingleDefRecipe *,
1221 std::optional<std::pair<bool, unsigned>>>(R)
1224 [](auto *I) { return std::make_pair(false, I->getOpcode()); })
1225 .Case([](const VPWidenIntrinsicRecipe *I) {
1226 return std::make_pair(true, I->getVectorIntrinsicID());
1227 })
1228 .Case<VPVectorPointerRecipe, VPPredInstPHIRecipe, VPScalarIVStepsRecipe>(
1229 [](auto *I) {
1230 // For recipes that do not directly map to LLVM IR instructions,
1231 // assign opcodes after the last VPInstruction opcode (which is also
1232 // after the last IR Instruction opcode), based on the VPRecipeID.
1233 return std::make_pair(false, VPInstruction::OpsEnd + 1 +
1234 I->getVPRecipeID());
1235 })
1236 .Default([](auto *) { return std::nullopt; });
1237}
1238
1239/// Try to fold \p R using InstSimplifyFolder. Will succeed and return a
1240/// non-nullptr VPValue for a handled opcode or intrinsic ID if corresponding \p
1241/// Operands are foldable live-ins.
1243 ArrayRef<VPValue *> Operands,
1244 const DataLayout &DL,
1245 VPTypeAnalysis &TypeInfo) {
1246 auto OpcodeOrIID = getOpcodeOrIntrinsicID(&R);
1247 if (!OpcodeOrIID)
1248 return nullptr;
1249
1251 for (VPValue *Op : Operands) {
1252 if (!match(Op, m_LiveIn()))
1253 return nullptr;
1254 Value *V = Op->getUnderlyingValue();
1255 if (!V)
1256 return nullptr;
1257 Ops.push_back(V);
1258 }
1259
1260 auto FoldToIRValue = [&]() -> Value * {
1261 InstSimplifyFolder Folder(DL);
1262 if (OpcodeOrIID->first) {
1263 if (R.getNumOperands() != 2)
1264 return nullptr;
1265 unsigned ID = OpcodeOrIID->second;
1266 return Folder.FoldBinaryIntrinsic(ID, Ops[0], Ops[1],
1267 TypeInfo.inferScalarType(&R));
1268 }
1269 unsigned Opcode = OpcodeOrIID->second;
1270 if (Instruction::isBinaryOp(Opcode))
1271 return Folder.FoldBinOp(static_cast<Instruction::BinaryOps>(Opcode),
1272 Ops[0], Ops[1]);
1273 if (Instruction::isCast(Opcode))
1274 return Folder.FoldCast(static_cast<Instruction::CastOps>(Opcode), Ops[0],
1275 TypeInfo.inferScalarType(R.getVPSingleValue()));
1276 switch (Opcode) {
1278 return Folder.FoldSelect(Ops[0], Ops[1],
1280 case VPInstruction::Not:
1281 return Folder.FoldBinOp(Instruction::BinaryOps::Xor, Ops[0],
1283 case Instruction::Select:
1284 return Folder.FoldSelect(Ops[0], Ops[1], Ops[2]);
1285 case Instruction::ICmp:
1286 case Instruction::FCmp:
1287 return Folder.FoldCmp(cast<VPRecipeWithIRFlags>(R).getPredicate(), Ops[0],
1288 Ops[1]);
1289 case Instruction::GetElementPtr: {
1290 auto &RFlags = cast<VPRecipeWithIRFlags>(R);
1291 auto *GEP = cast<GetElementPtrInst>(RFlags.getUnderlyingInstr());
1292 return Folder.FoldGEP(GEP->getSourceElementType(), Ops[0],
1293 drop_begin(Ops), RFlags.getGEPNoWrapFlags());
1294 }
1297 return Folder.FoldGEP(IntegerType::getInt8Ty(TypeInfo.getContext()),
1298 Ops[0], Ops[1],
1299 cast<VPRecipeWithIRFlags>(R).getGEPNoWrapFlags());
1300 // An extract of a live-in is an extract of a broadcast, so return the
1301 // broadcasted element.
1302 case Instruction::ExtractElement:
1303 assert(!Ops[0]->getType()->isVectorTy() && "Live-ins should be scalar");
1304 return Ops[0];
1305 }
1306 return nullptr;
1307 };
1308
1309 if (Value *V = FoldToIRValue())
1310 return R.getParent()->getPlan()->getOrAddLiveIn(V);
1311 return nullptr;
1312}
1313
1314/// Try to simplify VPSingleDefRecipe \p Def.
1316 VPlan *Plan = Def->getParent()->getPlan();
1317
1318 // Simplification of live-in IR values for SingleDef recipes using
1319 // InstSimplifyFolder.
1320 const DataLayout &DL = Plan->getDataLayout();
1321 if (VPValue *V = tryToFoldLiveIns(*Def, Def->operands(), DL, TypeInfo))
1322 return Def->replaceAllUsesWith(V);
1323
1324 // Fold PredPHI LiveIn -> LiveIn.
1325 if (auto *PredPHI = dyn_cast<VPPredInstPHIRecipe>(Def)) {
1326 VPValue *Op = PredPHI->getOperand(0);
1327 if (isa<VPIRValue>(Op))
1328 PredPHI->replaceAllUsesWith(Op);
1329 }
1330
1331 VPBuilder Builder(Def);
1332
1333 // Avoid replacing VPInstructions with underlying values with new
1334 // VPInstructions, as we would fail to create widen/replicate recpes from the
1335 // new VPInstructions without an underlying value, and miss out on some
1336 // transformations that only apply to widened/replicated recipes later, by
1337 // doing so.
1338 // TODO: We should also not replace non-VPInstructions like VPWidenRecipe with
1339 // VPInstructions without underlying values, as those will get skipped during
1340 // cost computation.
1341 bool CanCreateNewRecipe =
1342 !isa<VPInstruction>(Def) || !Def->getUnderlyingValue();
1343
1344 VPValue *A;
1345 if (match(Def, m_Trunc(m_ZExtOrSExt(m_VPValue(A))))) {
1346 Type *TruncTy = TypeInfo.inferScalarType(Def);
1347 Type *ATy = TypeInfo.inferScalarType(A);
1348 if (TruncTy == ATy) {
1349 Def->replaceAllUsesWith(A);
1350 } else {
1351 // Don't replace a non-widened cast recipe with a widened cast.
1352 if (!isa<VPWidenCastRecipe>(Def))
1353 return;
1354 if (ATy->getScalarSizeInBits() < TruncTy->getScalarSizeInBits()) {
1355
1356 unsigned ExtOpcode = match(Def->getOperand(0), m_SExt(m_VPValue()))
1357 ? Instruction::SExt
1358 : Instruction::ZExt;
1359 auto *Ext = Builder.createWidenCast(Instruction::CastOps(ExtOpcode), A,
1360 TruncTy);
1361 if (auto *UnderlyingExt = Def->getOperand(0)->getUnderlyingValue()) {
1362 // UnderlyingExt has distinct return type, used to retain legacy cost.
1363 Ext->setUnderlyingValue(UnderlyingExt);
1364 }
1365 Def->replaceAllUsesWith(Ext);
1366 } else if (ATy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits()) {
1367 auto *Trunc = Builder.createWidenCast(Instruction::Trunc, A, TruncTy);
1368 Def->replaceAllUsesWith(Trunc);
1369 }
1370 }
1371#ifndef NDEBUG
1372 // Verify that the cached type info is for both A and its users is still
1373 // accurate by comparing it to freshly computed types.
1374 VPTypeAnalysis TypeInfo2(*Plan);
1375 assert(TypeInfo.inferScalarType(A) == TypeInfo2.inferScalarType(A));
1376 for (VPUser *U : A->users()) {
1377 auto *R = cast<VPRecipeBase>(U);
1378 for (VPValue *VPV : R->definedValues())
1379 assert(TypeInfo.inferScalarType(VPV) == TypeInfo2.inferScalarType(VPV));
1380 }
1381#endif
1382 }
1383
1384 // Simplify (X && Y) | (X && !Y) -> X.
1385 // TODO: Split up into simpler, modular combines: (X && Y) | (X && Z) into X
1386 // && (Y | Z) and (X | !X) into true. This requires queuing newly created
1387 // recipes to be visited during simplification.
1388 VPValue *X, *Y, *Z;
1389 if (match(Def,
1392 Def->replaceAllUsesWith(X);
1393 Def->eraseFromParent();
1394 return;
1395 }
1396
1397 // x | AllOnes -> AllOnes
1398 if (match(Def, m_c_BinaryOr(m_VPValue(X), m_AllOnes())))
1399 return Def->replaceAllUsesWith(
1400 Plan->getAllOnesValue(TypeInfo.inferScalarType(Def)));
1401
1402 // x | 0 -> x
1403 if (match(Def, m_c_BinaryOr(m_VPValue(X), m_ZeroInt())))
1404 return Def->replaceAllUsesWith(X);
1405
1406 // x | !x -> AllOnes
1408 return Def->replaceAllUsesWith(
1409 Plan->getAllOnesValue(TypeInfo.inferScalarType(Def)));
1410
1411 // x & 0 -> 0
1412 if (match(Def, m_c_BinaryAnd(m_VPValue(X), m_ZeroInt())))
1413 return Def->replaceAllUsesWith(
1414 Plan->getZero(TypeInfo.inferScalarType(Def)));
1415
1416 // x & AllOnes -> x
1417 if (match(Def, m_c_BinaryAnd(m_VPValue(X), m_AllOnes())))
1418 return Def->replaceAllUsesWith(X);
1419
1420 // x && false -> false
1421 if (match(Def, m_c_LogicalAnd(m_VPValue(X), m_False())))
1422 return Def->replaceAllUsesWith(Plan->getFalse());
1423
1424 // x && true -> x
1425 if (match(Def, m_c_LogicalAnd(m_VPValue(X), m_True())))
1426 return Def->replaceAllUsesWith(X);
1427
1428 // (x && y) | (x && z) -> x && (y | z)
1429 if (CanCreateNewRecipe &&
1432 // Simplify only if one of the operands has one use to avoid creating an
1433 // extra recipe.
1434 (!Def->getOperand(0)->hasMoreThanOneUniqueUser() ||
1435 !Def->getOperand(1)->hasMoreThanOneUniqueUser()))
1436 return Def->replaceAllUsesWith(
1437 Builder.createLogicalAnd(X, Builder.createOr(Y, Z)));
1438
1439 // x && (x && y) -> x && y
1440 if (match(Def, m_LogicalAnd(m_VPValue(X),
1442 return Def->replaceAllUsesWith(Def->getOperand(1));
1443
1444 // x && (y && x) -> x && y
1445 if (match(Def, m_LogicalAnd(m_VPValue(X),
1447 return Def->replaceAllUsesWith(Builder.createLogicalAnd(X, Y));
1448
1449 // x && !x -> 0
1451 return Def->replaceAllUsesWith(Plan->getFalse());
1452
1453 if (match(Def, m_Select(m_VPValue(), m_VPValue(X), m_Deferred(X))))
1454 return Def->replaceAllUsesWith(X);
1455
1456 // select c, false, true -> not c
1457 VPValue *C;
1458 if (CanCreateNewRecipe &&
1459 match(Def, m_Select(m_VPValue(C), m_False(), m_True())))
1460 return Def->replaceAllUsesWith(Builder.createNot(C));
1461
1462 // select !c, x, y -> select c, y, x
1463 if (match(Def, m_Select(m_Not(m_VPValue(C)), m_VPValue(X), m_VPValue(Y)))) {
1464 Def->setOperand(0, C);
1465 Def->setOperand(1, Y);
1466 Def->setOperand(2, X);
1467 return;
1468 }
1469
1470 if (match(Def, m_c_Add(m_VPValue(A), m_ZeroInt())))
1471 return Def->replaceAllUsesWith(A);
1472
1473 if (match(Def, m_c_Mul(m_VPValue(A), m_One())))
1474 return Def->replaceAllUsesWith(A);
1475
1476 if (match(Def, m_c_Mul(m_VPValue(A), m_ZeroInt())))
1477 return Def->replaceAllUsesWith(
1478 Plan->getZero(TypeInfo.inferScalarType(Def)));
1479
1480 if (CanCreateNewRecipe && match(Def, m_c_Mul(m_VPValue(A), m_AllOnes()))) {
1481 // Preserve nsw from the Mul on the new Sub.
1483 false, cast<VPRecipeWithIRFlags>(Def)->hasNoSignedWrap()};
1484 return Def->replaceAllUsesWith(
1485 Builder.createSub(Plan->getZero(TypeInfo.inferScalarType(A)), A,
1486 Def->getDebugLoc(), "", NW));
1487 }
1488
1489 if (CanCreateNewRecipe &&
1491 // Preserve nsw from the Add and the Sub, if it's present on both, on the
1492 // new Sub.
1494 false,
1495 cast<VPRecipeWithIRFlags>(Def)->hasNoSignedWrap() &&
1496 cast<VPRecipeWithIRFlags>(Def->getOperand(Def->getOperand(0) == X))
1497 ->hasNoSignedWrap()};
1498 return Def->replaceAllUsesWith(
1499 Builder.createSub(X, Y, Def->getDebugLoc(), "", NW));
1500 }
1501
1502 const APInt *APC;
1503 if (CanCreateNewRecipe && match(Def, m_c_Mul(m_VPValue(A), m_APInt(APC))) &&
1504 APC->isPowerOf2())
1505 return Def->replaceAllUsesWith(Builder.createNaryOp(
1506 Instruction::Shl,
1507 {A, Plan->getConstantInt(APC->getBitWidth(), APC->exactLogBase2())},
1508 *cast<VPRecipeWithIRFlags>(Def), Def->getDebugLoc()));
1509
1510 if (CanCreateNewRecipe && match(Def, m_UDiv(m_VPValue(A), m_APInt(APC))) &&
1511 APC->isPowerOf2())
1512 return Def->replaceAllUsesWith(Builder.createNaryOp(
1513 Instruction::LShr,
1514 {A, Plan->getConstantInt(APC->getBitWidth(), APC->exactLogBase2())},
1515 *cast<VPRecipeWithIRFlags>(Def), Def->getDebugLoc()));
1516
1517 if (match(Def, m_Not(m_VPValue(A)))) {
1518 if (match(A, m_Not(m_VPValue(A))))
1519 return Def->replaceAllUsesWith(A);
1520
1521 // Try to fold Not into compares by adjusting the predicate in-place.
1522 CmpPredicate Pred;
1523 if (match(A, m_Cmp(Pred, m_VPValue(), m_VPValue()))) {
1524 auto *Cmp = cast<VPRecipeWithIRFlags>(A);
1525 if (all_of(Cmp->users(),
1527 m_Not(m_Specific(Cmp)),
1528 m_Select(m_Specific(Cmp), m_VPValue(), m_VPValue()))))) {
1529 Cmp->setPredicate(CmpInst::getInversePredicate(Pred));
1530 for (VPUser *U : to_vector(Cmp->users())) {
1531 auto *R = cast<VPSingleDefRecipe>(U);
1532 if (match(R, m_Select(m_Specific(Cmp), m_VPValue(X), m_VPValue(Y)))) {
1533 // select (cmp pred), x, y -> select (cmp inv_pred), y, x
1534 R->setOperand(1, Y);
1535 R->setOperand(2, X);
1536 } else {
1537 // not (cmp pred) -> cmp inv_pred
1538 assert(match(R, m_Not(m_Specific(Cmp))) && "Unexpected user");
1539 R->replaceAllUsesWith(Cmp);
1540 }
1541 }
1542 // If Cmp doesn't have a debug location, use the one from the negation,
1543 // to preserve the location.
1544 if (!Cmp->getDebugLoc() && Def->getDebugLoc())
1545 Cmp->setDebugLoc(Def->getDebugLoc());
1546 }
1547 }
1548 }
1549
1550 // Fold any-of (fcmp uno %A, %A), (fcmp uno %B, %B), ... ->
1551 // any-of (fcmp uno %A, %B), ...
1552 if (match(Def, m_AnyOf())) {
1554 VPRecipeBase *UnpairedCmp = nullptr;
1555 for (VPValue *Op : Def->operands()) {
1556 VPValue *X;
1557 if (Op->getNumUsers() > 1 ||
1559 m_Deferred(X)))) {
1560 NewOps.push_back(Op);
1561 } else if (!UnpairedCmp) {
1562 UnpairedCmp = Op->getDefiningRecipe();
1563 } else {
1564 NewOps.push_back(Builder.createFCmp(CmpInst::FCMP_UNO,
1565 UnpairedCmp->getOperand(0), X));
1566 UnpairedCmp = nullptr;
1567 }
1568 }
1569
1570 if (UnpairedCmp)
1571 NewOps.push_back(UnpairedCmp->getVPSingleValue());
1572
1573 if (NewOps.size() < Def->getNumOperands()) {
1574 VPValue *NewAnyOf = Builder.createNaryOp(VPInstruction::AnyOf, NewOps);
1575 return Def->replaceAllUsesWith(NewAnyOf);
1576 }
1577 }
1578
1579 // Fold (fcmp uno %X, %X) or (fcmp uno %Y, %Y) -> fcmp uno %X, %Y
1580 // This is useful for fmax/fmin without fast-math flags, where we need to
1581 // check if any operand is NaN.
1582 if (CanCreateNewRecipe &&
1584 m_Deferred(X)),
1586 m_Deferred(Y))))) {
1587 VPValue *NewCmp = Builder.createFCmp(CmpInst::FCMP_UNO, X, Y);
1588 return Def->replaceAllUsesWith(NewCmp);
1589 }
1590
1591 // Remove redundant DerviedIVs, that is 0 + A * 1 -> A and 0 + 0 * x -> 0.
1592 if ((match(Def, m_DerivedIV(m_ZeroInt(), m_VPValue(A), m_One())) ||
1593 match(Def, m_DerivedIV(m_ZeroInt(), m_ZeroInt(), m_VPValue()))) &&
1594 TypeInfo.inferScalarType(Def->getOperand(1)) ==
1595 TypeInfo.inferScalarType(Def))
1596 return Def->replaceAllUsesWith(Def->getOperand(1));
1597
1599 m_One()))) {
1600 Type *WideStepTy = TypeInfo.inferScalarType(Def);
1601 if (TypeInfo.inferScalarType(X) != WideStepTy)
1602 X = Builder.createWidenCast(Instruction::Trunc, X, WideStepTy);
1603 Def->replaceAllUsesWith(X);
1604 return;
1605 }
1606
1607 // For i1 vp.merges produced by AnyOf reductions:
1608 // vp.merge true, (or x, y), x, evl -> vp.merge y, true, x, evl
1610 m_VPValue(X), m_VPValue())) &&
1612 TypeInfo.inferScalarType(Def)->isIntegerTy(1)) {
1613 Def->setOperand(1, Def->getOperand(0));
1614 Def->setOperand(0, Y);
1615 return;
1616 }
1617
1618 // Simplify MaskedCond with no block mask to its single operand.
1620 !cast<VPInstruction>(Def)->isMasked())
1621 return Def->replaceAllUsesWith(Def->getOperand(0));
1622
1623 // Look through ExtractLastLane.
1624 if (match(Def, m_ExtractLastLane(m_VPValue(A)))) {
1625 if (match(A, m_BuildVector())) {
1626 auto *BuildVector = cast<VPInstruction>(A);
1627 Def->replaceAllUsesWith(
1628 BuildVector->getOperand(BuildVector->getNumOperands() - 1));
1629 return;
1630 }
1631 if (Plan->hasScalarVFOnly())
1632 return Def->replaceAllUsesWith(A);
1633 }
1634
1635 // Look through ExtractPenultimateElement (BuildVector ....).
1637 auto *BuildVector = cast<VPInstruction>(Def->getOperand(0));
1638 Def->replaceAllUsesWith(
1639 BuildVector->getOperand(BuildVector->getNumOperands() - 2));
1640 return;
1641 }
1642
1643 uint64_t Idx;
1645 auto *BuildVector = cast<VPInstruction>(Def->getOperand(0));
1646 Def->replaceAllUsesWith(BuildVector->getOperand(Idx));
1647 return;
1648 }
1649
1650 if (match(Def, m_BuildVector()) && all_equal(Def->operands())) {
1651 Def->replaceAllUsesWith(
1652 Builder.createNaryOp(VPInstruction::Broadcast, Def->getOperand(0)));
1653 return;
1654 }
1655
1656 // Look through broadcast of single-scalar when used as select conditions; in
1657 // that case the scalar condition can be used directly.
1658 if (match(Def,
1661 "broadcast operand must be single-scalar");
1662 Def->setOperand(0, C);
1663 return;
1664 }
1665
1667 if (Def->getNumOperands() == 1) {
1668 Def->replaceAllUsesWith(Def->getOperand(0));
1669 return;
1670 }
1671 if (auto *Phi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(Def)) {
1672 if (all_equal(Phi->incoming_values()))
1673 Phi->replaceAllUsesWith(Phi->getOperand(0));
1674 }
1675 return;
1676 }
1677
1678 VPIRValue *IRV;
1679 if (Def->getNumOperands() == 1 &&
1681 return Def->replaceAllUsesWith(IRV);
1682
1683 // Some simplifications can only be applied after unrolling. Perform them
1684 // below.
1685 if (!Plan->isUnrolled())
1686 return;
1687
1688 // After unrolling, extract-lane may be used to extract values from multiple
1689 // scalar sources. Only simplify when extracting from a single scalar source.
1690 VPValue *LaneToExtract;
1691 if (match(Def, m_ExtractLane(m_VPValue(LaneToExtract), m_VPValue(A)))) {
1692 // Simplify extract-lane(%lane_num, %scalar_val) -> %scalar_val.
1694 return Def->replaceAllUsesWith(A);
1695
1696 // Simplify extract-lane with single source to extract-element.
1697 Def->replaceAllUsesWith(Builder.createNaryOp(
1698 Instruction::ExtractElement, {A, LaneToExtract}, Def->getDebugLoc()));
1699 return;
1700 }
1701
1702 // Look for cycles where Def is of the form:
1703 // X = phi(0, IVInc) ; used only by IVInc, or by IVInc and Inc = X + Y
1704 // IVInc = X + Step ; used by X and Def
1705 // Def = IVInc + Y
1706 // Fold the increment Y into the phi's start value, replace Def with IVInc,
1707 // and if Inc exists, replace it with X.
1708 if (match(Def, m_Add(m_Add(m_VPValue(X), m_VPValue()), m_VPValue(Y))) &&
1709 isa<VPIRValue>(Y) &&
1710 match(X, m_VPPhi(m_ZeroInt(), m_Specific(Def->getOperand(0))))) {
1711 auto *Phi = cast<VPPhi>(X);
1712 auto *IVInc = Def->getOperand(0);
1713 if (IVInc->getNumUsers() == 2) {
1714 // If Phi has a second user (besides IVInc's defining recipe), it must
1715 // be Inc = Phi + Y for the fold to apply.
1718 if (Phi->getNumUsers() == 1 || (Phi->getNumUsers() == 2 && Inc)) {
1719 Def->replaceAllUsesWith(IVInc);
1720 if (Inc)
1721 Inc->replaceAllUsesWith(Phi);
1722 Phi->setOperand(0, Y);
1723 return;
1724 }
1725 }
1726 }
1727
1728 // Simplify unrolled VectorPointer without offset, or with zero offset, to
1729 // just the pointer operand.
1730 if (auto *VPR = dyn_cast<VPVectorPointerRecipe>(Def))
1731 if (!VPR->getOffset() || match(VPR->getOffset(), m_ZeroInt()))
1732 return VPR->replaceAllUsesWith(VPR->getOperand(0));
1733
1734 // VPScalarIVSteps after unrolling can be replaced by their start value, if
1735 // the start index is zero and only the first lane 0 is demanded.
1736 if (auto *Steps = dyn_cast<VPScalarIVStepsRecipe>(Def)) {
1737 if (!Steps->getStartIndex() && vputils::onlyFirstLaneUsed(Steps)) {
1738 Steps->replaceAllUsesWith(Steps->getOperand(0));
1739 return;
1740 }
1741 }
1742 // Simplify redundant ReductionStartVector recipes after unrolling.
1743 VPValue *StartV;
1745 m_VPValue(StartV), m_VPValue(), m_VPValue()))) {
1746 Def->replaceUsesWithIf(StartV, [](const VPUser &U, unsigned Idx) {
1747 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&U);
1748 return PhiR && PhiR->isInLoop();
1749 });
1750 return;
1751 }
1752
1754 Def->replaceAllUsesWith(A);
1755 return;
1756 }
1757
1758 if (match(Def, m_ExtractLastLane(m_VPValue(A))) &&
1761 cast<VPReplicateRecipe>(A)->isSingleScalar())) &&
1762 all_of(A->users(),
1763 [Def, A](VPUser *U) { return U->usesScalars(A) || Def == U; })) {
1764 return Def->replaceAllUsesWith(A);
1765 }
1766
1767 if (Plan->getConcreteUF() == 1 && match(Def, m_ExtractLastPart(m_VPValue(A))))
1768 return Def->replaceAllUsesWith(A);
1769}
1770
1773 Plan.getEntry());
1774 VPTypeAnalysis TypeInfo(Plan);
1776 for (VPRecipeBase &R : make_early_inc_range(*VPBB))
1777 if (auto *Def = dyn_cast<VPSingleDefRecipe>(&R))
1778 simplifyRecipe(Def, TypeInfo);
1779 }
1780}
1781
1782/// Reassociate (headermask && x) && y -> headermask && (x && y) to allow the
1783/// header mask to be simplified further when tail folding, e.g. in
1784/// optimizeEVLMasks.
1785static void reassociateHeaderMask(VPlan &Plan) {
1786 VPValue *HeaderMask = vputils::findHeaderMask(Plan);
1787 if (!HeaderMask)
1788 return;
1789
1790 SmallVector<VPUser *> Worklist;
1791 for (VPUser *U : HeaderMask->users())
1792 if (match(U, m_LogicalAnd(m_Specific(HeaderMask), m_VPValue())))
1794
1795 while (!Worklist.empty()) {
1796 auto *R = dyn_cast<VPSingleDefRecipe>(Worklist.pop_back_val());
1797 VPValue *X, *Y;
1798 if (!R || !match(R, m_LogicalAnd(
1799 m_LogicalAnd(m_Specific(HeaderMask), m_VPValue(X)),
1800 m_VPValue(Y))))
1801 continue;
1802 append_range(Worklist, R->users());
1803 VPBuilder Builder(R);
1804 R->replaceAllUsesWith(
1805 Builder.createLogicalAnd(HeaderMask, Builder.createLogicalAnd(X, Y)));
1806 }
1807}
1808
1810 if (Plan.hasScalarVFOnly())
1811 return;
1812
1813 // Try to narrow wide and replicating recipes to single scalar recipes,
1814 // based on VPlan analysis. Only process blocks in the loop region for now,
1815 // without traversing into nested regions, as recipes in replicate regions
1816 // cannot be converted yet.
1819 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
1821 VPWidenStoreRecipe>(&R))
1822 continue;
1823 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
1824 if (RepR && (RepR->isSingleScalar() || RepR->isPredicated()))
1825 continue;
1826
1827 // Convert an unmasked scatter with an uniform address into
1828 // extract-last-lane + scalar store.
1829 // TODO: Add a profitability check comparing the cost of a scatter vs.
1830 // extract + scalar store.
1831 auto *WidenStoreR = dyn_cast<VPWidenStoreRecipe>(&R);
1832 if (WidenStoreR && vputils::isSingleScalar(WidenStoreR->getAddr()) &&
1833 !WidenStoreR->isConsecutive()) {
1834 VPValue *Mask = WidenStoreR->getMask();
1835
1836 // Only convert the scatter to a scalar store if it is unmasked.
1837 // TODO: Support converting scatter masked by the header mask to scalar
1838 // store.
1839 if (Mask)
1840 continue;
1841
1843 {WidenStoreR->getOperand(1)});
1844 Extract->insertBefore(WidenStoreR);
1845
1846 // TODO: Sink the scalar store recipe to middle block if possible.
1847 auto *ScalarStore = new VPReplicateRecipe(
1848 &WidenStoreR->getIngredient(), {Extract, WidenStoreR->getAddr()},
1849 true /*IsSingleScalar*/, nullptr /*Mask*/, {},
1850 *WidenStoreR /*Metadata*/);
1851 ScalarStore->insertBefore(WidenStoreR);
1852 WidenStoreR->eraseFromParent();
1853 continue;
1854 }
1855
1856 auto *RepOrWidenR = dyn_cast<VPRecipeWithIRFlags>(&R);
1857 if (RepR && RepR->getOpcode() == Instruction::Store &&
1858 vputils::isSingleScalar(RepR->getOperand(1))) {
1859 auto *Clone = new VPReplicateRecipe(
1860 RepOrWidenR->getUnderlyingInstr(), RepOrWidenR->operands(),
1861 true /*IsSingleScalar*/, nullptr /*Mask*/, *RepR /*Flags*/,
1862 *RepR /*Metadata*/, RepR->getDebugLoc());
1863 Clone->insertBefore(RepOrWidenR);
1864 VPBuilder Builder(Clone);
1865 VPValue *ExtractOp = Clone->getOperand(0);
1866 if (vputils::isUniformAcrossVFsAndUFs(RepR->getOperand(1)))
1867 ExtractOp =
1868 Builder.createNaryOp(VPInstruction::ExtractLastPart, ExtractOp);
1869 ExtractOp =
1870 Builder.createNaryOp(VPInstruction::ExtractLastLane, ExtractOp);
1871 Clone->setOperand(0, ExtractOp);
1872 RepR->eraseFromParent();
1873 continue;
1874 }
1875
1876 // Skip recipes that aren't single scalars.
1877 if (!RepOrWidenR || !vputils::isSingleScalar(RepOrWidenR))
1878 continue;
1879
1880 // Predicate to check if a user of Op introduces extra broadcasts.
1881 auto IntroducesBCastOf = [](const VPValue *Op) {
1882 return [Op](const VPUser *U) {
1883 if (auto *VPI = dyn_cast<VPInstruction>(U)) {
1887 VPI->getOpcode()))
1888 return false;
1889 }
1890 return !U->usesScalars(Op);
1891 };
1892 };
1893
1894 if (any_of(RepOrWidenR->users(), IntroducesBCastOf(RepOrWidenR)) &&
1895 none_of(RepOrWidenR->operands(), [&](VPValue *Op) {
1896 if (any_of(
1897 make_filter_range(Op->users(), not_equal_to(RepOrWidenR)),
1898 IntroducesBCastOf(Op)))
1899 return false;
1900 // Non-constant live-ins require broadcasts, while constants do not
1901 // need explicit broadcasts.
1902 auto *IRV = dyn_cast<VPIRValue>(Op);
1903 bool LiveInNeedsBroadcast = IRV && !isa<Constant>(IRV->getValue());
1904 auto *OpR = dyn_cast<VPReplicateRecipe>(Op);
1905 return LiveInNeedsBroadcast || (OpR && OpR->isSingleScalar());
1906 }))
1907 continue;
1908
1909 auto *Clone = new VPReplicateRecipe(
1910 RepOrWidenR->getUnderlyingInstr(), RepOrWidenR->operands(),
1911 true /*IsSingleScalar*/, nullptr, *RepOrWidenR);
1912 Clone->insertBefore(RepOrWidenR);
1913 RepOrWidenR->replaceAllUsesWith(Clone);
1914 if (isDeadRecipe(*RepOrWidenR))
1915 RepOrWidenR->eraseFromParent();
1916 }
1917 }
1918}
1919
1920/// Try to see if all of \p Blend's masks share a common value logically and'ed
1921/// and remove it from the masks.
1923 if (Blend->isNormalized())
1924 return;
1925 VPValue *CommonEdgeMask;
1926 if (!match(Blend->getMask(0),
1927 m_LogicalAnd(m_VPValue(CommonEdgeMask), m_VPValue())))
1928 return;
1929 for (unsigned I = 0; I < Blend->getNumIncomingValues(); I++)
1930 if (!match(Blend->getMask(I),
1931 m_LogicalAnd(m_Specific(CommonEdgeMask), m_VPValue())))
1932 return;
1933 for (unsigned I = 0; I < Blend->getNumIncomingValues(); I++)
1934 Blend->setMask(I, Blend->getMask(I)->getDefiningRecipe()->getOperand(1));
1935}
1936
1937/// Normalize and simplify VPBlendRecipes. Should be run after simplifyRecipes
1938/// to make sure the masks are simplified.
1939static void simplifyBlends(VPlan &Plan) {
1942 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
1943 auto *Blend = dyn_cast<VPBlendRecipe>(&R);
1944 if (!Blend)
1945 continue;
1946
1947 removeCommonBlendMask(Blend);
1948
1949 // Try to remove redundant blend recipes.
1950 SmallPtrSet<VPValue *, 4> UniqueValues;
1951 if (Blend->isNormalized() || !match(Blend->getMask(0), m_False()))
1952 UniqueValues.insert(Blend->getIncomingValue(0));
1953 for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
1954 if (!match(Blend->getMask(I), m_False()))
1955 UniqueValues.insert(Blend->getIncomingValue(I));
1956
1957 if (UniqueValues.size() == 1) {
1958 Blend->replaceAllUsesWith(*UniqueValues.begin());
1959 Blend->eraseFromParent();
1960 continue;
1961 }
1962
1963 if (Blend->isNormalized())
1964 continue;
1965
1966 // Normalize the blend so its first incoming value is used as the initial
1967 // value with the others blended into it.
1968
1969 unsigned StartIndex = 0;
1970 for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
1971 // If a value's mask is used only by the blend then is can be deadcoded.
1972 // TODO: Find the most expensive mask that can be deadcoded, or a mask
1973 // that's used by multiple blends where it can be removed from them all.
1974 VPValue *Mask = Blend->getMask(I);
1975 if (Mask->getNumUsers() == 1 && !match(Mask, m_False())) {
1976 StartIndex = I;
1977 break;
1978 }
1979 }
1980
1981 SmallVector<VPValue *, 4> OperandsWithMask;
1982 OperandsWithMask.push_back(Blend->getIncomingValue(StartIndex));
1983
1984 for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
1985 if (I == StartIndex)
1986 continue;
1987 OperandsWithMask.push_back(Blend->getIncomingValue(I));
1988 OperandsWithMask.push_back(Blend->getMask(I));
1989 }
1990
1991 auto *NewBlend =
1992 new VPBlendRecipe(cast_or_null<PHINode>(Blend->getUnderlyingValue()),
1993 OperandsWithMask, *Blend, Blend->getDebugLoc());
1994 NewBlend->insertBefore(&R);
1995
1996 VPValue *DeadMask = Blend->getMask(StartIndex);
1997 Blend->replaceAllUsesWith(NewBlend);
1998 Blend->eraseFromParent();
2000
2001 /// Simplify BLEND %a, %b, Not(%mask) -> BLEND %b, %a, %mask.
2002 VPValue *NewMask;
2003 if (NewBlend->getNumOperands() == 3 &&
2004 match(NewBlend->getMask(1), m_Not(m_VPValue(NewMask)))) {
2005 VPValue *Inc0 = NewBlend->getOperand(0);
2006 VPValue *Inc1 = NewBlend->getOperand(1);
2007 VPValue *OldMask = NewBlend->getOperand(2);
2008 NewBlend->setOperand(0, Inc1);
2009 NewBlend->setOperand(1, Inc0);
2010 NewBlend->setOperand(2, NewMask);
2011 if (OldMask->getNumUsers() == 0)
2012 cast<VPInstruction>(OldMask)->eraseFromParent();
2013 }
2014 }
2015 }
2016}
2017
2018/// Optimize the width of vector induction variables in \p Plan based on a known
2019/// constant Trip Count, \p BestVF and \p BestUF.
2021 ElementCount BestVF,
2022 unsigned BestUF) {
2023 // Only proceed if we have not completely removed the vector region.
2024 if (!Plan.getVectorLoopRegion())
2025 return false;
2026
2027 const APInt *TC;
2028 if (!BestVF.isFixed() || !match(Plan.getTripCount(), m_APInt(TC)))
2029 return false;
2030
2031 // Calculate the minimum power-of-2 bit width that can fit the known TC, VF
2032 // and UF. Returns at least 8.
2033 auto ComputeBitWidth = [](APInt TC, uint64_t Align) {
2034 APInt AlignedTC =
2037 APInt MaxVal = AlignedTC - 1;
2038 return std::max<unsigned>(PowerOf2Ceil(MaxVal.getActiveBits()), 8);
2039 };
2040 unsigned NewBitWidth =
2041 ComputeBitWidth(*TC, BestVF.getKnownMinValue() * BestUF);
2042
2043 LLVMContext &Ctx = Plan.getContext();
2044 auto *NewIVTy = IntegerType::get(Ctx, NewBitWidth);
2045
2046 bool MadeChange = false;
2047
2048 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
2049 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
2050 auto *WideIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
2051
2052 // Currently only handle canonical IVs as it is trivial to replace the start
2053 // and stop values, and we currently only perform the optimization when the
2054 // IV has a single use.
2055 if (!WideIV || !WideIV->isCanonical() ||
2056 WideIV->hasMoreThanOneUniqueUser() ||
2057 NewIVTy == WideIV->getScalarType())
2058 continue;
2059
2060 // Currently only handle cases where the single user is a header-mask
2061 // comparison with the backedge-taken-count.
2062 VPUser *SingleUser = WideIV->getSingleUser();
2063 if (!SingleUser ||
2064 !match(SingleUser,
2065 m_ICmp(m_Specific(WideIV),
2067 continue;
2068
2069 // Update IV operands and comparison bound to use new narrower type.
2070 auto *NewStart = Plan.getZero(NewIVTy);
2071 WideIV->setStartValue(NewStart);
2072 auto *NewStep = Plan.getConstantInt(NewIVTy, 1);
2073 WideIV->setStepValue(NewStep);
2074
2075 auto *NewBTC = new VPWidenCastRecipe(
2076 Instruction::Trunc, Plan.getOrCreateBackedgeTakenCount(), NewIVTy,
2077 nullptr, VPIRFlags::getDefaultFlags(Instruction::Trunc));
2078 Plan.getVectorPreheader()->appendRecipe(NewBTC);
2079 auto *Cmp = cast<VPInstruction>(WideIV->getSingleUser());
2080 Cmp->setOperand(1, NewBTC);
2081
2082 MadeChange = true;
2083 }
2084
2085 return MadeChange;
2086}
2087
2088/// Return true if \p Cond is known to be true for given \p BestVF and \p
2089/// BestUF.
2091 ElementCount BestVF, unsigned BestUF,
2094 return any_of(Cond->getDefiningRecipe()->operands(), [&Plan, BestVF, BestUF,
2095 &PSE](VPValue *C) {
2096 return isConditionTrueViaVFAndUF(C, Plan, BestVF, BestUF, PSE);
2097 });
2098
2099 auto *CanIV = Plan.getVectorLoopRegion()->getCanonicalIV();
2102 m_c_Add(m_Specific(CanIV), m_Specific(&Plan.getVFxUF())),
2103 m_Specific(&Plan.getVectorTripCount()))))
2104 return false;
2105
2106 // The compare checks CanIV + VFxUF == vector trip count. The vector trip
2107 // count is not conveniently available as SCEV so far, so we compare directly
2108 // against the original trip count. This is stricter than necessary, as we
2109 // will only return true if the trip count == vector trip count.
2110 const SCEV *VectorTripCount =
2112 if (isa<SCEVCouldNotCompute>(VectorTripCount))
2113 VectorTripCount = vputils::getSCEVExprForVPValue(Plan.getTripCount(), PSE);
2114 assert(!isa<SCEVCouldNotCompute>(VectorTripCount) &&
2115 "Trip count SCEV must be computable");
2116 ScalarEvolution &SE = *PSE.getSE();
2117 ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF);
2118 const SCEV *C = SE.getElementCount(VectorTripCount->getType(), NumElements);
2119 return SE.isKnownPredicate(CmpInst::ICMP_EQ, VectorTripCount, C);
2120}
2121
2122/// Try to replace multiple active lane masks used for control flow with
2123/// a single, wide active lane mask instruction followed by multiple
2124/// extract subvector intrinsics. This applies to the active lane mask
2125/// instructions both in the loop and in the preheader.
2126/// Incoming values of all ActiveLaneMaskPHIs are updated to use the
2127/// new extracts from the first active lane mask, which has it's last
2128/// operand (multiplier) set to UF.
2130 unsigned UF) {
2131 if (!EnableWideActiveLaneMask || !VF.isVector() || UF == 1)
2132 return false;
2133
2134 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
2135 VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
2136 auto *Term = &ExitingVPBB->back();
2137
2138 using namespace llvm::VPlanPatternMatch;
2140 m_VPValue(), m_VPValue(), m_VPValue())))))
2141 return false;
2142
2143 auto *Header = cast<VPBasicBlock>(VectorRegion->getEntry());
2144 LLVMContext &Ctx = Plan.getContext();
2145
2146 auto ExtractFromALM = [&](VPInstruction *ALM,
2147 SmallVectorImpl<VPValue *> &Extracts) {
2148 DebugLoc DL = ALM->getDebugLoc();
2149 for (unsigned Part = 0; Part < UF; ++Part) {
2151 Ops.append({ALM, Plan.getConstantInt(64, VF.getKnownMinValue() * Part)});
2152 auto *Ext =
2153 new VPWidenIntrinsicRecipe(Intrinsic::vector_extract, Ops,
2154 IntegerType::getInt1Ty(Ctx), {}, {}, DL);
2155 Extracts[Part] = Ext;
2156 Ext->insertAfter(ALM);
2157 }
2158 };
2159
2160 // Create a list of each active lane mask phi, ordered by unroll part.
2162 for (VPRecipeBase &R : Header->phis()) {
2164 if (!Phi)
2165 continue;
2166 VPValue *Index = nullptr;
2167 match(Phi->getBackedgeValue(),
2169 assert(Index && "Expected index from ActiveLaneMask instruction");
2170
2171 uint64_t Part;
2172 if (match(Index,
2174 m_VPValue(), m_Mul(m_VPValue(), m_ConstantInt(Part)))))
2175 Phis[Part] = Phi;
2176 else {
2177 // Anything other than a CanonicalIVIncrementForPart is part 0
2178 assert(!match(
2179 Index,
2181 Phis[0] = Phi;
2182 }
2183 }
2184
2185 assert(all_of(Phis, not_equal_to(nullptr)) &&
2186 "Expected one VPActiveLaneMaskPHIRecipe for each unroll part");
2187
2188 auto *EntryALM = cast<VPInstruction>(Phis[0]->getStartValue());
2189 auto *LoopALM = cast<VPInstruction>(Phis[0]->getBackedgeValue());
2190
2191 assert((EntryALM->getOpcode() == VPInstruction::ActiveLaneMask &&
2192 LoopALM->getOpcode() == VPInstruction::ActiveLaneMask) &&
2193 "Expected incoming values of Phi to be ActiveLaneMasks");
2194
2195 // When using wide lane masks, the return type of the get.active.lane.mask
2196 // intrinsic is VF x UF (last operand).
2197 VPValue *ALMMultiplier = Plan.getConstantInt(64, UF);
2198 EntryALM->setOperand(2, ALMMultiplier);
2199 LoopALM->setOperand(2, ALMMultiplier);
2200
2201 // Create UF x extract vectors and insert into preheader.
2202 SmallVector<VPValue *> EntryExtracts(UF);
2203 ExtractFromALM(EntryALM, EntryExtracts);
2204
2205 // Create UF x extract vectors and insert before the loop compare & branch,
2206 // updating the compare to use the first extract.
2207 SmallVector<VPValue *> LoopExtracts(UF);
2208 ExtractFromALM(LoopALM, LoopExtracts);
2209 VPInstruction *Not = cast<VPInstruction>(Term->getOperand(0));
2210 Not->setOperand(0, LoopExtracts[0]);
2211
2212 // Update the incoming values of active lane mask phis.
2213 for (unsigned Part = 0; Part < UF; ++Part) {
2214 Phis[Part]->setStartValue(EntryExtracts[Part]);
2215 Phis[Part]->setBackedgeValue(LoopExtracts[Part]);
2216 }
2217
2218 return true;
2219}
2220
2221/// Try to simplify the branch condition of \p Plan. This may restrict the
2222/// resulting plan to \p BestVF and \p BestUF.
2224 unsigned BestUF,
2226 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
2227 VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
2228 auto *Term = &ExitingVPBB->back();
2229 VPValue *Cond;
2230 auto m_CanIVInc = m_Add(m_VPValue(), m_Specific(&Plan.getVFxUF()));
2231 // Check if the branch condition compares the canonical IV increment (for main
2232 // loop), or the canonical IV increment plus an offset (for epilog loop).
2233 if (match(Term, m_BranchOnCount(
2234 m_CombineOr(m_CanIVInc, m_c_Add(m_CanIVInc, m_LiveIn())),
2235 m_VPValue())) ||
2237 m_VPValue(), m_VPValue(), m_VPValue()))))) {
2238 // Try to simplify the branch condition if VectorTC <= VF * UF when the
2239 // latch terminator is BranchOnCount or BranchOnCond(Not(ActiveLaneMask)).
2240 const SCEV *VectorTripCount =
2242 if (isa<SCEVCouldNotCompute>(VectorTripCount))
2243 VectorTripCount =
2245 assert(!isa<SCEVCouldNotCompute>(VectorTripCount) &&
2246 "Trip count SCEV must be computable");
2247 ScalarEvolution &SE = *PSE.getSE();
2248 ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF);
2249 const SCEV *C = SE.getElementCount(VectorTripCount->getType(), NumElements);
2250 if (!SE.isKnownPredicate(CmpInst::ICMP_ULE, VectorTripCount, C))
2251 return false;
2252 } else if (match(Term, m_BranchOnCond(m_VPValue(Cond))) ||
2254 // For BranchOnCond, check if we can prove the condition to be true using VF
2255 // and UF.
2256 if (!isConditionTrueViaVFAndUF(Cond, Plan, BestVF, BestUF, PSE))
2257 return false;
2258 } else {
2259 return false;
2260 }
2261
2262 // The vector loop region only executes once. Convert terminator of the
2263 // exiting block to exit in the first iteration.
2264 if (match(Term, m_BranchOnTwoConds())) {
2265 Term->setOperand(1, Plan.getTrue());
2266 return true;
2267 }
2268
2269 auto *BOC = new VPInstruction(VPInstruction::BranchOnCond, Plan.getTrue(), {},
2270 {}, Term->getDebugLoc());
2271 ExitingVPBB->appendRecipe(BOC);
2272 Term->eraseFromParent();
2273
2274 return true;
2275}
2276
2277/// From the definition of llvm.experimental.get.vector.length,
2278/// VPInstruction::ExplicitVectorLength(%AVL) = %AVL when %AVL <= VF.
2282 vp_depth_first_deep(Plan.getEntry()))) {
2283 for (VPRecipeBase &R : *VPBB) {
2284 VPValue *AVL;
2285 if (!match(&R, m_EVL(m_VPValue(AVL))))
2286 continue;
2287
2288 const SCEV *AVLSCEV = vputils::getSCEVExprForVPValue(AVL, PSE);
2289 if (isa<SCEVCouldNotCompute>(AVLSCEV))
2290 continue;
2291 ScalarEvolution &SE = *PSE.getSE();
2292 const SCEV *VFSCEV = SE.getElementCount(AVLSCEV->getType(), VF);
2293 if (!SE.isKnownPredicate(CmpInst::ICMP_ULE, AVLSCEV, VFSCEV))
2294 continue;
2295
2297 AVL, Type::getInt32Ty(Plan.getContext()), AVLSCEV->getType(),
2298 R.getDebugLoc());
2299 if (Trunc != AVL) {
2300 auto *TruncR = cast<VPSingleDefRecipe>(Trunc);
2301 const DataLayout &DL = Plan.getDataLayout();
2302 VPTypeAnalysis TypeInfo(Plan);
2303 if (VPValue *Folded =
2304 tryToFoldLiveIns(*TruncR, TruncR->operands(), DL, TypeInfo))
2305 Trunc = Folded;
2306 }
2307 R.getVPSingleValue()->replaceAllUsesWith(Trunc);
2308 return true;
2309 }
2310 }
2311 return false;
2312}
2313
2315 unsigned BestUF,
2317 assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
2318 assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
2319
2320 bool MadeChange = tryToReplaceALMWithWideALM(Plan, BestVF, BestUF);
2321 MadeChange |= simplifyBranchConditionForVFAndUF(Plan, BestVF, BestUF, PSE);
2322 MadeChange |= optimizeVectorInductionWidthForTCAndVFUF(Plan, BestVF, BestUF);
2323
2324 if (MadeChange) {
2325 Plan.setVF(BestVF);
2326 assert(Plan.getConcreteUF() == BestUF && "BestUF must match the Plan's UF");
2327 }
2328}
2329
2330/// Sink users of \p FOR after the recipe defining the previous value \p
2331/// Previous of the recurrence. \returns true if all users of \p FOR could be
2332/// re-arranged as needed or false if it is not possible.
2333static bool
2335 VPRecipeBase *Previous,
2336 VPDominatorTree &VPDT) {
2337 // If Previous is a live-in (no defining recipe), it naturally dominates all
2338 // recipes in the loop, so no sinking is needed.
2339 if (!Previous)
2340 return true;
2341
2342 // Collect recipes that need sinking.
2345 Seen.insert(Previous);
2346 auto TryToPushSinkCandidate = [&](VPRecipeBase *SinkCandidate) {
2347 // The previous value must not depend on the users of the recurrence phi. In
2348 // that case, FOR is not a fixed order recurrence.
2349 if (SinkCandidate == Previous)
2350 return false;
2351
2352 if (isa<VPHeaderPHIRecipe>(SinkCandidate) ||
2353 !Seen.insert(SinkCandidate).second ||
2354 VPDT.properlyDominates(Previous, SinkCandidate))
2355 return true;
2356
2357 if (cannotHoistOrSinkRecipe(*SinkCandidate, /*Sinking=*/true))
2358 return false;
2359
2360 WorkList.push_back(SinkCandidate);
2361 return true;
2362 };
2363
2364 // Recursively sink users of FOR after Previous.
2365 WorkList.push_back(FOR);
2366 for (unsigned I = 0; I != WorkList.size(); ++I) {
2367 VPRecipeBase *Current = WorkList[I];
2368 assert(Current->getNumDefinedValues() == 1 &&
2369 "only recipes with a single defined value expected");
2370
2371 for (VPUser *User : Current->getVPSingleValue()->users()) {
2372 if (!TryToPushSinkCandidate(cast<VPRecipeBase>(User)))
2373 return false;
2374 }
2375 }
2376
2377 // Keep recipes to sink ordered by dominance so earlier instructions are
2378 // processed first.
2379 sort(WorkList, [&VPDT](const VPRecipeBase *A, const VPRecipeBase *B) {
2380 return VPDT.properlyDominates(A, B);
2381 });
2382
2383 for (VPRecipeBase *SinkCandidate : WorkList) {
2384 if (SinkCandidate == FOR)
2385 continue;
2386
2387 SinkCandidate->moveAfter(Previous);
2388 Previous = SinkCandidate;
2389 }
2390 return true;
2391}
2392
2393/// Try to hoist \p Previous and its operands before all users of \p FOR.
2395 VPRecipeBase *Previous,
2396 VPDominatorTree &VPDT) {
2397 if (cannotHoistOrSinkRecipe(*Previous))
2398 return false;
2399
2400 // Collect recipes that need hoisting.
2401 SmallVector<VPRecipeBase *> HoistCandidates;
2403 VPRecipeBase *HoistPoint = nullptr;
2404 // Find the closest hoist point by looking at all users of FOR and selecting
2405 // the recipe dominating all other users.
2406 for (VPUser *U : FOR->users()) {
2407 auto *R = cast<VPRecipeBase>(U);
2408 if (!HoistPoint || VPDT.properlyDominates(R, HoistPoint))
2409 HoistPoint = R;
2410 }
2411 assert(all_of(FOR->users(),
2412 [&VPDT, HoistPoint](VPUser *U) {
2413 auto *R = cast<VPRecipeBase>(U);
2414 return HoistPoint == R ||
2415 VPDT.properlyDominates(HoistPoint, R);
2416 }) &&
2417 "HoistPoint must dominate all users of FOR");
2418
2419 auto NeedsHoisting = [HoistPoint, &VPDT,
2420 &Visited](VPValue *HoistCandidateV) -> VPRecipeBase * {
2421 VPRecipeBase *HoistCandidate = HoistCandidateV->getDefiningRecipe();
2422 if (!HoistCandidate)
2423 return nullptr;
2424 VPRegionBlock *EnclosingLoopRegion =
2425 HoistCandidate->getParent()->getEnclosingLoopRegion();
2426 assert((!HoistCandidate->getRegion() ||
2427 HoistCandidate->getRegion() == EnclosingLoopRegion) &&
2428 "CFG in VPlan should still be flat, without replicate regions");
2429 // Hoist candidate was already visited, no need to hoist.
2430 if (!Visited.insert(HoistCandidate).second)
2431 return nullptr;
2432
2433 // Candidate is outside loop region or a header phi, dominates FOR users w/o
2434 // hoisting.
2435 if (!EnclosingLoopRegion || isa<VPHeaderPHIRecipe>(HoistCandidate))
2436 return nullptr;
2437
2438 // If we reached a recipe that dominates HoistPoint, we don't need to
2439 // hoist the recipe.
2440 if (VPDT.properlyDominates(HoistCandidate, HoistPoint))
2441 return nullptr;
2442 return HoistCandidate;
2443 };
2444
2445 if (!NeedsHoisting(Previous->getVPSingleValue()))
2446 return true;
2447
2448 // Recursively try to hoist Previous and its operands before all users of FOR.
2449 HoistCandidates.push_back(Previous);
2450
2451 for (unsigned I = 0; I != HoistCandidates.size(); ++I) {
2452 VPRecipeBase *Current = HoistCandidates[I];
2453 assert(Current->getNumDefinedValues() == 1 &&
2454 "only recipes with a single defined value expected");
2455 if (cannotHoistOrSinkRecipe(*Current))
2456 return false;
2457
2458 for (VPValue *Op : Current->operands()) {
2459 // If we reach FOR, it means the original Previous depends on some other
2460 // recurrence that in turn depends on FOR. If that is the case, we would
2461 // also need to hoist recipes involving the other FOR, which may break
2462 // dependencies.
2463 if (Op == FOR)
2464 return false;
2465
2466 if (auto *R = NeedsHoisting(Op)) {
2467 // Bail out if the recipe defines multiple values.
2468 // TODO: Hoisting such recipes requires additional handling.
2469 if (R->getNumDefinedValues() != 1)
2470 return false;
2471 HoistCandidates.push_back(R);
2472 }
2473 }
2474 }
2475
2476 // Order recipes to hoist by dominance so earlier instructions are processed
2477 // first.
2478 sort(HoistCandidates, [&VPDT](const VPRecipeBase *A, const VPRecipeBase *B) {
2479 return VPDT.properlyDominates(A, B);
2480 });
2481
2482 for (VPRecipeBase *HoistCandidate : HoistCandidates) {
2483 HoistCandidate->moveBefore(*HoistPoint->getParent(),
2484 HoistPoint->getIterator());
2485 }
2486
2487 return true;
2488}
2489
2491 VPBuilder &LoopBuilder) {
2492 VPDominatorTree VPDT(Plan);
2493
2495 for (VPRecipeBase &R :
2498 RecurrencePhis.push_back(FOR);
2499
2500 for (VPFirstOrderRecurrencePHIRecipe *FOR : RecurrencePhis) {
2502 VPRecipeBase *Previous = FOR->getBackedgeValue()->getDefiningRecipe();
2503 // Fixed-order recurrences do not contain cycles, so this loop is guaranteed
2504 // to terminate.
2505 while (auto *PrevPhi =
2507 assert(PrevPhi->getParent() == FOR->getParent());
2508 assert(SeenPhis.insert(PrevPhi).second);
2509 Previous = PrevPhi->getBackedgeValue()->getDefiningRecipe();
2510 }
2511
2512 if (!sinkRecurrenceUsersAfterPrevious(FOR, Previous, VPDT) &&
2513 !hoistPreviousBeforeFORUsers(FOR, Previous, VPDT))
2514 return false;
2515
2516 // Introduce a recipe to combine the incoming and previous values of a
2517 // fixed-order recurrence.
2518 VPBasicBlock *InsertBlock =
2519 Previous ? Previous->getParent() : FOR->getParent();
2520 if (!Previous || isa<VPHeaderPHIRecipe>(Previous))
2521 LoopBuilder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi());
2522 else
2523 LoopBuilder.setInsertPoint(InsertBlock,
2524 std::next(Previous->getIterator()));
2525
2526 auto *RecurSplice =
2528 {FOR, FOR->getBackedgeValue()});
2529
2530 FOR->replaceAllUsesWith(RecurSplice);
2531 // Set the first operand of RecurSplice to FOR again, after replacing
2532 // all users.
2533 RecurSplice->setOperand(0, FOR);
2534 }
2535 return true;
2536}
2537
2539 for (VPRecipeBase &R :
2541 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
2542 if (!PhiR)
2543 continue;
2544 RecurKind RK = PhiR->getRecurrenceKind();
2545 if (RK != RecurKind::Add && RK != RecurKind::Mul && RK != RecurKind::Sub &&
2547 continue;
2548
2549 for (VPUser *U : collectUsersRecursively(PhiR))
2550 if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(U)) {
2551 RecWithFlags->dropPoisonGeneratingFlags();
2552 }
2553 }
2554}
2555
2556namespace {
2557struct VPCSEDenseMapInfo : public DenseMapInfo<VPSingleDefRecipe *> {
2558 static bool isSentinel(const VPSingleDefRecipe *Def) {
2559 return Def == getEmptyKey() || Def == getTombstoneKey();
2560 }
2561
2562 /// If recipe \p R will lower to a GEP with a non-i8 source element type,
2563 /// return that source element type.
2564 static Type *getGEPSourceElementType(const VPSingleDefRecipe *R) {
2565 // All VPInstructions that lower to GEPs must have the i8 source element
2566 // type (as they are PtrAdds), so we omit it.
2568 .Case([](const VPReplicateRecipe *I) -> Type * {
2569 if (auto *GEP = dyn_cast<GetElementPtrInst>(I->getUnderlyingValue()))
2570 return GEP->getSourceElementType();
2571 return nullptr;
2572 })
2573 .Case<VPVectorPointerRecipe, VPWidenGEPRecipe>(
2574 [](auto *I) { return I->getSourceElementType(); })
2575 .Default([](auto *) { return nullptr; });
2576 }
2577
2578 /// Returns true if recipe \p Def can be safely handed for CSE.
2579 static bool canHandle(const VPSingleDefRecipe *Def) {
2580 // We can extend the list of handled recipes in the future,
2581 // provided we account for the data embedded in them while checking for
2582 // equality or hashing.
2583 auto C = getOpcodeOrIntrinsicID(Def);
2584
2585 // The issue with (Insert|Extract)Value is that the index of the
2586 // insert/extract is not a proper operand in LLVM IR, and hence also not in
2587 // VPlan.
2588 if (!C || (!C->first && (C->second == Instruction::InsertValue ||
2589 C->second == Instruction::ExtractValue)))
2590 return false;
2591
2592 // During CSE, we can only handle recipes that don't read from memory: if
2593 // they read from memory, there could be an intervening write to memory
2594 // before the next instance is CSE'd, leading to an incorrect result.
2595 return !Def->mayReadFromMemory();
2596 }
2597
2598 /// Hash the underlying data of \p Def.
2599 static unsigned getHashValue(const VPSingleDefRecipe *Def) {
2600 const VPlan *Plan = Def->getParent()->getPlan();
2601 VPTypeAnalysis TypeInfo(*Plan);
2602 hash_code Result = hash_combine(
2603 Def->getVPRecipeID(), getOpcodeOrIntrinsicID(Def),
2604 getGEPSourceElementType(Def), TypeInfo.inferScalarType(Def),
2606 if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(Def))
2607 if (RFlags->hasPredicate())
2608 return hash_combine(Result, RFlags->getPredicate());
2609 if (auto *SIVSteps = dyn_cast<VPScalarIVStepsRecipe>(Def))
2610 return hash_combine(Result, SIVSteps->getInductionOpcode());
2611 return Result;
2612 }
2613
2614 /// Check equality of underlying data of \p L and \p R.
2615 static bool isEqual(const VPSingleDefRecipe *L, const VPSingleDefRecipe *R) {
2616 if (isSentinel(L) || isSentinel(R))
2617 return L == R;
2618 if (L->getVPRecipeID() != R->getVPRecipeID() ||
2620 getGEPSourceElementType(L) != getGEPSourceElementType(R) ||
2622 !equal(L->operands(), R->operands()))
2623 return false;
2625 "must have valid opcode info for both recipes");
2626 if (auto *LFlags = dyn_cast<VPRecipeWithIRFlags>(L))
2627 if (LFlags->hasPredicate() &&
2628 LFlags->getPredicate() !=
2629 cast<VPRecipeWithIRFlags>(R)->getPredicate())
2630 return false;
2631 if (auto *LSIV = dyn_cast<VPScalarIVStepsRecipe>(L))
2632 if (LSIV->getInductionOpcode() !=
2633 cast<VPScalarIVStepsRecipe>(R)->getInductionOpcode())
2634 return false;
2635 // Recipes in replicate regions implicitly depend on predicate. If either
2636 // recipe is in a replicate region, only consider them equal if both have
2637 // the same parent.
2638 const VPRegionBlock *RegionL = L->getRegion();
2639 const VPRegionBlock *RegionR = R->getRegion();
2640 if (((RegionL && RegionL->isReplicator()) ||
2641 (RegionR && RegionR->isReplicator())) &&
2642 L->getParent() != R->getParent())
2643 return false;
2644 const VPlan *Plan = L->getParent()->getPlan();
2645 VPTypeAnalysis TypeInfo(*Plan);
2646 return TypeInfo.inferScalarType(L) == TypeInfo.inferScalarType(R);
2647 }
2648};
2649} // end anonymous namespace
2650
2651/// Perform a common-subexpression-elimination of VPSingleDefRecipes on the \p
2652/// Plan.
2654 VPDominatorTree VPDT(Plan);
2656
2658 Plan.getEntry());
2660 for (VPRecipeBase &R : *VPBB) {
2661 auto *Def = dyn_cast<VPSingleDefRecipe>(&R);
2662 if (!Def || !VPCSEDenseMapInfo::canHandle(Def))
2663 continue;
2664 if (VPSingleDefRecipe *V = CSEMap.lookup(Def)) {
2665 // V must dominate Def for a valid replacement.
2666 if (!VPDT.dominates(V->getParent(), VPBB))
2667 continue;
2668 // Only keep flags present on both V and Def.
2669 if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(V))
2670 RFlags->intersectFlags(*cast<VPRecipeWithIRFlags>(Def));
2671 Def->replaceAllUsesWith(V);
2672 continue;
2673 }
2674 CSEMap[Def] = Def;
2675 }
2676 }
2677}
2678
2679/// Move loop-invariant recipes out of the vector loop region in \p Plan.
2680static void licm(VPlan &Plan) {
2681 VPBasicBlock *Preheader = Plan.getVectorPreheader();
2682
2683 // Hoist any loop invariant recipes from the vector loop region to the
2684 // preheader. Preform a shallow traversal of the vector loop region, to
2685 // exclude recipes in replicate regions. Since the top-level blocks in the
2686 // vector loop region are guaranteed to execute if the vector pre-header is,
2687 // we don't need to check speculation safety.
2688 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2689 assert(Preheader->getSingleSuccessor() == LoopRegion &&
2690 "Expected vector prehader's successor to be the vector loop region");
2692 vp_depth_first_shallow(LoopRegion->getEntry()))) {
2693 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
2695 continue;
2696 if (any_of(R.operands(), [](VPValue *Op) {
2697 return !Op->isDefinedOutsideLoopRegions();
2698 }))
2699 continue;
2700 R.moveBefore(*Preheader, Preheader->end());
2701 }
2702 }
2703
2704#ifndef NDEBUG
2705 VPDominatorTree VPDT(Plan);
2706#endif
2707 // Sink recipes with no users inside the vector loop region if all users are
2708 // in the same exit block of the region.
2709 // TODO: Extend to sink recipes from inner loops.
2711 LoopRegion->getEntry());
2713 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
2714 if (cannotHoistOrSinkRecipe(R, /*Sinking=*/true))
2715 continue;
2716
2717 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
2718 assert(!RepR->isPredicated() &&
2719 "Expected prior transformation of predicated replicates to "
2720 "replicate regions");
2721 // narrowToSingleScalarRecipes should have already maximally narrowed
2722 // replicates to single-scalar replicates.
2723 // TODO: When unrolling, replicateByVF doesn't handle sunk
2724 // non-single-scalar replicates correctly.
2725 if (!RepR->isSingleScalar())
2726 continue;
2727 }
2728
2729 // TODO: Use R.definedValues() instead of casting to VPSingleDefRecipe to
2730 // support recipes with multiple defined values (e.g., interleaved loads).
2731 auto *Def = cast<VPSingleDefRecipe>(&R);
2732
2733 // Cannot sink the recipe if the user is defined in a loop region or a
2734 // non-successor of the vector loop region. Cannot sink if user is a phi
2735 // either.
2736 VPBasicBlock *SinkBB = nullptr;
2737 if (any_of(Def->users(), [&SinkBB, &LoopRegion](VPUser *U) {
2738 auto *UserR = cast<VPRecipeBase>(U);
2739 VPBasicBlock *Parent = UserR->getParent();
2740 // TODO: Support sinking when users are in multiple blocks.
2741 if (SinkBB && SinkBB != Parent)
2742 return true;
2743 SinkBB = Parent;
2744 // TODO: If the user is a PHI node, we should check the block of
2745 // incoming value. Support PHI node users if needed.
2746 return UserR->isPhi() || Parent->getEnclosingLoopRegion() ||
2747 Parent->getSinglePredecessor() != LoopRegion;
2748 }))
2749 continue;
2750
2751 if (!SinkBB)
2752 SinkBB = cast<VPBasicBlock>(LoopRegion->getSingleSuccessor());
2753
2754 // TODO: This will need to be a check instead of a assert after
2755 // conditional branches in vectorized loops are supported.
2756 assert(VPDT.properlyDominates(VPBB, SinkBB) &&
2757 "Defining block must dominate sink block");
2758 // TODO: Clone the recipe if users are on multiple exit paths, instead of
2759 // just moving.
2760 Def->moveBefore(*SinkBB, SinkBB->getFirstNonPhi());
2761 }
2762 }
2763}
2764
2766 VPlan &Plan, const MapVector<Instruction *, uint64_t> &MinBWs) {
2767 if (Plan.hasScalarVFOnly())
2768 return;
2769 // Keep track of created truncates, so they can be re-used. Note that we
2770 // cannot use RAUW after creating a new truncate, as this would could make
2771 // other uses have different types for their operands, making them invalidly
2772 // typed.
2774 VPTypeAnalysis TypeInfo(Plan);
2775 VPBasicBlock *PH = Plan.getVectorPreheader();
2778 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
2781 continue;
2782
2783 VPValue *ResultVPV = R.getVPSingleValue();
2784 auto *UI = cast_or_null<Instruction>(ResultVPV->getUnderlyingValue());
2785 unsigned NewResSizeInBits = MinBWs.lookup(UI);
2786 if (!NewResSizeInBits)
2787 continue;
2788
2789 // If the value wasn't vectorized, we must maintain the original scalar
2790 // type. Skip those here, after incrementing NumProcessedRecipes. Also
2791 // skip casts which do not need to be handled explicitly here, as
2792 // redundant casts will be removed during recipe simplification.
2794 continue;
2795
2796 Type *OldResTy = TypeInfo.inferScalarType(ResultVPV);
2797 unsigned OldResSizeInBits = OldResTy->getScalarSizeInBits();
2798 assert(OldResTy->isIntegerTy() && "only integer types supported");
2799 (void)OldResSizeInBits;
2800
2801 auto *NewResTy = IntegerType::get(Plan.getContext(), NewResSizeInBits);
2802
2803 // Any wrapping introduced by shrinking this operation shouldn't be
2804 // considered undefined behavior. So, we can't unconditionally copy
2805 // arithmetic wrapping flags to VPW.
2806 if (auto *VPW = dyn_cast<VPRecipeWithIRFlags>(&R))
2807 VPW->dropPoisonGeneratingFlags();
2808
2809 if (OldResSizeInBits != NewResSizeInBits &&
2810 !match(&R, m_ICmp(m_VPValue(), m_VPValue()))) {
2811 // Extend result to original width.
2812 auto *Ext = new VPWidenCastRecipe(
2813 Instruction::ZExt, ResultVPV, OldResTy, nullptr,
2814 VPIRFlags::getDefaultFlags(Instruction::ZExt));
2815 Ext->insertAfter(&R);
2816 ResultVPV->replaceAllUsesWith(Ext);
2817 Ext->setOperand(0, ResultVPV);
2818 assert(OldResSizeInBits > NewResSizeInBits && "Nothing to shrink?");
2819 } else {
2820 assert(match(&R, m_ICmp(m_VPValue(), m_VPValue())) &&
2821 "Only ICmps should not need extending the result.");
2822 }
2823
2824 assert(!isa<VPWidenStoreRecipe>(&R) && "stores cannot be narrowed");
2826 continue;
2827
2828 // Shrink operands by introducing truncates as needed.
2829 unsigned StartIdx =
2830 match(&R, m_Select(m_VPValue(), m_VPValue(), m_VPValue())) ? 1 : 0;
2831 for (unsigned Idx = StartIdx; Idx != R.getNumOperands(); ++Idx) {
2832 auto *Op = R.getOperand(Idx);
2833 unsigned OpSizeInBits =
2835 if (OpSizeInBits == NewResSizeInBits)
2836 continue;
2837 assert(OpSizeInBits > NewResSizeInBits && "nothing to truncate");
2838 auto [ProcessedIter, IterIsEmpty] = ProcessedTruncs.try_emplace(Op);
2839 if (!IterIsEmpty) {
2840 R.setOperand(Idx, ProcessedIter->second);
2841 continue;
2842 }
2843
2844 VPBuilder Builder;
2845 if (isa<VPIRValue>(Op))
2846 Builder.setInsertPoint(PH);
2847 else
2848 Builder.setInsertPoint(&R);
2849 VPWidenCastRecipe *NewOp =
2850 Builder.createWidenCast(Instruction::Trunc, Op, NewResTy);
2851 ProcessedIter->second = NewOp;
2852 R.setOperand(Idx, NewOp);
2853 }
2854
2855 }
2856 }
2857}
2858
2859void VPlanTransforms::removeBranchOnConst(VPlan &Plan, bool OnlyLatches) {
2860 std::optional<VPDominatorTree> VPDT;
2861 if (OnlyLatches)
2862 VPDT.emplace(Plan);
2863
2864 // Collect all blocks before modifying the CFG so we can identify unreachable
2865 // ones after constant branch removal.
2867
2868 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(AllBlocks)) {
2869 VPValue *Cond;
2870 // Skip blocks that are not terminated by BranchOnCond.
2871 if (VPBB->empty() || !match(&VPBB->back(), m_BranchOnCond(m_VPValue(Cond))))
2872 continue;
2873
2874 if (OnlyLatches && !VPBlockUtils::isLatch(VPBB, *VPDT))
2875 continue;
2876
2877 assert(VPBB->getNumSuccessors() == 2 &&
2878 "Two successors expected for BranchOnCond");
2879 unsigned RemovedIdx;
2880 if (match(Cond, m_True()))
2881 RemovedIdx = 1;
2882 else if (match(Cond, m_False()))
2883 RemovedIdx = 0;
2884 else
2885 continue;
2886
2887 VPBasicBlock *RemovedSucc =
2888 cast<VPBasicBlock>(VPBB->getSuccessors()[RemovedIdx]);
2889 assert(count(RemovedSucc->getPredecessors(), VPBB) == 1 &&
2890 "There must be a single edge between VPBB and its successor");
2891 // Values coming from VPBB into phi recipes of RemovedSucc are removed from
2892 // these recipes.
2893 for (VPRecipeBase &R : RemovedSucc->phis())
2894 cast<VPPhiAccessors>(&R)->removeIncomingValueFor(VPBB);
2895
2896 // Disconnect blocks and remove the terminator.
2897 VPBlockUtils::disconnectBlocks(VPBB, RemovedSucc);
2898 VPBB->back().eraseFromParent();
2899 }
2900
2901 // Compute which blocks are still reachable from the entry after constant
2902 // branch removal.
2905
2906 // Detach all unreachable blocks from their successors, removing their recipes
2907 // and incoming values from phi recipes.
2908 VPSymbolicValue Tmp;
2909 for (VPBlockBase *B : AllBlocks) {
2910 if (Reachable.contains(B))
2911 continue;
2912 for (VPBlockBase *Succ : to_vector(B->successors())) {
2913 if (auto *SuccBB = dyn_cast<VPBasicBlock>(Succ))
2914 for (VPRecipeBase &R : SuccBB->phis())
2915 cast<VPPhiAccessors>(&R)->removeIncomingValueFor(B);
2917 }
2918 for (VPBasicBlock *DeadBB :
2920 for (VPRecipeBase &R : make_early_inc_range(*DeadBB)) {
2921 for (VPValue *Def : R.definedValues())
2922 Def->replaceAllUsesWith(&Tmp);
2923 R.eraseFromParent();
2924 }
2925 }
2926 }
2927}
2928
2950
2951// Add a VPActiveLaneMaskPHIRecipe and related recipes to \p Plan and replace
2952// the loop terminator with a branch-on-cond recipe with the negated
2953// active-lane-mask as operand. Note that this turns the loop into an
2954// uncountable one. Only the existing terminator is replaced, all other existing
2955// recipes/users remain unchanged, except for poison-generating flags being
2956// dropped from the canonical IV increment. Return the created
2957// VPActiveLaneMaskPHIRecipe.
2958//
2959// The function adds the following recipes:
2960//
2961// vector.ph:
2962// %EntryInc = canonical-iv-increment-for-part CanonicalIVStart
2963// %EntryALM = active-lane-mask %EntryInc, TC
2964//
2965// vector.body:
2966// ...
2967// %P = active-lane-mask-phi [ %EntryALM, %vector.ph ], [ %ALM, %vector.body ]
2968// ...
2969// %InLoopInc = canonical-iv-increment-for-part CanonicalIVIncrement
2970// %ALM = active-lane-mask %InLoopInc, TC
2971// %Negated = Not %ALM
2972// branch-on-cond %Negated
2973//
2976 VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
2977 VPBasicBlock *EB = TopRegion->getExitingBasicBlock();
2978 VPValue *StartV = Plan.getZero(TopRegion->getCanonicalIVType());
2979 auto *CanonicalIVIncrement = TopRegion->getOrCreateCanonicalIVIncrement();
2980 // TODO: Check if dropping the flags is needed.
2981 TopRegion->clearCanonicalIVNUW(CanonicalIVIncrement);
2982 DebugLoc DL = CanonicalIVIncrement->getDebugLoc();
2983 // We can't use StartV directly in the ActiveLaneMask VPInstruction, since
2984 // we have to take unrolling into account. Each part needs to start at
2985 // Part * VF
2986 auto *VecPreheader = Plan.getVectorPreheader();
2987 VPBuilder Builder(VecPreheader);
2988
2989 // Create the ActiveLaneMask instruction using the correct start values.
2990 VPValue *TC = Plan.getTripCount();
2991 VPValue *VF = &Plan.getVF();
2992
2993 auto *EntryIncrement = Builder.createOverflowingOp(
2994 VPInstruction::CanonicalIVIncrementForPart, {StartV, VF}, {false, false},
2995 DL, "index.part.next");
2996
2997 // Create the active lane mask instruction in the VPlan preheader.
2998 VPValue *ALMMultiplier =
2999 Plan.getConstantInt(TopRegion->getCanonicalIVType(), 1);
3000 auto *EntryALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
3001 {EntryIncrement, TC, ALMMultiplier}, DL,
3002 "active.lane.mask.entry");
3003
3004 // Now create the ActiveLaneMaskPhi recipe in the main loop using the
3005 // preheader ActiveLaneMask instruction.
3006 auto *LaneMaskPhi =
3008 auto *HeaderVPBB = TopRegion->getEntryBasicBlock();
3009 LaneMaskPhi->insertBefore(*HeaderVPBB, HeaderVPBB->begin());
3010
3011 // Create the active lane mask for the next iteration of the loop before the
3012 // original terminator.
3013 VPRecipeBase *OriginalTerminator = EB->getTerminator();
3014 Builder.setInsertPoint(OriginalTerminator);
3015 auto *InLoopIncrement = Builder.createOverflowingOp(
3017 {CanonicalIVIncrement, &Plan.getVF()}, {false, false}, DL);
3018 auto *ALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
3019 {InLoopIncrement, TC, ALMMultiplier}, DL,
3020 "active.lane.mask.next");
3021 LaneMaskPhi->addOperand(ALM);
3022
3023 // Replace the original terminator with BranchOnCond. We have to invert the
3024 // mask here because a true condition means jumping to the exit block.
3025 auto *NotMask = Builder.createNot(ALM, DL);
3026 Builder.createNaryOp(VPInstruction::BranchOnCond, {NotMask}, DL);
3027 OriginalTerminator->eraseFromParent();
3028 return LaneMaskPhi;
3029}
3030
3032 bool UseActiveLaneMaskForControlFlow) {
3033 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3034 auto *FoundWidenCanonicalIVUser = find_if(
3036 assert(FoundWidenCanonicalIVUser &&
3037 "Must have widened canonical IV when tail folding!");
3038 VPSingleDefRecipe *HeaderMask = vputils::findHeaderMask(Plan);
3039 auto *WideCanonicalIV =
3040 cast<VPWidenCanonicalIVRecipe>(*FoundWidenCanonicalIVUser);
3041 VPSingleDefRecipe *LaneMask;
3042 if (UseActiveLaneMaskForControlFlow) {
3043 LaneMask = addVPLaneMaskPhiAndUpdateExitBranch(Plan);
3044 } else {
3045 VPBuilder B = VPBuilder::getToInsertAfter(WideCanonicalIV);
3046 VPValue *ALMMultiplier =
3047 Plan.getConstantInt(LoopRegion->getCanonicalIVType(), 1);
3048 LaneMask =
3049 B.createNaryOp(VPInstruction::ActiveLaneMask,
3050 {WideCanonicalIV, Plan.getTripCount(), ALMMultiplier},
3051 nullptr, "active.lane.mask");
3052 }
3053
3054 // Walk users of WideCanonicalIV and replace the header mask of the form
3055 // (ICMP_ULE, WideCanonicalIV, backedge-taken-count) with an active-lane-mask,
3056 // removing the old one to ensure there is always only a single header mask.
3057 HeaderMask->replaceAllUsesWith(LaneMask);
3058 HeaderMask->eraseFromParent();
3059}
3060
3061template <typename Op0_t, typename Op1_t> struct RemoveMask_match {
3062 Op0_t In;
3064
3065 RemoveMask_match(const Op0_t &In, Op1_t &Out) : In(In), Out(Out) {}
3066
3067 template <typename OpTy> bool match(OpTy *V) const {
3068 if (m_Specific(In).match(V)) {
3069 Out = nullptr;
3070 return true;
3071 }
3072 return m_LogicalAnd(m_Specific(In), m_VPValue(Out)).match(V);
3073 }
3074};
3075
3076/// Match a specific mask \p In, or a combination of it (logical-and In, Out).
3077/// Returns the remaining part \p Out if so, or nullptr otherwise.
3078template <typename Op0_t, typename Op1_t>
3079static inline RemoveMask_match<Op0_t, Op1_t> m_RemoveMask(const Op0_t &In,
3080 Op1_t &Out) {
3081 return RemoveMask_match<Op0_t, Op1_t>(In, Out);
3082}
3083
3084/// Try to optimize a \p CurRecipe masked by \p HeaderMask to a corresponding
3085/// EVL-based recipe without the header mask. Returns nullptr if no EVL-based
3086/// recipe could be created.
3087/// \p HeaderMask Header Mask.
3088/// \p CurRecipe Recipe to be transform.
3089/// \p TypeInfo VPlan-based type analysis.
3090/// \p EVL The explicit vector length parameter of vector-predication
3091/// intrinsics.
3093 VPRecipeBase &CurRecipe,
3094 VPTypeAnalysis &TypeInfo, VPValue &EVL) {
3095 VPlan *Plan = CurRecipe.getParent()->getPlan();
3096 DebugLoc DL = CurRecipe.getDebugLoc();
3097 VPValue *Addr, *Mask, *EndPtr;
3098
3099 /// Adjust any end pointers so that they point to the end of EVL lanes not VF.
3100 auto AdjustEndPtr = [&CurRecipe, &EVL](VPValue *EndPtr) {
3101 auto *EVLEndPtr = cast<VPVectorEndPointerRecipe>(EndPtr)->clone();
3102 EVLEndPtr->insertBefore(&CurRecipe);
3103 EVLEndPtr->setOperand(1, &EVL);
3104 return EVLEndPtr;
3105 };
3106
3107 auto GetVPReverse = [&CurRecipe, &EVL, &TypeInfo, Plan,
3109 if (!V)
3110 return nullptr;
3111 auto *Reverse = new VPWidenIntrinsicRecipe(
3112 Intrinsic::experimental_vp_reverse, {V, Plan->getTrue(), &EVL},
3113 TypeInfo.inferScalarType(V), {}, {}, DL);
3114 Reverse->insertBefore(&CurRecipe);
3115 return Reverse;
3116 };
3117
3118 if (match(&CurRecipe,
3119 m_MaskedLoad(m_VPValue(Addr), m_RemoveMask(HeaderMask, Mask))))
3120 return new VPWidenLoadEVLRecipe(cast<VPWidenLoadRecipe>(CurRecipe), Addr,
3121 EVL, Mask);
3122
3123 VPValue *ReversedVal;
3124 if (match(&CurRecipe, m_Reverse(m_VPValue(ReversedVal))) &&
3125 match(ReversedVal,
3126 m_MaskedLoad(m_VPValue(EndPtr),
3127 m_Reverse(m_RemoveMask(HeaderMask, Mask)))) &&
3128 match(EndPtr, m_VecEndPtr(m_VPValue(), m_Specific(&Plan->getVF())))) {
3129 Mask = GetVPReverse(Mask);
3130 Addr = AdjustEndPtr(EndPtr);
3131 auto *LoadR = new VPWidenLoadEVLRecipe(
3132 *cast<VPWidenLoadRecipe>(ReversedVal), Addr, EVL, Mask);
3133 LoadR->insertBefore(&CurRecipe);
3134 return new VPWidenIntrinsicRecipe(
3135 Intrinsic::experimental_vp_reverse, {LoadR, Plan->getTrue(), &EVL},
3136 TypeInfo.inferScalarType(LoadR), {}, {}, DL);
3137 }
3138
3139 VPValue *StoredVal;
3140 if (match(&CurRecipe, m_MaskedStore(m_VPValue(Addr), m_VPValue(StoredVal),
3141 m_RemoveMask(HeaderMask, Mask))))
3142 return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(CurRecipe), Addr,
3143 StoredVal, EVL, Mask);
3144
3145 if (match(&CurRecipe,
3146 m_MaskedStore(m_VPValue(EndPtr), m_Reverse(m_VPValue(ReversedVal)),
3147 m_Reverse(m_RemoveMask(HeaderMask, Mask)))) &&
3148 match(EndPtr, m_VecEndPtr(m_VPValue(), m_Specific(&Plan->getVF())))) {
3149 Mask = GetVPReverse(Mask);
3150 Addr = AdjustEndPtr(EndPtr);
3151 StoredVal = GetVPReverse(ReversedVal);
3152 return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(CurRecipe), Addr,
3153 StoredVal, EVL, Mask);
3154 }
3155
3156 if (auto *Rdx = dyn_cast<VPReductionRecipe>(&CurRecipe))
3157 if (Rdx->isConditional() &&
3158 match(Rdx->getCondOp(), m_RemoveMask(HeaderMask, Mask)))
3159 return new VPReductionEVLRecipe(*Rdx, EVL, Mask);
3160
3161 if (auto *Interleave = dyn_cast<VPInterleaveRecipe>(&CurRecipe))
3162 if (Interleave->getMask() &&
3163 match(Interleave->getMask(), m_RemoveMask(HeaderMask, Mask)))
3164 return new VPInterleaveEVLRecipe(*Interleave, EVL, Mask);
3165
3166 VPValue *LHS, *RHS;
3167 if (match(&CurRecipe,
3168 m_Select(m_Specific(HeaderMask), m_VPValue(LHS), m_VPValue(RHS))))
3169 return new VPWidenIntrinsicRecipe(
3170 Intrinsic::vp_merge, {Plan->getTrue(), LHS, RHS, &EVL},
3171 TypeInfo.inferScalarType(LHS), {}, {}, DL);
3172
3173 if (match(&CurRecipe, m_Select(m_RemoveMask(HeaderMask, Mask), m_VPValue(LHS),
3174 m_VPValue(RHS))))
3175 return new VPWidenIntrinsicRecipe(
3176 Intrinsic::vp_merge, {Mask, LHS, RHS, &EVL},
3177 TypeInfo.inferScalarType(LHS), {}, {}, DL);
3178
3179 if (match(&CurRecipe, m_LastActiveLane(m_Specific(HeaderMask)))) {
3180 Type *Ty = TypeInfo.inferScalarType(CurRecipe.getVPSingleValue());
3181 VPValue *ZExt = VPBuilder(&CurRecipe)
3183 &EVL, Ty, TypeInfo.inferScalarType(&EVL), DL);
3184 return new VPInstruction(
3185 Instruction::Sub, {ZExt, Plan->getConstantInt(Ty, 1)},
3186 VPIRFlags::getDefaultFlags(Instruction::Sub), {}, DL);
3187 }
3188
3189 // lhs | (headermask && rhs) -> vp.merge rhs, true, lhs, evl
3190 if (match(&CurRecipe,
3192 m_LogicalAnd(m_Specific(HeaderMask), m_VPValue(RHS)))))
3193 return new VPWidenIntrinsicRecipe(
3194 Intrinsic::vp_merge, {RHS, Plan->getTrue(), LHS, &EVL},
3195 TypeInfo.inferScalarType(LHS), {}, {}, DL);
3196
3197 return nullptr;
3198}
3199
3200/// Optimize away any EVL-based header masks to VP intrinsic based recipes.
3201/// The transforms here need to preserve the original semantics.
3203 // Find the EVL-based header mask if it exists: icmp ult step-vector, EVL
3204 VPValue *HeaderMask = nullptr, *EVL = nullptr;
3207 m_VPValue(EVL))) &&
3208 match(EVL, m_EVL(m_VPValue()))) {
3209 HeaderMask = R.getVPSingleValue();
3210 break;
3211 }
3212 }
3213 if (!HeaderMask)
3214 return;
3215
3216 VPTypeAnalysis TypeInfo(Plan);
3217 SmallVector<VPRecipeBase *> OldRecipes;
3218 for (VPUser *U : collectUsersRecursively(HeaderMask)) {
3220 if (auto *NewR = optimizeMaskToEVL(HeaderMask, *R, TypeInfo, *EVL)) {
3221 NewR->insertBefore(R);
3222 for (auto [Old, New] :
3223 zip_equal(R->definedValues(), NewR->definedValues()))
3224 Old->replaceAllUsesWith(New);
3225 OldRecipes.push_back(R);
3226 }
3227 }
3228
3229 // Replace remaining (HeaderMask && Mask) with vp.merge (True, Mask,
3230 // False, EVL)
3231 for (VPUser *U : collectUsersRecursively(HeaderMask)) {
3232 VPValue *Mask;
3233 if (match(U, m_LogicalAnd(m_Specific(HeaderMask), m_VPValue(Mask)))) {
3234 auto *LogicalAnd = cast<VPInstruction>(U);
3235 auto *Merge = new VPWidenIntrinsicRecipe(
3236 Intrinsic::vp_merge, {Plan.getTrue(), Mask, Plan.getFalse(), EVL},
3237 TypeInfo.inferScalarType(Mask), {}, {}, LogicalAnd->getDebugLoc());
3238 Merge->insertBefore(LogicalAnd);
3239 LogicalAnd->replaceAllUsesWith(Merge);
3240 OldRecipes.push_back(LogicalAnd);
3241 }
3242 }
3243
3244 // Erase old recipes at the end so we don't invalidate TypeInfo.
3245 for (VPRecipeBase *R : reverse(OldRecipes)) {
3246 SmallVector<VPValue *> PossiblyDead(R->operands());
3247 R->eraseFromParent();
3248 for (VPValue *Op : PossiblyDead)
3250 }
3251}
3252
3253/// After replacing the canonical IV with a EVL-based IV, fixup recipes that use
3254/// VF to use the EVL instead to avoid incorrect updates on the penultimate
3255/// iteration.
3256static void fixupVFUsersForEVL(VPlan &Plan, VPValue &EVL) {
3257 VPTypeAnalysis TypeInfo(Plan);
3258 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3259 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
3260
3261 assert(all_of(Plan.getVF().users(),
3264 "User of VF that we can't transform to EVL.");
3265 Plan.getVF().replaceUsesWithIf(&EVL, [](VPUser &U, unsigned Idx) {
3267 });
3268
3269 assert(all_of(Plan.getVFxUF().users(),
3271 m_c_Add(m_Specific(LoopRegion->getCanonicalIV()),
3272 m_Specific(&Plan.getVFxUF())),
3274 "Only users of VFxUF should be VPWidenPointerInductionRecipe and the "
3275 "increment of the canonical induction.");
3276 Plan.getVFxUF().replaceUsesWithIf(&EVL, [](VPUser &U, unsigned Idx) {
3277 // Only replace uses in VPWidenPointerInductionRecipe; The increment of the
3278 // canonical induction must not be updated.
3280 });
3281
3282 // Create a scalar phi to track the previous EVL if fixed-order recurrence is
3283 // contained.
3284 bool ContainsFORs =
3286 if (ContainsFORs) {
3287 // TODO: Use VPInstruction::ExplicitVectorLength to get maximum EVL.
3288 VPValue *MaxEVL = &Plan.getVF();
3289 // Emit VPScalarCastRecipe in preheader if VF is not a 32 bits integer.
3290 VPBuilder Builder(LoopRegion->getPreheaderVPBB());
3291 MaxEVL = Builder.createScalarZExtOrTrunc(
3292 MaxEVL, Type::getInt32Ty(Plan.getContext()),
3293 TypeInfo.inferScalarType(MaxEVL), DebugLoc::getUnknown());
3294
3295 Builder.setInsertPoint(Header, Header->getFirstNonPhi());
3296 VPValue *PrevEVL = Builder.createScalarPhi(
3297 {MaxEVL, &EVL}, DebugLoc::getUnknown(), "prev.evl");
3298
3301 for (VPRecipeBase &R : *VPBB) {
3302 VPValue *V1, *V2;
3303 if (!match(&R,
3305 m_VPValue(V1), m_VPValue(V2))))
3306 continue;
3307 VPValue *Imm = Plan.getOrAddLiveIn(
3310 Intrinsic::experimental_vp_splice,
3311 {V1, V2, Imm, Plan.getTrue(), PrevEVL, &EVL},
3312 TypeInfo.inferScalarType(R.getVPSingleValue()), {}, {},
3313 R.getDebugLoc());
3314 VPSplice->insertBefore(&R);
3315 R.getVPSingleValue()->replaceAllUsesWith(VPSplice);
3316 }
3317 }
3318 }
3319
3320 VPValue *HeaderMask = vputils::findHeaderMask(Plan);
3321 if (!HeaderMask)
3322 return;
3323
3324 // Ensure that any reduction that uses a select to mask off tail lanes does so
3325 // in the vector loop, not the middle block, since EVL tail folding can have
3326 // tail elements in the penultimate iteration.
3327 assert(all_of(*Plan.getMiddleBlock(), [&Plan, HeaderMask](VPRecipeBase &R) {
3328 if (match(&R, m_ComputeReductionResult(m_Select(m_Specific(HeaderMask),
3329 m_VPValue(), m_VPValue()))))
3330 return R.getOperand(0)->getDefiningRecipe()->getRegion() ==
3331 Plan.getVectorLoopRegion();
3332 return true;
3333 }));
3334
3335 // Replace header masks with a mask equivalent to predicating by EVL:
3336 //
3337 // icmp ule widen-canonical-iv backedge-taken-count
3338 // ->
3339 // icmp ult step-vector, EVL
3340 VPRecipeBase *EVLR = EVL.getDefiningRecipe();
3341 VPBuilder Builder(EVLR->getParent(), std::next(EVLR->getIterator()));
3342 Type *EVLType = TypeInfo.inferScalarType(&EVL);
3343 VPValue *EVLMask = Builder.createICmp(
3345 Builder.createNaryOp(VPInstruction::StepVector, {}, EVLType), &EVL);
3346 HeaderMask->replaceAllUsesWith(EVLMask);
3347}
3348
3349/// Converts a tail folded vector loop region to step by
3350/// VPInstruction::ExplicitVectorLength elements instead of VF elements each
3351/// iteration.
3352///
3353/// - Add a VPCurrentIterationPHIRecipe and related recipes to \p Plan and
3354/// replaces all uses of the canonical IV except for the canonical IV
3355/// increment with a VPCurrentIterationPHIRecipe. The canonical IV is used
3356/// only for loop iterations counting after this transformation.
3357///
3358/// - The header mask is replaced with a header mask based on the EVL.
3359///
3360/// - Plans with FORs have a new phi added to keep track of the EVL of the
3361/// previous iteration, and VPFirstOrderRecurrencePHIRecipes are replaced with
3362/// @llvm.vp.splice.
3363///
3364/// The function uses the following definitions:
3365/// %StartV is the canonical induction start value.
3366///
3367/// The function adds the following recipes:
3368///
3369/// vector.ph:
3370/// ...
3371///
3372/// vector.body:
3373/// ...
3374/// %CurrentIter = CURRENT-ITERATION-PHI [ %StartV, %vector.ph ],
3375/// [ %NextIter, %vector.body ]
3376/// %AVL = phi [ trip-count, %vector.ph ], [ %NextAVL, %vector.body ]
3377/// %VPEVL = EXPLICIT-VECTOR-LENGTH %AVL
3378/// ...
3379/// %OpEVL = cast i32 %VPEVL to IVSize
3380/// %NextIter = add IVSize %OpEVL, %CurrentIter
3381/// %NextAVL = sub IVSize nuw %AVL, %OpEVL
3382/// ...
3383///
3384/// If MaxSafeElements is provided, the function adds the following recipes:
3385/// vector.ph:
3386/// ...
3387///
3388/// vector.body:
3389/// ...
3390/// %CurrentIter = CURRENT-ITERATION-PHI [ %StartV, %vector.ph ],
3391/// [ %NextIter, %vector.body ]
3392/// %AVL = phi [ trip-count, %vector.ph ], [ %NextAVL, %vector.body ]
3393/// %cmp = cmp ult %AVL, MaxSafeElements
3394/// %SAFE_AVL = select %cmp, %AVL, MaxSafeElements
3395/// %VPEVL = EXPLICIT-VECTOR-LENGTH %SAFE_AVL
3396/// ...
3397/// %OpEVL = cast i32 %VPEVL to IVSize
3398/// %NextIter = add IVSize %OpEVL, %CurrentIter
3399/// %NextAVL = sub IVSize nuw %AVL, %OpEVL
3400/// ...
3401///
3403 VPlan &Plan, const std::optional<unsigned> &MaxSafeElements) {
3404 if (Plan.hasScalarVFOnly())
3405 return;
3406 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3407 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
3408
3409 auto *CanonicalIV = LoopRegion->getCanonicalIV();
3410 auto *CanIVTy = LoopRegion->getCanonicalIVType();
3411 VPValue *StartV = Plan.getZero(CanIVTy);
3412 auto *CanonicalIVIncrement = LoopRegion->getOrCreateCanonicalIVIncrement();
3413
3414 // Create the CurrentIteration recipe in the vector loop.
3415 auto *CurrentIteration =
3417 CurrentIteration->insertBefore(*Header, Header->begin());
3418 VPBuilder Builder(Header, Header->getFirstNonPhi());
3419 // Create the AVL (application vector length), starting from TC -> 0 in steps
3420 // of EVL.
3421 VPPhi *AVLPhi = Builder.createScalarPhi(
3422 {Plan.getTripCount()}, DebugLoc::getCompilerGenerated(), "avl");
3423 VPValue *AVL = AVLPhi;
3424
3425 if (MaxSafeElements) {
3426 // Support for MaxSafeDist for correct loop emission.
3427 VPValue *AVLSafe = Plan.getConstantInt(CanIVTy, *MaxSafeElements);
3428 VPValue *Cmp = Builder.createICmp(ICmpInst::ICMP_ULT, AVL, AVLSafe);
3429 AVL = Builder.createSelect(Cmp, AVL, AVLSafe, DebugLoc::getUnknown(),
3430 "safe_avl");
3431 }
3432 auto *VPEVL = Builder.createNaryOp(VPInstruction::ExplicitVectorLength, AVL,
3433 DebugLoc::getUnknown(), "evl");
3434
3435 Builder.setInsertPoint(CanonicalIVIncrement);
3436 VPValue *OpVPEVL = VPEVL;
3437
3438 auto *I32Ty = Type::getInt32Ty(Plan.getContext());
3439 OpVPEVL = Builder.createScalarZExtOrTrunc(
3440 OpVPEVL, CanIVTy, I32Ty, CanonicalIVIncrement->getDebugLoc());
3441
3442 auto *NextIter = Builder.createAdd(
3443 OpVPEVL, CurrentIteration, CanonicalIVIncrement->getDebugLoc(),
3444 "current.iteration.next", CanonicalIVIncrement->getNoWrapFlags());
3445 CurrentIteration->addOperand(NextIter);
3446
3447 VPValue *NextAVL =
3448 Builder.createSub(AVLPhi, OpVPEVL, DebugLoc::getCompilerGenerated(),
3449 "avl.next", {/*NUW=*/true, /*NSW=*/false});
3450 AVLPhi->addOperand(NextAVL);
3451
3452 fixupVFUsersForEVL(Plan, *VPEVL);
3453 removeDeadRecipes(Plan);
3454
3455 // Replace all uses of the canonical IV with VPCurrentIterationPHIRecipe
3456 // except for the canonical IV increment.
3457 CanonicalIV->replaceAllUsesWith(CurrentIteration);
3458 CanonicalIVIncrement->setOperand(0, CanonicalIV);
3459 // TODO: support unroll factor > 1.
3460 Plan.setUF(1);
3461}
3462
3464 // Find the vector loop entry by locating VPCurrentIterationPHIRecipe.
3465 // There should be only one VPCurrentIteration in the entire plan.
3466 VPCurrentIterationPHIRecipe *CurrentIteration = nullptr;
3467
3470 for (VPRecipeBase &R : VPBB->phis())
3471 if (auto *PhiR = dyn_cast<VPCurrentIterationPHIRecipe>(&R)) {
3472 assert(!CurrentIteration &&
3473 "Found multiple CurrentIteration. Only one expected");
3474 CurrentIteration = PhiR;
3475 }
3476
3477 // Early return if it is not variable-length stepping.
3478 if (!CurrentIteration)
3479 return;
3480
3481 VPBasicBlock *HeaderVPBB = CurrentIteration->getParent();
3482 VPValue *CurrentIterationIncr = CurrentIteration->getBackedgeValue();
3483
3484 // Convert CurrentIteration to concrete recipe.
3485 auto *ScalarR =
3486 VPBuilder(CurrentIteration)
3488 {CurrentIteration->getStartValue(), CurrentIterationIncr},
3489 CurrentIteration->getDebugLoc(), "current.iteration.iv");
3490 CurrentIteration->replaceAllUsesWith(ScalarR);
3491 CurrentIteration->eraseFromParent();
3492
3493 // Replace CanonicalIVInc with CurrentIteration increment if it exists.
3494 auto *CanonicalIV = cast<VPPhi>(&*HeaderVPBB->begin());
3495 if (auto *CanIVInc = vputils::findUserOf(
3496 CanonicalIV, m_c_Add(m_VPValue(), m_Specific(&Plan.getVFxUF())))) {
3497 cast<VPInstruction>(CanIVInc)->replaceAllUsesWith(CurrentIterationIncr);
3498 CanIVInc->eraseFromParent();
3499 }
3500}
3501
3503 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3504 if (!LoopRegion)
3505 return;
3506 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
3507 if (Header->empty())
3508 return;
3509 // The EVL IV is always at the beginning.
3510 auto *EVLPhi = dyn_cast<VPCurrentIterationPHIRecipe>(&Header->front());
3511 if (!EVLPhi)
3512 return;
3513
3514 // Bail if not an EVL tail folded loop.
3515 VPValue *AVL;
3516 if (!match(EVLPhi->getBackedgeValue(),
3517 m_c_Add(m_ZExtOrSelf(m_EVL(m_VPValue(AVL))), m_Specific(EVLPhi))))
3518 return;
3519
3520 // The AVL may be capped to a safe distance.
3521 VPValue *SafeAVL, *UnsafeAVL;
3522 if (match(AVL,
3524 m_VPValue(SafeAVL)),
3525 m_Deferred(UnsafeAVL), m_Deferred(SafeAVL))))
3526 AVL = UnsafeAVL;
3527
3528 VPValue *AVLNext;
3529 [[maybe_unused]] bool FoundAVLNext =
3531 m_Specific(Plan.getTripCount()), m_VPValue(AVLNext)));
3532 assert(FoundAVLNext && "Didn't find AVL backedge?");
3533
3534 VPBasicBlock *Latch = LoopRegion->getExitingBasicBlock();
3535 auto *LatchBr = cast<VPInstruction>(Latch->getTerminator());
3536 if (match(LatchBr, m_BranchOnCond(m_True())))
3537 return;
3538
3539 VPValue *CanIVInc;
3540 [[maybe_unused]] bool FoundIncrement = match(
3541 LatchBr,
3543 m_Specific(&Plan.getVectorTripCount()))));
3544 assert(FoundIncrement &&
3545 match(CanIVInc, m_Add(m_Specific(LoopRegion->getCanonicalIV()),
3546 m_Specific(&Plan.getVFxUF()))) &&
3547 "Expected BranchOnCond with ICmp comparing CanIV + VFxUF with vector "
3548 "trip count");
3549
3550 Type *AVLTy = VPTypeAnalysis(Plan).inferScalarType(AVLNext);
3551 VPBuilder Builder(LatchBr);
3552 LatchBr->setOperand(
3553 0, Builder.createICmp(CmpInst::ICMP_EQ, AVLNext, Plan.getZero(AVLTy)));
3554}
3555
3557 VPlan &Plan, PredicatedScalarEvolution &PSE,
3558 const DenseMap<Value *, const SCEV *> &StridesMap) {
3559 // Replace VPValues for known constant strides guaranteed by predicate scalar
3560 // evolution.
3561 auto CanUseVersionedStride = [&Plan](VPUser &U, unsigned) {
3562 auto *R = cast<VPRecipeBase>(&U);
3563 return R->getRegion() ||
3564 R->getParent() == Plan.getVectorLoopRegion()->getSinglePredecessor();
3565 };
3566 ValueToSCEVMapTy RewriteMap;
3567 for (const SCEV *Stride : StridesMap.values()) {
3568 using namespace SCEVPatternMatch;
3569 auto *StrideV = cast<SCEVUnknown>(Stride)->getValue();
3570 const APInt *StrideConst;
3571 if (!match(PSE.getSCEV(StrideV), m_scev_APInt(StrideConst)))
3572 // Only handle constant strides for now.
3573 continue;
3574
3575 auto *CI = Plan.getConstantInt(*StrideConst);
3576 if (VPValue *StrideVPV = Plan.getLiveIn(StrideV))
3577 StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
3578
3579 // The versioned value may not be used in the loop directly but through a
3580 // sext/zext. Add new live-ins in those cases.
3581 for (Value *U : StrideV->users()) {
3583 continue;
3584 VPValue *StrideVPV = Plan.getLiveIn(U);
3585 if (!StrideVPV)
3586 continue;
3587 unsigned BW = U->getType()->getScalarSizeInBits();
3588 APInt C =
3589 isa<SExtInst>(U) ? StrideConst->sext(BW) : StrideConst->zext(BW);
3590 VPValue *CI = Plan.getConstantInt(C);
3591 StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
3592 }
3593 RewriteMap[StrideV] = PSE.getSCEV(StrideV);
3594 }
3595
3596 for (VPRecipeBase &R : *Plan.getEntry()) {
3597 auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(&R);
3598 if (!ExpSCEV)
3599 continue;
3600 const SCEV *ScevExpr = ExpSCEV->getSCEV();
3601 auto *NewSCEV =
3602 SCEVParameterRewriter::rewrite(ScevExpr, *PSE.getSE(), RewriteMap);
3603 if (NewSCEV != ScevExpr) {
3604 VPValue *NewExp = vputils::getOrCreateVPValueForSCEVExpr(Plan, NewSCEV);
3605 ExpSCEV->replaceAllUsesWith(NewExp);
3606 if (Plan.getTripCount() == ExpSCEV)
3607 Plan.resetTripCount(NewExp);
3608 }
3609 }
3610}
3611
3613 VPlan &Plan,
3614 const std::function<bool(BasicBlock *)> &BlockNeedsPredication) {
3615 // Collect recipes in the backward slice of `Root` that may generate a poison
3616 // value that is used after vectorization.
3618 auto CollectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) {
3620 Worklist.push_back(Root);
3621
3622 // Traverse the backward slice of Root through its use-def chain.
3623 while (!Worklist.empty()) {
3624 VPRecipeBase *CurRec = Worklist.pop_back_val();
3625
3626 if (!Visited.insert(CurRec).second)
3627 continue;
3628
3629 // Prune search if we find another recipe generating a widen memory
3630 // instruction. Widen memory instructions involved in address computation
3631 // will lead to gather/scatter instructions, which don't need to be
3632 // handled.
3634 VPHeaderPHIRecipe>(CurRec))
3635 continue;
3636
3637 // This recipe contributes to the address computation of a widen
3638 // load/store. If the underlying instruction has poison-generating flags,
3639 // drop them directly.
3640 if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(CurRec)) {
3641 VPValue *A, *B;
3642 // Dropping disjoint from an OR may yield incorrect results, as some
3643 // analysis may have converted it to an Add implicitly (e.g. SCEV used
3644 // for dependence analysis). Instead, replace it with an equivalent Add.
3645 // This is possible as all users of the disjoint OR only access lanes
3646 // where the operands are disjoint or poison otherwise.
3647 if (match(RecWithFlags, m_BinaryOr(m_VPValue(A), m_VPValue(B))) &&
3648 RecWithFlags->isDisjoint()) {
3649 VPBuilder Builder(RecWithFlags);
3650 VPInstruction *New =
3651 Builder.createAdd(A, B, RecWithFlags->getDebugLoc());
3652 New->setUnderlyingValue(RecWithFlags->getUnderlyingValue());
3653 RecWithFlags->replaceAllUsesWith(New);
3654 RecWithFlags->eraseFromParent();
3655 CurRec = New;
3656 } else
3657 RecWithFlags->dropPoisonGeneratingFlags();
3658 } else {
3661 (void)Instr;
3662 assert((!Instr || !Instr->hasPoisonGeneratingFlags()) &&
3663 "found instruction with poison generating flags not covered by "
3664 "VPRecipeWithIRFlags");
3665 }
3666
3667 // Add new definitions to the worklist.
3668 for (VPValue *Operand : CurRec->operands())
3669 if (VPRecipeBase *OpDef = Operand->getDefiningRecipe())
3670 Worklist.push_back(OpDef);
3671 }
3672 });
3673
3674 // Traverse all the recipes in the VPlan and collect the poison-generating
3675 // recipes in the backward slice starting at the address of a VPWidenRecipe or
3676 // VPInterleaveRecipe.
3677 auto Iter =
3680 for (VPRecipeBase &Recipe : *VPBB) {
3681 if (auto *WidenRec = dyn_cast<VPWidenMemoryRecipe>(&Recipe)) {
3682 Instruction &UnderlyingInstr = WidenRec->getIngredient();
3683 VPRecipeBase *AddrDef = WidenRec->getAddr()->getDefiningRecipe();
3684 if (AddrDef && WidenRec->isConsecutive() &&
3685 BlockNeedsPredication(UnderlyingInstr.getParent()))
3686 CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
3687 } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) {
3688 VPRecipeBase *AddrDef = InterleaveRec->getAddr()->getDefiningRecipe();
3689 if (AddrDef) {
3690 // Check if any member of the interleave group needs predication.
3691 const InterleaveGroup<Instruction> *InterGroup =
3692 InterleaveRec->getInterleaveGroup();
3693 bool NeedPredication = false;
3694 for (int I = 0, NumMembers = InterGroup->getNumMembers();
3695 I < NumMembers; ++I) {
3696 Instruction *Member = InterGroup->getMember(I);
3697 if (Member)
3698 NeedPredication |= BlockNeedsPredication(Member->getParent());
3699 }
3700
3701 if (NeedPredication)
3702 CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
3703 }
3704 }
3705 }
3706 }
3707}
3708
3710 VPlan &Plan,
3712 &InterleaveGroups,
3713 VPRecipeBuilder &RecipeBuilder, const bool &EpilogueAllowed) {
3714 if (InterleaveGroups.empty())
3715 return;
3716
3717 // Interleave memory: for each Interleave Group we marked earlier as relevant
3718 // for this VPlan, replace the Recipes widening its memory instructions with a
3719 // single VPInterleaveRecipe at its insertion point.
3720 VPDominatorTree VPDT(Plan);
3721 for (const auto *IG : InterleaveGroups) {
3722 auto *Start =
3723 cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IG->getMember(0)));
3724 VPIRMetadata InterleaveMD(*Start);
3725 SmallVector<VPValue *, 4> StoredValues;
3726 if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(Start))
3727 StoredValues.push_back(StoreR->getStoredValue());
3728 for (unsigned I = 1; I < IG->getFactor(); ++I) {
3729 Instruction *MemberI = IG->getMember(I);
3730 if (!MemberI)
3731 continue;
3732 VPWidenMemoryRecipe *MemoryR =
3733 cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(MemberI));
3734 if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(MemoryR))
3735 StoredValues.push_back(StoreR->getStoredValue());
3736 InterleaveMD.intersect(*MemoryR);
3737 }
3738
3739 bool NeedsMaskForGaps =
3740 (IG->requiresScalarEpilogue() && !EpilogueAllowed) ||
3741 (!StoredValues.empty() && !IG->isFull());
3742
3743 Instruction *IRInsertPos = IG->getInsertPos();
3744 auto *InsertPos =
3745 cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IRInsertPos));
3746
3748 if (auto *Gep = dyn_cast<GetElementPtrInst>(
3749 getLoadStorePointerOperand(IRInsertPos)->stripPointerCasts()))
3750 NW = Gep->getNoWrapFlags().withoutNoUnsignedWrap();
3751
3752 // Get or create the start address for the interleave group.
3753 VPValue *Addr = Start->getAddr();
3754 VPRecipeBase *AddrDef = Addr->getDefiningRecipe();
3755 if (AddrDef && !VPDT.properlyDominates(AddrDef, InsertPos)) {
3756 // We cannot re-use the address of member zero because it does not
3757 // dominate the insert position. Instead, use the address of the insert
3758 // position and create a PtrAdd adjusting it to the address of member
3759 // zero.
3760 // TODO: Hoist Addr's defining recipe (and any operands as needed) to
3761 // InsertPos or sink loads above zero members to join it.
3762 assert(IG->getIndex(IRInsertPos) != 0 &&
3763 "index of insert position shouldn't be zero");
3764 auto &DL = IRInsertPos->getDataLayout();
3765 APInt Offset(32,
3766 DL.getTypeAllocSize(getLoadStoreType(IRInsertPos)) *
3767 IG->getIndex(IRInsertPos),
3768 /*IsSigned=*/true);
3769 VPValue *OffsetVPV = Plan.getConstantInt(-Offset);
3770 VPBuilder B(InsertPos);
3771 Addr = B.createNoWrapPtrAdd(InsertPos->getAddr(), OffsetVPV, NW);
3772 }
3773 // If the group is reverse, adjust the index to refer to the last vector
3774 // lane instead of the first. We adjust the index from the first vector
3775 // lane, rather than directly getting the pointer for lane VF - 1, because
3776 // the pointer operand of the interleaved access is supposed to be uniform.
3777 if (IG->isReverse()) {
3778 auto *ReversePtr = new VPVectorEndPointerRecipe(
3779 Addr, &Plan.getVF(), getLoadStoreType(IRInsertPos),
3780 -(int64_t)IG->getFactor(), NW, InsertPos->getDebugLoc());
3781 ReversePtr->insertBefore(InsertPos);
3782 Addr = ReversePtr;
3783 }
3784 auto *VPIG = new VPInterleaveRecipe(IG, Addr, StoredValues,
3785 InsertPos->getMask(), NeedsMaskForGaps,
3786 InterleaveMD, InsertPos->getDebugLoc());
3787 VPIG->insertBefore(InsertPos);
3788
3789 unsigned J = 0;
3790 for (unsigned i = 0; i < IG->getFactor(); ++i)
3791 if (Instruction *Member = IG->getMember(i)) {
3792 VPRecipeBase *MemberR = RecipeBuilder.getRecipe(Member);
3793 if (!Member->getType()->isVoidTy()) {
3794 VPValue *OriginalV = MemberR->getVPSingleValue();
3795 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));
3796 J++;
3797 }
3798 MemberR->eraseFromParent();
3799 }
3800 }
3801}
3802
3803/// Expand a VPWidenIntOrFpInduction into executable recipes, for the initial
3804/// value, phi and backedge value. In the following example:
3805///
3806/// vector.ph:
3807/// Successor(s): vector loop
3808///
3809/// <x1> vector loop: {
3810/// vector.body:
3811/// WIDEN-INDUCTION %i = phi %start, %step, %vf
3812/// ...
3813/// EMIT branch-on-count ...
3814/// No successors
3815/// }
3816///
3817/// WIDEN-INDUCTION will get expanded to:
3818///
3819/// vector.ph:
3820/// ...
3821/// vp<%induction.start> = ...
3822/// vp<%induction.increment> = ...
3823///
3824/// Successor(s): vector loop
3825///
3826/// <x1> vector loop: {
3827/// vector.body:
3828/// ir<%i> = WIDEN-PHI vp<%induction.start>, vp<%vec.ind.next>
3829/// ...
3830/// vp<%vec.ind.next> = add ir<%i>, vp<%induction.increment>
3831/// EMIT branch-on-count ...
3832/// No successors
3833/// }
3834static void
3836 VPTypeAnalysis &TypeInfo) {
3837 VPlan *Plan = WidenIVR->getParent()->getPlan();
3838 VPValue *Start = WidenIVR->getStartValue();
3839 VPValue *Step = WidenIVR->getStepValue();
3840 VPValue *VF = WidenIVR->getVFValue();
3841 DebugLoc DL = WidenIVR->getDebugLoc();
3842
3843 // The value from the original loop to which we are mapping the new induction
3844 // variable.
3845 Type *Ty = TypeInfo.inferScalarType(WidenIVR);
3846
3847 const InductionDescriptor &ID = WidenIVR->getInductionDescriptor();
3850 VPIRFlags Flags = *WidenIVR;
3851 if (ID.getKind() == InductionDescriptor::IK_IntInduction) {
3852 AddOp = Instruction::Add;
3853 MulOp = Instruction::Mul;
3854 } else {
3855 AddOp = ID.getInductionOpcode();
3856 MulOp = Instruction::FMul;
3857 }
3858
3859 // If the phi is truncated, truncate the start and step values.
3860 VPBuilder Builder(Plan->getVectorPreheader());
3861 Type *StepTy = TypeInfo.inferScalarType(Step);
3862 if (Ty->getScalarSizeInBits() < StepTy->getScalarSizeInBits()) {
3863 assert(StepTy->isIntegerTy() && "Truncation requires an integer type");
3864 Step = Builder.createScalarCast(Instruction::Trunc, Step, Ty, DL);
3865 Start = Builder.createScalarCast(Instruction::Trunc, Start, Ty, DL);
3866 StepTy = Ty;
3867 }
3868
3869 // Construct the initial value of the vector IV in the vector loop preheader.
3870 Type *IVIntTy =
3872 VPValue *Init = Builder.createNaryOp(VPInstruction::StepVector, {}, IVIntTy);
3873 if (StepTy->isFloatingPointTy())
3874 Init = Builder.createWidenCast(Instruction::UIToFP, Init, StepTy);
3875
3876 VPValue *SplatStart = Builder.createNaryOp(VPInstruction::Broadcast, Start);
3877 VPValue *SplatStep = Builder.createNaryOp(VPInstruction::Broadcast, Step);
3878
3879 Init = Builder.createNaryOp(MulOp, {Init, SplatStep}, Flags);
3880 Init = Builder.createNaryOp(AddOp, {SplatStart, Init}, Flags,
3881 DebugLoc::getUnknown(), "induction");
3882
3883 // Create the widened phi of the vector IV.
3884 auto *WidePHI = VPBuilder(WidenIVR).createWidenPhi(
3885 Init, WidenIVR->getDebugLoc(), "vec.ind");
3886
3887 // Create the backedge value for the vector IV.
3888 VPValue *Inc;
3889 VPValue *Prev;
3890 // If unrolled, use the increment and prev value from the operands.
3891 if (auto *SplatVF = WidenIVR->getSplatVFValue()) {
3892 Inc = SplatVF;
3893 Prev = WidenIVR->getLastUnrolledPartOperand();
3894 } else {
3895 if (VPRecipeBase *R = VF->getDefiningRecipe())
3896 Builder.setInsertPoint(R->getParent(), std::next(R->getIterator()));
3897 // Multiply the vectorization factor by the step using integer or
3898 // floating-point arithmetic as appropriate.
3899 if (StepTy->isFloatingPointTy())
3900 VF = Builder.createScalarCast(Instruction::CastOps::UIToFP, VF, StepTy,
3901 DL);
3902 else
3903 VF = Builder.createScalarZExtOrTrunc(VF, StepTy,
3904 TypeInfo.inferScalarType(VF), DL);
3905
3906 Inc = Builder.createNaryOp(MulOp, {Step, VF}, Flags);
3907 Inc = Builder.createNaryOp(VPInstruction::Broadcast, Inc);
3908 Prev = WidePHI;
3909 }
3910
3912 Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator());
3913 auto *Next = Builder.createNaryOp(AddOp, {Prev, Inc}, Flags,
3914 WidenIVR->getDebugLoc(), "vec.ind.next");
3915
3916 WidePHI->addOperand(Next);
3917
3918 WidenIVR->replaceAllUsesWith(WidePHI);
3919}
3920
3921/// Expand a VPWidenPointerInductionRecipe into executable recipes, for the
3922/// initial value, phi and backedge value. In the following example:
3923///
3924/// <x1> vector loop: {
3925/// vector.body:
3926/// EMIT ir<%ptr.iv> = WIDEN-POINTER-INDUCTION %start, %step, %vf
3927/// ...
3928/// EMIT branch-on-count ...
3929/// }
3930///
3931/// WIDEN-POINTER-INDUCTION will get expanded to:
3932///
3933/// <x1> vector loop: {
3934/// vector.body:
3935/// EMIT-SCALAR %pointer.phi = phi %start, %ptr.ind
3936/// EMIT %mul = mul %stepvector, %step
3937/// EMIT %vector.gep = wide-ptradd %pointer.phi, %mul
3938/// ...
3939/// EMIT %ptr.ind = ptradd %pointer.phi, %vf
3940/// EMIT branch-on-count ...
3941/// }
3943 VPTypeAnalysis &TypeInfo) {
3944 VPlan *Plan = R->getParent()->getPlan();
3945 VPValue *Start = R->getStartValue();
3946 VPValue *Step = R->getStepValue();
3947 VPValue *VF = R->getVFValue();
3948
3949 assert(R->getInductionDescriptor().getKind() ==
3951 "Not a pointer induction according to InductionDescriptor!");
3952 assert(TypeInfo.inferScalarType(R)->isPointerTy() && "Unexpected type.");
3953 assert(!R->onlyScalarsGenerated(Plan->hasScalableVF()) &&
3954 "Recipe should have been replaced");
3955
3956 VPBuilder Builder(R);
3957 DebugLoc DL = R->getDebugLoc();
3958
3959 // Build a scalar pointer phi.
3960 VPPhi *ScalarPtrPhi = Builder.createScalarPhi(Start, DL, "pointer.phi");
3961
3962 // Create actual address geps that use the pointer phi as base and a
3963 // vectorized version of the step value (<step*0, ..., step*N>) as offset.
3964 Builder.setInsertPoint(R->getParent(), R->getParent()->getFirstNonPhi());
3965 Type *StepTy = TypeInfo.inferScalarType(Step);
3966 VPValue *Offset = Builder.createNaryOp(VPInstruction::StepVector, {}, StepTy);
3967 Offset = Builder.createOverflowingOp(Instruction::Mul, {Offset, Step});
3968 VPValue *PtrAdd =
3969 Builder.createWidePtrAdd(ScalarPtrPhi, Offset, DL, "vector.gep");
3970 R->replaceAllUsesWith(PtrAdd);
3971
3972 // Create the backedge value for the scalar pointer phi.
3974 Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator());
3975 VF = Builder.createScalarZExtOrTrunc(VF, StepTy, TypeInfo.inferScalarType(VF),
3976 DL);
3977 VPValue *Inc = Builder.createOverflowingOp(Instruction::Mul, {Step, VF});
3978
3979 VPValue *InductionGEP =
3980 Builder.createPtrAdd(ScalarPtrPhi, Inc, DL, "ptr.ind");
3981 ScalarPtrPhi->addOperand(InductionGEP);
3982}
3983
3985 // Replace loop regions with explicity CFG.
3986 SmallVector<VPRegionBlock *> LoopRegions;
3988 vp_depth_first_deep(Plan.getEntry()))) {
3989 if (!R->isReplicator())
3990 LoopRegions.push_back(R);
3991 }
3992 for (VPRegionBlock *R : LoopRegions)
3993 R->dissolveToCFGLoop();
3994}
3995
3998 // The transform runs after dissolving loop regions, so all VPBasicBlocks
3999 // terminated with BranchOnTwoConds are reached via a shallow traversal.
4002 if (!VPBB->empty() && match(&VPBB->back(), m_BranchOnTwoConds()))
4003 WorkList.push_back(cast<VPInstruction>(&VPBB->back()));
4004 }
4005
4006 // Expand BranchOnTwoConds instructions into explicit CFG with two new
4007 // single-condition branches:
4008 // 1. A branch that replaces BranchOnTwoConds, jumps to the first successor if
4009 // the first condition is true, and otherwise jumps to a new interim block.
4010 // 2. A branch that ends the interim block, jumps to the second successor if
4011 // the second condition is true, and otherwise jumps to the third
4012 // successor.
4013 for (VPInstruction *Br : WorkList) {
4014 assert(Br->getNumOperands() == 2 &&
4015 "BranchOnTwoConds must have exactly 2 conditions");
4016 DebugLoc DL = Br->getDebugLoc();
4017 VPBasicBlock *BrOnTwoCondsBB = Br->getParent();
4018 const auto Successors = to_vector(BrOnTwoCondsBB->getSuccessors());
4019 assert(Successors.size() == 3 &&
4020 "BranchOnTwoConds must have exactly 3 successors");
4021
4022 for (VPBlockBase *Succ : Successors)
4023 VPBlockUtils::disconnectBlocks(BrOnTwoCondsBB, Succ);
4024
4025 VPValue *Cond0 = Br->getOperand(0);
4026 VPValue *Cond1 = Br->getOperand(1);
4027 VPBlockBase *Succ0 = Successors[0];
4028 VPBlockBase *Succ1 = Successors[1];
4029 VPBlockBase *Succ2 = Successors[2];
4030 assert(!Succ0->getParent() && !Succ1->getParent() && !Succ2->getParent() &&
4031 !BrOnTwoCondsBB->getParent() && "regions must already be dissolved");
4032
4033 VPBasicBlock *InterimBB =
4034 Plan.createVPBasicBlock(BrOnTwoCondsBB->getName() + ".interim");
4035
4036 VPBuilder(BrOnTwoCondsBB)
4038 VPBlockUtils::connectBlocks(BrOnTwoCondsBB, Succ0);
4039 VPBlockUtils::connectBlocks(BrOnTwoCondsBB, InterimBB);
4040
4042 VPBlockUtils::connectBlocks(InterimBB, Succ1);
4043 VPBlockUtils::connectBlocks(InterimBB, Succ2);
4044 Br->eraseFromParent();
4045 }
4046}
4047
4049 VPTypeAnalysis TypeInfo(Plan);
4052 vp_depth_first_deep(Plan.getEntry()))) {
4053 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
4054 if (auto *WidenIVR = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R)) {
4055 expandVPWidenIntOrFpInduction(WidenIVR, TypeInfo);
4056 ToRemove.push_back(WidenIVR);
4057 continue;
4058 }
4059
4060 if (auto *WidenIVR = dyn_cast<VPWidenPointerInductionRecipe>(&R)) {
4061 // If the recipe only generates scalars, scalarize it instead of
4062 // expanding it.
4063 if (WidenIVR->onlyScalarsGenerated(Plan.hasScalableVF())) {
4064 VPBuilder Builder(WidenIVR);
4065 VPValue *PtrAdd =
4066 scalarizeVPWidenPointerInduction(WidenIVR, Plan, Builder);
4067 WidenIVR->replaceAllUsesWith(PtrAdd);
4068 ToRemove.push_back(WidenIVR);
4069 continue;
4070 }
4071 expandVPWidenPointerInduction(WidenIVR, TypeInfo);
4072 ToRemove.push_back(WidenIVR);
4073 continue;
4074 }
4075
4076 // Expand VPBlendRecipe into VPInstruction::Select.
4077 VPBuilder Builder(&R);
4078 if (auto *Blend = dyn_cast<VPBlendRecipe>(&R)) {
4079 VPValue *Select = Blend->getIncomingValue(0);
4080 for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
4081 Select = Builder.createSelect(Blend->getMask(I),
4082 Blend->getIncomingValue(I), Select,
4083 R.getDebugLoc(), "predphi", *Blend);
4084 Blend->replaceAllUsesWith(Select);
4085 ToRemove.push_back(Blend);
4086 }
4087
4088 if (auto *VEPR = dyn_cast<VPVectorEndPointerRecipe>(&R)) {
4089 if (!VEPR->getOffset()) {
4090 assert(Plan.getConcreteUF() == 1 &&
4091 "Expected unroller to have materialized offset for UF != 1");
4092 VEPR->materializeOffset();
4093 }
4094 }
4095
4096 if (auto *Expr = dyn_cast<VPExpressionRecipe>(&R)) {
4097 Expr->decompose();
4098 ToRemove.push_back(Expr);
4099 }
4100
4101 // Expand LastActiveLane into Not + FirstActiveLane + Sub.
4102 auto *LastActiveL = dyn_cast<VPInstruction>(&R);
4103 if (LastActiveL &&
4104 LastActiveL->getOpcode() == VPInstruction::LastActiveLane) {
4105 // Create Not(Mask) for all operands.
4107 for (VPValue *Op : LastActiveL->operands()) {
4108 VPValue *NotMask = Builder.createNot(Op, LastActiveL->getDebugLoc());
4109 NotMasks.push_back(NotMask);
4110 }
4111
4112 // Create FirstActiveLane on the inverted masks.
4113 VPValue *FirstInactiveLane = Builder.createNaryOp(
4115 LastActiveL->getDebugLoc(), "first.inactive.lane");
4116
4117 // Subtract 1 to get the last active lane.
4118 VPValue *One =
4119 Plan.getConstantInt(TypeInfo.inferScalarType(FirstInactiveLane), 1);
4120 VPValue *LastLane =
4121 Builder.createSub(FirstInactiveLane, One,
4122 LastActiveL->getDebugLoc(), "last.active.lane");
4123
4124 LastActiveL->replaceAllUsesWith(LastLane);
4125 ToRemove.push_back(LastActiveL);
4126 continue;
4127 }
4128
4129 // Lower MaskedCond with block mask to LogicalAnd.
4131 auto *VPI = cast<VPInstruction>(&R);
4132 assert(VPI->isMasked() &&
4133 "Unmasked MaskedCond should be simplified earlier");
4134 VPI->replaceAllUsesWith(Builder.createNaryOp(
4135 VPInstruction::LogicalAnd, {VPI->getMask(), VPI->getOperand(0)}));
4136 ToRemove.push_back(VPI);
4137 continue;
4138 }
4139
4140 // Lower CanonicalIVIncrementForPart to plain Add.
4141 if (match(
4142 &R,
4144 auto *VPI = cast<VPInstruction>(&R);
4145 VPValue *Add = Builder.createOverflowingOp(
4146 Instruction::Add, VPI->operands(), VPI->getNoWrapFlags(),
4147 VPI->getDebugLoc());
4148 VPI->replaceAllUsesWith(Add);
4149 ToRemove.push_back(VPI);
4150 continue;
4151 }
4152
4153 // Lower BranchOnCount to ICmp + BranchOnCond.
4154 VPValue *IV, *TC;
4155 if (match(&R, m_BranchOnCount(m_VPValue(IV), m_VPValue(TC)))) {
4156 auto *BranchOnCountInst = cast<VPInstruction>(&R);
4157 DebugLoc DL = BranchOnCountInst->getDebugLoc();
4158 VPValue *Cond = Builder.createICmp(CmpInst::ICMP_EQ, IV, TC, DL);
4159 Builder.createNaryOp(VPInstruction::BranchOnCond, Cond, DL);
4160 ToRemove.push_back(BranchOnCountInst);
4161 continue;
4162 }
4163
4164 VPValue *VectorStep;
4165 VPValue *ScalarStep;
4167 m_VPValue(VectorStep), m_VPValue(ScalarStep))))
4168 continue;
4169
4170 // Expand WideIVStep.
4171 auto *VPI = cast<VPInstruction>(&R);
4172 Type *IVTy = TypeInfo.inferScalarType(VPI);
4173 if (TypeInfo.inferScalarType(VectorStep) != IVTy) {
4175 ? Instruction::UIToFP
4176 : Instruction::Trunc;
4177 VectorStep = Builder.createWidenCast(CastOp, VectorStep, IVTy);
4178 }
4179
4180 assert(!match(ScalarStep, m_One()) && "Expected non-unit scalar-step");
4181 if (TypeInfo.inferScalarType(ScalarStep) != IVTy) {
4182 ScalarStep =
4183 Builder.createWidenCast(Instruction::Trunc, ScalarStep, IVTy);
4184 }
4185
4186 VPIRFlags Flags;
4187 unsigned MulOpc;
4188 if (IVTy->isFloatingPointTy()) {
4189 MulOpc = Instruction::FMul;
4190 Flags = VPI->getFastMathFlags();
4191 } else {
4192 MulOpc = Instruction::Mul;
4193 Flags = VPIRFlags::getDefaultFlags(MulOpc);
4194 }
4195
4196 VPInstruction *Mul = Builder.createNaryOp(
4197 MulOpc, {VectorStep, ScalarStep}, Flags, R.getDebugLoc());
4198 VectorStep = Mul;
4199 VPI->replaceAllUsesWith(VectorStep);
4200 ToRemove.push_back(VPI);
4201 }
4202 }
4203
4204 for (VPRecipeBase *R : ToRemove)
4205 R->eraseFromParent();
4206}
4207
4209 VPBasicBlock *HeaderVPBB,
4210 VPBasicBlock *LatchVPBB,
4211 VPBasicBlock *MiddleVPBB,
4212 UncountableExitStyle Style) {
4213 struct EarlyExitInfo {
4214 VPBasicBlock *EarlyExitingVPBB;
4215 VPIRBasicBlock *EarlyExitVPBB;
4216 VPValue *CondToExit;
4217 };
4218
4219 VPDominatorTree VPDT(Plan);
4220 VPBuilder Builder(LatchVPBB->getTerminator());
4222 for (VPIRBasicBlock *ExitBlock : Plan.getExitBlocks()) {
4223 for (VPBlockBase *Pred : to_vector(ExitBlock->getPredecessors())) {
4224 if (Pred == MiddleVPBB)
4225 continue;
4226 // Collect condition for this early exit.
4227 auto *EarlyExitingVPBB = cast<VPBasicBlock>(Pred);
4228 VPBlockBase *TrueSucc = EarlyExitingVPBB->getSuccessors()[0];
4229 VPValue *CondOfEarlyExitingVPBB;
4230 [[maybe_unused]] bool Matched =
4231 match(EarlyExitingVPBB->getTerminator(),
4232 m_BranchOnCond(m_VPValue(CondOfEarlyExitingVPBB)));
4233 assert(Matched && "Terminator must be BranchOnCond");
4234
4235 // Insert the MaskedCond in the EarlyExitingVPBB so the predicator adds
4236 // the correct block mask.
4237 VPBuilder EarlyExitingBuilder(EarlyExitingVPBB->getTerminator());
4238 auto *CondToEarlyExit = EarlyExitingBuilder.createNaryOp(
4240 TrueSucc == ExitBlock
4241 ? CondOfEarlyExitingVPBB
4242 : EarlyExitingBuilder.createNot(CondOfEarlyExitingVPBB));
4243 assert((isa<VPIRValue>(CondOfEarlyExitingVPBB) ||
4244 !VPDT.properlyDominates(EarlyExitingVPBB, LatchVPBB) ||
4245 VPDT.properlyDominates(
4246 CondOfEarlyExitingVPBB->getDefiningRecipe()->getParent(),
4247 LatchVPBB)) &&
4248 "exit condition must dominate the latch");
4249 Exits.push_back({
4250 EarlyExitingVPBB,
4251 ExitBlock,
4252 CondToEarlyExit,
4253 });
4254 }
4255 }
4256
4257 assert(!Exits.empty() && "must have at least one early exit");
4258 // Sort exits by RPO order to get correct program order. RPO gives a
4259 // topological ordering of the CFG, ensuring upstream exits are checked
4260 // before downstream exits in the dispatch chain.
4262 HeaderVPBB);
4264 for (const auto &[Num, VPB] : enumerate(RPOT))
4265 RPOIdx[VPB] = Num;
4266 llvm::sort(Exits, [&RPOIdx](const EarlyExitInfo &A, const EarlyExitInfo &B) {
4267 return RPOIdx[A.EarlyExitingVPBB] < RPOIdx[B.EarlyExitingVPBB];
4268 });
4269#ifndef NDEBUG
4270 // After RPO sorting, verify that for any pair where one exit dominates
4271 // another, the dominating exit comes first. This is guaranteed by RPO
4272 // (topological order) and is required for the dispatch chain correctness.
4273 for (unsigned I = 0; I + 1 < Exits.size(); ++I)
4274 for (unsigned J = I + 1; J < Exits.size(); ++J)
4275 assert(!VPDT.properlyDominates(Exits[J].EarlyExitingVPBB,
4276 Exits[I].EarlyExitingVPBB) &&
4277 "RPO sort must place dominating exits before dominated ones");
4278#endif
4279
4280 // Build the AnyOf condition for the latch terminator using logical OR
4281 // to avoid poison propagation from later exit conditions when an earlier
4282 // exit is taken.
4283 VPValue *Combined = Exits[0].CondToExit;
4284 for (const EarlyExitInfo &Info : drop_begin(Exits))
4285 Combined = Builder.createLogicalOr(Combined, Info.CondToExit);
4286
4287 VPValue *IsAnyExitTaken =
4288 Builder.createNaryOp(VPInstruction::AnyOf, {Combined});
4289
4291 "Early exit store masking not implemented");
4292
4293 // Create the vector.early.exit blocks.
4294 SmallVector<VPBasicBlock *> VectorEarlyExitVPBBs(Exits.size());
4295 for (unsigned Idx = 0; Idx != Exits.size(); ++Idx) {
4296 Twine BlockSuffix = Exits.size() == 1 ? "" : Twine(".") + Twine(Idx);
4297 VPBasicBlock *VectorEarlyExitVPBB =
4298 Plan.createVPBasicBlock("vector.early.exit" + BlockSuffix);
4299 VectorEarlyExitVPBBs[Idx] = VectorEarlyExitVPBB;
4300 }
4301
4302 // Create the dispatch block (or reuse the single exit block if only one
4303 // exit). The dispatch block computes the first active lane of the combined
4304 // condition and, for multiple exits, chains through conditions to determine
4305 // which exit to take.
4306 VPBasicBlock *DispatchVPBB =
4307 Exits.size() == 1 ? VectorEarlyExitVPBBs[0]
4308 : Plan.createVPBasicBlock("vector.early.exit.check");
4309 VPBuilder DispatchBuilder(DispatchVPBB, DispatchVPBB->begin());
4310 VPValue *FirstActiveLane =
4311 DispatchBuilder.createNaryOp(VPInstruction::FirstActiveLane, {Combined},
4312 DebugLoc::getUnknown(), "first.active.lane");
4313
4314 // For each early exit, disconnect the original exiting block
4315 // (early.exiting.I) from the exit block (ir-bb<exit.I>) and route through a
4316 // new vector.early.exit block. Update ir-bb<exit.I>'s phis to extract their
4317 // values at the first active lane:
4318 //
4319 // Input:
4320 // early.exiting.I:
4321 // ...
4322 // EMIT branch-on-cond vp<%cond.I>
4323 // Successor(s): in.loop.succ, ir-bb<exit.I>
4324 //
4325 // ir-bb<exit.I>:
4326 // IR %phi = phi [ vp<%incoming.I>, early.exiting.I ], ...
4327 //
4328 // Output:
4329 // early.exiting.I:
4330 // ...
4331 // Successor(s): in.loop.succ
4332 //
4333 // vector.early.exit.I:
4334 // EMIT vp<%exit.val> = extract-lane vp<%first.lane>, vp<%incoming.I>
4335 // Successor(s): ir-bb<exit.I>
4336 //
4337 // ir-bb<exit.I>:
4338 // IR %phi = phi ... (extra operand: vp<%exit.val> from
4339 // vector.early.exit.I)
4340 //
4341 for (auto [Exit, VectorEarlyExitVPBB] :
4342 zip_equal(Exits, VectorEarlyExitVPBBs)) {
4343 auto &[EarlyExitingVPBB, EarlyExitVPBB, _] = Exit;
4344 // Adjust the phi nodes in EarlyExitVPBB.
4345 // 1. remove incoming values from EarlyExitingVPBB,
4346 // 2. extract the incoming value at FirstActiveLane
4347 // 3. add back the extracts as last operands for the phis
4348 // Then adjust the CFG, removing the edge between EarlyExitingVPBB and
4349 // EarlyExitVPBB and adding a new edge between VectorEarlyExitVPBB and
4350 // EarlyExitVPBB. The extracts at FirstActiveLane are now the incoming
4351 // values from VectorEarlyExitVPBB.
4352 for (VPRecipeBase &R : EarlyExitVPBB->phis()) {
4353 auto *ExitIRI = cast<VPIRPhi>(&R);
4354 VPValue *IncomingVal =
4355 ExitIRI->getIncomingValueForBlock(EarlyExitingVPBB);
4356 VPValue *NewIncoming = IncomingVal;
4357 if (!isa<VPIRValue>(IncomingVal)) {
4358 VPBuilder EarlyExitBuilder(VectorEarlyExitVPBB);
4359 NewIncoming = EarlyExitBuilder.createNaryOp(
4360 VPInstruction::ExtractLane, {FirstActiveLane, IncomingVal},
4361 DebugLoc::getUnknown(), "early.exit.value");
4362 }
4363 ExitIRI->removeIncomingValueFor(EarlyExitingVPBB);
4364 ExitIRI->addOperand(NewIncoming);
4365 }
4366
4367 EarlyExitingVPBB->getTerminator()->eraseFromParent();
4368 VPBlockUtils::disconnectBlocks(EarlyExitingVPBB, EarlyExitVPBB);
4369 VPBlockUtils::connectBlocks(VectorEarlyExitVPBB, EarlyExitVPBB);
4370 }
4371
4372 // Chain through exits: for each exit, check if its condition is true at
4373 // the first active lane. If so, take that exit; otherwise, try the next.
4374 // The last exit needs no check since it must be taken if all others fail.
4375 //
4376 // For 3 exits (cond.0, cond.1, cond.2), this creates:
4377 //
4378 // latch:
4379 // ...
4380 // EMIT vp<%combined> = logical-or vp<%cond.0>, vp<%cond.1>, vp<%cond.2>
4381 // ...
4382 //
4383 // vector.early.exit.check:
4384 // EMIT vp<%first.lane> = first-active-lane vp<%combined>
4385 // EMIT vp<%at.cond.0> = extract-lane vp<%first.lane>, vp<%cond.0>
4386 // EMIT branch-on-cond vp<%at.cond.0>
4387 // Successor(s): vector.early.exit.0, vector.early.exit.check.0
4388 //
4389 // vector.early.exit.check.0:
4390 // EMIT vp<%at.cond.1> = extract-lane vp<%first.lane>, vp<%cond.1>
4391 // EMIT branch-on-cond vp<%at.cond.1>
4392 // Successor(s): vector.early.exit.1, vector.early.exit.2
4393 VPBasicBlock *CurrentBB = DispatchVPBB;
4394 for (auto [I, Exit] : enumerate(ArrayRef(Exits).drop_back())) {
4395 VPValue *LaneVal = DispatchBuilder.createNaryOp(
4396 VPInstruction::ExtractLane, {FirstActiveLane, Exit.CondToExit},
4397 DebugLoc::getUnknown(), "exit.cond.at.lane");
4398
4399 // For the last dispatch, branch directly to the last exit on false;
4400 // otherwise, create a new check block.
4401 bool IsLastDispatch = (I + 2 == Exits.size());
4402 VPBasicBlock *FalseBB =
4403 IsLastDispatch ? VectorEarlyExitVPBBs.back()
4404 : Plan.createVPBasicBlock(
4405 Twine("vector.early.exit.check.") + Twine(I));
4406
4407 DispatchBuilder.createNaryOp(VPInstruction::BranchOnCond, {LaneVal});
4408 CurrentBB->setSuccessors({VectorEarlyExitVPBBs[I], FalseBB});
4409 VectorEarlyExitVPBBs[I]->setPredecessors({CurrentBB});
4410 FalseBB->setPredecessors({CurrentBB});
4411
4412 CurrentBB = FalseBB;
4413 DispatchBuilder.setInsertPoint(CurrentBB);
4414 }
4415
4416 // Replace the latch terminator with the new branching logic.
4417 auto *LatchExitingBranch = cast<VPInstruction>(LatchVPBB->getTerminator());
4418 assert(LatchExitingBranch->getOpcode() == VPInstruction::BranchOnCount &&
4419 "Unexpected terminator");
4420 auto *IsLatchExitTaken =
4421 Builder.createICmp(CmpInst::ICMP_EQ, LatchExitingBranch->getOperand(0),
4422 LatchExitingBranch->getOperand(1));
4423
4424 DebugLoc LatchDL = LatchExitingBranch->getDebugLoc();
4425 LatchExitingBranch->eraseFromParent();
4426 Builder.setInsertPoint(LatchVPBB);
4427 Builder.createNaryOp(VPInstruction::BranchOnTwoConds,
4428 {IsAnyExitTaken, IsLatchExitTaken}, LatchDL);
4429 LatchVPBB->clearSuccessors();
4430 LatchVPBB->setSuccessors({DispatchVPBB, MiddleVPBB, HeaderVPBB});
4431 DispatchVPBB->setPredecessors({LatchVPBB});
4432}
4433
4434/// This function tries convert extended in-loop reductions to
4435/// VPExpressionRecipe and clamp the \p Range if it is beneficial and
4436/// valid. The created recipe must be decomposed to its constituent
4437/// recipes before execution.
4438static VPExpressionRecipe *
4440 VFRange &Range) {
4441 Type *RedTy = Ctx.Types.inferScalarType(Red);
4442 VPValue *VecOp = Red->getVecOp();
4443
4444 assert(!Red->isPartialReduction() &&
4445 "This path does not support partial reductions");
4446
4447 // Clamp the range if using extended-reduction is profitable.
4448 auto IsExtendedRedValidAndClampRange =
4449 [&](unsigned Opcode, Instruction::CastOps ExtOpc, Type *SrcTy) -> bool {
4451 [&](ElementCount VF) {
4452 auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF));
4454
4456 InstructionCost ExtCost =
4457 cast<VPWidenCastRecipe>(VecOp)->computeCost(VF, Ctx);
4458 InstructionCost RedCost = Red->computeCost(VF, Ctx);
4459
4460 assert(!RedTy->isFloatingPointTy() &&
4461 "getExtendedReductionCost only supports integer types");
4462 ExtRedCost = Ctx.TTI.getExtendedReductionCost(
4463 Opcode, ExtOpc == Instruction::CastOps::ZExt, RedTy, SrcVecTy,
4464 Red->getFastMathFlags(), CostKind);
4465 return ExtRedCost.isValid() && ExtRedCost < ExtCost + RedCost;
4466 },
4467 Range);
4468 };
4469
4470 VPValue *A;
4471 // Match reduce(ext)).
4473 IsExtendedRedValidAndClampRange(
4474 RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()),
4475 cast<VPWidenCastRecipe>(VecOp)->getOpcode(),
4476 Ctx.Types.inferScalarType(A)))
4477 return new VPExpressionRecipe(cast<VPWidenCastRecipe>(VecOp), Red);
4478
4479 return nullptr;
4480}
4481
4482/// This function tries convert extended in-loop reductions to
4483/// VPExpressionRecipe and clamp the \p Range if it is beneficial
4484/// and valid. The created VPExpressionRecipe must be decomposed to its
4485/// constituent recipes before execution. Patterns of the
4486/// VPExpressionRecipe:
4487/// reduce.add(mul(...)),
4488/// reduce.add(mul(ext(A), ext(B))),
4489/// reduce.add(ext(mul(ext(A), ext(B)))).
4490/// reduce.fadd(fmul(ext(A), ext(B)))
4491static VPExpressionRecipe *
4493 VPCostContext &Ctx, VFRange &Range) {
4494 unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind());
4495 if (Opcode != Instruction::Add && Opcode != Instruction::Sub &&
4496 Opcode != Instruction::FAdd)
4497 return nullptr;
4498
4499 assert(!Red->isPartialReduction() &&
4500 "This path does not support partial reductions");
4501 Type *RedTy = Ctx.Types.inferScalarType(Red);
4502
4503 // Clamp the range if using multiply-accumulate-reduction is profitable.
4504 auto IsMulAccValidAndClampRange =
4506 VPWidenCastRecipe *OuterExt) -> bool {
4508 [&](ElementCount VF) {
4510 Type *SrcTy =
4511 Ext0 ? Ctx.Types.inferScalarType(Ext0->getOperand(0)) : RedTy;
4512 InstructionCost MulAccCost;
4513
4514 // getMulAccReductionCost for in-loop reductions does not support
4515 // mixed or floating-point extends.
4516 if (Ext0 && Ext1 &&
4517 (Ext0->getOpcode() != Ext1->getOpcode() ||
4518 Ext0->getOpcode() == Instruction::CastOps::FPExt))
4519 return false;
4520
4521 bool IsZExt =
4522 !Ext0 || Ext0->getOpcode() == Instruction::CastOps::ZExt;
4523 auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF));
4524 MulAccCost = Ctx.TTI.getMulAccReductionCost(IsZExt, Opcode, RedTy,
4525 SrcVecTy, CostKind);
4526
4527 InstructionCost MulCost = Mul->computeCost(VF, Ctx);
4528 InstructionCost RedCost = Red->computeCost(VF, Ctx);
4529 InstructionCost ExtCost = 0;
4530 if (Ext0)
4531 ExtCost += Ext0->computeCost(VF, Ctx);
4532 if (Ext1)
4533 ExtCost += Ext1->computeCost(VF, Ctx);
4534 if (OuterExt)
4535 ExtCost += OuterExt->computeCost(VF, Ctx);
4536
4537 return MulAccCost.isValid() &&
4538 MulAccCost < ExtCost + MulCost + RedCost;
4539 },
4540 Range);
4541 };
4542
4543 VPValue *VecOp = Red->getVecOp();
4544 VPRecipeBase *Sub = nullptr;
4545 VPValue *A, *B;
4546 VPValue *Tmp = nullptr;
4547
4548 if (RedTy->isFloatingPointTy())
4549 return nullptr;
4550
4551 // Sub reductions could have a sub between the add reduction and vec op.
4552 if (match(VecOp, m_Sub(m_ZeroInt(), m_VPValue(Tmp)))) {
4553 Sub = VecOp->getDefiningRecipe();
4554 VecOp = Tmp;
4555 }
4556
4557 // If ValB is a constant and can be safely extended, truncate it to the same
4558 // type as ExtA's operand, then extend it to the same type as ExtA. This
4559 // creates two uniform extends that can more easily be matched by the rest of
4560 // the bundling code. The ExtB reference, ValB and operand 1 of Mul are all
4561 // replaced with the new extend of the constant.
4562 auto ExtendAndReplaceConstantOp = [&Ctx](VPWidenCastRecipe *ExtA,
4563 VPWidenCastRecipe *&ExtB,
4564 VPValue *&ValB, VPWidenRecipe *Mul) {
4565 if (!ExtA || ExtB || !isa<VPIRValue>(ValB))
4566 return;
4567 Type *NarrowTy = Ctx.Types.inferScalarType(ExtA->getOperand(0));
4568 Instruction::CastOps ExtOpc = ExtA->getOpcode();
4569 const APInt *Const;
4570 if (!match(ValB, m_APInt(Const)) ||
4572 Const, NarrowTy, TTI::getPartialReductionExtendKind(ExtOpc)))
4573 return;
4574 // The truncate ensures that the type of each extended operand is the
4575 // same, and it's been proven that the constant can be extended from
4576 // NarrowTy safely. Necessary since ExtA's extended operand would be
4577 // e.g. an i8, while the const will likely be an i32. This will be
4578 // elided by later optimisations.
4579 VPBuilder Builder(Mul);
4580 auto *Trunc =
4581 Builder.createWidenCast(Instruction::CastOps::Trunc, ValB, NarrowTy);
4582 Type *WideTy = Ctx.Types.inferScalarType(ExtA);
4583 ValB = ExtB = Builder.createWidenCast(ExtOpc, Trunc, WideTy);
4584 Mul->setOperand(1, ExtB);
4585 };
4586
4587 // Try to match reduce.add(mul(...)).
4588 if (match(VecOp, m_Mul(m_VPValue(A), m_VPValue(B)))) {
4589 auto *RecipeA = dyn_cast<VPWidenCastRecipe>(A);
4590 auto *RecipeB = dyn_cast<VPWidenCastRecipe>(B);
4591 auto *Mul = cast<VPWidenRecipe>(VecOp);
4592
4593 // Convert reduce.add(mul(ext, const)) to reduce.add(mul(ext, ext(const)))
4594 ExtendAndReplaceConstantOp(RecipeA, RecipeB, B, Mul);
4595
4596 // Match reduce.add/sub(mul(ext, ext)).
4597 if (RecipeA && RecipeB && match(RecipeA, m_ZExtOrSExt(m_VPValue())) &&
4598 match(RecipeB, m_ZExtOrSExt(m_VPValue())) &&
4599 IsMulAccValidAndClampRange(Mul, RecipeA, RecipeB, nullptr)) {
4600 if (Sub)
4601 return new VPExpressionRecipe(RecipeA, RecipeB, Mul,
4602 cast<VPWidenRecipe>(Sub), Red);
4603 return new VPExpressionRecipe(RecipeA, RecipeB, Mul, Red);
4604 }
4605 // TODO: Add an expression type for this variant with a negated mul
4606 if (!Sub && IsMulAccValidAndClampRange(Mul, nullptr, nullptr, nullptr))
4607 return new VPExpressionRecipe(Mul, Red);
4608 }
4609 // TODO: Add an expression type for negated versions of other expression
4610 // variants.
4611 if (Sub)
4612 return nullptr;
4613
4614 // Match reduce.add(ext(mul(A, B))).
4615 if (match(VecOp, m_ZExtOrSExt(m_Mul(m_VPValue(A), m_VPValue(B))))) {
4616 auto *Ext = cast<VPWidenCastRecipe>(VecOp);
4617 auto *Mul = cast<VPWidenRecipe>(Ext->getOperand(0));
4618 auto *Ext0 = dyn_cast<VPWidenCastRecipe>(A);
4619 auto *Ext1 = dyn_cast<VPWidenCastRecipe>(B);
4620
4621 // reduce.add(ext(mul(ext, const)))
4622 // -> reduce.add(ext(mul(ext, ext(const))))
4623 ExtendAndReplaceConstantOp(Ext0, Ext1, B, Mul);
4624
4625 // reduce.add(ext(mul(ext(A), ext(B))))
4626 // -> reduce.add(mul(wider_ext(A), wider_ext(B)))
4627 // The inner extends must either have the same opcode as the outer extend or
4628 // be the same, in which case the multiply can never result in a negative
4629 // value and the outer extend can be folded away by doing wider
4630 // extends for the operands of the mul.
4631 if (Ext0 && Ext1 &&
4632 (Ext->getOpcode() == Ext0->getOpcode() || Ext0 == Ext1) &&
4633 Ext0->getOpcode() == Ext1->getOpcode() &&
4634 IsMulAccValidAndClampRange(Mul, Ext0, Ext1, Ext) && Mul->hasOneUse()) {
4635 auto *NewExt0 = new VPWidenCastRecipe(
4636 Ext0->getOpcode(), Ext0->getOperand(0), Ext->getResultType(), nullptr,
4637 *Ext0, *Ext0, Ext0->getDebugLoc());
4638 NewExt0->insertBefore(Ext0);
4639
4640 VPWidenCastRecipe *NewExt1 = NewExt0;
4641 if (Ext0 != Ext1) {
4642 NewExt1 = new VPWidenCastRecipe(Ext1->getOpcode(), Ext1->getOperand(0),
4643 Ext->getResultType(), nullptr, *Ext1,
4644 *Ext1, Ext1->getDebugLoc());
4645 NewExt1->insertBefore(Ext1);
4646 }
4647 Mul->setOperand(0, NewExt0);
4648 Mul->setOperand(1, NewExt1);
4649 Red->setOperand(1, Mul);
4650 return new VPExpressionRecipe(NewExt0, NewExt1, Mul, Red);
4651 }
4652 }
4653 return nullptr;
4654}
4655
4656/// This function tries to create abstract recipes from the reduction recipe for
4657/// following optimizations and cost estimation.
4659 VPCostContext &Ctx,
4660 VFRange &Range) {
4661 // Creation of VPExpressions for partial reductions is entirely handled in
4662 // transformToPartialReduction.
4663 assert(!Red->isPartialReduction() &&
4664 "This path does not support partial reductions");
4665
4666 VPExpressionRecipe *AbstractR = nullptr;
4667 auto IP = std::next(Red->getIterator());
4668 auto *VPBB = Red->getParent();
4669 if (auto *MulAcc = tryToMatchAndCreateMulAccumulateReduction(Red, Ctx, Range))
4670 AbstractR = MulAcc;
4671 else if (auto *ExtRed = tryToMatchAndCreateExtendedReduction(Red, Ctx, Range))
4672 AbstractR = ExtRed;
4673 // Cannot create abstract inloop reduction recipes.
4674 if (!AbstractR)
4675 return;
4676
4677 AbstractR->insertBefore(*VPBB, IP);
4678 Red->replaceAllUsesWith(AbstractR);
4679}
4680
4691
4693 if (Plan.hasScalarVFOnly())
4694 return;
4695
4696#ifndef NDEBUG
4697 VPDominatorTree VPDT(Plan);
4698#endif
4699
4700 SmallVector<VPValue *> VPValues;
4701 if (VPValue *BTC = Plan.getBackedgeTakenCount())
4702 VPValues.push_back(BTC);
4703 append_range(VPValues, Plan.getLiveIns());
4704 for (VPRecipeBase &R : *Plan.getEntry())
4705 append_range(VPValues, R.definedValues());
4706
4707 auto *VectorPreheader = Plan.getVectorPreheader();
4708 for (VPValue *VPV : VPValues) {
4710 (isa<VPIRValue>(VPV) && isa<Constant>(VPV->getLiveInIRValue())))
4711 continue;
4712
4713 // Add explicit broadcast at the insert point that dominates all users.
4714 VPBasicBlock *HoistBlock = VectorPreheader;
4715 VPBasicBlock::iterator HoistPoint = VectorPreheader->end();
4716 for (VPUser *User : VPV->users()) {
4717 if (User->usesScalars(VPV))
4718 continue;
4719 if (cast<VPRecipeBase>(User)->getParent() == VectorPreheader)
4720 HoistPoint = HoistBlock->begin();
4721 else
4722 assert(VPDT.dominates(VectorPreheader,
4723 cast<VPRecipeBase>(User)->getParent()) &&
4724 "All users must be in the vector preheader or dominated by it");
4725 }
4726
4727 VPBuilder Builder(cast<VPBasicBlock>(HoistBlock), HoistPoint);
4728 auto *Broadcast = Builder.createNaryOp(VPInstruction::Broadcast, {VPV});
4729 VPV->replaceUsesWithIf(Broadcast,
4730 [VPV, Broadcast](VPUser &U, unsigned Idx) {
4731 return Broadcast != &U && !U.usesScalars(VPV);
4732 });
4733 }
4734}
4735
4737 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
4738
4739 // Collect candidate loads with invariant addresses and noalias scopes
4740 // metadata and memory-writing recipes with noalias metadata.
4744 vp_depth_first_shallow(LoopRegion->getEntry()))) {
4745 for (VPRecipeBase &R : *VPBB) {
4746 // Only handle single-scalar replicated loads with invariant addresses.
4747 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
4748 if (RepR->isPredicated() || !RepR->isSingleScalar() ||
4749 RepR->getOpcode() != Instruction::Load)
4750 continue;
4751
4752 VPValue *Addr = RepR->getOperand(0);
4753 if (Addr->isDefinedOutsideLoopRegions()) {
4755 if (!Loc.AATags.Scope)
4756 continue;
4757 CandidateLoads.push_back({RepR, Loc});
4758 }
4759 }
4760 if (R.mayWriteToMemory()) {
4762 if (!Loc || !Loc->AATags.Scope || !Loc->AATags.NoAlias)
4763 return;
4764 Stores.push_back(*Loc);
4765 }
4766 }
4767 }
4768
4769 VPBasicBlock *Preheader = Plan.getVectorPreheader();
4770 for (auto &[LoadRecipe, LoadLoc] : CandidateLoads) {
4771 // Hoist the load to the preheader if it doesn't alias with any stores
4772 // according to the noalias metadata. Other loads should have been hoisted
4773 // by other passes
4774 const AAMDNodes &LoadAA = LoadLoc.AATags;
4775 if (all_of(Stores, [&](const MemoryLocation &StoreLoc) {
4777 LoadAA.Scope, StoreLoc.AATags.NoAlias);
4778 })) {
4779 LoadRecipe->moveBefore(*Preheader, Preheader->getFirstNonPhi());
4780 }
4781 }
4782}
4783
4784// Collect common metadata from a group of replicate recipes by intersecting
4785// metadata from all recipes in the group.
4787 VPIRMetadata CommonMetadata = *Recipes.front();
4788 for (VPReplicateRecipe *Recipe : drop_begin(Recipes))
4789 CommonMetadata.intersect(*Recipe);
4790 return CommonMetadata;
4791}
4792
4793template <unsigned Opcode>
4797 const Loop *L) {
4798 static_assert(Opcode == Instruction::Load || Opcode == Instruction::Store,
4799 "Only Load and Store opcodes supported");
4800 constexpr bool IsLoad = (Opcode == Instruction::Load);
4801 VPTypeAnalysis TypeInfo(Plan);
4802
4803 // For each address, collect operations with the same or complementary masks.
4805 auto GetLoadStoreValueType = [&](VPReplicateRecipe *Recipe) {
4806 return TypeInfo.inferScalarType(IsLoad ? Recipe : Recipe->getOperand(0));
4807 };
4809 Plan, PSE, L,
4810 [](VPReplicateRecipe *RepR) { return RepR->isPredicated(); });
4811 for (auto Recipes : Groups) {
4812 if (Recipes.size() < 2)
4813 continue;
4814
4815 // Collect groups with the same or complementary masks.
4816 for (VPReplicateRecipe *&RecipeI : Recipes) {
4817 if (!RecipeI)
4818 continue;
4819
4820 VPValue *MaskI = RecipeI->getMask();
4821 Type *TypeI = GetLoadStoreValueType(RecipeI);
4823 Group.push_back(RecipeI);
4824 RecipeI = nullptr;
4825
4826 // Find all operations with the same or complementary masks.
4827 bool HasComplementaryMask = false;
4828 for (VPReplicateRecipe *&RecipeJ : Recipes) {
4829 if (!RecipeJ)
4830 continue;
4831
4832 VPValue *MaskJ = RecipeJ->getMask();
4833 Type *TypeJ = GetLoadStoreValueType(RecipeJ);
4834 if (TypeI == TypeJ) {
4835 // Check if any operation in the group has a complementary mask with
4836 // another, that is M1 == NOT(M2) or M2 == NOT(M1).
4837 HasComplementaryMask |= match(MaskI, m_Not(m_Specific(MaskJ))) ||
4838 match(MaskJ, m_Not(m_Specific(MaskI)));
4839 Group.push_back(RecipeJ);
4840 RecipeJ = nullptr;
4841 }
4842 }
4843
4844 if (HasComplementaryMask) {
4845 assert(Group.size() >= 2 && "must have at least 2 entries");
4846 AllGroups.push_back(std::move(Group));
4847 }
4848 }
4849 }
4850
4851 return AllGroups;
4852}
4853
4854// Find the recipe with minimum alignment in the group.
4855template <typename InstType>
4856static VPReplicateRecipe *
4858 return *min_element(Group, [](VPReplicateRecipe *A, VPReplicateRecipe *B) {
4859 return cast<InstType>(A->getUnderlyingInstr())->getAlign() <
4860 cast<InstType>(B->getUnderlyingInstr())->getAlign();
4861 });
4862}
4863
4866 const Loop *L) {
4867 auto Groups =
4869 if (Groups.empty())
4870 return;
4871
4872 // Process each group of loads.
4873 for (auto &Group : Groups) {
4874 // Try to use the earliest (most dominating) load to replace all others.
4875 VPReplicateRecipe *EarliestLoad = Group[0];
4876 VPBasicBlock *FirstBB = EarliestLoad->getParent();
4877 VPBasicBlock *LastBB = Group.back()->getParent();
4878
4879 // Check that the load doesn't alias with stores between first and last.
4880 auto LoadLoc = vputils::getMemoryLocation(*EarliestLoad);
4881 if (!LoadLoc || !canHoistOrSinkWithNoAliasCheck(*LoadLoc, FirstBB, LastBB))
4882 continue;
4883
4884 // Collect common metadata from all loads in the group.
4885 VPIRMetadata CommonMetadata = getCommonMetadata(Group);
4886
4887 // Find the load with minimum alignment to use.
4888 auto *LoadWithMinAlign = findRecipeWithMinAlign<LoadInst>(Group);
4889
4890 bool IsSingleScalar = EarliestLoad->isSingleScalar();
4891 assert(all_of(Group,
4892 [IsSingleScalar](VPReplicateRecipe *R) {
4893 return R->isSingleScalar() == IsSingleScalar;
4894 }) &&
4895 "all members in group must agree on IsSingleScalar");
4896
4897 // Create an unpredicated version of the earliest load with common
4898 // metadata.
4899 auto *UnpredicatedLoad = new VPReplicateRecipe(
4900 LoadWithMinAlign->getUnderlyingInstr(), {EarliestLoad->getOperand(0)},
4901 IsSingleScalar, /*Mask=*/nullptr, *EarliestLoad, CommonMetadata);
4902
4903 UnpredicatedLoad->insertBefore(EarliestLoad);
4904
4905 // Replace all loads in the group with the unpredicated load.
4906 for (VPReplicateRecipe *Load : Group) {
4907 Load->replaceAllUsesWith(UnpredicatedLoad);
4908 Load->eraseFromParent();
4909 }
4910 }
4911}
4912
4913static bool
4915 PredicatedScalarEvolution &PSE, const Loop &L,
4916 VPTypeAnalysis &TypeInfo) {
4917 auto StoreLoc = vputils::getMemoryLocation(*StoresToSink.front());
4918 if (!StoreLoc || !StoreLoc->AATags.Scope)
4919 return false;
4920
4921 // When sinking a group of stores, all members of the group alias each other.
4922 // Skip them during the alias checks.
4923 SmallPtrSet<VPRecipeBase *, 4> StoresToSinkSet(StoresToSink.begin(),
4924 StoresToSink.end());
4925
4926 VPBasicBlock *FirstBB = StoresToSink.front()->getParent();
4927 VPBasicBlock *LastBB = StoresToSink.back()->getParent();
4928 SinkStoreInfo SinkInfo(StoresToSinkSet, *StoresToSink[0], PSE, L, TypeInfo);
4929 return canHoistOrSinkWithNoAliasCheck(*StoreLoc, FirstBB, LastBB, SinkInfo);
4930}
4931
4934 const Loop *L) {
4935 auto Groups =
4937 if (Groups.empty())
4938 return;
4939
4940 VPTypeAnalysis TypeInfo(Plan);
4941
4942 for (auto &Group : Groups) {
4943 if (!canSinkStoreWithNoAliasCheck(Group, PSE, *L, TypeInfo))
4944 continue;
4945
4946 // Use the last (most dominated) store's location for the unconditional
4947 // store.
4948 VPReplicateRecipe *LastStore = Group.back();
4949 VPBasicBlock *InsertBB = LastStore->getParent();
4950
4951 // Collect common alias metadata from all stores in the group.
4952 VPIRMetadata CommonMetadata = getCommonMetadata(Group);
4953
4954 // Build select chain for stored values.
4955 VPValue *SelectedValue = Group[0]->getOperand(0);
4956 VPBuilder Builder(InsertBB, LastStore->getIterator());
4957
4958 bool IsSingleScalar = Group[0]->isSingleScalar();
4959 for (unsigned I = 1; I < Group.size(); ++I) {
4960 assert(IsSingleScalar == Group[I]->isSingleScalar() &&
4961 "all members in group must agree on IsSingleScalar");
4962 VPValue *Mask = Group[I]->getMask();
4963 VPValue *Value = Group[I]->getOperand(0);
4964 SelectedValue = Builder.createSelect(Mask, Value, SelectedValue,
4965 Group[I]->getDebugLoc());
4966 }
4967
4968 // Find the store with minimum alignment to use.
4969 auto *StoreWithMinAlign = findRecipeWithMinAlign<StoreInst>(Group);
4970
4971 // Create unconditional store with selected value and common metadata.
4972 auto *UnpredicatedStore = new VPReplicateRecipe(
4973 StoreWithMinAlign->getUnderlyingInstr(),
4974 {SelectedValue, LastStore->getOperand(1)}, IsSingleScalar,
4975 /*Mask=*/nullptr, *LastStore, CommonMetadata);
4976 UnpredicatedStore->insertBefore(*InsertBB, LastStore->getIterator());
4977
4978 // Remove all predicated stores from the group.
4979 for (VPReplicateRecipe *Store : Group)
4980 Store->eraseFromParent();
4981 }
4982}
4983
4985 VPlan &Plan, ElementCount BestVF, unsigned BestUF,
4987 assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
4988 assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
4989
4990 VPValue *TC = Plan.getTripCount();
4991 if (TC->getNumUsers() == 0)
4992 return;
4993
4994 // Skip cases for which the trip count may be non-trivial to materialize.
4995 // I.e., when a scalar tail is absent - due to tail folding, or when a scalar
4996 // tail is required.
4997 if (!Plan.hasScalarTail() ||
4999 Plan.getScalarPreheader() ||
5000 !isa<VPIRValue>(TC))
5001 return;
5002
5003 // Materialize vector trip counts for constants early if it can simply
5004 // be computed as (Original TC / VF * UF) * VF * UF.
5005 // TODO: Compute vector trip counts for loops requiring a scalar epilogue and
5006 // tail-folded loops.
5007 ScalarEvolution &SE = *PSE.getSE();
5008 auto *TCScev = SE.getSCEV(TC->getLiveInIRValue());
5009 if (!isa<SCEVConstant>(TCScev))
5010 return;
5011 const SCEV *VFxUF = SE.getElementCount(TCScev->getType(), BestVF * BestUF);
5012 auto VecTCScev = SE.getMulExpr(SE.getUDivExpr(TCScev, VFxUF), VFxUF);
5013 if (auto *ConstVecTC = dyn_cast<SCEVConstant>(VecTCScev))
5014 Plan.getVectorTripCount().setUnderlyingValue(ConstVecTC->getValue());
5015}
5016
5018 VPBasicBlock *VectorPH) {
5020 if (BTC->getNumUsers() == 0)
5021 return;
5022
5023 VPBuilder Builder(VectorPH, VectorPH->begin());
5024 auto *TCTy = VPTypeAnalysis(Plan).inferScalarType(Plan.getTripCount());
5025 auto *TCMO =
5026 Builder.createSub(Plan.getTripCount(), Plan.getConstantInt(TCTy, 1),
5027 DebugLoc::getCompilerGenerated(), "trip.count.minus.1");
5028 BTC->replaceAllUsesWith(TCMO);
5029}
5030
5032 if (Plan.hasScalarVFOnly())
5033 return;
5034
5035 VPTypeAnalysis TypeInfo(Plan);
5036 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
5037 auto VPBBsOutsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
5039 auto VPBBsInsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
5040 vp_depth_first_shallow(LoopRegion->getEntry()));
5041 // Materialize Build(Struct)Vector for all replicating VPReplicateRecipes,
5042 // VPScalarIVStepsRecipe and VPInstructions, excluding ones in replicate
5043 // regions. Those are not materialized explicitly yet. Those vector users are
5044 // still handled in VPReplicateRegion::execute(), via shouldPack().
5045 // TODO: materialize build vectors for replicating recipes in replicating
5046 // regions.
5047 for (VPBasicBlock *VPBB :
5048 concat<VPBasicBlock *>(VPBBsOutsideLoopRegion, VPBBsInsideLoopRegion)) {
5049 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
5051 continue;
5052 auto *DefR = cast<VPSingleDefRecipe>(&R);
5053 auto UsesVectorOrInsideReplicateRegion = [DefR, LoopRegion](VPUser *U) {
5054 VPRegionBlock *ParentRegion = cast<VPRecipeBase>(U)->getRegion();
5055 return !U->usesScalars(DefR) || ParentRegion != LoopRegion;
5056 };
5057 if ((isa<VPReplicateRecipe>(DefR) &&
5058 cast<VPReplicateRecipe>(DefR)->isSingleScalar()) ||
5059 (isa<VPInstruction>(DefR) &&
5061 !cast<VPInstruction>(DefR)->doesGeneratePerAllLanes())) ||
5062 none_of(DefR->users(), UsesVectorOrInsideReplicateRegion))
5063 continue;
5064
5065 Type *ScalarTy = TypeInfo.inferScalarType(DefR);
5066 unsigned Opcode = ScalarTy->isStructTy()
5069 auto *BuildVector = new VPInstruction(Opcode, {DefR});
5070 BuildVector->insertAfter(DefR);
5071
5072 DefR->replaceUsesWithIf(
5073 BuildVector, [BuildVector, &UsesVectorOrInsideReplicateRegion](
5074 VPUser &U, unsigned) {
5075 return &U != BuildVector && UsesVectorOrInsideReplicateRegion(&U);
5076 });
5077 }
5078 }
5079
5080 // Create explicit VPInstructions to convert vectors to scalars. The current
5081 // implementation is conservative - it may miss some cases that may or may not
5082 // be vector values. TODO: introduce Unpacks speculatively - remove them later
5083 // if they are known to operate on scalar values.
5084 for (VPBasicBlock *VPBB : VPBBsInsideLoopRegion) {
5085 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
5087 VPDerivedIVRecipe>(&R))
5088 continue;
5089 for (VPValue *Def : R.definedValues()) {
5090 // Skip recipes that are single-scalar or only have their first lane
5091 // used.
5092 // TODO: The Defs skipped here may or may not be vector values.
5093 // Introduce Unpacks, and remove them later, if they are guaranteed to
5094 // produce scalar values.
5096 continue;
5097
5098 // At the moment, we create unpacks only for scalar users outside
5099 // replicate regions. Recipes inside replicate regions still extract the
5100 // required lanes implicitly.
5101 // TODO: Remove once replicate regions are unrolled completely.
5102 auto IsCandidateUnpackUser = [Def](VPUser *U) {
5103 VPRegionBlock *ParentRegion = cast<VPRecipeBase>(U)->getRegion();
5104 return U->usesScalars(Def) &&
5105 (!ParentRegion || !ParentRegion->isReplicator());
5106 };
5107 if (none_of(Def->users(), IsCandidateUnpackUser))
5108 continue;
5109
5110 auto *Unpack = new VPInstruction(VPInstruction::Unpack, {Def});
5111 if (R.isPhi())
5112 Unpack->insertBefore(*VPBB, VPBB->getFirstNonPhi());
5113 else
5114 Unpack->insertAfter(&R);
5115 Def->replaceUsesWithIf(Unpack,
5116 [&IsCandidateUnpackUser](VPUser &U, unsigned) {
5117 return IsCandidateUnpackUser(&U);
5118 });
5119 }
5120 }
5121 }
5122}
5123
5125 VPlan &Plan, VPBasicBlock *VectorPHVPBB, bool TailByMasking,
5126 bool RequiresScalarEpilogue, VPValue *Step,
5127 std::optional<uint64_t> MaxRuntimeStep) {
5128 VPSymbolicValue &VectorTC = Plan.getVectorTripCount();
5129 // There's nothing to do if there are no users of the vector trip count or its
5130 // IR value has already been set.
5131 if (VectorTC.getNumUsers() == 0 || VectorTC.getUnderlyingValue())
5132 return;
5133
5134 VPValue *TC = Plan.getTripCount();
5135 Type *TCTy = VPTypeAnalysis(Plan).inferScalarType(TC);
5136 VPBasicBlock::iterator InsertPt = VectorPHVPBB->begin();
5137 if (auto *StepR = Step->getDefiningRecipe()) {
5138 assert(StepR->getParent() == VectorPHVPBB &&
5139 "Step must be defined in VectorPHVPBB");
5140 // Insert after Step's definition to maintain valid def-use ordering.
5141 InsertPt = std::next(StepR->getIterator());
5142 }
5143 VPBuilder Builder(VectorPHVPBB, InsertPt);
5144
5145 // For scalable steps, if TC is a constant and is divisible by the maximum
5146 // possible runtime step, then TC % Step == 0 for all valid vscale values
5147 // and the vector trip count equals TC directly.
5148 const APInt *TCVal;
5149 if (!RequiresScalarEpilogue && match(TC, m_APInt(TCVal)) && MaxRuntimeStep &&
5150 TCVal->getZExtValue() % *MaxRuntimeStep == 0) {
5151 VectorTC.replaceAllUsesWith(TC);
5152 return;
5153 }
5154
5155 // If the tail is to be folded by masking, round the number of iterations N
5156 // up to a multiple of Step instead of rounding down. This is done by first
5157 // adding Step-1 and then rounding down. Note that it's ok if this addition
5158 // overflows: the vector induction variable will eventually wrap to zero given
5159 // that it starts at zero and its Step is a power of two; the loop will then
5160 // exit, with the last early-exit vector comparison also producing all-true.
5161 if (TailByMasking) {
5162 TC = Builder.createAdd(
5163 TC, Builder.createSub(Step, Plan.getConstantInt(TCTy, 1)),
5164 DebugLoc::getCompilerGenerated(), "n.rnd.up");
5165 }
5166
5167 // Now we need to generate the expression for the part of the loop that the
5168 // vectorized body will execute. This is equal to N - (N % Step) if scalar
5169 // iterations are not required for correctness, or N - Step, otherwise. Step
5170 // is equal to the vectorization factor (number of SIMD elements) times the
5171 // unroll factor (number of SIMD instructions).
5172 VPValue *R =
5173 Builder.createNaryOp(Instruction::URem, {TC, Step},
5174 DebugLoc::getCompilerGenerated(), "n.mod.vf");
5175
5176 // There are cases where we *must* run at least one iteration in the remainder
5177 // loop. See the cost model for when this can happen. If the step evenly
5178 // divides the trip count, we set the remainder to be equal to the step. If
5179 // the step does not evenly divide the trip count, no adjustment is necessary
5180 // since there will already be scalar iterations. Note that the minimum
5181 // iterations check ensures that N >= Step.
5182 if (RequiresScalarEpilogue) {
5183 assert(!TailByMasking &&
5184 "requiring scalar epilogue is not supported with fail folding");
5185 VPValue *IsZero =
5186 Builder.createICmp(CmpInst::ICMP_EQ, R, Plan.getZero(TCTy));
5187 R = Builder.createSelect(IsZero, Step, R);
5188 }
5189
5190 VPValue *Res =
5191 Builder.createSub(TC, R, DebugLoc::getCompilerGenerated(), "n.vec");
5192 VectorTC.replaceAllUsesWith(Res);
5193}
5194
5196 ElementCount VFEC) {
5197 // If VF and VFxUF have already been materialized (no remaining users),
5198 // there's nothing more to do.
5199 if (Plan.getVF().isMaterialized()) {
5200 assert(Plan.getVFxUF().isMaterialized() &&
5201 "VF and VFxUF must be materialized together");
5202 return;
5203 }
5204
5205 VPBuilder Builder(VectorPH, VectorPH->begin());
5206 Type *TCTy = VPTypeAnalysis(Plan).inferScalarType(Plan.getTripCount());
5207 VPValue &VF = Plan.getVF();
5208 VPValue &VFxUF = Plan.getVFxUF();
5209 // If there are no users of the runtime VF, compute VFxUF by constant folding
5210 // the multiplication of VF and UF.
5211 if (VF.getNumUsers() == 0) {
5212 VPValue *RuntimeVFxUF =
5213 Builder.createElementCount(TCTy, VFEC * Plan.getConcreteUF());
5214 VFxUF.replaceAllUsesWith(RuntimeVFxUF);
5215 return;
5216 }
5217
5218 // For users of the runtime VF, compute it as VF * vscale, and VFxUF as (VF *
5219 // vscale) * UF.
5220 VPValue *RuntimeVF = Builder.createElementCount(TCTy, VFEC);
5222 VPValue *BC = Builder.createNaryOp(VPInstruction::Broadcast, RuntimeVF);
5224 BC, [&VF](VPUser &U, unsigned) { return !U.usesScalars(&VF); });
5225 }
5226 VF.replaceAllUsesWith(RuntimeVF);
5227
5228 VPValue *MulByUF = Builder.createOverflowingOp(
5229 Instruction::Mul,
5230 {RuntimeVF, Plan.getConstantInt(TCTy, Plan.getConcreteUF())},
5231 {true, false});
5232 VFxUF.replaceAllUsesWith(MulByUF);
5233}
5234
5237 SCEVExpander Expander(SE, "induction", /*PreserveLCSSA=*/false);
5238
5239 auto *Entry = cast<VPIRBasicBlock>(Plan.getEntry());
5240 BasicBlock *EntryBB = Entry->getIRBasicBlock();
5241 DenseMap<const SCEV *, Value *> ExpandedSCEVs;
5242 for (VPRecipeBase &R : make_early_inc_range(*Entry)) {
5244 continue;
5245 auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(&R);
5246 if (!ExpSCEV)
5247 break;
5248 const SCEV *Expr = ExpSCEV->getSCEV();
5249 Value *Res =
5250 Expander.expandCodeFor(Expr, Expr->getType(), EntryBB->getTerminator());
5251 ExpandedSCEVs[ExpSCEV->getSCEV()] = Res;
5252 VPValue *Exp = Plan.getOrAddLiveIn(Res);
5253 ExpSCEV->replaceAllUsesWith(Exp);
5254 if (Plan.getTripCount() == ExpSCEV)
5255 Plan.resetTripCount(Exp);
5256 ExpSCEV->eraseFromParent();
5257 }
5259 "VPExpandSCEVRecipes must be at the beginning of the entry block, "
5260 "before any VPIRInstructions");
5261 // Add IR instructions in the entry basic block but not in the VPIRBasicBlock
5262 // to the VPIRBasicBlock.
5263 auto EI = Entry->begin();
5264 for (Instruction &I : drop_end(*EntryBB)) {
5265 if (EI != Entry->end() && isa<VPIRInstruction>(*EI) &&
5266 &cast<VPIRInstruction>(&*EI)->getInstruction() == &I) {
5267 EI++;
5268 continue;
5269 }
5271 }
5272
5273 return ExpandedSCEVs;
5274}
5275
5276/// Returns true if \p V is VPWidenLoadRecipe or VPInterleaveRecipe that can be
5277/// converted to a narrower recipe. \p V is used by a wide recipe that feeds a
5278/// store interleave group at index \p Idx, \p WideMember0 is the recipe feeding
5279/// the same interleave group at index 0. A VPWidenLoadRecipe can be narrowed to
5280/// an index-independent load if it feeds all wide ops at all indices (\p OpV
5281/// must be the operand at index \p OpIdx for both the recipe at lane 0, \p
5282/// WideMember0). A VPInterleaveRecipe can be narrowed to a wide load, if \p V
5283/// is defined at \p Idx of a load interleave group.
5284static bool canNarrowLoad(VPSingleDefRecipe *WideMember0, unsigned OpIdx,
5285 VPValue *OpV, unsigned Idx, bool IsScalable) {
5286 VPValue *Member0Op = WideMember0->getOperand(OpIdx);
5287 VPRecipeBase *Member0OpR = Member0Op->getDefiningRecipe();
5288 if (!Member0OpR)
5289 return Member0Op == OpV;
5290 if (auto *W = dyn_cast<VPWidenLoadRecipe>(Member0OpR))
5291 // For scalable VFs, the narrowed plan processes vscale iterations at once,
5292 // so a shared wide load cannot be narrowed to a uniform scalar; bail out.
5293 return !IsScalable && !W->getMask() && W->isConsecutive() &&
5294 Member0Op == OpV;
5295 if (auto *IR = dyn_cast<VPInterleaveRecipe>(Member0OpR))
5296 return IR->getInterleaveGroup()->isFull() && IR->getVPValue(Idx) == OpV;
5297 return false;
5298}
5299
5300static bool canNarrowOps(ArrayRef<VPValue *> Ops, bool IsScalable) {
5302 auto *WideMember0 = dyn_cast<VPSingleDefRecipe>(Ops[0]);
5303 if (!WideMember0)
5304 return false;
5305 for (VPValue *V : Ops) {
5307 return false;
5308 auto *R = cast<VPSingleDefRecipe>(V);
5309 if (getOpcodeOrIntrinsicID(R) != getOpcodeOrIntrinsicID(WideMember0))
5310 return false;
5311 }
5312
5313 for (unsigned Idx = 0; Idx != WideMember0->getNumOperands(); ++Idx) {
5315 for (VPValue *Op : Ops)
5316 OpsI.push_back(Op->getDefiningRecipe()->getOperand(Idx));
5317
5318 if (canNarrowOps(OpsI, IsScalable))
5319 continue;
5320
5321 if (any_of(enumerate(OpsI), [WideMember0, Idx, IsScalable](const auto &P) {
5322 const auto &[OpIdx, OpV] = P;
5323 return !canNarrowLoad(WideMember0, Idx, OpV, OpIdx, IsScalable);
5324 }))
5325 return false;
5326 }
5327
5328 return true;
5329}
5330
5331/// Returns VF from \p VFs if \p IR is a full interleave group with factor and
5332/// number of members both equal to VF. The interleave group must also access
5333/// the full vector width.
5334static std::optional<ElementCount> isConsecutiveInterleaveGroup(
5336 VPTypeAnalysis &TypeInfo, const TargetTransformInfo &TTI) {
5337 if (!InterleaveR || InterleaveR->getMask())
5338 return std::nullopt;
5339
5340 Type *GroupElementTy = nullptr;
5341 if (InterleaveR->getStoredValues().empty()) {
5342 GroupElementTy = TypeInfo.inferScalarType(InterleaveR->getVPValue(0));
5343 if (!all_of(InterleaveR->definedValues(),
5344 [&TypeInfo, GroupElementTy](VPValue *Op) {
5345 return TypeInfo.inferScalarType(Op) == GroupElementTy;
5346 }))
5347 return std::nullopt;
5348 } else {
5349 GroupElementTy =
5350 TypeInfo.inferScalarType(InterleaveR->getStoredValues()[0]);
5351 if (!all_of(InterleaveR->getStoredValues(),
5352 [&TypeInfo, GroupElementTy](VPValue *Op) {
5353 return TypeInfo.inferScalarType(Op) == GroupElementTy;
5354 }))
5355 return std::nullopt;
5356 }
5357
5358 auto IG = InterleaveR->getInterleaveGroup();
5359 if (IG->getFactor() != IG->getNumMembers())
5360 return std::nullopt;
5361
5362 auto GetVectorBitWidthForVF = [&TTI](ElementCount VF) {
5363 TypeSize Size = TTI.getRegisterBitWidth(
5366 assert(Size.isScalable() == VF.isScalable() &&
5367 "if Size is scalable, VF must be scalable and vice versa");
5368 return Size.getKnownMinValue();
5369 };
5370
5371 for (ElementCount VF : VFs) {
5372 unsigned MinVal = VF.getKnownMinValue();
5373 unsigned GroupSize = GroupElementTy->getScalarSizeInBits() * MinVal;
5374 if (IG->getFactor() == MinVal && GroupSize == GetVectorBitWidthForVF(VF))
5375 return {VF};
5376 }
5377 return std::nullopt;
5378}
5379
5380/// Returns true if \p VPValue is a narrow VPValue.
5381static bool isAlreadyNarrow(VPValue *VPV) {
5382 if (isa<VPIRValue>(VPV))
5383 return true;
5384 auto *RepR = dyn_cast<VPReplicateRecipe>(VPV);
5385 return RepR && RepR->isSingleScalar();
5386}
5387
5388// Convert a wide recipe defining a VPValue \p V feeding an interleave group to
5389// a narrow variant.
5390static VPValue *
5392 auto *R = V->getDefiningRecipe();
5393 if (!R || NarrowedOps.contains(V))
5394 return V;
5395
5396 if (isAlreadyNarrow(V))
5397 return V;
5398
5400 auto *WideMember0 = cast<VPSingleDefRecipe>(R);
5401 for (unsigned Idx = 0, E = WideMember0->getNumOperands(); Idx != E; ++Idx)
5402 WideMember0->setOperand(
5403 Idx,
5404 narrowInterleaveGroupOp(WideMember0->getOperand(Idx), NarrowedOps));
5405 return V;
5406 }
5407
5408 if (auto *LoadGroup = dyn_cast<VPInterleaveRecipe>(R)) {
5409 // Narrow interleave group to wide load, as transformed VPlan will only
5410 // process one original iteration.
5411 auto *LI = cast<LoadInst>(LoadGroup->getInterleaveGroup()->getInsertPos());
5412 auto *L = new VPWidenLoadRecipe(*LI, LoadGroup->getAddr(),
5413 LoadGroup->getMask(), /*Consecutive=*/true,
5414 {}, LoadGroup->getDebugLoc());
5415 L->insertBefore(LoadGroup);
5416 NarrowedOps.insert(L);
5417 return L;
5418 }
5419
5420 if (auto *RepR = dyn_cast<VPReplicateRecipe>(R)) {
5421 assert(RepR->isSingleScalar() && RepR->getOpcode() == Instruction::Load &&
5422 "must be a single scalar load");
5423 NarrowedOps.insert(RepR);
5424 return RepR;
5425 }
5426
5427 auto *WideLoad = cast<VPWidenLoadRecipe>(R);
5428 VPValue *PtrOp = WideLoad->getAddr();
5429 if (auto *VecPtr = dyn_cast<VPVectorPointerRecipe>(PtrOp))
5430 PtrOp = VecPtr->getOperand(0);
5431 // Narrow wide load to uniform scalar load, as transformed VPlan will only
5432 // process one original iteration.
5433 auto *N = new VPReplicateRecipe(&WideLoad->getIngredient(), {PtrOp},
5434 /*IsUniform*/ true,
5435 /*Mask*/ nullptr, {}, *WideLoad);
5436 N->insertBefore(WideLoad);
5437 NarrowedOps.insert(N);
5438 return N;
5439}
5440
5441std::unique_ptr<VPlan>
5443 const TargetTransformInfo &TTI) {
5444 VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
5445
5446 if (!VectorLoop)
5447 return nullptr;
5448
5449 // Only handle single-block loops for now.
5450 if (VectorLoop->getEntryBasicBlock() != VectorLoop->getExitingBasicBlock())
5451 return nullptr;
5452
5453 // Skip plans when we may not be able to properly narrow.
5454 VPBasicBlock *Exiting = VectorLoop->getExitingBasicBlock();
5455 if (!match(&Exiting->back(), m_BranchOnCount()))
5456 return nullptr;
5457
5458 assert(match(&Exiting->back(),
5460 m_Specific(&Plan.getVectorTripCount()))) &&
5461 "unexpected branch-on-count");
5462
5463 VPTypeAnalysis TypeInfo(Plan);
5465 std::optional<ElementCount> VFToOptimize;
5466 for (auto &R : *VectorLoop->getEntryBasicBlock()) {
5469 continue;
5470
5471 // Bail out on recipes not supported at the moment:
5472 // * phi recipes other than the canonical induction
5473 // * recipes writing to memory except interleave groups
5474 // Only support plans with a canonical induction phi.
5475 if (R.isPhi())
5476 return nullptr;
5477
5478 auto *InterleaveR = dyn_cast<VPInterleaveRecipe>(&R);
5479 if (R.mayWriteToMemory() && !InterleaveR)
5480 return nullptr;
5481
5482 // All other ops are allowed, but we reject uses that cannot be converted
5483 // when checking all allowed consumers (store interleave groups) below.
5484 if (!InterleaveR)
5485 continue;
5486
5487 // Try to find a single VF, where all interleave groups are consecutive and
5488 // saturate the full vector width. If we already have a candidate VF, check
5489 // if it is applicable for the current InterleaveR, otherwise look for a
5490 // suitable VF across the Plan's VFs.
5492 VFToOptimize ? SmallVector<ElementCount>({*VFToOptimize})
5493 : to_vector(Plan.vectorFactors());
5494 std::optional<ElementCount> NarrowedVF =
5495 isConsecutiveInterleaveGroup(InterleaveR, VFs, TypeInfo, TTI);
5496 if (!NarrowedVF || (VFToOptimize && NarrowedVF != VFToOptimize))
5497 return nullptr;
5498 VFToOptimize = NarrowedVF;
5499
5500 // Skip read interleave groups.
5501 if (InterleaveR->getStoredValues().empty())
5502 continue;
5503
5504 // Narrow interleave groups, if all operands are already matching narrow
5505 // ops.
5506 auto *Member0 = InterleaveR->getStoredValues()[0];
5507 if (isAlreadyNarrow(Member0) &&
5508 all_of(InterleaveR->getStoredValues(), equal_to(Member0))) {
5509 StoreGroups.push_back(InterleaveR);
5510 continue;
5511 }
5512
5513 // For now, we only support full interleave groups storing load interleave
5514 // groups.
5515 if (all_of(enumerate(InterleaveR->getStoredValues()), [](auto Op) {
5516 VPRecipeBase *DefR = Op.value()->getDefiningRecipe();
5517 if (!DefR)
5518 return false;
5519 auto *IR = dyn_cast<VPInterleaveRecipe>(DefR);
5520 return IR && IR->getInterleaveGroup()->isFull() &&
5521 IR->getVPValue(Op.index()) == Op.value();
5522 })) {
5523 StoreGroups.push_back(InterleaveR);
5524 continue;
5525 }
5526
5527 // Check if all values feeding InterleaveR are matching wide recipes, which
5528 // operands that can be narrowed.
5529 if (!canNarrowOps(InterleaveR->getStoredValues(),
5530 VFToOptimize->isScalable()))
5531 return nullptr;
5532 StoreGroups.push_back(InterleaveR);
5533 }
5534
5535 if (StoreGroups.empty())
5536 return nullptr;
5537
5538 VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
5539 bool RequiresScalarEpilogue =
5540 MiddleVPBB->getNumSuccessors() == 1 &&
5541 MiddleVPBB->getSingleSuccessor() == Plan.getScalarPreheader();
5542 // Bail out for tail-folding (middle block with a single successor to exit).
5543 if (MiddleVPBB->getNumSuccessors() != 2 && !RequiresScalarEpilogue)
5544 return nullptr;
5545
5546 // All interleave groups in Plan can be narrowed for VFToOptimize. Split the
5547 // original Plan into 2: a) a new clone which contains all VFs of Plan, except
5548 // VFToOptimize, and b) the original Plan with VFToOptimize as single VF.
5549 // TODO: Handle cases where only some interleave groups can be narrowed.
5550 std::unique_ptr<VPlan> NewPlan;
5551 if (size(Plan.vectorFactors()) != 1) {
5552 NewPlan = std::unique_ptr<VPlan>(Plan.duplicate());
5553 Plan.setVF(*VFToOptimize);
5554 NewPlan->removeVF(*VFToOptimize);
5555 }
5556
5557 // Convert InterleaveGroup \p R to a single VPWidenLoadRecipe.
5558 SmallPtrSet<VPValue *, 4> NarrowedOps;
5559 // Narrow operation tree rooted at store groups.
5560 for (auto *StoreGroup : StoreGroups) {
5561 VPValue *Res =
5562 narrowInterleaveGroupOp(StoreGroup->getStoredValues()[0], NarrowedOps);
5563 auto *SI =
5564 cast<StoreInst>(StoreGroup->getInterleaveGroup()->getInsertPos());
5565 auto *S = new VPWidenStoreRecipe(*SI, StoreGroup->getAddr(), Res, nullptr,
5566 /*Consecutive=*/true, {},
5567 StoreGroup->getDebugLoc());
5568 S->insertBefore(StoreGroup);
5569 StoreGroup->eraseFromParent();
5570 }
5571
5572 // Adjust induction to reflect that the transformed plan only processes one
5573 // original iteration.
5575 Type *CanIVTy = VectorLoop->getCanonicalIVType();
5576 VPBasicBlock *VectorPH = Plan.getVectorPreheader();
5577 VPBuilder PHBuilder(VectorPH, VectorPH->begin());
5578
5579 VPValue *UF = &Plan.getUF();
5580 VPValue *Step;
5581 if (VFToOptimize->isScalable()) {
5582 VPValue *VScale =
5583 PHBuilder.createElementCount(CanIVTy, ElementCount::getScalable(1));
5584 Step = PHBuilder.createOverflowingOp(Instruction::Mul, {VScale, UF},
5585 {true, false});
5586 Plan.getVF().replaceAllUsesWith(VScale);
5587 } else {
5588 Step = UF;
5589 Plan.getVF().replaceAllUsesWith(Plan.getConstantInt(CanIVTy, 1));
5590 }
5591 // Materialize vector trip count with the narrowed step.
5592 materializeVectorTripCount(Plan, VectorPH, /*TailByMasking=*/false,
5593 RequiresScalarEpilogue, Step);
5594
5595 CanIVInc->setOperand(1, Step);
5596 Plan.getVFxUF().replaceAllUsesWith(Step);
5597
5598 removeDeadRecipes(Plan);
5599 assert(none_of(*VectorLoop->getEntryBasicBlock(),
5601 "All VPVectorPointerRecipes should have been removed");
5602 return NewPlan;
5603}
5604
5605/// Add branch weight metadata, if the \p Plan's middle block is terminated by a
5606/// BranchOnCond recipe.
5608 VPlan &Plan, ElementCount VF, std::optional<unsigned> VScaleForTuning) {
5609 VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
5610 auto *MiddleTerm =
5612 // Only add branch metadata if there is a (conditional) terminator.
5613 if (!MiddleTerm)
5614 return;
5615
5616 assert(MiddleTerm->getOpcode() == VPInstruction::BranchOnCond &&
5617 "must have a BranchOnCond");
5618 // Assume that `TripCount % VectorStep ` is equally distributed.
5619 unsigned VectorStep = Plan.getConcreteUF() * VF.getKnownMinValue();
5620 if (VF.isScalable() && VScaleForTuning.has_value())
5621 VectorStep *= *VScaleForTuning;
5622 assert(VectorStep > 0 && "trip count should not be zero");
5623 MDBuilder MDB(Plan.getContext());
5624 MDNode *BranchWeights =
5625 MDB.createBranchWeights({1, VectorStep - 1}, /*IsExpected=*/false);
5626 MiddleTerm->setMetadata(LLVMContext::MD_prof, BranchWeights);
5627}
5628
5630 VFRange &Range) {
5631 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
5632 auto *MiddleVPBB = Plan.getMiddleBlock();
5633 VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
5634
5635 auto IsScalableOne = [](ElementCount VF) -> bool {
5636 return VF == ElementCount::getScalable(1);
5637 };
5638
5639 for (auto &HeaderPhi : VectorRegion->getEntryBasicBlock()->phis()) {
5640 auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&HeaderPhi);
5641 if (!FOR)
5642 continue;
5643
5644 assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
5645 "Cannot handle loops with uncountable early exits");
5646
5647 // This is the second phase of vectorizing first-order recurrences, creating
5648 // extract for users outside the loop. An overview of the transformation is
5649 // described below. Suppose we have the following loop with some use after
5650 // the loop of the last a[i-1],
5651 //
5652 // for (int i = 0; i < n; ++i) {
5653 // t = a[i - 1];
5654 // b[i] = a[i] - t;
5655 // }
5656 // use t;
5657 //
5658 // There is a first-order recurrence on "a". For this loop, the shorthand
5659 // scalar IR looks like:
5660 //
5661 // scalar.ph:
5662 // s.init = a[-1]
5663 // br scalar.body
5664 //
5665 // scalar.body:
5666 // i = phi [0, scalar.ph], [i+1, scalar.body]
5667 // s1 = phi [s.init, scalar.ph], [s2, scalar.body]
5668 // s2 = a[i]
5669 // b[i] = s2 - s1
5670 // br cond, scalar.body, exit.block
5671 //
5672 // exit.block:
5673 // use = lcssa.phi [s1, scalar.body]
5674 //
5675 // In this example, s1 is a recurrence because it's value depends on the
5676 // previous iteration. In the first phase of vectorization, we created a
5677 // VPFirstOrderRecurrencePHIRecipe v1 for s1. Now we create the extracts
5678 // for users in the scalar preheader and exit block.
5679 //
5680 // vector.ph:
5681 // v_init = vector(..., ..., ..., a[-1])
5682 // br vector.body
5683 //
5684 // vector.body
5685 // i = phi [0, vector.ph], [i+4, vector.body]
5686 // v1 = phi [v_init, vector.ph], [v2, vector.body]
5687 // v2 = a[i, i+1, i+2, i+3]
5688 // b[i] = v2 - v1
5689 // // Next, third phase will introduce v1' = splice(v1(3), v2(0, 1, 2))
5690 // b[i, i+1, i+2, i+3] = v2 - v1
5691 // br cond, vector.body, middle.block
5692 //
5693 // middle.block:
5694 // vector.recur.extract.for.phi = v2(2)
5695 // vector.recur.extract = v2(3)
5696 // br cond, scalar.ph, exit.block
5697 //
5698 // scalar.ph:
5699 // scalar.recur.init = phi [vector.recur.extract, middle.block],
5700 // [s.init, otherwise]
5701 // br scalar.body
5702 //
5703 // scalar.body:
5704 // i = phi [0, scalar.ph], [i+1, scalar.body]
5705 // s1 = phi [scalar.recur.init, scalar.ph], [s2, scalar.body]
5706 // s2 = a[i]
5707 // b[i] = s2 - s1
5708 // br cond, scalar.body, exit.block
5709 //
5710 // exit.block:
5711 // lo = lcssa.phi [s1, scalar.body],
5712 // [vector.recur.extract.for.phi, middle.block]
5713 //
5714 // Now update VPIRInstructions modeling LCSSA phis in the exit block.
5715 // Extract the penultimate value of the recurrence and use it as operand for
5716 // the VPIRInstruction modeling the phi.
5718 make_range(MiddleVPBB->getFirstNonPhi(), MiddleVPBB->end()))) {
5720 continue;
5721
5722 // For VF vscale x 1, if vscale = 1, we are unable to extract the
5723 // penultimate value of the recurrence. Instead we rely on the existing
5724 // extract of the last element from the result of
5725 // VPInstruction::FirstOrderRecurrenceSplice.
5726 // TODO: Consider vscale_range info and UF.
5728 Range))
5729 return;
5730 VPValue *PenultimateElement = MiddleBuilder.createNaryOp(
5731 VPInstruction::ExtractPenultimateElement, FOR->getBackedgeValue(), {},
5732 "vector.recur.extract.for.phi");
5733 for (VPUser *U : to_vector(cast<VPInstruction>(&R)->users())) {
5734 auto *ExitPhi = dyn_cast<VPIRPhi>(U);
5735 if (!ExitPhi)
5736 continue;
5737 ExitPhi->replaceUsesOfWith(cast<VPInstruction>(&R), PenultimateElement);
5738 }
5739 }
5740 }
5741}
5742
5743/// Check if \p V is a binary expression of a widened IV and a loop-invariant
5744/// value. Returns the widened IV if found, nullptr otherwise.
5746 auto *BinOp = dyn_cast<VPWidenRecipe>(V);
5747 if (!BinOp || !Instruction::isBinaryOp(BinOp->getOpcode()) ||
5748 Instruction::isIntDivRem(BinOp->getOpcode()))
5749 return nullptr;
5750
5751 VPValue *WidenIVCandidate = BinOp->getOperand(0);
5752 VPValue *InvariantCandidate = BinOp->getOperand(1);
5753 if (!isa<VPWidenIntOrFpInductionRecipe>(WidenIVCandidate))
5754 std::swap(WidenIVCandidate, InvariantCandidate);
5755
5756 if (!InvariantCandidate->isDefinedOutsideLoopRegions())
5757 return nullptr;
5758
5759 return dyn_cast<VPWidenIntOrFpInductionRecipe>(WidenIVCandidate);
5760}
5761
5762/// Create a scalar version of \p BinOp, with its \p WidenIV operand replaced
5763/// by \p ScalarIV, and place it after \p ScalarIV's defining recipe.
5767 BinOp->getNumOperands() == 2 && "BinOp must have 2 operands");
5768 auto *ClonedOp = BinOp->clone();
5769 if (ClonedOp->getOperand(0) == WidenIV) {
5770 ClonedOp->setOperand(0, ScalarIV);
5771 } else {
5772 assert(ClonedOp->getOperand(1) == WidenIV && "one operand must be WideIV");
5773 ClonedOp->setOperand(1, ScalarIV);
5774 }
5775 ClonedOp->insertAfter(ScalarIV->getDefiningRecipe());
5776 return ClonedOp;
5777}
5778
5781 Loop &L) {
5782 ScalarEvolution &SE = *PSE.getSE();
5783 VPRegionBlock *VectorLoopRegion = Plan.getVectorLoopRegion();
5784
5785 // Helper lambda to check if the IV range excludes the sentinel value. Try
5786 // signed first, then unsigned. Return an excluded sentinel if found,
5787 // otherwise return std::nullopt.
5788 auto CheckSentinel = [&SE](const SCEV *IVSCEV,
5789 bool UseMax) -> std::optional<APSInt> {
5790 unsigned BW = IVSCEV->getType()->getScalarSizeInBits();
5791 for (bool Signed : {true, false}) {
5792 APSInt Sentinel = UseMax ? APSInt::getMinValue(BW, /*Unsigned=*/!Signed)
5793 : APSInt::getMaxValue(BW, /*Unsigned=*/!Signed);
5794
5795 ConstantRange IVRange =
5796 Signed ? SE.getSignedRange(IVSCEV) : SE.getUnsignedRange(IVSCEV);
5797 if (!IVRange.contains(Sentinel))
5798 return Sentinel;
5799 }
5800 return std::nullopt;
5801 };
5802
5803 VPValue *HeaderMask = vputils::findHeaderMask(Plan);
5804 for (VPRecipeBase &Phi :
5805 make_early_inc_range(VectorLoopRegion->getEntryBasicBlock()->phis())) {
5806 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&Phi);
5808 PhiR->getRecurrenceKind()))
5809 continue;
5810
5811 Type *PhiTy = VPTypeAnalysis(Plan).inferScalarType(PhiR);
5812 if (PhiTy->isPointerTy() || PhiTy->isFloatingPointTy())
5813 continue;
5814
5815 // If there's a header mask, the backedge select will not be the find-last
5816 // select.
5817 VPValue *BackedgeVal = PhiR->getBackedgeValue();
5818 auto *FindLastSelect = cast<VPSingleDefRecipe>(BackedgeVal);
5819 if (HeaderMask &&
5820 !match(BackedgeVal,
5821 m_Select(m_Specific(HeaderMask),
5822 m_VPSingleDefRecipe(FindLastSelect), m_Specific(PhiR))))
5823 llvm_unreachable("expected header mask select");
5824
5825 // Get the find-last expression from the find-last select of the reduction
5826 // phi. The find-last select should be a select between the phi and the
5827 // find-last expression.
5828 VPValue *Cond, *FindLastExpression;
5829 if (!match(FindLastSelect, m_Select(m_VPValue(Cond), m_Specific(PhiR),
5830 m_VPValue(FindLastExpression))) &&
5831 !match(FindLastSelect,
5832 m_Select(m_VPValue(Cond), m_VPValue(FindLastExpression),
5833 m_Specific(PhiR))))
5834 continue;
5835
5836 // Check if FindLastExpression is a simple expression of a widened IV. If
5837 // so, we can track the underlying IV instead and sink the expression.
5838 auto *IVOfExpressionToSink = getExpressionIV(FindLastExpression);
5839 const SCEV *IVSCEV = vputils::getSCEVExprForVPValue(
5840 IVOfExpressionToSink ? IVOfExpressionToSink : FindLastExpression, PSE,
5841 &L);
5842 const SCEV *Step;
5843 if (!match(IVSCEV, m_scev_AffineAddRec(m_SCEV(), m_SCEV(Step)))) {
5844 assert(!match(vputils::getSCEVExprForVPValue(FindLastExpression, PSE, &L),
5846 "IVOfExpressionToSink not being an AddRec must imply "
5847 "FindLastExpression not being an AddRec.");
5848 continue;
5849 }
5850
5851 // Determine direction from SCEV step.
5852 if (!SE.isKnownNonZero(Step))
5853 continue;
5854
5855 // Positive step means we need UMax/SMax to find the last IV value, and
5856 // UMin/SMin otherwise.
5857 bool UseMax = SE.isKnownPositive(Step);
5858 std::optional<APSInt> SentinelVal = CheckSentinel(IVSCEV, UseMax);
5859 bool UseSigned = SentinelVal && SentinelVal->isSigned();
5860
5861 // Sinking an expression will disable epilogue vectorization. Only use it,
5862 // if FindLastExpression cannot be vectorized via a sentinel. Sinking may
5863 // also prevent vectorizing using a sentinel (e.g., if the expression is a
5864 // multiply or divide by large constant, respectively), which also makes
5865 // sinking undesirable.
5866 if (IVOfExpressionToSink) {
5867 const SCEV *FindLastExpressionSCEV =
5868 vputils::getSCEVExprForVPValue(FindLastExpression, PSE, &L);
5869 if (match(FindLastExpressionSCEV,
5870 m_scev_AffineAddRec(m_SCEV(), m_SCEV(Step)))) {
5871 bool NewUseMax = SE.isKnownPositive(Step);
5872 if (auto NewSentinel =
5873 CheckSentinel(FindLastExpressionSCEV, NewUseMax)) {
5874 // The original expression already has a sentinel, so prefer not
5875 // sinking to keep epilogue vectorization possible.
5876 SentinelVal = *NewSentinel;
5877 UseSigned = NewSentinel->isSigned();
5878 UseMax = NewUseMax;
5879 IVSCEV = FindLastExpressionSCEV;
5880 IVOfExpressionToSink = nullptr;
5881 }
5882 }
5883 }
5884
5885 // If no sentinel was found, fall back to a boolean AnyOf reduction to track
5886 // if the condition was ever true. Requires the IV to not wrap, otherwise we
5887 // cannot use min/max.
5888 if (!SentinelVal) {
5889 auto *AR = cast<SCEVAddRecExpr>(IVSCEV);
5890 if (AR->hasNoSignedWrap())
5891 UseSigned = true;
5892 else if (AR->hasNoUnsignedWrap())
5893 UseSigned = false;
5894 else
5895 continue;
5896 }
5897
5899 BackedgeVal,
5901
5902 VPValue *NewFindLastSelect = BackedgeVal;
5903 VPValue *SelectCond = Cond;
5904 if (!SentinelVal || IVOfExpressionToSink) {
5905 // When we need to create a new select, normalize the condition so that
5906 // PhiR is the last operand and include the header mask if needed.
5907 DebugLoc DL = FindLastSelect->getDefiningRecipe()->getDebugLoc();
5908 VPBuilder LoopBuilder(FindLastSelect->getDefiningRecipe());
5909 if (FindLastSelect->getDefiningRecipe()->getOperand(1) == PhiR)
5910 SelectCond = LoopBuilder.createNot(SelectCond);
5911
5912 // When tail folding, mask the condition with the header mask to prevent
5913 // propagating poison from inactive lanes in the last vector iteration.
5914 if (HeaderMask)
5915 SelectCond = LoopBuilder.createLogicalAnd(HeaderMask, SelectCond);
5916
5917 if (SelectCond != Cond || IVOfExpressionToSink) {
5918 NewFindLastSelect = LoopBuilder.createSelect(
5919 SelectCond,
5920 IVOfExpressionToSink ? IVOfExpressionToSink : FindLastExpression,
5921 PhiR, DL);
5922 }
5923 }
5924
5925 // Create the reduction result in the middle block using sentinel directly.
5926 RecurKind MinMaxKind =
5927 UseMax ? (UseSigned ? RecurKind::SMax : RecurKind::UMax)
5928 : (UseSigned ? RecurKind::SMin : RecurKind::UMin);
5929 VPIRFlags Flags(MinMaxKind, /*IsOrdered=*/false, /*IsInLoop=*/false,
5930 FastMathFlags());
5931 DebugLoc ExitDL = RdxResult->getDebugLoc();
5932 VPBuilder MiddleBuilder(RdxResult);
5933 VPValue *ReducedIV =
5935 NewFindLastSelect, Flags, ExitDL);
5936
5937 // If IVOfExpressionToSink is an expression to sink, sink it now.
5938 VPValue *VectorRegionExitingVal = ReducedIV;
5939 if (IVOfExpressionToSink)
5940 VectorRegionExitingVal =
5941 cloneBinOpForScalarIV(cast<VPWidenRecipe>(FindLastExpression),
5942 ReducedIV, IVOfExpressionToSink);
5943
5944 VPValue *NewRdxResult;
5945 VPValue *StartVPV = PhiR->getStartValue();
5946 if (SentinelVal) {
5947 // Sentinel-based approach: reduce IVs with min/max, compare against
5948 // sentinel to detect if condition was ever true, select accordingly.
5949 VPValue *Sentinel = Plan.getConstantInt(*SentinelVal);
5950 auto *Cmp = MiddleBuilder.createICmp(CmpInst::ICMP_NE, ReducedIV,
5951 Sentinel, ExitDL);
5952 NewRdxResult = MiddleBuilder.createSelect(Cmp, VectorRegionExitingVal,
5953 StartVPV, ExitDL);
5954 StartVPV = Sentinel;
5955 } else {
5956 // Introduce a boolean AnyOf reduction to track if the condition was ever
5957 // true in the loop. Use it to select the initial start value, if it was
5958 // never true.
5959 auto *AnyOfPhi = new VPReductionPHIRecipe(
5960 /*Phi=*/nullptr, RecurKind::Or, *Plan.getFalse(), *Plan.getFalse(),
5961 RdxUnordered{1}, {}, /*HasUsesOutsideReductionChain=*/false);
5962 AnyOfPhi->insertAfter(PhiR);
5963
5964 VPBuilder LoopBuilder(BackedgeVal->getDefiningRecipe());
5965 VPValue *OrVal = LoopBuilder.createOr(AnyOfPhi, SelectCond);
5966 AnyOfPhi->setOperand(1, OrVal);
5967
5968 NewRdxResult = MiddleBuilder.createAnyOfReduction(
5969 OrVal, VectorRegionExitingVal, StartVPV, ExitDL);
5970
5971 // Initialize the IV reduction phi with the neutral element, not the
5972 // original start value, to ensure correct min/max reduction results.
5973 StartVPV = Plan.getOrAddLiveIn(
5974 getRecurrenceIdentity(MinMaxKind, IVSCEV->getType(), {}));
5975 }
5976 RdxResult->replaceAllUsesWith(NewRdxResult);
5977 RdxResult->eraseFromParent();
5978
5979 auto *NewPhiR = new VPReductionPHIRecipe(
5980 cast<PHINode>(PhiR->getUnderlyingInstr()), RecurKind::FindIV, *StartVPV,
5981 *NewFindLastSelect, RdxUnordered{1}, {},
5982 PhiR->hasUsesOutsideReductionChain());
5983 NewPhiR->insertBefore(PhiR);
5984 PhiR->replaceAllUsesWith(NewPhiR);
5985 PhiR->eraseFromParent();
5986 }
5987}
5988
5989namespace {
5990
5991using ExtendKind = TTI::PartialReductionExtendKind;
5992struct ReductionExtend {
5993 Type *SrcType = nullptr;
5994 ExtendKind Kind = ExtendKind::PR_None;
5995};
5996
5997/// Describes the extends used to compute the extended reduction operand.
5998/// ExtendB is optional. If ExtendB is present, ExtendsUser is a binary
5999/// operation.
6000struct ExtendedReductionOperand {
6001 /// The recipe that consumes the extends.
6002 VPWidenRecipe *ExtendsUser = nullptr;
6003 /// Extend descriptions (inputs to getPartialReductionCost).
6004 ReductionExtend ExtendA, ExtendB;
6005};
6006
6007/// A chain of recipes that form a partial reduction. Matches either
6008/// reduction_bin_op (extended op, accumulator), or
6009/// reduction_bin_op (accumulator, extended op).
6010/// The possible forms of the "extended op" are listed in
6011/// matchExtendedReductionOperand.
6012struct VPPartialReductionChain {
6013 /// The top-level binary operation that forms the reduction to a scalar
6014 /// after the loop body.
6015 VPWidenRecipe *ReductionBinOp = nullptr;
6016 /// The user of the extends that is then reduced.
6017 ExtendedReductionOperand ExtendedOp;
6018 /// The recurrence kind for the entire partial reduction chain.
6019 /// This allows distinguishing between Sub and AddWithSub recurrences,
6020 /// when the ReductionBinOp is a Instruction::Sub.
6021 RecurKind RK;
6022 /// The index of the accumulator operand of ReductionBinOp. The extended op
6023 /// is `1 - AccumulatorOpIdx`.
6024 unsigned AccumulatorOpIdx;
6025 unsigned ScaleFactor;
6026};
6027
6028static VPSingleDefRecipe *
6029optimizeExtendsForPartialReduction(VPSingleDefRecipe *Op,
6030 VPTypeAnalysis &TypeInfo) {
6031 // reduce.add(mul(ext(A), C))
6032 // -> reduce.add(mul(ext(A), ext(trunc(C))))
6033 const APInt *Const;
6034 if (match(Op, m_Mul(m_ZExtOrSExt(m_VPValue()), m_APInt(Const)))) {
6035 auto *ExtA = cast<VPWidenCastRecipe>(Op->getOperand(0));
6036 Instruction::CastOps ExtOpc = ExtA->getOpcode();
6037 Type *NarrowTy = TypeInfo.inferScalarType(ExtA->getOperand(0));
6038 if (!Op->hasOneUse() ||
6040 Const, NarrowTy, TTI::getPartialReductionExtendKind(ExtOpc)))
6041 return Op;
6042
6043 VPBuilder Builder(Op);
6044 auto *Trunc = Builder.createWidenCast(Instruction::CastOps::Trunc,
6045 Op->getOperand(1), NarrowTy);
6046 Type *WideTy = TypeInfo.inferScalarType(ExtA);
6047 Op->setOperand(1, Builder.createWidenCast(ExtOpc, Trunc, WideTy));
6048 return Op;
6049 }
6050
6051 // reduce.add(abs(sub(ext(A), ext(B))))
6052 // -> reduce.add(ext(absolute-difference(A, B)))
6053 VPValue *X, *Y;
6056 auto *Sub = Op->getOperand(0)->getDefiningRecipe();
6057 auto *Ext = cast<VPWidenCastRecipe>(Sub->getOperand(0));
6058 assert(Ext->getOpcode() ==
6059 cast<VPWidenCastRecipe>(Sub->getOperand(1))->getOpcode() &&
6060 "Expected both the LHS and RHS extends to be the same");
6061 bool IsSigned = Ext->getOpcode() == Instruction::SExt;
6062 VPBuilder Builder(Op);
6063 Type *SrcTy = TypeInfo.inferScalarType(X);
6064 auto *FreezeX = Builder.insert(new VPWidenRecipe(Instruction::Freeze, {X}));
6065 auto *FreezeY = Builder.insert(new VPWidenRecipe(Instruction::Freeze, {Y}));
6066 auto *Max = Builder.insert(
6067 new VPWidenIntrinsicRecipe(IsSigned ? Intrinsic::smax : Intrinsic::umax,
6068 {FreezeX, FreezeY}, SrcTy));
6069 auto *Min = Builder.insert(
6070 new VPWidenIntrinsicRecipe(IsSigned ? Intrinsic::smin : Intrinsic::umin,
6071 {FreezeX, FreezeY}, SrcTy));
6072 auto *AbsDiff =
6073 Builder.insert(new VPWidenRecipe(Instruction::Sub, {Max, Min}));
6074 return Builder.createWidenCast(Instruction::CastOps::ZExt, AbsDiff,
6075 TypeInfo.inferScalarType(Op));
6076 }
6077
6078 // reduce.add(ext(mul(ext(A), ext(B))))
6079 // -> reduce.add(mul(wider_ext(A), wider_ext(B)))
6080 // TODO: Support this optimization for float types.
6082 m_ZExtOrSExt(m_VPValue()))))) {
6083 auto *Ext = cast<VPWidenCastRecipe>(Op);
6084 auto *Mul = cast<VPWidenRecipe>(Ext->getOperand(0));
6085 auto *MulLHS = cast<VPWidenCastRecipe>(Mul->getOperand(0));
6086 auto *MulRHS = cast<VPWidenCastRecipe>(Mul->getOperand(1));
6087 if (!Mul->hasOneUse() ||
6088 (Ext->getOpcode() != MulLHS->getOpcode() && MulLHS != MulRHS) ||
6089 MulLHS->getOpcode() != MulRHS->getOpcode())
6090 return Op;
6091 VPBuilder Builder(Mul);
6092 Mul->setOperand(0, Builder.createWidenCast(MulLHS->getOpcode(),
6093 MulLHS->getOperand(0),
6094 Ext->getResultType()));
6095 Mul->setOperand(1, MulLHS == MulRHS
6096 ? Mul->getOperand(0)
6097 : Builder.createWidenCast(MulRHS->getOpcode(),
6098 MulRHS->getOperand(0),
6099 Ext->getResultType()));
6100 return Mul;
6101 }
6102
6103 return Op;
6104}
6105
6106static VPExpressionRecipe *
6107createPartialReductionExpression(VPReductionRecipe *Red) {
6108 VPValue *VecOp = Red->getVecOp();
6109
6110 // reduce.[f]add(ext(op))
6111 // -> VPExpressionRecipe(op, red)
6112 if (match(VecOp, m_WidenAnyExtend(m_VPValue())))
6113 return new VPExpressionRecipe(cast<VPWidenCastRecipe>(VecOp), Red);
6114
6115 // reduce.[f]add([f]mul(ext(a), ext(b)))
6116 // -> VPExpressionRecipe(a, b, mul, red)
6117 if (match(VecOp, m_FMul(m_FPExt(m_VPValue()), m_FPExt(m_VPValue()))) ||
6118 match(VecOp,
6120 auto *Mul = cast<VPWidenRecipe>(VecOp);
6121 auto *ExtA = cast<VPWidenCastRecipe>(Mul->getOperand(0));
6122 auto *ExtB = cast<VPWidenCastRecipe>(Mul->getOperand(1));
6123 return new VPExpressionRecipe(ExtA, ExtB, Mul, Red);
6124 }
6125
6126 // reduce.add(neg(mul(ext(a), ext(b))))
6127 // -> VPExpressionRecipe(a, b, mul, sub, red)
6129 m_ZExtOrSExt(m_VPValue()))))) {
6130 auto *Sub = cast<VPWidenRecipe>(VecOp);
6131 auto *Mul = cast<VPWidenRecipe>(Sub->getOperand(1));
6132 auto *ExtA = cast<VPWidenCastRecipe>(Mul->getOperand(0));
6133 auto *ExtB = cast<VPWidenCastRecipe>(Mul->getOperand(1));
6134 return new VPExpressionRecipe(ExtA, ExtB, Mul, Sub, Red);
6135 }
6136
6137 llvm_unreachable("Unsupported expression");
6138}
6139
6140// Helper to transform a partial reduction chain into a partial reduction
6141// recipe. Assumes profitability has been checked.
6142static void transformToPartialReduction(const VPPartialReductionChain &Chain,
6143 VPTypeAnalysis &TypeInfo, VPlan &Plan,
6144 VPReductionPHIRecipe *RdxPhi) {
6145 VPWidenRecipe *WidenRecipe = Chain.ReductionBinOp;
6146 assert(WidenRecipe->getNumOperands() == 2 && "Expected binary operation");
6147
6148 VPValue *Accumulator = WidenRecipe->getOperand(Chain.AccumulatorOpIdx);
6149 auto *ExtendedOp = cast<VPSingleDefRecipe>(
6150 WidenRecipe->getOperand(1 - Chain.AccumulatorOpIdx));
6151
6152 // Sub-reductions can be implemented in two ways:
6153 // (1) negate the operand in the vector loop (the default way).
6154 // (2) subtract the reduced value from the init value in the middle block.
6155 // Both ways keep the reduction itself as an 'add' reduction.
6156 //
6157 // The ISD nodes for partial reductions don't support folding the
6158 // sub/negation into its operands because the following is not a valid
6159 // transformation:
6160 // sub(0, mul(ext(a), ext(b)))
6161 // -> mul(ext(a), ext(sub(0, b)))
6162 //
6163 // It's therefore better to choose option (2) such that the partial
6164 // reduction is always positive (starting at '0') and to do a final
6165 // subtract in the middle block.
6166 if (WidenRecipe->getOpcode() == Instruction::Sub &&
6167 Chain.RK != RecurKind::Sub) {
6168 VPBuilder Builder(WidenRecipe);
6169 Type *ElemTy = TypeInfo.inferScalarType(ExtendedOp);
6170 auto *Zero = Plan.getZero(ElemTy);
6171 auto *NegRecipe =
6172 new VPWidenRecipe(Instruction::Sub, {Zero, ExtendedOp}, VPIRFlags(),
6174 Builder.insert(NegRecipe);
6175 ExtendedOp = NegRecipe;
6176 }
6177
6178 // FIXME: Do these transforms before invoking the cost-model.
6179 ExtendedOp = optimizeExtendsForPartialReduction(ExtendedOp, TypeInfo);
6180
6181 // Check if WidenRecipe is the final result of the reduction. If so look
6182 // through selects for predicated reductions.
6183 VPValue *Cond = nullptr;
6185 WidenRecipe,
6186 m_Select(m_VPValue(Cond), m_Specific(WidenRecipe), m_Specific(RdxPhi))));
6187 bool IsLastInChain = RdxPhi->getBackedgeValue() == WidenRecipe ||
6188 RdxPhi->getBackedgeValue() == ExitValue;
6189 assert((!ExitValue || IsLastInChain) &&
6190 "if we found ExitValue, it must match RdxPhi's backedge value");
6191
6192 Type *PhiType = TypeInfo.inferScalarType(RdxPhi);
6193 RecurKind RdxKind =
6195 auto *PartialRed = new VPReductionRecipe(
6196 RdxKind,
6197 RdxKind == RecurKind::FAdd ? WidenRecipe->getFastMathFlags()
6198 : FastMathFlags(),
6199 WidenRecipe->getUnderlyingInstr(), Accumulator, ExtendedOp, Cond,
6200 RdxUnordered{/*VFScaleFactor=*/Chain.ScaleFactor});
6201 PartialRed->insertBefore(WidenRecipe);
6202
6203 if (Cond)
6204 ExitValue->replaceAllUsesWith(PartialRed);
6205 WidenRecipe->replaceAllUsesWith(PartialRed);
6206
6207 // For cost-model purposes, fold this into a VPExpression.
6208 VPExpressionRecipe *E = createPartialReductionExpression(PartialRed);
6209 E->insertBefore(WidenRecipe);
6210 PartialRed->replaceAllUsesWith(E);
6211
6212 // We only need to update the PHI node once, which is when we find the
6213 // last reduction in the chain.
6214 if (!IsLastInChain)
6215 return;
6216
6217 // Scale the PHI and ReductionStartVector by the VFScaleFactor
6218 assert(RdxPhi->getVFScaleFactor() == 1 && "scale factor must not be set");
6219 RdxPhi->setVFScaleFactor(Chain.ScaleFactor);
6220
6221 auto *StartInst = cast<VPInstruction>(RdxPhi->getStartValue());
6222 assert(StartInst->getOpcode() == VPInstruction::ReductionStartVector);
6223 auto *NewScaleFactor = Plan.getConstantInt(32, Chain.ScaleFactor);
6224 StartInst->setOperand(2, NewScaleFactor);
6225
6226 // If this is the last value in a sub-reduction chain, then update the PHI
6227 // node to start at `0` and update the reduction-result to subtract from
6228 // the PHI's start value.
6229 if (Chain.RK != RecurKind::Sub)
6230 return;
6231
6232 VPValue *OldStartValue = StartInst->getOperand(0);
6233 StartInst->setOperand(0, StartInst->getOperand(1));
6234
6235 // Replace reduction_result by 'sub (startval, reductionresult)'.
6237 assert(RdxResult && "Could not find reduction result");
6238
6239 VPBuilder Builder = VPBuilder::getToInsertAfter(RdxResult);
6240 constexpr unsigned SubOpc = Instruction::BinaryOps::Sub;
6241 VPInstruction *NewResult = Builder.createNaryOp(
6242 SubOpc, {OldStartValue, RdxResult}, VPIRFlags::getDefaultFlags(SubOpc),
6243 RdxPhi->getDebugLoc());
6244 RdxResult->replaceUsesWithIf(
6245 NewResult,
6246 [&NewResult](VPUser &U, unsigned Idx) { return &U != NewResult; });
6247}
6248
6249/// Returns the cost of a link in a partial-reduction chain for a given VF.
6250static InstructionCost
6251getPartialReductionLinkCost(VPCostContext &CostCtx,
6252 const VPPartialReductionChain &Link,
6253 ElementCount VF) {
6254 Type *RdxType = CostCtx.Types.inferScalarType(Link.ReductionBinOp);
6255 const ExtendedReductionOperand &ExtendedOp = Link.ExtendedOp;
6256 std::optional<unsigned> BinOpc = std::nullopt;
6257 // If ExtendB is not none, then the "ExtendsUser" is the binary operation.
6258 if (ExtendedOp.ExtendB.Kind != ExtendKind::PR_None)
6259 BinOpc = ExtendedOp.ExtendsUser->getOpcode();
6260
6261 std::optional<llvm::FastMathFlags> Flags;
6262 if (RdxType->isFloatingPointTy())
6263 Flags = Link.ReductionBinOp->getFastMathFlags();
6264
6265 unsigned Opcode = Link.RK == RecurKind::Sub
6266 ? (unsigned)Instruction::Add
6267 : Link.ReductionBinOp->getOpcode();
6268 return CostCtx.TTI.getPartialReductionCost(
6269 Opcode, ExtendedOp.ExtendA.SrcType, ExtendedOp.ExtendB.SrcType, RdxType,
6270 VF, ExtendedOp.ExtendA.Kind, ExtendedOp.ExtendB.Kind, BinOpc,
6271 CostCtx.CostKind, Flags);
6272}
6273
6274static ExtendKind getPartialReductionExtendKind(VPWidenCastRecipe *Cast) {
6276}
6277
6278/// Checks if \p Op (which is an operand of \p UpdateR) is an extended reduction
6279/// operand. This is an operand where the source of the value (e.g. a load) has
6280/// been extended (sext, zext, or fpext) before it is used in the reduction.
6281///
6282/// Possible forms matched by this function:
6283/// - UpdateR(PrevValue, ext(...))
6284/// - UpdateR(PrevValue, mul(ext(...), ext(...)))
6285/// - UpdateR(PrevValue, mul(ext(...), Constant))
6286/// - UpdateR(PrevValue, neg(mul(ext(...), ext(...))))
6287/// - UpdateR(PrevValue, neg(mul(ext(...), Constant)))
6288/// - UpdateR(PrevValue, ext(mul(ext(...), ext(...))))
6289/// - UpdateR(PrevValue, ext(mul(ext(...), Constant)))
6290/// - UpdateR(PrevValue, abs(sub(ext(...), ext(...)))
6291///
6292/// Note: The second operand of UpdateR corresponds to \p Op in the examples.
6293static std::optional<ExtendedReductionOperand>
6294matchExtendedReductionOperand(VPWidenRecipe *UpdateR, VPValue *Op,
6295 VPTypeAnalysis &TypeInfo) {
6296 assert(is_contained(UpdateR->operands(), Op) &&
6297 "Op should be operand of UpdateR");
6298
6299 // Try matching an absolute difference operand of the form
6300 // `abs(sub(ext(A), ext(B)))`. This will be later transformed into
6301 // `ext(absolute-difference(A, B))`. This allows us to perform the absolute
6302 // difference on a wider type and get the extend for "free" from the partial
6303 // reduction.
6304 VPValue *X, *Y;
6305 if (Op->hasOneUse() &&
6309 auto *Abs = cast<VPWidenIntrinsicRecipe>(Op);
6310 auto *Sub = cast<VPWidenRecipe>(Abs->getOperand(0));
6311 auto *LHSExt = cast<VPWidenCastRecipe>(Sub->getOperand(0));
6312 auto *RHSExt = cast<VPWidenCastRecipe>(Sub->getOperand(1));
6313 Type *LHSInputType = TypeInfo.inferScalarType(X);
6314 Type *RHSInputType = TypeInfo.inferScalarType(Y);
6315 if (LHSInputType != RHSInputType ||
6316 LHSExt->getOpcode() != RHSExt->getOpcode())
6317 return std::nullopt;
6318 // Note: This is essentially the same as matching ext(...) as we will
6319 // rewrite this operand to ext(absolute-difference(A, B)).
6320 return ExtendedReductionOperand{
6321 Sub,
6322 /*ExtendA=*/{LHSInputType, getPartialReductionExtendKind(LHSExt)},
6323 /*ExtendB=*/{}};
6324 }
6325
6326 std::optional<TTI::PartialReductionExtendKind> OuterExtKind;
6328 auto *CastRecipe = cast<VPWidenCastRecipe>(Op);
6329 VPValue *CastSource = CastRecipe->getOperand(0);
6330 OuterExtKind = getPartialReductionExtendKind(CastRecipe);
6331 if (match(CastSource, m_Mul(m_VPValue(), m_VPValue())) ||
6332 match(CastSource, m_FMul(m_VPValue(), m_VPValue()))) {
6333 // Match: ext(mul(...))
6334 // Record the outer extend kind and set `Op` to the mul. We can then match
6335 // this as a binary operation. Note: We can optimize out the outer extend
6336 // by widening the inner extends to match it. See
6337 // optimizeExtendsForPartialReduction.
6338 Op = CastSource;
6339 } else if (UpdateR->getOpcode() == Instruction::Add ||
6340 UpdateR->getOpcode() == Instruction::FAdd) {
6341 // Match: UpdateR(PrevValue, ext(...))
6342 // TODO: Remove the add/fadd restriction (we should be able to handle this
6343 // case for sub reductions too).
6344 return ExtendedReductionOperand{
6345 UpdateR,
6346 /*ExtendA=*/{TypeInfo.inferScalarType(CastSource), *OuterExtKind},
6347 /*ExtendB=*/{}};
6348 }
6349 }
6350
6351 if (!Op->hasOneUse())
6352 return std::nullopt;
6353
6355 if (!MulOp ||
6356 !is_contained({Instruction::Mul, Instruction::FMul}, MulOp->getOpcode()))
6357 return std::nullopt;
6358
6359 // The rest of the matching assumes `Op` is a (possibly extended/negated)
6360 // binary operation.
6361
6362 VPValue *LHS = MulOp->getOperand(0);
6363 VPValue *RHS = MulOp->getOperand(1);
6364
6365 // The LHS of the operation must always be an extend.
6367 return std::nullopt;
6368
6369 auto *LHSCast = cast<VPWidenCastRecipe>(LHS);
6370 Type *LHSInputType = TypeInfo.inferScalarType(LHSCast->getOperand(0));
6371 ExtendKind LHSExtendKind = getPartialReductionExtendKind(LHSCast);
6372
6373 // The RHS of the operation can be an extend or a constant integer.
6374 const APInt *RHSConst = nullptr;
6375 VPWidenCastRecipe *RHSCast = nullptr;
6377 RHSCast = cast<VPWidenCastRecipe>(RHS);
6378 else if (!match(RHS, m_APInt(RHSConst)) ||
6379 !canConstantBeExtended(RHSConst, LHSInputType, LHSExtendKind))
6380 return std::nullopt;
6381
6382 // The outer extend kind must match the inner extends for folding.
6383 for (VPWidenCastRecipe *Cast : {LHSCast, RHSCast})
6384 if (Cast && OuterExtKind &&
6385 getPartialReductionExtendKind(Cast) != OuterExtKind)
6386 return std::nullopt;
6387
6388 Type *RHSInputType = LHSInputType;
6389 ExtendKind RHSExtendKind = LHSExtendKind;
6390 if (RHSCast) {
6391 RHSInputType = TypeInfo.inferScalarType(RHSCast->getOperand(0));
6392 RHSExtendKind = getPartialReductionExtendKind(RHSCast);
6393 }
6394
6395 return ExtendedReductionOperand{
6396 MulOp, {LHSInputType, LHSExtendKind}, {RHSInputType, RHSExtendKind}};
6397}
6398
6399/// Examines each operation in the reduction chain corresponding to \p RedPhiR,
6400/// and determines if the target can use a cheaper operation with a wider
6401/// per-iteration input VF and narrower PHI VF. If successful, returns the chain
6402/// of operations in the reduction.
6403static std::optional<SmallVector<VPPartialReductionChain>>
6404getScaledReductions(VPReductionPHIRecipe *RedPhiR, VPCostContext &CostCtx,
6405 VFRange &Range) {
6406 // Get the backedge value from the reduction PHI and find the
6407 // ComputeReductionResult that uses it (directly or through a select for
6408 // predicated reductions).
6409 auto *RdxResult = vputils::findComputeReductionResult(RedPhiR);
6410 if (!RdxResult)
6411 return std::nullopt;
6412 VPValue *ExitValue = RdxResult->getOperand(0);
6413 match(ExitValue, m_Select(m_VPValue(), m_VPValue(ExitValue), m_VPValue()));
6414
6415 VPTypeAnalysis &TypeInfo = CostCtx.Types;
6417 RecurKind RK = RedPhiR->getRecurrenceKind();
6418 Type *PhiType = TypeInfo.inferScalarType(RedPhiR);
6419 TypeSize PHISize = PhiType->getPrimitiveSizeInBits();
6420
6421 // Work backwards from the ExitValue examining each reduction operation.
6422 VPValue *CurrentValue = ExitValue;
6423 while (CurrentValue != RedPhiR) {
6424 auto *UpdateR = dyn_cast<VPWidenRecipe>(CurrentValue);
6425 if (!UpdateR || !Instruction::isBinaryOp(UpdateR->getOpcode()))
6426 return std::nullopt;
6427
6428 VPValue *Op = UpdateR->getOperand(1);
6429 VPValue *PrevValue = UpdateR->getOperand(0);
6430
6431 // Find the extended operand. The other operand (PrevValue) is the next link
6432 // in the reduction chain.
6433 std::optional<ExtendedReductionOperand> ExtendedOp =
6434 matchExtendedReductionOperand(UpdateR, Op, TypeInfo);
6435 if (!ExtendedOp) {
6436 ExtendedOp = matchExtendedReductionOperand(UpdateR, PrevValue, TypeInfo);
6437 if (!ExtendedOp)
6438 return std::nullopt;
6439 std::swap(Op, PrevValue);
6440 }
6441
6442 Type *ExtSrcType = ExtendedOp->ExtendA.SrcType;
6443 TypeSize ExtSrcSize = ExtSrcType->getPrimitiveSizeInBits();
6444 if (!PHISize.hasKnownScalarFactor(ExtSrcSize))
6445 return std::nullopt;
6446
6447 // Check if a partial reduction chain is supported by the target (i.e. does
6448 // not have an invalid cost) for the given VF range. Clamps the range and
6449 // returns true if feasible for any VF.
6450 VPPartialReductionChain Link(
6451 {UpdateR, *ExtendedOp, RK,
6452 PrevValue == UpdateR->getOperand(0) ? 0U : 1U,
6453 static_cast<unsigned>(PHISize.getKnownScalarFactor(ExtSrcSize))});
6454 Chain.push_back(Link);
6455 CurrentValue = PrevValue;
6456 }
6457
6458 // The chain links were collected by traversing backwards from the exit value.
6459 // Reverse the chains so they are in program order.
6460 std::reverse(Chain.begin(), Chain.end());
6461 return Chain;
6462}
6463} // namespace
6464
6466 VPCostContext &CostCtx,
6467 VFRange &Range) {
6468 // Find all possible valid partial reductions, grouping chains by their PHI.
6469 // This grouping allows invalidating the whole chain, if any link is not a
6470 // valid partial reduction.
6472 ChainsByPhi;
6473 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
6474 for (VPRecipeBase &R : HeaderVPBB->phis()) {
6475 auto *RedPhiR = dyn_cast<VPReductionPHIRecipe>(&R);
6476 if (!RedPhiR)
6477 continue;
6478
6479 if (auto Chains = getScaledReductions(RedPhiR, CostCtx, Range))
6480 ChainsByPhi.try_emplace(RedPhiR, std::move(*Chains));
6481 }
6482
6483 if (ChainsByPhi.empty())
6484 return;
6485
6486 // Build set of partial reduction operations for extend user validation and
6487 // a map of reduction bin ops to their scale factors for scale validation.
6488 SmallPtrSet<VPRecipeBase *, 4> PartialReductionOps;
6489 DenseMap<VPSingleDefRecipe *, unsigned> ScaledReductionMap;
6490 for (const auto &[_, Chains] : ChainsByPhi)
6491 for (const VPPartialReductionChain &Chain : Chains) {
6492 PartialReductionOps.insert(Chain.ExtendedOp.ExtendsUser);
6493 ScaledReductionMap[Chain.ReductionBinOp] = Chain.ScaleFactor;
6494 }
6495
6496 // A partial reduction is invalid if any of its extends are used by
6497 // something that isn't another partial reduction. This is because the
6498 // extends are intended to be lowered along with the reduction itself.
6499 auto ExtendUsersValid = [&](VPValue *Ext) {
6500 return !isa<VPWidenCastRecipe>(Ext) || all_of(Ext->users(), [&](VPUser *U) {
6501 return PartialReductionOps.contains(cast<VPRecipeBase>(U));
6502 });
6503 };
6504
6505 auto IsProfitablePartialReductionChainForVF =
6506 [&](ArrayRef<VPPartialReductionChain> Chain, ElementCount VF) -> bool {
6507 InstructionCost PartialCost = 0, RegularCost = 0;
6508
6509 // The chain is a profitable partial reduction chain if the cost of handling
6510 // the entire chain is cheaper when using partial reductions than when
6511 // handling the entire chain using regular reductions.
6512 for (const VPPartialReductionChain &Link : Chain) {
6513 const ExtendedReductionOperand &ExtendedOp = Link.ExtendedOp;
6514 InstructionCost LinkCost = getPartialReductionLinkCost(CostCtx, Link, VF);
6515 if (!LinkCost.isValid())
6516 return false;
6517
6518 PartialCost += LinkCost;
6519 RegularCost += Link.ReductionBinOp->computeCost(VF, CostCtx);
6520 // If ExtendB is not none, then the "ExtendsUser" is the binary operation.
6521 if (ExtendedOp.ExtendB.Kind != ExtendKind::PR_None)
6522 RegularCost += ExtendedOp.ExtendsUser->computeCost(VF, CostCtx);
6523 for (VPValue *Op : ExtendedOp.ExtendsUser->operands())
6524 if (auto *Extend = dyn_cast<VPWidenCastRecipe>(Op))
6525 RegularCost += Extend->computeCost(VF, CostCtx);
6526 }
6527 return PartialCost.isValid() && PartialCost < RegularCost;
6528 };
6529
6530 // Validate chains: check that extends are only used by partial reductions,
6531 // and that reduction bin ops are only used by other partial reductions with
6532 // matching scale factors, are outside the loop region or the select
6533 // introduced by tail-folding. Otherwise we would create users of scaled
6534 // reductions where the types of the other operands don't match.
6535 for (auto &[RedPhiR, Chains] : ChainsByPhi) {
6536 for (const VPPartialReductionChain &Chain : Chains) {
6537 if (!all_of(Chain.ExtendedOp.ExtendsUser->operands(), ExtendUsersValid)) {
6538 Chains.clear();
6539 break;
6540 }
6541 auto UseIsValid = [&, RedPhiR = RedPhiR](VPUser *U) {
6542 if (auto *PhiR = dyn_cast<VPReductionPHIRecipe>(U))
6543 return PhiR == RedPhiR;
6544 auto *R = cast<VPSingleDefRecipe>(U);
6545 return Chain.ScaleFactor == ScaledReductionMap.lookup_or(R, 0) ||
6547 m_Specific(Chain.ReductionBinOp))) ||
6548 match(R, m_Select(m_VPValue(), m_Specific(Chain.ReductionBinOp),
6549 m_Specific(RedPhiR)));
6550 };
6551 if (!all_of(Chain.ReductionBinOp->users(), UseIsValid)) {
6552 Chains.clear();
6553 break;
6554 }
6555
6556 // Check if the compute-reduction-result is used by a sunk store.
6557 // TODO: Also form partial reductions in those cases.
6558 if (auto *RdxResult = vputils::findComputeReductionResult(RedPhiR)) {
6559 if (any_of(RdxResult->users(), [](VPUser *U) {
6560 auto *RepR = dyn_cast<VPReplicateRecipe>(U);
6561 return RepR && RepR->getOpcode() == Instruction::Store;
6562 })) {
6563 Chains.clear();
6564 break;
6565 }
6566 }
6567 }
6568
6569 // Clear the chain if it is not profitable.
6571 [&, &Chains = Chains](ElementCount VF) {
6572 return IsProfitablePartialReductionChainForVF(Chains, VF);
6573 },
6574 Range))
6575 Chains.clear();
6576 }
6577
6578 for (auto &[Phi, Chains] : ChainsByPhi)
6579 for (const VPPartialReductionChain &Chain : Chains)
6580 transformToPartialReduction(Chain, CostCtx.Types, Plan, Phi);
6581}
6582
6584 VPlan &Plan, VFRange &Range, VPRecipeBuilder &RecipeBuilder) {
6585 // Collect all loads/stores first. We will start with ones having simpler
6586 // decisions followed by more complex ones that are potentially
6587 // guided/dependent on the simpler ones.
6589 for (VPBasicBlock *VPBB :
6592 for (VPRecipeBase &R : *VPBB) {
6593 auto *VPI = dyn_cast<VPInstruction>(&R);
6594 if (VPI && VPI->getUnderlyingValue() &&
6595 is_contained({Instruction::Load, Instruction::Store},
6596 VPI->getOpcode()))
6597 MemOps.push_back(VPI);
6598 }
6599 }
6600
6601 VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
6602 VPBuilder FinalRedStoresBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
6603
6604 for (VPInstruction *VPI : MemOps) {
6605 auto ReplaceWith = [&](VPRecipeBase *New) {
6606 RecipeBuilder.setRecipe(cast<Instruction>(VPI->getUnderlyingValue()),
6607 New);
6608 New->insertBefore(VPI);
6609 if (VPI->getOpcode() == Instruction::Load)
6610 VPI->replaceAllUsesWith(New->getVPSingleValue());
6611 VPI->eraseFromParent();
6612 };
6613
6614 // Note: we must do that for scalar VPlan as well.
6615 if (RecipeBuilder.replaceWithFinalIfReductionStore(VPI,
6616 FinalRedStoresBuilder))
6617 continue;
6618
6619 // Filter out scalar VPlan for the remaining memory operations.
6621 [](ElementCount VF) { return VF.isScalar(); }, Range))
6622 continue;
6623
6624 if (VPHistogramRecipe *Histogram = RecipeBuilder.widenIfHistogram(VPI)) {
6625 ReplaceWith(Histogram);
6626 continue;
6627 }
6628
6629 VPRecipeBase *Recipe = RecipeBuilder.tryToWidenMemory(VPI, Range);
6630 if (!Recipe)
6631 Recipe = RecipeBuilder.handleReplication(VPI, Range);
6632
6633 ReplaceWith(Recipe);
6634 }
6635}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
AMDGPU Register Bank Select
This file implements a class to represent arbitrary precision integral constant values and operations...
ReachingDefInfo InstSet & ToRemove
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool isEqual(const Function &Caller, const Function &Callee)
static const Function * getParent(const Value *V)
#define X(NUM, ENUM, NAME)
Definition ELF.h:851
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
static bool isSentinel(const DWARFDebugNames::AttributeEncoding &AE)
@ Default
Hexagon Common GEP
#define _
iv Induction Variable Users
Definition IVUsers.cpp:48
iv users
Definition IVUsers.cpp:48
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
licm
Definition LICM.cpp:383
Legalize the Machine IR a function s Machine IR
Definition Legalizer.cpp:81
#define I(x, y, z)
Definition MD5.cpp:57
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first DebugLoc that has line number information, given a range of instructions.
This file provides utility analysis objects describing memory locations.
This file contains the declarations for metadata subclasses.
MachineInstr unsigned OpIdx
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
#define P(N)
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
R600 Clause Merge
const SmallVectorImpl< MachineOperand > & Cond
This file contains some templates that are useful if you are working with the STL at all.
This is the interface for a metadata-based scoped no-alias analysis.
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file defines the SmallPtrSet class.
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This file implements the TypeSwitch template, which mimics a switch() statement whose cases are type ...
This file implements dominator tree analysis for a single level of a VPlan's H-CFG.
This file contains the declarations of different VPlan-related auxiliary helpers.
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition VPlanSLP.cpp:247
static SmallVector< SmallVector< VPReplicateRecipe *, 4 > > collectComplementaryPredicatedMemOps(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L)
static void removeCommonBlendMask(VPBlendRecipe *Blend)
Try to see if all of Blend's masks share a common value logically and'ed and remove it from the masks...
static void tryToCreateAbstractReductionRecipe(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries to create abstract recipes from the reduction recipe for following optimizations ...
static VPReplicateRecipe * findRecipeWithMinAlign(ArrayRef< VPReplicateRecipe * > Group)
static bool sinkScalarOperands(VPlan &Plan)
static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Try to simplify the branch condition of Plan.
static void simplifyRecipe(VPSingleDefRecipe *Def, VPTypeAnalysis &TypeInfo)
Try to simplify VPSingleDefRecipe Def.
static VPValue * cloneBinOpForScalarIV(VPWidenRecipe *BinOp, VPValue *ScalarIV, VPWidenIntOrFpInductionRecipe *WidenIV)
Create a scalar version of BinOp, with its WidenIV operand replaced by ScalarIV, and place it after S...
static VPWidenIntOrFpInductionRecipe * getExpressionIV(VPValue *V)
Check if V is a binary expression of a widened IV and a loop-invariant value.
static void removeRedundantInductionCasts(VPlan &Plan)
Remove redundant casts of inductions.
static bool isConditionTrueViaVFAndUF(VPValue *Cond, VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Return true if Cond is known to be true for given BestVF and BestUF.
static bool tryToReplaceALMWithWideALM(VPlan &Plan, ElementCount VF, unsigned UF)
Try to replace multiple active lane masks used for control flow with a single, wide active lane mask ...
static std::optional< std::pair< bool, unsigned > > getOpcodeOrIntrinsicID(const VPSingleDefRecipe *R)
Get any instruction opcode or intrinsic ID data embedded in recipe R.
static VPExpressionRecipe * tryToMatchAndCreateExtendedReduction(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries convert extended in-loop reductions to VPExpressionRecipe and clamp the Range if ...
static RemoveMask_match< Op0_t, Op1_t > m_RemoveMask(const Op0_t &In, Op1_t &Out)
Match a specific mask In, or a combination of it (logical-and In, Out).
static VPIRMetadata getCommonMetadata(ArrayRef< VPReplicateRecipe * > Recipes)
static VPValue * getPredicatedMask(VPRegionBlock *R)
If R is a region with a VPBranchOnMaskRecipe in the entry block, return the mask.
static bool sinkRecurrenceUsersAfterPrevious(VPFirstOrderRecurrencePHIRecipe *FOR, VPRecipeBase *Previous, VPDominatorTree &VPDT)
Sink users of FOR after the recipe defining the previous value Previous of the recurrence.
static bool mergeReplicateRegionsIntoSuccessors(VPlan &Plan)
static VPScalarIVStepsRecipe * createScalarIVSteps(VPlan &Plan, InductionDescriptor::InductionKind Kind, Instruction::BinaryOps InductionOpcode, FPMathOperator *FPBinOp, Instruction *TruncI, VPIRValue *StartV, VPValue *Step, DebugLoc DL, VPBuilder &Builder)
static VPWidenInductionRecipe * getOptimizableIVOf(VPValue *VPV, PredicatedScalarEvolution &PSE)
Check if VPV is an untruncated wide induction, either before or after the increment.
static void fixupVFUsersForEVL(VPlan &Plan, VPValue &EVL)
After replacing the canonical IV with a EVL-based IV, fixup recipes that use VF to use the EVL instea...
static bool canNarrowLoad(VPSingleDefRecipe *WideMember0, unsigned OpIdx, VPValue *OpV, unsigned Idx, bool IsScalable)
Returns true if V is VPWidenLoadRecipe or VPInterleaveRecipe that can be converted to a narrower reci...
static void expandVPWidenPointerInduction(VPWidenPointerInductionRecipe *R, VPTypeAnalysis &TypeInfo)
Expand a VPWidenPointerInductionRecipe into executable recipes, for the initial value,...
static std::optional< ElementCount > isConsecutiveInterleaveGroup(VPInterleaveRecipe *InterleaveR, ArrayRef< ElementCount > VFs, VPTypeAnalysis &TypeInfo, const TargetTransformInfo &TTI)
Returns VF from VFs if IR is a full interleave group with factor and number of members both equal to ...
static bool isDeadRecipe(VPRecipeBase &R)
Returns true if R is dead and can be removed.
static void legalizeAndOptimizeInductions(VPlan &Plan)
Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd (IndStart, ScalarIVSteps (0,...
static void addReplicateRegions(VPlan &Plan)
static SmallVector< SmallVector< VPReplicateRecipe *, 4 > > collectGroupedReplicateMemOps(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L, function_ref< bool(VPReplicateRecipe *)> FilterFn)
Collect either replicated Loads or Stores grouped by their address SCEV, in a deep-traversal of the v...
static VPIRValue * tryToFoldLiveIns(VPSingleDefRecipe &R, ArrayRef< VPValue * > Operands, const DataLayout &DL, VPTypeAnalysis &TypeInfo)
Try to fold R using InstSimplifyFolder.
static VPValue * tryToComputeEndValueForInduction(VPWidenInductionRecipe *WideIV, VPBuilder &VectorPHBuilder, VPTypeAnalysis &TypeInfo, VPValue *VectorTC)
Compute the end value for WideIV, unless it is truncated.
static void removeRedundantExpandSCEVRecipes(VPlan &Plan)
Remove redundant EpxandSCEVRecipes in Plan's entry block by replacing them with already existing reci...
static bool hoistPreviousBeforeFORUsers(VPFirstOrderRecurrencePHIRecipe *FOR, VPRecipeBase *Previous, VPDominatorTree &VPDT)
Try to hoist Previous and its operands before all users of FOR.
static VPValue * scalarizeVPWidenPointerInduction(VPWidenPointerInductionRecipe *PtrIV, VPlan &Plan, VPBuilder &Builder)
Scalarize a VPWidenPointerInductionRecipe by replacing it with a PtrAdd (IndStart,...
static SmallVector< VPUser * > collectUsersRecursively(VPValue *V)
static VPValue * optimizeEarlyExitInductionUser(VPlan &Plan, VPTypeAnalysis &TypeInfo, VPBlockBase *PredVPBB, VPValue *Op, PredicatedScalarEvolution &PSE)
Attempts to optimize the induction variable exit values for users in the early exit block.
static void recursivelyDeleteDeadRecipes(VPValue *V)
static void reassociateHeaderMask(VPlan &Plan)
Reassociate (headermask && x) && y -> headermask && (x && y) to allow the header mask to be simplifie...
static bool canSinkStoreWithNoAliasCheck(ArrayRef< VPReplicateRecipe * > StoresToSink, PredicatedScalarEvolution &PSE, const Loop &L, VPTypeAnalysis &TypeInfo)
static VPActiveLaneMaskPHIRecipe * addVPLaneMaskPhiAndUpdateExitBranch(VPlan &Plan)
static VPRegionBlock * createReplicateRegion(VPReplicateRecipe *PredRecipe, VPlan &Plan)
static VPBasicBlock * getPredicatedThenBlock(VPRegionBlock *R)
If R is a triangle region, return the 'then' block of the triangle.
static VPValue * narrowInterleaveGroupOp(VPValue *V, SmallPtrSetImpl< VPValue * > &NarrowedOps)
static bool cannotHoistOrSinkRecipe(const VPRecipeBase &R, bool Sinking=false)
Return true if we do not know how to (mechanically) hoist or sink R out of a loop region.
static bool canHoistOrSinkWithNoAliasCheck(const MemoryLocation &MemLoc, VPBasicBlock *FirstBB, VPBasicBlock *LastBB, std::optional< SinkStoreInfo > SinkInfo={})
Check if a memory operation doesn't alias with memory operations using scoped noalias metadata,...
static void simplifyBlends(VPlan &Plan)
Normalize and simplify VPBlendRecipes.
static VPRecipeBase * optimizeMaskToEVL(VPValue *HeaderMask, VPRecipeBase &CurRecipe, VPTypeAnalysis &TypeInfo, VPValue &EVL)
Try to optimize a CurRecipe masked by HeaderMask to a corresponding EVL-based recipe without the head...
static bool isAlreadyNarrow(VPValue *VPV)
Returns true if VPValue is a narrow VPValue.
static bool canNarrowOps(ArrayRef< VPValue * > Ops, bool IsScalable)
static bool optimizeVectorInductionWidthForTCAndVFUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF)
Optimize the width of vector induction variables in Plan based on a known constant Trip Count,...
static VPExpressionRecipe * tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries convert extended in-loop reductions to VPExpressionRecipe and clamp the Range if ...
static void expandVPWidenIntOrFpInduction(VPWidenIntOrFpInductionRecipe *WidenIVR, VPTypeAnalysis &TypeInfo)
Expand a VPWidenIntOrFpInduction into executable recipes, for the initial value, phi and backedge val...
static void removeRedundantCanonicalIVs(VPlan &Plan)
Try to replace VPWidenCanonicalIVRecipes with a widened canonical IV recipe, if it exists.
static void narrowToSingleScalarRecipes(VPlan &Plan)
static VPValue * optimizeLatchExitInductionUser(VPlan &Plan, VPTypeAnalysis &TypeInfo, VPBlockBase *PredVPBB, VPValue *Op, DenseMap< VPValue *, VPValue * > &EndValues, PredicatedScalarEvolution &PSE)
Attempts to optimize the induction variable exit values for users in the exit block coming from the l...
This file provides utility VPlan to VPlan transformations.
#define RUN_VPLAN_PASS(PASS,...)
This file declares the class VPlanVerifier, which contains utility functions to check the consistency...
This file contains the declarations of the Vectorization Plan base classes:
static const X86InstrFMA3Group Groups[]
Value * RHS
Value * LHS
BinaryOperator * Mul
static const uint32_t IV[8]
Definition blake3_impl.h:83
Helper for extra no-alias checks via known-safe recipe and SCEV.
SinkStoreInfo(const SmallPtrSetImpl< VPRecipeBase * > &ExcludeRecipes, VPReplicateRecipe &GroupLeader, PredicatedScalarEvolution &PSE, const Loop &L, VPTypeAnalysis &TypeInfo)
bool shouldSkip(VPRecipeBase &R) const
Return true if R should be skipped during alias checking, either because it's in the exclude set or b...
Class for arbitrary precision integers.
Definition APInt.h:78
LLVM_ABI APInt zext(unsigned width) const
Zero extend to a new width.
Definition APInt.cpp:1054
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1563
unsigned getActiveBits() const
Compute the number of active bits in the value.
Definition APInt.h:1535
APInt abs() const
Get the absolute value.
Definition APInt.h:1818
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1511
LLVM_ABI APInt sext(unsigned width) const
Sign extend to a new width.
Definition APInt.cpp:1027
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition APInt.h:441
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition APInt.h:1228
An arbitrary precision integer that knows its signedness.
Definition APSInt.h:24
static APSInt getMinValue(uint32_t numBits, bool Unsigned)
Return the APSInt representing the minimum integer value with the given bit width and signedness.
Definition APSInt.h:310
static APSInt getMaxValue(uint32_t numBits, bool Unsigned)
Return the APSInt representing the maximum integer value with the given bit width and signedness.
Definition APSInt.h:302
@ NoAlias
The two locations do not alias at all.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
const T & back() const
back - Get the last element.
Definition ArrayRef.h:151
const T & front() const
front - Get the first element.
Definition ArrayRef.h:145
iterator end() const
Definition ArrayRef.h:131
iterator begin() const
Definition ArrayRef.h:130
LLVM Basic Block Representation.
Definition BasicBlock.h:62
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction; assumes that the block is well-formed.
Definition BasicBlock.h:237
This class represents a function call, abstracting a target machine's calling convention.
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:701
@ ICMP_NE
not equal
Definition InstrTypes.h:698
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:702
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition InstrTypes.h:686
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition InstrTypes.h:789
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
static ConstantInt * getSigned(IntegerType *Ty, int64_t V, bool ImplicitTrunc=false)
Return a ConstantInt with the specified value for the specified type.
Definition Constants.h:135
This class represents a range of values.
LLVM_ABI bool contains(const APInt &Val) const
Return true if the specified value is in the set.
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
A debug info location.
Definition DebugLoc.h:123
static DebugLoc getCompilerGenerated()
Definition DebugLoc.h:162
static DebugLoc getUnknown()
Definition DebugLoc.h:161
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition DenseMap.h:205
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:256
ValueT lookup_or(const_arg_type_t< KeyT > Val, U &&Default) const
Definition DenseMap.h:215
bool dominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
dominates - Returns true iff A dominates B.
constexpr bool isVector() const
One or more elements.
Definition TypeSize.h:324
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition TypeSize.h:312
constexpr bool isScalar() const
Exactly one element.
Definition TypeSize.h:320
Utility class for floating point operations which can have information about relaxed accuracy require...
Definition Operator.h:200
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:23
Represents flags for the getelementptr instruction/expression.
GEPNoWrapFlags withoutNoUnsignedWrap() const
static GEPNoWrapFlags none()
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
A struct for saving information about induction variables.
InductionKind
This enum represents the kinds of inductions that we support.
@ IK_PtrInduction
Pointer induction var. Step = C.
@ IK_IntInduction
Integer induction variable. Step = C.
InstSimplifyFolder - Use InstructionSimplify to fold operations to existing values.
static InstructionCost getInvalid(CostType Val=0)
bool isCast() const
bool isBinaryOp() const
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
bool isIntDivRem() const
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:354
The group of interleaved loads/stores sharing the same stride and close to each other.
InstTy * getMember(uint32_t Index) const
Get the member with the given index Index.
uint32_t getNumMembers() const
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
static bool getDecisionAndClampRange(const std::function< bool(ElementCount)> &Predicate, VFRange &Range)
Test a Predicate on a Range of VF's.
Definition VPlan.cpp:1685
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
LLVM_ABI MDNode * createBranchWeights(uint32_t TrueWeight, uint32_t FalseWeight, bool IsExpected=false)
Return metadata containing two branch weights.
Definition MDBuilder.cpp:38
Metadata node.
Definition Metadata.h:1080
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:36
bool empty() const
Definition MapVector.h:77
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
Definition MapVector.h:116
ValueT lookup(const KeyT &Key) const
Definition MapVector.h:108
Representation for a specific memory location.
AAMDNodes AATags
The metadata nodes which describes the aliasing of the location (each member is null if that kind of ...
Post-order traversal of a graph.
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
ScalarEvolution * getSE() const
Returns the ScalarEvolution analysis used.
LLVM_ABI const SCEV * getSCEV(Value *V)
Returns the SCEV expression of V, in the context of the current SCEV predicate.
static LLVM_ABI unsigned getOpcode(RecurKind Kind)
Returns the opcode corresponding to the RecurrenceKind.
static bool isFindLastRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is of the form select(cmp(),x,y) where one of (x,...
RegionT * getParent() const
Get the parent of the Region.
Definition RegionInfo.h:362
This class uses information about analyze scalars to rewrite expressions in canonical form.
LLVM_ABI Value * expandCodeFor(SCEVUse SH, Type *Ty, BasicBlock::iterator I)
Insert code to directly compute the specified SCEV expression into the program.
static const SCEV * rewrite(const SCEV *Scev, ScalarEvolution &SE, ValueToSCEVMapTy &Map)
This class represents an analyzed expression in the program.
LLVM_ABI Type * getType() const
Return the LLVM type of this SCEV expression.
The main scalar evolution driver.
LLVM_ABI const SCEV * getUDivExpr(SCEVUse LHS, SCEVUse RHS)
Get a canonical unsigned division expression, or something simpler if possible.
const DataLayout & getDataLayout() const
Return the DataLayout associated with the module this SCEV instance is operating on.
LLVM_ABI const SCEV * getNegativeSCEV(const SCEV *V, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
Return the SCEV object corresponding to -V.
LLVM_ABI bool isKnownNonZero(const SCEV *S)
Test if the given expression is known to be non-zero.
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI const SCEV * getMinusSCEV(SCEVUse LHS, SCEVUse RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
ConstantRange getSignedRange(const SCEV *S)
Determine the signed range for a particular SCEV.
LLVM_ABI bool isKnownPositive(const SCEV *S)
Test if the given expression is known to be positive.
LLVM_ABI const SCEV * getElementCount(Type *Ty, ElementCount EC, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
ConstantRange getUnsignedRange(const SCEV *S)
Determine the unsigned range for a particular SCEV.
LLVM_ABI const SCEV * getMulExpr(SmallVectorImpl< SCEVUse > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
LLVM_ABI bool isKnownPredicate(CmpPredicate Pred, SCEVUse LHS, SCEVUse RHS)
Test if the given expression is known to satisfy the condition described by Pred, LHS,...
static LLVM_ABI bool mayAliasInScopes(const MDNode *Scopes, const MDNode *NoAlias)
static LLVM_ABI AliasResult alias(const MemoryLocation &LocA, const MemoryLocation &LocB)
A vector that has set insertion semantics.
Definition SetVector.h:57
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:103
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:151
size_type size() const
Definition SmallPtrSet.h:99
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
iterator begin() const
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
Provides information about what library functions are available for the current target.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
static LLVM_ABI PartialReductionExtendKind getPartialReductionExtendKind(Instruction *I)
Get the kind of extension that an instruction represents.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
LLVM_ABI InstructionCost getPartialReductionCost(unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType, ElementCount VF, PartialReductionExtendKind OpAExtend, PartialReductionExtendKind OpBExtend, std::optional< unsigned > BinOp, TTI::TargetCostKind CostKind, std::optional< FastMathFlags > FMF) const
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
This class implements a switch-like dispatch statement for a value of 'T' using dyn_cast functionalit...
Definition TypeSwitch.h:89
TypeSwitch< T, ResultT > & Case(CallableT &&caseFn)
Add a case on the given type.
Definition TypeSwitch.h:98
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:313
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:284
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
Definition Type.cpp:311
bool isStructTy() const
True if this is an instance of StructType.
Definition Type.h:278
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:201
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:236
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:310
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:186
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:257
op_range operands()
Definition User.h:267
A recipe for generating the active lane mask for the vector loop that is used to predicate the vector...
Definition VPlan.h:3793
VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph.
Definition VPlan.h:4160
void appendRecipe(VPRecipeBase *Recipe)
Augment the existing recipes of a VPBasicBlock with an additional Recipe as the last recipe.
Definition VPlan.h:4235
RecipeListTy::iterator iterator
Instruction iterators...
Definition VPlan.h:4187
iterator end()
Definition VPlan.h:4197
iterator begin()
Recipe iterator methods.
Definition VPlan.h:4195
iterator_range< iterator > phis()
Returns an iterator range over the PHI-like recipes in the block.
Definition VPlan.h:4248
iterator getFirstNonPhi()
Return the position of the first non-phi node recipe in the block.
Definition VPlan.cpp:232
VPRegionBlock * getEnclosingLoopRegion()
Definition VPlan.cpp:598
VPBasicBlock * splitAt(iterator SplitAt)
Split current block at SplitAt by inserting a new block between the current block and its successors ...
Definition VPlan.cpp:565
VPRecipeBase * getTerminator()
If the block has multiple successors, return the branch recipe terminating the block.
Definition VPlan.cpp:644
const VPRecipeBase & back() const
Definition VPlan.h:4209
A recipe for vectorizing a phi-node as a sequence of mask-based select instructions.
Definition VPlan.h:2780
VPValue * getMask(unsigned Idx) const
Return mask number Idx.
Definition VPlan.h:2816
unsigned getNumIncomingValues() const
Return the number of incoming values, taking into account when normalized the first incoming value wi...
Definition VPlan.h:2806
void setMask(unsigned Idx, VPValue *V)
Set mask number Idx to V.
Definition VPlan.h:2822
bool isNormalized() const
A normalized blend is one that has an odd number of operands, whereby the first operand does not have...
Definition VPlan.h:2802
VPBlockBase is the building block of the Hierarchical Control-Flow Graph.
Definition VPlan.h:98
void setSuccessors(ArrayRef< VPBlockBase * > NewSuccs)
Set each VPBasicBlock in NewSuccss as successor of this VPBlockBase.
Definition VPlan.h:319
VPRegionBlock * getParent()
Definition VPlan.h:190
const VPBasicBlock * getExitingBasicBlock() const
Definition VPlan.cpp:202
size_t getNumSuccessors() const
Definition VPlan.h:241
void setPredecessors(ArrayRef< VPBlockBase * > NewPreds)
Set each VPBasicBlock in NewPreds as predecessor of this VPBlockBase.
Definition VPlan.h:310
const VPBlocksTy & getPredecessors() const
Definition VPlan.h:226
VPlan * getPlan()
Definition VPlan.cpp:177
const std::string & getName() const
Definition VPlan.h:181
void clearSuccessors()
Remove all the successors of this block.
Definition VPlan.h:329
VPBlockBase * getSinglePredecessor() const
Definition VPlan.h:237
const VPBasicBlock * getEntryBasicBlock() const
Definition VPlan.cpp:182
VPBlockBase * getSingleHierarchicalPredecessor()
Definition VPlan.h:283
VPBlockBase * getSingleSuccessor() const
Definition VPlan.h:231
const VPBlocksTy & getSuccessors() const
Definition VPlan.h:215
static void insertOnEdge(VPBlockBase *From, VPBlockBase *To, VPBlockBase *BlockPtr)
Inserts BlockPtr on the edge between From and To.
Definition VPlanUtils.h:303
static bool isLatch(const VPBlockBase *VPB, const VPDominatorTree &VPDT)
Returns true if VPB is a loop latch, using isHeader().
static void insertTwoBlocksAfter(VPBlockBase *IfTrue, VPBlockBase *IfFalse, VPBlockBase *BlockPtr)
Insert disconnected VPBlockBases IfTrue and IfFalse after BlockPtr.
Definition VPlanUtils.h:204
static void connectBlocks(VPBlockBase *From, VPBlockBase *To, unsigned PredIdx=-1u, unsigned SuccIdx=-1u)
Connect VPBlockBases From and To bi-directionally.
Definition VPlanUtils.h:222
static void disconnectBlocks(VPBlockBase *From, VPBlockBase *To)
Disconnect VPBlockBases From and To bi-directionally.
Definition VPlanUtils.h:240
static auto blocksOnly(T &&Range)
Return an iterator range over Range which only includes BlockTy blocks.
Definition VPlanUtils.h:276
static void transferSuccessors(VPBlockBase *Old, VPBlockBase *New)
Transfer successors from Old to New. New must have no successors.
Definition VPlanUtils.h:260
static SmallVector< VPBasicBlock * > blocksInSingleSuccessorChainBetween(VPBasicBlock *FirstBB, VPBasicBlock *LastBB)
Returns the blocks between FirstBB and LastBB, where FirstBB to LastBB forms a single-sucessor chain.
A recipe for generating conditional branches on the bits of a mask.
Definition VPlan.h:3277
RAII object that stores the current insertion point and restores it when the object is destroyed.
VPlan-based builder utility analogous to IRBuilder.
VPInstruction * createOr(VPValue *LHS, VPValue *RHS, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
VPValue * createScalarZExtOrTrunc(VPValue *Op, Type *ResultTy, Type *SrcTy, DebugLoc DL)
VPValue * createElementCount(Type *Ty, ElementCount EC)
VPInstruction * createNot(VPValue *Operand, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
VPInstruction * createAnyOfReduction(VPValue *ChainOp, VPValue *TrueVal, VPValue *FalseVal, DebugLoc DL=DebugLoc::getUnknown())
Create an AnyOf reduction pattern: or-reduce ChainOp, freeze the result, then select between TrueVal ...
Definition VPlan.cpp:1670
VPInstruction * createLogicalAnd(VPValue *LHS, VPValue *RHS, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
VPInstruction * createScalarCast(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy, DebugLoc DL, const VPIRMetadata &Metadata={})
VPWidenPHIRecipe * createWidenPhi(ArrayRef< VPValue * > IncomingValues, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
static VPBuilder getToInsertAfter(VPRecipeBase *R)
Create a VPBuilder to insert after R.
VPInstruction * createOverflowingOp(unsigned Opcode, ArrayRef< VPValue * > Operands, VPRecipeWithIRFlags::WrapFlagsTy WrapFlags={false, false}, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
VPPhi * createScalarPhi(ArrayRef< VPValue * > IncomingValues, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="", const VPIRFlags &Flags={})
VPDerivedIVRecipe * createDerivedIV(InductionDescriptor::InductionKind Kind, FPMathOperator *FPBinOp, VPIRValue *Start, VPValue *Current, VPValue *Step, const Twine &Name="")
Convert the input value Current to the corresponding value of an induction with Start and Step values...
VPInstruction * createICmp(CmpInst::Predicate Pred, VPValue *A, VPValue *B, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
Create a new ICmp VPInstruction with predicate Pred and operands A and B.
VPInstruction * createSelect(VPValue *Cond, VPValue *TrueVal, VPValue *FalseVal, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="", const VPIRFlags &Flags={})
void setInsertPoint(VPBasicBlock *TheBB)
This specifies that created VPInstructions should be appended to the end of the specified block.
VPInstruction * createNaryOp(unsigned Opcode, ArrayRef< VPValue * > Operands, Instruction *Inst=nullptr, const VPIRFlags &Flags={}, const VPIRMetadata &MD={}, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
Create an N-ary operation with Opcode, Operands and set Inst as its underlying Instruction.
A recipe for generating the phi node tracking the current scalar iteration index.
Definition VPlan.h:3825
unsigned getNumDefinedValues() const
Returns the number of values defined by the VPDef.
Definition VPlanValue.h:498
VPValue * getVPSingleValue()
Returns the only VPValue defined by the VPDef.
Definition VPlanValue.h:471
VPValue * getVPValue(unsigned I)
Returns the VPValue with index I defined by the VPDef.
Definition VPlanValue.h:483
ArrayRef< VPRecipeValue * > definedValues()
Returns an ArrayRef of the values defined by the VPDef.
Definition VPlanValue.h:493
A recipe for converting the input value IV value to the corresponding value of an IV with different s...
Definition VPlan.h:3909
Template specialization of the standard LLVM dominator tree utility for VPBlockBases.
bool properlyDominates(const VPRecipeBase *A, const VPRecipeBase *B)
A recipe to combine multiple recipes into a single 'expression' recipe, which should be considered a ...
Definition VPlan.h:3322
A pure virtual base class for all recipes modeling header phis, including phis for first order recurr...
Definition VPlan.h:2300
virtual VPValue * getBackedgeValue()
Returns the incoming value from the loop backedge.
Definition VPlan.h:2342
VPValue * getStartValue()
Returns the start value of the phi, if one is set.
Definition VPlan.h:2331
A recipe representing a sequence of load -> update -> store as part of a histogram operation.
Definition VPlan.h:2045
A special type of VPBasicBlock that wraps an existing IR basic block.
Definition VPlan.h:4313
Class to record and manage LLVM IR flags.
Definition VPlan.h:688
static VPIRFlags getDefaultFlags(unsigned Opcode)
Returns default flags for Opcode for opcodes that support it, asserts otherwise.
LLVM_ABI_FOR_TEST FastMathFlags getFastMathFlags() const
static LLVM_ABI_FOR_TEST VPIRInstruction * create(Instruction &I)
Create a new VPIRPhi for \I , if it is a PHINode, otherwise create a VPIRInstruction.
Helper to manage IR metadata for recipes.
Definition VPlan.h:1168
void intersect(const VPIRMetadata &MD)
Intersect this VPIRMetadata object with MD, keeping only metadata nodes that are common to both.
This is a concrete Recipe that models a single VPlan-level instruction.
Definition VPlan.h:1223
@ ExtractLane
Extracts a single lane (first operand) from a set of vector operands.
Definition VPlan.h:1324
@ Unpack
Extracts all lanes from its (non-scalable) vector operand.
Definition VPlan.h:1267
@ ReductionStartVector
Start vector for reductions with 3 operands: the original start value, the identity value for the red...
Definition VPlan.h:1318
@ BuildVector
Creates a fixed-width vector containing all operands.
Definition VPlan.h:1262
@ BuildStructVector
Given operands of (the same) struct type, creates a struct of fixed- width vectors each containing a ...
Definition VPlan.h:1259
@ CanonicalIVIncrementForPart
Definition VPlan.h:1243
@ ComputeReductionResult
Reduce the operands to the final reduction result using the operation specified via the operation's V...
Definition VPlan.h:1270
const InterleaveGroup< Instruction > * getInterleaveGroup() const
Definition VPlan.h:2918
VPValue * getMask() const
Return the mask used by this recipe.
Definition VPlan.h:2910
ArrayRef< VPValue * > getStoredValues() const
Return the VPValues stored by this interleave group.
Definition VPlan.h:2939
A recipe for interleaved memory operations with vector-predication intrinsics.
Definition VPlan.h:2991
VPInterleaveRecipe is a recipe for transforming an interleave group of load or stores into one wide l...
Definition VPlan.h:2949
VPPredInstPHIRecipe is a recipe for generating the phi nodes needed when control converges back from ...
Definition VPlan.h:3464
VPRecipeBase is a base class modeling a sequence of one or more output IR instructions.
Definition VPlan.h:406
VPRegionBlock * getRegion()
Definition VPlan.h:4505
VPBasicBlock * getParent()
Definition VPlan.h:480
DebugLoc getDebugLoc() const
Returns the debug location of the recipe.
Definition VPlan.h:554
void moveBefore(VPBasicBlock &BB, iplist< VPRecipeBase >::iterator I)
Unlink this recipe and insert into BB before I.
void insertBefore(VPRecipeBase *InsertPos)
Insert an unlinked recipe into a basic block immediately before the specified recipe.
void insertAfter(VPRecipeBase *InsertPos)
Insert an unlinked Recipe into a basic block immediately after the specified Recipe.
iplist< VPRecipeBase >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Helper class to create VPRecipies from IR instructions.
VPHistogramRecipe * widenIfHistogram(VPInstruction *VPI)
If VPI represents a histogram operation (as determined by LoopVectorizationLegality) make that safe f...
void setRecipe(Instruction *I, VPRecipeBase *R)
Set the recipe created for given ingredient.
VPRecipeBase * tryToWidenMemory(VPInstruction *VPI, VFRange &Range)
Check if the load or store instruction VPI should widened for Range.Start and potentially masked.
bool replaceWithFinalIfReductionStore(VPInstruction *VPI, VPBuilder &FinalRedStoresBuilder)
If VPI is a store of a reduction into an invariant address, delete it.
VPReplicateRecipe * handleReplication(VPInstruction *VPI, VFRange &Range)
Build a VPReplicationRecipe for VPI.
VPRecipeBase * getRecipe(Instruction *I)
Return the recipe created for given ingredient.
A recipe to represent inloop reduction operations with vector-predication intrinsics,...
Definition VPlan.h:3151
A recipe for handling reduction phis.
Definition VPlan.h:2686
void setVFScaleFactor(unsigned ScaleFactor)
Set the VFScaleFactor for this reduction phi.
Definition VPlan.h:2733
unsigned getVFScaleFactor() const
Get the factor that the VF of this recipe's output should be scaled by, or 1 if it isn't scaled.
Definition VPlan.h:2726
RecurKind getRecurrenceKind() const
Returns the recurrence kind of the reduction.
Definition VPlan.h:2744
A recipe to represent inloop, ordered or partial reduction operations.
Definition VPlan.h:3042
VPRegionBlock represents a collection of VPBasicBlocks and VPRegionBlocks which form a Single-Entry-S...
Definition VPlan.h:4370
const VPBlockBase * getEntry() const
Definition VPlan.h:4414
bool isReplicator() const
An indicator whether this region is to generate multiple replicated instances of output IR correspond...
Definition VPlan.h:4446
VPInstruction * getOrCreateCanonicalIVIncrement()
Get the canonical IV increment instruction if it exists.
Definition VPlan.cpp:880
void setExiting(VPBlockBase *ExitingBlock)
Set ExitingBlock as the exiting VPBlockBase of this VPRegionBlock.
Definition VPlan.h:4431
Type * getCanonicalIVType() const
Return the type of the canonical IV for loop regions.
Definition VPlan.h:4490
void clearCanonicalIVNUW(VPInstruction *Increment)
Unsets NUW for the canonical IV increment Increment, for loop regions.
Definition VPlan.h:4498
VPRegionValue * getCanonicalIV()
Return the canonical induction variable of the region, null for replicating regions.
Definition VPlan.h:4482
const VPBlockBase * getExiting() const
Definition VPlan.h:4426
VPBasicBlock * getPreheaderVPBB()
Returns the pre-header VPBasicBlock of the loop region.
Definition VPlan.h:4439
VPReplicateRecipe replicates a given instruction producing multiple scalar copies of the original sca...
Definition VPlan.h:3196
bool isSingleScalar() const
Definition VPlan.h:3237
bool isPredicated() const
Definition VPlan.h:3239
VPValue * getMask()
Return the mask of a predicated VPReplicateRecipe.
Definition VPlan.h:3261
A recipe for handling phi nodes of integer and floating-point inductions, producing their scalar valu...
Definition VPlan.h:3980
VPSingleDef is a base class for recipes for modeling a sequence of one or more output IR that define ...
Definition VPlan.h:606
Instruction * getUnderlyingInstr()
Returns the underlying instruction.
Definition VPlan.h:673
VPSingleDefRecipe * clone() override=0
Clone the current recipe.
An analysis for type-inference for VPValues.
LLVMContext & getContext()
Return the LLVMContext used by the analysis.
Type * inferScalarType(const VPValue *V)
Infer the type of V. Returns the scalar type of V.
This class augments VPValue with operands which provide the inverse def-use edges from VPValue's user...
Definition VPlanValue.h:329
operand_range operands()
Definition VPlanValue.h:397
void setOperand(unsigned I, VPValue *New)
Definition VPlanValue.h:373
unsigned getNumOperands() const
Definition VPlanValue.h:367
VPValue * getOperand(unsigned N) const
Definition VPlanValue.h:368
void addOperand(VPValue *Operand)
Definition VPlanValue.h:362
This is the base class of the VPlan Def/Use graph, used for modeling the data flow into,...
Definition VPlanValue.h:49
Value * getLiveInIRValue() const
Return the underlying IR value for a VPIRValue.
Definition VPlan.cpp:137
bool isDefinedOutsideLoopRegions() const
Returns true if the VPValue is defined outside any loop.
Definition VPlan.cpp:1495
VPRecipeBase * getDefiningRecipe()
Returns the recipe defining this VPValue or nullptr if it is not defined by a recipe,...
Definition VPlan.cpp:127
Value * getUnderlyingValue() const
Return the underlying Value attached to this VPValue.
Definition VPlanValue.h:74
void setUnderlyingValue(Value *Val)
Definition VPlanValue.h:202
void replaceAllUsesWith(VPValue *New)
Definition VPlan.cpp:1498
unsigned getNumUsers() const
Definition VPlanValue.h:113
void replaceUsesWithIf(VPValue *New, llvm::function_ref< bool(VPUser &U, unsigned Idx)> ShouldReplace)
Go through the uses list for this VPValue and make each use point to New if the callback ShouldReplac...
Definition VPlan.cpp:1504
user_range users()
Definition VPlanValue.h:155
A recipe to compute a pointer to the last element of each part of a widened memory access for widened...
Definition VPlan.h:2151
A Recipe for widening the canonical induction variable of the vector loop.
Definition VPlan.h:3868
VPWidenCastRecipe is a recipe to create vector cast instructions.
Definition VPlan.h:1837
Instruction::CastOps getOpcode() const
Definition VPlan.h:1875
A recipe for handling GEP instructions.
Definition VPlan.h:2087
Base class for widened induction (VPWidenIntOrFpInductionRecipe and VPWidenPointerInductionRecipe),...
Definition VPlan.h:2366
VPIRValue * getStartValue() const
Returns the start value of the induction.
Definition VPlan.h:2394
VPValue * getStepValue()
Returns the step value of the induction.
Definition VPlan.h:2397
const InductionDescriptor & getInductionDescriptor() const
Returns the induction descriptor for the recipe.
Definition VPlan.h:2417
A recipe for handling phi nodes of integer and floating-point inductions, producing their vector valu...
Definition VPlan.h:2448
VPIRValue * getStartValue() const
Returns the start value of the induction.
Definition VPlan.h:2495
VPValue * getSplatVFValue() const
If the recipe has been unrolled, return the VPValue for the induction increment, otherwise return nul...
Definition VPlan.h:2499
VPValue * getLastUnrolledPartOperand()
Returns the VPValue representing the value of this induction at the last unrolled part,...
Definition VPlan.h:2526
A recipe for widening vector intrinsics.
Definition VPlan.h:1889
A common base class for widening memory operations.
Definition VPlan.h:3507
A recipe for widened phis.
Definition VPlan.h:2584
VPWidenRecipe is a recipe for producing a widened instruction using the opcode and operands of the re...
Definition VPlan.h:1781
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPWidenRecipe.
VPWidenRecipe * clone() override
Clone the current recipe.
Definition VPlan.h:1801
unsigned getOpcode() const
Definition VPlan.h:1818
VPlan models a candidate for vectorization, encoding various decisions take to produce efficient outp...
Definition VPlan.h:4518
VPIRValue * getLiveIn(Value *V) const
Return the live-in VPIRValue for V, if there is one or nullptr otherwise.
Definition VPlan.h:4834
bool hasVF(ElementCount VF) const
Definition VPlan.h:4732
const DataLayout & getDataLayout() const
Definition VPlan.h:4714
LLVMContext & getContext() const
Definition VPlan.h:4710
VPBasicBlock * getEntry()
Definition VPlan.h:4610
bool hasScalableVF() const
Definition VPlan.h:4733
VPValue * getTripCount() const
The trip count of the original loop.
Definition VPlan.h:4669
VPValue * getOrCreateBackedgeTakenCount()
The backedge taken count of the original loop.
Definition VPlan.h:4690
iterator_range< SmallSetVector< ElementCount, 2 >::iterator > vectorFactors() const
Returns an iterator range over all VFs of the plan.
Definition VPlan.h:4739
VPIRValue * getFalse()
Return a VPIRValue wrapping i1 false.
Definition VPlan.h:4805
VPSymbolicValue & getVFxUF()
Returns VF * UF of the vector loop region.
Definition VPlan.h:4708
VPIRValue * getAllOnesValue(Type *Ty)
Return a VPIRValue wrapping the AllOnes value of type Ty.
Definition VPlan.h:4811
VPRegionBlock * createReplicateRegion(VPBlockBase *Entry, VPBlockBase *Exiting, const std::string &Name="")
Create a new replicate region with Entry, Exiting and Name.
Definition VPlan.h:4883
auto getLiveIns() const
Return the list of live-in VPValues available in the VPlan.
Definition VPlan.h:4837
bool hasUF(unsigned UF) const
Definition VPlan.h:4757
ArrayRef< VPIRBasicBlock * > getExitBlocks() const
Return an ArrayRef containing VPIRBasicBlocks wrapping the exit blocks of the original scalar loop.
Definition VPlan.h:4659
VPSymbolicValue & getVectorTripCount()
The vector trip count.
Definition VPlan.h:4698
VPValue * getBackedgeTakenCount() const
Definition VPlan.h:4695
VPIRValue * getOrAddLiveIn(Value *V)
Gets the live-in VPIRValue for V or adds a new live-in (if none exists yet) for V.
Definition VPlan.h:4782
VPIRValue * getZero(Type *Ty)
Return a VPIRValue wrapping the null value of type Ty.
Definition VPlan.h:4808
void setVF(ElementCount VF)
Definition VPlan.h:4720
bool isUnrolled() const
Returns true if the VPlan already has been unrolled, i.e.
Definition VPlan.h:4773
LLVM_ABI_FOR_TEST VPRegionBlock * getVectorLoopRegion()
Returns the VPRegionBlock of the vector loop.
Definition VPlan.cpp:1095
unsigned getConcreteUF() const
Returns the concrete UF of the plan, after unrolling.
Definition VPlan.h:4760
void resetTripCount(VPValue *NewTripCount)
Resets the trip count for the VPlan.
Definition VPlan.h:4683
VPBasicBlock * getMiddleBlock()
Returns the 'middle' block of the plan, that is the block that selects whether to execute the scalar ...
Definition VPlan.h:4635
VPBasicBlock * createVPBasicBlock(const Twine &Name, VPRecipeBase *Recipe=nullptr)
Create a new VPBasicBlock with Name and containing Recipe if present.
Definition VPlan.h:4860
VPIRValue * getTrue()
Return a VPIRValue wrapping i1 true.
Definition VPlan.h:4802
VPSymbolicValue & getUF()
Returns the UF of the vector loop region.
Definition VPlan.h:4705
bool hasScalarVFOnly() const
Definition VPlan.h:4750
VPBasicBlock * getScalarPreheader() const
Return the VPBasicBlock for the preheader of the scalar loop.
Definition VPlan.h:4649
VPBasicBlock * getVectorPreheader()
Returns the preheader of the vector loop region, if one exists, or null otherwise.
Definition VPlan.h:4615
VPSymbolicValue & getVF()
Returns the VF of the vector loop region.
Definition VPlan.h:4701
void setUF(unsigned UF)
Definition VPlan.h:4765
bool hasScalarTail() const
Returns true if the scalar tail may execute after the vector loop, i.e.
Definition VPlan.h:4915
LLVM_ABI_FOR_TEST VPlan * duplicate()
Clone the current VPlan, update all VPValues of the new VPlan and cloned recipes to refer to the clon...
Definition VPlan.cpp:1243
VPIRValue * getConstantInt(Type *Ty, uint64_t Val, bool IsSigned=false)
Return a VPIRValue wrapping a ConstantInt with the given type and value.
Definition VPlan.h:4816
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255
iterator_range< user_iterator > users()
Definition Value.h:426
bool hasName() const
Definition Value.h:261
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:318
constexpr bool hasKnownScalarFactor(const FixedOrScalableQuantity &RHS) const
Returns true if there exists a value X where RHS.multiplyCoefficientBy(X) will result in a value whos...
Definition TypeSize.h:269
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
constexpr ScalarTy getKnownScalarFactor(const FixedOrScalableQuantity &RHS) const
Returns a value X where RHS.multiplyCoefficientBy(X) will result in a value whose quantity matches ou...
Definition TypeSize.h:277
static constexpr bool isKnownLT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:216
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:168
constexpr LeafTy multiplyCoefficientBy(ScalarTy RHS) const
Definition TypeSize.h:256
constexpr bool isFixed() const
Returns true if the quantity is not scaled by vscale.
Definition TypeSize.h:171
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
LLVM_ABI APInt RoundingUDiv(const APInt &A, const APInt &B, APInt::Rounding RM)
Return A unsign-divided by B, rounded by the given rounding mode.
Definition APInt.cpp:2814
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
match_combine_or< Ty... > m_CombineOr(const Ty &...Ps)
Combine pattern matchers matching any of Ps patterns.
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
auto m_Cmp()
Matches any compare instruction and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
m_Intrinsic_Ty< Opnd0, Opnd1, Opnd2 >::Ty m_MaskedStore(const Opnd0 &Op0, const Opnd1 &Op1, const Opnd2 &Op2)
Matches MaskedStore Intrinsic.
ap_match< APInt > m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
CastInst_match< OpTy, TruncInst > m_Trunc(const OpTy &Op)
Matches Trunc.
LogicalOp_match< LHS, RHS, Instruction::And > m_LogicalAnd(const LHS &L, const RHS &R)
Matches L && R either in the form of L & R or L ?
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, ZExtInst >, OpTy > m_ZExtOrSelf(const OpTy &Op)
bool match(Val *V, const Pattern &P)
match_deferred< Value > m_Deferred(Value *const &V)
Like m_Specific(), but works if the specific value to match is determined as part of the same match()...
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
auto match_fn(const Pattern &P)
A match functor that can be used as a UnaryPredicate in functional algorithms like all_of.
m_Intrinsic_Ty< Opnd0, Opnd1, Opnd2 >::Ty m_MaskedLoad(const Opnd0 &Op0, const Opnd1 &Op1, const Opnd2 &Op2)
Matches MaskedLoad Intrinsic.
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
SpecificCmpClass_match< LHS, RHS, CmpInst > m_SpecificCmp(CmpPredicate MatchPred, const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
CastInst_match< OpTy, FPExtInst > m_FPExt(const OpTy &Op)
SpecificCmpClass_match< LHS, RHS, ICmpInst > m_SpecificICmp(CmpPredicate MatchPred, const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::UDiv > m_UDiv(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
CmpClass_match< LHS, RHS, ICmpInst > m_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
BinaryOp_match< LHS, RHS, Instruction::FAdd, true > m_c_FAdd(const LHS &L, const RHS &R)
Matches FAdd with LHS and RHS in either order.
LogicalOp_match< LHS, RHS, Instruction::And, true > m_c_LogicalAnd(const LHS &L, const RHS &R)
Matches L && R with LHS and RHS in either order.
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
BinaryOp_match< LHS, RHS, Instruction::Mul, true > m_c_Mul(const LHS &L, const RHS &R)
Matches a Mul with LHS and RHS in either order.
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
auto m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
bind_cst_ty m_scev_APInt(const APInt *&C)
Match an SCEV constant and bind it to an APInt.
bool match(const SCEV *S, const Pattern &P)
SCEVAffineAddRec_match< Op0_t, Op1_t, match_isa< const Loop > > m_scev_AffineAddRec(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::ExtractLastLane, VPInstruction_match< VPInstruction::ExtractLastPart, Op0_t > > m_ExtractLastLaneOfLastPart(const Op0_t &Op0)
AllRecipe_commutative_match< Instruction::And, Op0_t, Op1_t > m_c_BinaryAnd(const Op0_t &Op0, const Op1_t &Op1)
Match a binary AND operation.
AllRecipe_match< Instruction::Or, Op0_t, Op1_t > m_BinaryOr(const Op0_t &Op0, const Op1_t &Op1)
Match a binary OR operation.
VPInstruction_match< VPInstruction::AnyOf > m_AnyOf()
AllRecipe_commutative_match< Instruction::Or, Op0_t, Op1_t > m_c_BinaryOr(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::ComputeReductionResult, Op0_t > m_ComputeReductionResult(const Op0_t &Op0)
auto m_WidenAnyExtend(const Op0_t &Op0)
match_bind< VPIRValue > m_VPIRValue(VPIRValue *&V)
Match a VPIRValue.
VPInstruction_match< VPInstruction::StepVector > m_StepVector()
auto m_VPPhi(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::BranchOnTwoConds > m_BranchOnTwoConds()
AllRecipe_match< Opcode, Op0_t, Op1_t > m_Binary(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::LastActiveLane, Op0_t > m_LastActiveLane(const Op0_t &Op0)
auto m_WidenIntrinsic(const T &...Ops)
VPInstruction_match< VPInstruction::ExitingIVValue, Op0_t > m_ExitingIVValue(const Op0_t &Op0)
VPInstruction_match< Instruction::ExtractElement, Op0_t, Op1_t > m_ExtractElement(const Op0_t &Op0, const Op1_t &Op1)
specific_intval< 1 > m_False()
VPInstruction_match< VPInstruction::ExtractLastLane, Op0_t > m_ExtractLastLane(const Op0_t &Op0)
VPInstruction_match< VPInstruction::ActiveLaneMask, Op0_t, Op1_t, Op2_t > m_ActiveLaneMask(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2)
match_bind< VPSingleDefRecipe > m_VPSingleDefRecipe(VPSingleDefRecipe *&V)
Match a VPSingleDefRecipe, capturing if we match.
VPInstruction_match< VPInstruction::BranchOnCount > m_BranchOnCount()
auto m_GetElementPtr(const Op0_t &Op0, const Op1_t &Op1)
specific_intval< 1 > m_True()
auto m_VPValue()
Match an arbitrary VPValue and ignore it.
VectorEndPointerRecipe_match< Op0_t, Op1_t > m_VecEndPtr(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::ExtractLastPart, Op0_t > m_ExtractLastPart(const Op0_t &Op0)
VPInstruction_match< VPInstruction::Broadcast, Op0_t > m_Broadcast(const Op0_t &Op0)
VPInstruction_match< VPInstruction::ExplicitVectorLength, Op0_t > m_EVL(const Op0_t &Op0)
VPInstruction_match< VPInstruction::BuildVector > m_BuildVector()
BuildVector is matches only its opcode, w/o matching its operands as the number of operands is not fi...
VPInstruction_match< VPInstruction::ExtractPenultimateElement, Op0_t > m_ExtractPenultimateElement(const Op0_t &Op0)
match_bind< VPInstruction > m_VPInstruction(VPInstruction *&V)
Match a VPInstruction, capturing if we match.
VPInstruction_match< VPInstruction::FirstActiveLane, Op0_t > m_FirstActiveLane(const Op0_t &Op0)
auto m_DerivedIV(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2)
VPInstruction_match< VPInstruction::BranchOnCond > m_BranchOnCond()
VPInstruction_match< VPInstruction::ExtractLane, Op0_t, Op1_t > m_ExtractLane(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::Reverse, Op0_t > m_Reverse(const Op0_t &Op0)
NodeAddr< DefNode * > Def
Definition RDFGraph.h:384
bool isSingleScalar(const VPValue *VPV)
Returns true if VPV is a single scalar, either because it produces the same value for all lanes or on...
bool isUniformAcrossVFsAndUFs(VPValue *V)
Checks if V is uniform across all VF lanes and UF parts.
VPValue * getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr)
Get or create a VPValue that corresponds to the expansion of Expr.
VPInstruction * findComputeReductionResult(VPReductionPHIRecipe *PhiR)
Find the ComputeReductionResult recipe for PhiR, looking through selects inserted for predicated redu...
VPInstruction * findCanonicalIVIncrement(VPlan &Plan)
Find the canonical IV increment of Plan's vector loop region.
std::optional< MemoryLocation > getMemoryLocation(const VPRecipeBase &R)
Return a MemoryLocation for R with noalias metadata populated from R, if the recipe is supported and ...
bool onlyFirstLaneUsed(const VPValue *Def)
Returns true if only the first lane of Def is used.
VPRecipeBase * findRecipe(VPValue *Start, PredT Pred)
Search Start's users for a recipe satisfying Pred, looking through recipes with definitions.
Definition VPlanUtils.h:111
VPSingleDefRecipe * findHeaderMask(VPlan &Plan)
Collect the header mask with the pattern: (ICMP_ULE, WideCanonicalIV, backedge-taken-count) TODO: Int...
bool onlyScalarValuesUsed(const VPValue *Def)
Returns true if only scalar values of Def are used by all users.
static VPRecipeBase * findUserOf(VPValue *V, const MatchT &P)
If V is used by a recipe matching pattern P, return it.
Definition VPlanUtils.h:132
const SCEV * getSCEVExprForVPValue(const VPValue *V, PredicatedScalarEvolution &PSE, const Loop *L=nullptr)
Return the SCEV expression for V.
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
@ Offset
Definition DWP.cpp:532
constexpr auto not_equal_to(T &&Arg)
Functor variant of std::not_equal_to that can be used as a UnaryPredicate in functional algorithms li...
Definition STLExtras.h:2180
void stable_sort(R &&Range)
Definition STLExtras.h:2116
auto min_element(R &&Range)
Provide wrappers to std::min_element which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:2078
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1669
LLVM_ABI Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
Definition STLExtras.h:841
DenseMap< const Value *, const SCEV * > ValueToSCEVMapTy
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2554
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
constexpr from_range_t from_range
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2208
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:634
auto cast_or_null(const Y &Val)
Definition Casting.h:714
iterator_range< df_iterator< VPBlockShallowTraversalWrapper< VPBlockBase * > > > vp_depth_first_shallow(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order.
Definition VPlanCFG.h:253
iterator_range< df_iterator< VPBlockDeepTraversalWrapper< VPBlockBase * > > > vp_depth_first_deep(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order while traversing t...
Definition VPlanCFG.h:265
constexpr auto equal_to(T &&Arg)
Functor variant of std::equal_to that can be used as a UnaryPredicate in functional algorithms like a...
Definition STLExtras.h:2173
detail::concat_range< ValueT, RangeTs... > concat(RangeTs &&...Ranges)
Returns a concatenated range across two or more ranges.
Definition STLExtras.h:1152
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:385
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1746
auto reverse(ContainerTy &&C)
Definition STLExtras.h:408
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1636
LLVM_ABI_FOR_TEST cl::opt< bool > EnableWideActiveLaneMask
UncountableExitStyle
Different methods of handling early exits.
Definition VPlan.h:83
@ ReadOnly
No side effects to worry about, so we can process any uncountable exits in the loop and branch either...
Definition VPlan.h:88
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1753
SmallVector< ValueTypeFromRangeType< R >, Size > to_vector(R &&Range)
Given a range of type R, iterate the entire range and return a SmallVector with elements of the vecto...
iterator_range< filter_iterator< detail::IterOfRange< RangeT >, PredicateT > > make_filter_range(RangeT &&Range, PredicateT Pred)
Convenience function that takes a range of elements and a predicate, and return a new filter_iterator...
Definition STLExtras.h:552
bool canConstantBeExtended(const APInt *C, Type *NarrowType, TTI::PartialReductionExtendKind ExtKind)
Check if a constant CI can be safely treated as having been extended from a narrower type with the gi...
Definition VPlan.cpp:1879
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
auto drop_end(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the last N elements excluded.
Definition STLExtras.h:323
TargetTransformInfo TTI
RecurKind
These are the kinds of recurrences that we support.
@ UMin
Unsigned integer min implemented in terms of select(cmp()).
@ FindIV
FindIV reduction with select(icmp(),x,y) where one of (x,y) is a loop induction variable (increasing ...
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ SMin
Signed integer min implemented in terms of select(cmp()).
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
@ AddChainWithSubs
A chain of adds and subs.
@ FAdd
Sum of floats.
@ UMax
Unsigned integer max implemented in terms of select(cmp()).
LLVM_ABI Value * getRecurrenceIdentity(RecurKind K, Type *Tp, FastMathFlags FMF)
Given information about an recurrence kind, return the identity for the @llvm.vector....
LLVM_ABI BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="")
Split the specified block at the specified instruction.
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition STLExtras.h:2012
DWARFExpression::Operation Op
auto max_element(R &&Range)
Provide wrappers to std::max_element which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:2088
ArrayRef(const T &OneElt) -> ArrayRef< T >
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1772
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1947
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition STLExtras.h:2166
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition Hashing.h:592
bool equal(L &&LRange, R &&RRange)
Wrapper function around std::equal to detect if pair-wise elements between two ranges are the same.
Definition STLExtras.h:2146
Type * toVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
@ Default
The result value is uniform if and only if all operands are uniform.
Definition Uniformity.h:20
constexpr detail::IsaCheckPredicate< Types... > IsaPred
Function object wrapper for the llvm::isa type check.
Definition Casting.h:866
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition Hashing.h:466
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
#define N
RemoveMask_match(const Op0_t &In, Op1_t &Out)
bool match(OpTy *V) const
A collection of metadata nodes that might be associated with a memory access used by the alias-analys...
Definition Metadata.h:763
MDNode * Scope
The tag for alias scope specification (used with noalias).
Definition Metadata.h:786
MDNode * NoAlias
The tag specifying the noalias scope.
Definition Metadata.h:789
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
An information struct used to provide DenseMap with the various necessary components for a given valu...
This reduction is unordered with the partial result scaled down by some factor.
Definition VPlan.h:2668
A range of powers-of-2 vectorization factors with fixed start and adjustable end.
Struct to hold various analysis needed for cost computations.
TargetTransformInfo::TargetCostKind CostKind
VPTypeAnalysis Types
const TargetTransformInfo & TTI
A recipe for handling first-order recurrence phis.
Definition VPlan.h:2624
A VPValue representing a live-in from the input IR or a constant.
Definition VPlanValue.h:240
Type * getType() const
Returns the type of the underlying IR value.
Definition VPlan.cpp:141
A symbolic live-in VPValue, used for values like vector trip count, VF, and VFxUF.
Definition VPlanValue.h:280
bool isMaterialized() const
Returns true if this symbolic value has been materialized.
Definition VPlanValue.h:288
A recipe for widening load operations with vector-predication intrinsics, using the address to load f...
Definition VPlan.h:3626
A recipe for widening load operations, using the address to load from and an optional mask.
Definition VPlan.h:3586
A recipe for widening store operations with vector-predication intrinsics, using the value to store,...
Definition VPlan.h:3710
A recipe for widening store operations, using the stored value, the address to store to and an option...
Definition VPlan.h:3667
static void handleUncountableEarlyExits(VPlan &Plan, VPBasicBlock *HeaderVPBB, VPBasicBlock *LatchVPBB, VPBasicBlock *MiddleVPBB, UncountableExitStyle Style)
Update Plan to account for uncountable early exits by introducing appropriate branching logic in the ...
static LLVM_ABI_FOR_TEST bool tryToConvertVPInstructionsToVPRecipes(VPlan &Plan, const TargetLibraryInfo &TLI)
Replaces the VPInstructions in Plan with corresponding widen recipes.
static void makeMemOpWideningDecisions(VPlan &Plan, VFRange &Range, VPRecipeBuilder &RecipeBuilder)
Convert load/store VPInstructions in Plan into widened or replicate recipes.
static void materializeBroadcasts(VPlan &Plan)
Add explicit broadcasts for live-ins and VPValues defined in Plan's entry block if they are used as v...
static void materializePacksAndUnpacks(VPlan &Plan)
Add explicit Build[Struct]Vector recipes to Pack multiple scalar values into vectors and Unpack recip...
static void createInterleaveGroups(VPlan &Plan, const SmallPtrSetImpl< const InterleaveGroup< Instruction > * > &InterleaveGroups, VPRecipeBuilder &RecipeBuilder, const bool &EpilogueAllowed)
static bool simplifyKnownEVL(VPlan &Plan, ElementCount VF, PredicatedScalarEvolution &PSE)
Try to simplify VPInstruction::ExplicitVectorLength recipes when the AVL is known to be <= VF,...
static void removeBranchOnConst(VPlan &Plan, bool OnlyLatches=false)
Remove BranchOnCond recipes with true or false conditions together with removing dead edges to their ...
static void materializeFactors(VPlan &Plan, VPBasicBlock *VectorPH, ElementCount VF)
Materialize UF, VF and VFxUF to be computed explicitly using VPInstructions.
static void materializeBackedgeTakenCount(VPlan &Plan, VPBasicBlock *VectorPH)
Materialize the backedge-taken count to be computed explicitly using VPInstructions.
static void hoistInvariantLoads(VPlan &Plan)
Hoist single-scalar loads with invariant addresses out of the vector loop to the preheader,...
static void addActiveLaneMask(VPlan &Plan, bool UseActiveLaneMaskForControlFlow)
Replace (ICMP_ULE, wide canonical IV, backedge-taken-count) checks with an (active-lane-mask recipe,...
static void dropPoisonGeneratingRecipes(VPlan &Plan, const std::function< bool(BasicBlock *)> &BlockNeedsPredication)
Drop poison flags from recipes that may generate a poison value that is used after vectorization,...
static void createAndOptimizeReplicateRegions(VPlan &Plan)
Wrap predicated VPReplicateRecipes with a mask operand in an if-then region block and remove the mask...
static void convertToVariableLengthStep(VPlan &Plan)
Transform loops with variable-length stepping after region dissolution.
static void addBranchWeightToMiddleTerminator(VPlan &Plan, ElementCount VF, std::optional< unsigned > VScaleForTuning)
Add branch weight metadata, if the Plan's middle block is terminated by a BranchOnCond recipe.
static std::unique_ptr< VPlan > narrowInterleaveGroups(VPlan &Plan, const TargetTransformInfo &TTI)
Try to find a single VF among Plan's VFs for which all interleave groups (with known minimum VF eleme...
static DenseMap< const SCEV *, Value * > expandSCEVs(VPlan &Plan, ScalarEvolution &SE)
Expand VPExpandSCEVRecipes in Plan's entry block.
static void convertToConcreteRecipes(VPlan &Plan)
Lower abstract recipes to concrete ones, that can be codegen'd.
static void expandBranchOnTwoConds(VPlan &Plan)
Expand BranchOnTwoConds instructions into explicit CFG with BranchOnCond instructions.
static void materializeVectorTripCount(VPlan &Plan, VPBasicBlock *VectorPHVPBB, bool TailByMasking, bool RequiresScalarEpilogue, VPValue *Step, std::optional< uint64_t > MaxRuntimeStep=std::nullopt)
Materialize vector trip count computations to a set of VPInstructions.
static void hoistPredicatedLoads(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L)
Hoist predicated loads from the same address to the loop entry block, if they are guaranteed to execu...
static bool mergeBlocksIntoPredecessors(VPlan &Plan)
Remove redundant VPBasicBlocks by merging them into their single predecessor if the latter has a sing...
static void optimizeFindIVReductions(VPlan &Plan, PredicatedScalarEvolution &PSE, Loop &L)
Optimize FindLast reductions selecting IVs (or expressions of IVs) by converting them to FindIV reduc...
static void convertToAbstractRecipes(VPlan &Plan, VPCostContext &Ctx, VFRange &Range)
This function converts initial recipes to the abstract recipes and clamps Range based on cost model f...
static void materializeConstantVectorTripCount(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
static void addExitUsersForFirstOrderRecurrences(VPlan &Plan, VFRange &Range)
Handle users in the exit block for first order reductions in the original exit block.
static void addExplicitVectorLength(VPlan &Plan, const std::optional< unsigned > &MaxEVLSafeElements)
Add a VPCurrentIterationPHIRecipe and related recipes to Plan and replaces all uses of the canonical ...
static void optimizeEVLMasks(VPlan &Plan)
Optimize recipes which use an EVL-based header mask to VP intrinsics, for example:
static void replaceSymbolicStrides(VPlan &Plan, PredicatedScalarEvolution &PSE, const DenseMap< Value *, const SCEV * > &StridesMap)
Replace symbolic strides from StridesMap in Plan with constants when possible.
static void removeDeadRecipes(VPlan &Plan)
Remove dead recipes from Plan.
static void simplifyRecipes(VPlan &Plan)
Perform instcombine-like simplifications on recipes in Plan.
static void sinkPredicatedStores(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L)
Sink predicated stores to the same address with complementary predicates (P and NOT P) to an uncondit...
static void clearReductionWrapFlags(VPlan &Plan)
Clear NSW/NUW flags from reduction instructions if necessary.
static void optimizeInductionLiveOutUsers(VPlan &Plan, PredicatedScalarEvolution &PSE, bool FoldTail)
If there's a single exit block, optimize its phi recipes that use exiting IV values by feeding them p...
static void createPartialReductions(VPlan &Plan, VPCostContext &CostCtx, VFRange &Range)
Detect and create partial reduction recipes for scaled reductions in Plan.
static void cse(VPlan &Plan)
Perform common-subexpression-elimination on Plan.
static LLVM_ABI_FOR_TEST void optimize(VPlan &Plan)
Apply VPlan-to-VPlan optimizations to Plan, including induction recipe optimizations,...
static void dissolveLoopRegions(VPlan &Plan)
Replace loop regions with explicit CFG.
static void truncateToMinimalBitwidths(VPlan &Plan, const MapVector< Instruction *, uint64_t > &MinBWs)
Insert truncates and extends for any truncated recipe.
static bool adjustFixedOrderRecurrences(VPlan &Plan, VPBuilder &Builder)
Try to have all users of fixed-order recurrences appear after the recipe defining their previous valu...
static void optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Optimize Plan based on BestVF and BestUF.
static void convertEVLExitCond(VPlan &Plan)
Replaces the exit condition from (branch-on-cond eq CanonicalIVInc, VectorTripCount) to (branch-on-co...