LLVM 23.0.0git
VPlanTransforms.cpp
Go to the documentation of this file.
1//===-- VPlanTransforms.cpp - Utility VPlan to VPlan transforms -----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file implements a set of utility VPlan to VPlan transformations.
11///
12//===----------------------------------------------------------------------===//
13
14#include "VPlanTransforms.h"
15#include "VPRecipeBuilder.h"
16#include "VPlan.h"
17#include "VPlanAnalysis.h"
18#include "VPlanCFG.h"
19#include "VPlanDominatorTree.h"
20#include "VPlanHelpers.h"
21#include "VPlanPatternMatch.h"
22#include "VPlanUtils.h"
23#include "VPlanVerifier.h"
24#include "llvm/ADT/APInt.h"
26#include "llvm/ADT/STLExtras.h"
28#include "llvm/ADT/SetVector.h"
30#include "llvm/ADT/TypeSwitch.h"
38#include "llvm/IR/Intrinsics.h"
39#include "llvm/IR/MDBuilder.h"
40#include "llvm/IR/Metadata.h"
45
46using namespace llvm;
47using namespace VPlanPatternMatch;
48using namespace SCEVPatternMatch;
49
51 VPlan &Plan, const TargetLibraryInfo &TLI) {
52
54 Plan.getVectorLoopRegion());
56 // Skip blocks outside region
57 if (!VPBB->getParent())
58 break;
59 VPRecipeBase *Term = VPBB->getTerminator();
60 auto EndIter = Term ? Term->getIterator() : VPBB->end();
61 // Introduce each ingredient into VPlan.
62 for (VPRecipeBase &Ingredient :
63 make_early_inc_range(make_range(VPBB->begin(), EndIter))) {
64
65 VPValue *VPV = Ingredient.getVPSingleValue();
66 if (!VPV->getUnderlyingValue())
67 continue;
68
70
71 VPRecipeBase *NewRecipe = nullptr;
72 if (auto *PhiR = dyn_cast<VPPhi>(&Ingredient)) {
73 auto *Phi = cast<PHINode>(PhiR->getUnderlyingValue());
74 NewRecipe = new VPWidenPHIRecipe(PhiR->operands(), PhiR->getDebugLoc(),
75 Phi->getName());
76 } else if (auto *VPI = dyn_cast<VPInstruction>(&Ingredient)) {
77 assert(!isa<PHINode>(Inst) && "phis should be handled above");
78 // Create VPWidenMemoryRecipe for loads and stores.
79 if (LoadInst *Load = dyn_cast<LoadInst>(Inst)) {
80 NewRecipe = new VPWidenLoadRecipe(
81 *Load, Ingredient.getOperand(0), nullptr /*Mask*/,
82 false /*Consecutive*/, *VPI, Ingredient.getDebugLoc());
83 } else if (StoreInst *Store = dyn_cast<StoreInst>(Inst)) {
84 NewRecipe = new VPWidenStoreRecipe(
85 *Store, Ingredient.getOperand(1), Ingredient.getOperand(0),
86 nullptr /*Mask*/, false /*Consecutive*/, *VPI,
87 Ingredient.getDebugLoc());
89 NewRecipe = new VPWidenGEPRecipe(GEP, Ingredient.operands(), *VPI,
90 Ingredient.getDebugLoc());
91 } else if (CallInst *CI = dyn_cast<CallInst>(Inst)) {
92 Intrinsic::ID VectorID = getVectorIntrinsicIDForCall(CI, &TLI);
93 if (VectorID == Intrinsic::not_intrinsic)
94 return false;
95
96 // The noalias.scope.decl intrinsic declares a noalias scope that
97 // is valid for a single iteration. Emitting it as a single-scalar
98 // replicate would incorrectly extend the scope across multiple
99 // original iterations packed into one vector iteration.
100 // FIXME: If we want to vectorize this loop, then we have to drop
101 // all the associated !alias.scope and !noalias.
102 if (VectorID == Intrinsic::experimental_noalias_scope_decl)
103 return false;
104
105 // These intrinsics are recognized by getVectorIntrinsicIDForCall
106 // but are not widenable. Emit them as replicate instead of widening.
107 if (VectorID == Intrinsic::assume ||
108 VectorID == Intrinsic::lifetime_end ||
109 VectorID == Intrinsic::lifetime_start ||
110 VectorID == Intrinsic::sideeffect ||
111 VectorID == Intrinsic::pseudoprobe) {
112 // If the operand of llvm.assume holds before vectorization, it will
113 // also hold per lane.
114 // llvm.pseudoprobe requires to be duplicated per lane for accurate
115 // sample count.
116 const bool IsSingleScalar = VectorID != Intrinsic::assume &&
117 VectorID != Intrinsic::pseudoprobe;
118 NewRecipe = new VPReplicateRecipe(CI, Ingredient.operands(),
119 /*IsSingleScalar=*/IsSingleScalar,
120 /*Mask=*/nullptr, *VPI, *VPI,
121 Ingredient.getDebugLoc());
122 } else {
123 NewRecipe = new VPWidenIntrinsicRecipe(
124 *CI, VectorID, drop_end(Ingredient.operands()), CI->getType(),
125 VPIRFlags(*CI), *VPI, CI->getDebugLoc());
126 }
127 } else if (auto *CI = dyn_cast<CastInst>(Inst)) {
128 NewRecipe = new VPWidenCastRecipe(
129 CI->getOpcode(), Ingredient.getOperand(0), CI->getType(), CI,
130 VPIRFlags(*CI), VPIRMetadata(*CI));
131 } else {
132 NewRecipe = new VPWidenRecipe(*Inst, Ingredient.operands(), *VPI,
133 *VPI, Ingredient.getDebugLoc());
134 }
135 } else {
137 "inductions must be created earlier");
138 continue;
139 }
140
141 NewRecipe->insertBefore(&Ingredient);
142 if (NewRecipe->getNumDefinedValues() == 1)
143 VPV->replaceAllUsesWith(NewRecipe->getVPSingleValue());
144 else
145 assert(NewRecipe->getNumDefinedValues() == 0 &&
146 "Only recpies with zero or one defined values expected");
147 Ingredient.eraseFromParent();
148 }
149 }
150 return true;
151}
152
153/// Helper for extra no-alias checks via known-safe recipe and SCEV.
155 const SmallPtrSetImpl<VPRecipeBase *> &ExcludeRecipes;
156 VPReplicateRecipe &GroupLeader;
158 const Loop &L;
159 VPTypeAnalysis &TypeInfo;
160
161 // Return true if \p A and \p B are known to not alias for all VFs in the
162 // plan, checked via the distance between the accesses
163 bool isNoAliasViaDistance(VPReplicateRecipe *A, VPReplicateRecipe *B) const {
164 if (A->getOpcode() != Instruction::Store ||
165 B->getOpcode() != Instruction::Store)
166 return false;
167
168 VPValue *AddrA = A->getOperand(1);
169 const SCEV *SCEVA = vputils::getSCEVExprForVPValue(AddrA, PSE, &L);
170 VPValue *AddrB = B->getOperand(1);
171 const SCEV *SCEVB = vputils::getSCEVExprForVPValue(AddrB, PSE, &L);
173 return false;
174
175 const APInt *Distance;
176 ScalarEvolution &SE = *PSE.getSE();
177 if (!match(SE.getMinusSCEV(SCEVA, SCEVB), m_scev_APInt(Distance)))
178 return false;
179
180 const DataLayout &DL = SE.getDataLayout();
181 Type *TyA = TypeInfo.inferScalarType(A->getOperand(0));
182 uint64_t SizeA = DL.getTypeStoreSize(TyA);
183 Type *TyB = TypeInfo.inferScalarType(B->getOperand(0));
184 uint64_t SizeB = DL.getTypeStoreSize(TyB);
185
186 // Use the maximum store size to ensure no overlap from either direction.
187 // Currently only handles fixed sizes, as it is only used for
188 // replicating VPReplicateRecipes.
189 uint64_t MaxStoreSize = std::max(SizeA, SizeB);
190
191 auto VFs = B->getParent()->getPlan()->vectorFactors();
193 if (MaxVF.isScalable())
194 return false;
195 return Distance->abs().uge(
196 MaxVF.multiplyCoefficientBy(MaxStoreSize).getFixedValue());
197 }
198
199public:
202 const Loop &L, VPTypeAnalysis &TypeInfo)
203 : ExcludeRecipes(ExcludeRecipes), GroupLeader(GroupLeader), PSE(PSE),
204 L(L), TypeInfo(TypeInfo) {}
205
206 /// Return true if \p R should be skipped during alias checking, either
207 /// because it's in the exclude set or because no-alias can be proven via
208 /// SCEV.
209 bool shouldSkip(VPRecipeBase &R) const {
210 auto *Store = dyn_cast<VPReplicateRecipe>(&R);
211 return ExcludeRecipes.contains(&R) ||
212 (Store && isNoAliasViaDistance(Store, &GroupLeader));
213 }
214};
215
216/// Check if a memory operation doesn't alias with memory operations using
217/// scoped noalias metadata, in blocks in the single-successor chain between \p
218/// FirstBB and \p LastBB. If \p SinkInfo is std::nullopt, only recipes that may
219/// write to memory are checked (for load hoisting). Otherwise recipes that both
220/// read and write memory are checked, and SCEV is used to prove no-alias
221/// between the group leader and other replicate recipes (for store sinking).
222static bool
224 VPBasicBlock *FirstBB, VPBasicBlock *LastBB,
225 std::optional<SinkStoreInfo> SinkInfo = {}) {
226 bool CheckReads = SinkInfo.has_value();
227 if (!MemLoc.AATags.Scope)
228 return false;
229
230 for (VPBasicBlock *VPBB :
232 for (VPRecipeBase &R : *VPBB) {
233 if (SinkInfo && SinkInfo->shouldSkip(R))
234 continue;
235
236 // Skip recipes that don't need checking.
237 if (!R.mayWriteToMemory() && !(CheckReads && R.mayReadFromMemory()))
238 continue;
239
241 if (!Loc)
242 // Conservatively assume aliasing for memory operations without
243 // location.
244 return false;
245
247 return false;
248 }
249 }
250 return true;
251}
252
253/// Collect either replicated Loads or Stores grouped by their address SCEV, in
254/// a deep-traversal of the vector loop region in \p Plan.
255template <unsigned Opcode>
258 VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L,
259 function_ref<bool(VPReplicateRecipe *)> FilterFn) {
260 static_assert(Opcode == Instruction::Load || Opcode == Instruction::Store,
261 "Only Load and Store opcodes supported");
262 constexpr bool IsLoad = (Opcode == Instruction::Load);
264 RecipesByAddress;
267 for (VPRecipeBase &R : *VPBB) {
268 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
269 if (!RepR || RepR->getOpcode() != Opcode || !FilterFn(RepR))
270 continue;
271
272 // For loads, operand 0 is address; for stores, operand 1 is address.
273 VPValue *Addr = RepR->getOperand(IsLoad ? 0 : 1);
274 const SCEV *AddrSCEV = vputils::getSCEVExprForVPValue(Addr, PSE, L);
275 if (!isa<SCEVCouldNotCompute>(AddrSCEV))
276 RecipesByAddress[AddrSCEV].push_back(RepR);
277 }
278 }
279 auto Groups = to_vector(RecipesByAddress.values());
280 VPDominatorTree VPDT(Plan);
281 for (auto &Group : Groups) {
282 // Sort mem ops by dominance order, with earliest (most dominating) first.
284 return VPDT.properlyDominates(A, B);
285 });
286 }
287 return Groups;
288}
289
290/// Return true if we do not know how to (mechanically) hoist or sink \p R out
291/// of a loop region. When sinking, passing \p Sinking = true ensures that
292/// assumes aren't sunk.
294 bool Sinking = false) {
295 // Assumes don't alias anything or throw; as long as they're guaranteed to
296 // execute, they're safe to hoist. They should however not be sunk, as it
297 // would destroy information.
299 return Sinking;
300
301 // TODO: Relax checks in the future, e.g. we could also hoist reads, if their
302 // memory location is not modified in the vector loop.
303 if (R.mayHaveSideEffects() || R.mayReadFromMemory() || R.isPhi())
304 return true;
305
306 // Allocas cannot be hoisted.
307 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
308 return RepR && RepR->getOpcode() == Instruction::Alloca;
309}
310
311static bool sinkScalarOperands(VPlan &Plan) {
312 auto Iter = vp_depth_first_deep(Plan.getEntry());
313 bool ScalarVFOnly = Plan.hasScalarVFOnly();
314 bool Changed = false;
315
317 auto InsertIfValidSinkCandidate = [ScalarVFOnly, &WorkList](
318 VPBasicBlock *SinkTo, VPValue *Op) {
319 auto *Candidate =
320 dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe());
321 if (!Candidate)
322 return;
323
324 // We only know how to sink VPReplicateRecipes and VPScalarIVStepsRecipes
325 // for now.
327 return;
328
329 if (Candidate->getParent() == SinkTo ||
330 cannotHoistOrSinkRecipe(*Candidate, /*Sinking=*/true))
331 return;
332
333 if (auto *RepR = dyn_cast<VPReplicateRecipe>(Candidate))
334 if (!ScalarVFOnly && RepR->isSingleScalar())
335 return;
336
337 WorkList.insert({SinkTo, Candidate});
338 };
339
340 // First, collect the operands of all recipes in replicate blocks as seeds for
341 // sinking.
343 VPBasicBlock *EntryVPBB = VPR->getEntryBasicBlock();
344 if (!VPR->isReplicator() || EntryVPBB->getSuccessors().size() != 2)
345 continue;
346 VPBasicBlock *VPBB = cast<VPBasicBlock>(EntryVPBB->getSuccessors().front());
347 if (VPBB->getSingleSuccessor() != VPR->getExitingBasicBlock())
348 continue;
349 for (auto &Recipe : *VPBB)
350 for (VPValue *Op : Recipe.operands())
351 InsertIfValidSinkCandidate(VPBB, Op);
352 }
353
354 // Try to sink each replicate or scalar IV steps recipe in the worklist.
355 for (unsigned I = 0; I != WorkList.size(); ++I) {
356 VPBasicBlock *SinkTo;
357 VPSingleDefRecipe *SinkCandidate;
358 std::tie(SinkTo, SinkCandidate) = WorkList[I];
359
360 // All recipe users of SinkCandidate must be in the same block SinkTo or all
361 // users outside of SinkTo must only use the first lane of SinkCandidate. In
362 // the latter case, we need to duplicate SinkCandidate.
363 auto UsersOutsideSinkTo =
364 make_filter_range(SinkCandidate->users(), [SinkTo](VPUser *U) {
365 return cast<VPRecipeBase>(U)->getParent() != SinkTo;
366 });
367 if (any_of(UsersOutsideSinkTo, [SinkCandidate](VPUser *U) {
368 return !U->usesFirstLaneOnly(SinkCandidate);
369 }))
370 continue;
371 bool NeedsDuplicating = !UsersOutsideSinkTo.empty();
372
373 if (NeedsDuplicating) {
374 if (ScalarVFOnly)
375 continue;
376 VPSingleDefRecipe *Clone;
377 if (auto *SinkCandidateRepR =
378 dyn_cast<VPReplicateRecipe>(SinkCandidate)) {
379 // TODO: Handle converting to uniform recipes as separate transform,
380 // then cloning should be sufficient here.
381 Instruction *I = SinkCandidate->getUnderlyingInstr();
382 Clone = new VPReplicateRecipe(I, SinkCandidate->operands(), true,
383 nullptr /*Mask*/, *SinkCandidateRepR,
384 *SinkCandidateRepR);
385 // TODO: add ".cloned" suffix to name of Clone's VPValue.
386 } else {
387 Clone = SinkCandidate->clone();
388 }
389
390 Clone->insertBefore(SinkCandidate);
391 SinkCandidate->replaceUsesWithIf(Clone, [SinkTo](VPUser &U, unsigned) {
392 return cast<VPRecipeBase>(&U)->getParent() != SinkTo;
393 });
394 }
395 SinkCandidate->moveBefore(*SinkTo, SinkTo->getFirstNonPhi());
396 for (VPValue *Op : SinkCandidate->operands())
397 InsertIfValidSinkCandidate(SinkTo, Op);
398 Changed = true;
399 }
400 return Changed;
401}
402
403/// If \p R is a region with a VPBranchOnMaskRecipe in the entry block, return
404/// the mask.
406 auto *EntryBB = dyn_cast<VPBasicBlock>(R->getEntry());
407 if (!EntryBB || EntryBB->size() != 1 ||
408 !isa<VPBranchOnMaskRecipe>(EntryBB->begin()))
409 return nullptr;
410
411 return cast<VPBranchOnMaskRecipe>(&*EntryBB->begin())->getOperand(0);
412}
413
414/// If \p R is a triangle region, return the 'then' block of the triangle.
416 auto *EntryBB = cast<VPBasicBlock>(R->getEntry());
417 if (EntryBB->getNumSuccessors() != 2)
418 return nullptr;
419
420 auto *Succ0 = dyn_cast<VPBasicBlock>(EntryBB->getSuccessors()[0]);
421 auto *Succ1 = dyn_cast<VPBasicBlock>(EntryBB->getSuccessors()[1]);
422 if (!Succ0 || !Succ1)
423 return nullptr;
424
425 if (Succ0->getNumSuccessors() + Succ1->getNumSuccessors() != 1)
426 return nullptr;
427 if (Succ0->getSingleSuccessor() == Succ1)
428 return Succ0;
429 if (Succ1->getSingleSuccessor() == Succ0)
430 return Succ1;
431 return nullptr;
432}
433
434// Merge replicate regions in their successor region, if a replicate region
435// is connected to a successor replicate region with the same predicate by a
436// single, empty VPBasicBlock.
438 SmallPtrSet<VPRegionBlock *, 4> TransformedRegions;
439
440 // Collect replicate regions followed by an empty block, followed by another
441 // replicate region with matching masks to process front. This is to avoid
442 // iterator invalidation issues while merging regions.
445 vp_depth_first_deep(Plan.getEntry()))) {
446 if (!Region1->isReplicator())
447 continue;
448 auto *MiddleBasicBlock =
449 dyn_cast_or_null<VPBasicBlock>(Region1->getSingleSuccessor());
450 if (!MiddleBasicBlock || !MiddleBasicBlock->empty())
451 continue;
452
453 auto *Region2 =
454 dyn_cast_or_null<VPRegionBlock>(MiddleBasicBlock->getSingleSuccessor());
455 if (!Region2 || !Region2->isReplicator())
456 continue;
457
458 VPValue *Mask1 = getPredicatedMask(Region1);
459 VPValue *Mask2 = getPredicatedMask(Region2);
460 if (!Mask1 || Mask1 != Mask2)
461 continue;
462
463 assert(Mask1 && Mask2 && "both region must have conditions");
464 WorkList.push_back(Region1);
465 }
466
467 // Move recipes from Region1 to its successor region, if both are triangles.
468 for (VPRegionBlock *Region1 : WorkList) {
469 if (TransformedRegions.contains(Region1))
470 continue;
471 auto *MiddleBasicBlock = cast<VPBasicBlock>(Region1->getSingleSuccessor());
472 auto *Region2 = cast<VPRegionBlock>(MiddleBasicBlock->getSingleSuccessor());
473
474 VPBasicBlock *Then1 = getPredicatedThenBlock(Region1);
475 VPBasicBlock *Then2 = getPredicatedThenBlock(Region2);
476 if (!Then1 || !Then2)
477 continue;
478
479 // Note: No fusion-preventing memory dependencies are expected in either
480 // region. Such dependencies should be rejected during earlier dependence
481 // checks, which guarantee accesses can be re-ordered for vectorization.
482 //
483 // Move recipes to the successor region.
484 for (VPRecipeBase &ToMove : make_early_inc_range(reverse(*Then1)))
485 ToMove.moveBefore(*Then2, Then2->getFirstNonPhi());
486
487 auto *Merge1 = cast<VPBasicBlock>(Then1->getSingleSuccessor());
488 auto *Merge2 = cast<VPBasicBlock>(Then2->getSingleSuccessor());
489
490 // Move VPPredInstPHIRecipes from the merge block to the successor region's
491 // merge block. Update all users inside the successor region to use the
492 // original values.
493 for (VPRecipeBase &Phi1ToMove : make_early_inc_range(reverse(*Merge1))) {
494 VPValue *PredInst1 =
495 cast<VPPredInstPHIRecipe>(&Phi1ToMove)->getOperand(0);
496 VPValue *Phi1ToMoveV = Phi1ToMove.getVPSingleValue();
497 Phi1ToMoveV->replaceUsesWithIf(PredInst1, [Then2](VPUser &U, unsigned) {
498 return cast<VPRecipeBase>(&U)->getParent() == Then2;
499 });
500
501 // Remove phi recipes that are unused after merging the regions.
502 if (Phi1ToMove.getVPSingleValue()->getNumUsers() == 0) {
503 Phi1ToMove.eraseFromParent();
504 continue;
505 }
506 Phi1ToMove.moveBefore(*Merge2, Merge2->begin());
507 }
508
509 // Remove the dead recipes in Region1's entry block.
510 for (VPRecipeBase &R :
511 make_early_inc_range(reverse(*Region1->getEntryBasicBlock())))
512 R.eraseFromParent();
513
514 // Finally, remove the first region.
515 for (VPBlockBase *Pred : make_early_inc_range(Region1->getPredecessors())) {
516 VPBlockUtils::disconnectBlocks(Pred, Region1);
517 VPBlockUtils::connectBlocks(Pred, MiddleBasicBlock);
518 }
519 VPBlockUtils::disconnectBlocks(Region1, MiddleBasicBlock);
520 TransformedRegions.insert(Region1);
521 }
522
523 return !TransformedRegions.empty();
524}
525
527 VPlan &Plan) {
528 Instruction *Instr = PredRecipe->getUnderlyingInstr();
529 // Build the triangular if-then region.
530 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
531 assert(Instr->getParent() && "Predicated instruction not in any basic block");
532 auto *BlockInMask = PredRecipe->getMask();
533 auto *MaskDef = BlockInMask->getDefiningRecipe();
534 auto *BOMRecipe = new VPBranchOnMaskRecipe(
535 BlockInMask, MaskDef ? MaskDef->getDebugLoc() : DebugLoc::getUnknown());
536 auto *Entry =
537 Plan.createVPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
538
539 // Replace predicated replicate recipe with a replicate recipe without a
540 // mask but in the replicate region.
541 auto *RecipeWithoutMask = new VPReplicateRecipe(
542 PredRecipe->getUnderlyingInstr(), drop_end(PredRecipe->operands()),
543 PredRecipe->isSingleScalar(), nullptr /*Mask*/, *PredRecipe, *PredRecipe,
544 PredRecipe->getDebugLoc());
545 auto *Pred =
546 Plan.createVPBasicBlock(Twine(RegionName) + ".if", RecipeWithoutMask);
547
548 VPPredInstPHIRecipe *PHIRecipe = nullptr;
549 if (PredRecipe->getNumUsers() != 0) {
550 PHIRecipe = new VPPredInstPHIRecipe(RecipeWithoutMask,
551 RecipeWithoutMask->getDebugLoc());
552 PredRecipe->replaceAllUsesWith(PHIRecipe);
553 PHIRecipe->setOperand(0, RecipeWithoutMask);
554 }
555 PredRecipe->eraseFromParent();
556 auto *Exiting =
557 Plan.createVPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
559 Plan.createReplicateRegion(Entry, Exiting, RegionName);
560
561 // Note: first set Entry as region entry and then connect successors starting
562 // from it in order, to propagate the "parent" of each VPBasicBlock.
563 VPBlockUtils::insertTwoBlocksAfter(Pred, Exiting, Entry);
564 VPBlockUtils::connectBlocks(Pred, Exiting);
565
566 return Region;
567}
568
569static void addReplicateRegions(VPlan &Plan) {
572 vp_depth_first_deep(Plan.getEntry()))) {
573 for (VPRecipeBase &R : *VPBB)
574 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
575 if (RepR->isPredicated())
576 WorkList.push_back(RepR);
577 }
578 }
579
580 unsigned BBNum = 0;
581 for (VPReplicateRecipe *RepR : WorkList) {
582 VPBasicBlock *CurrentBlock = RepR->getParent();
583 VPBasicBlock *SplitBlock = CurrentBlock->splitAt(RepR->getIterator());
584
585 BasicBlock *OrigBB = RepR->getUnderlyingInstr()->getParent();
586 SplitBlock->setName(
587 OrigBB->hasName() ? OrigBB->getName() + "." + Twine(BBNum++) : "");
588 // Record predicated instructions for above packing optimizations.
590 Region->setParent(CurrentBlock->getParent());
592
593 VPRegionBlock *ParentRegion = Region->getParent();
594 if (ParentRegion && ParentRegion->getExiting() == CurrentBlock)
595 ParentRegion->setExiting(SplitBlock);
596 }
597}
598
602 vp_depth_first_deep(Plan.getEntry()))) {
603 // Don't fold the blocks in the skeleton of the Plan into their single
604 // predecessors for now.
605 // TODO: Remove restriction once more of the skeleton is modeled in VPlan.
606 if (!VPBB->getParent())
607 continue;
608 auto *PredVPBB =
609 dyn_cast_or_null<VPBasicBlock>(VPBB->getSinglePredecessor());
610 if (!PredVPBB || PredVPBB->getNumSuccessors() != 1 ||
611 isa<VPIRBasicBlock>(PredVPBB))
612 continue;
613 WorkList.push_back(VPBB);
614 }
615
616 for (VPBasicBlock *VPBB : WorkList) {
617 VPBasicBlock *PredVPBB = cast<VPBasicBlock>(VPBB->getSinglePredecessor());
618 for (VPRecipeBase &R : make_early_inc_range(*VPBB))
619 R.moveBefore(*PredVPBB, PredVPBB->end());
620 VPBlockUtils::disconnectBlocks(PredVPBB, VPBB);
621 auto *ParentRegion = VPBB->getParent();
622 if (ParentRegion && ParentRegion->getExiting() == VPBB)
623 ParentRegion->setExiting(PredVPBB);
624 VPBlockUtils::transferSuccessors(VPBB, PredVPBB);
625 // VPBB is now dead and will be cleaned up when the plan gets destroyed.
626 }
627 return !WorkList.empty();
628}
629
631 // Convert masked VPReplicateRecipes to if-then region blocks.
633
634 bool ShouldSimplify = true;
635 while (ShouldSimplify) {
636 ShouldSimplify = sinkScalarOperands(Plan);
637 ShouldSimplify |= mergeReplicateRegionsIntoSuccessors(Plan);
638 ShouldSimplify |= mergeBlocksIntoPredecessors(Plan);
639 }
640}
641
642/// Remove redundant casts of inductions.
643///
644/// Such redundant casts are casts of induction variables that can be ignored,
645/// because we already proved that the casted phi is equal to the uncasted phi
646/// in the vectorized loop. There is no need to vectorize the cast - the same
647/// value can be used for both the phi and casts in the vector loop.
649 for (auto &Phi : Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
651 if (!IV || IV->getTruncInst())
652 continue;
653
654 // A sequence of IR Casts has potentially been recorded for IV, which
655 // *must be bypassed* when the IV is vectorized, because the vectorized IV
656 // will produce the desired casted value. This sequence forms a def-use
657 // chain and is provided in reverse order, ending with the cast that uses
658 // the IV phi. Search for the recipe of the last cast in the chain and
659 // replace it with the original IV. Note that only the final cast is
660 // expected to have users outside the cast-chain and the dead casts left
661 // over will be cleaned up later.
662 ArrayRef<Instruction *> Casts = IV->getInductionDescriptor().getCastInsts();
663 VPValue *FindMyCast = IV;
664 for (Instruction *IRCast : reverse(Casts)) {
665 VPSingleDefRecipe *FoundUserCast = nullptr;
666 for (auto *U : FindMyCast->users()) {
667 auto *UserCast = dyn_cast<VPSingleDefRecipe>(U);
668 if (UserCast && UserCast->getUnderlyingValue() == IRCast) {
669 FoundUserCast = UserCast;
670 break;
671 }
672 }
673 FindMyCast = FoundUserCast;
674 }
675 FindMyCast->replaceAllUsesWith(IV);
676 }
677}
678
679/// Try to replace VPWidenCanonicalIVRecipes with a widened canonical IV
680/// recipe, if it exists.
682 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
683 VPValue *CanonicalIV = LoopRegion->getCanonicalIV();
684 VPWidenCanonicalIVRecipe *WidenNewIV = nullptr;
685 for (VPUser *U : CanonicalIV->users()) {
687 if (WidenNewIV)
688 break;
689 }
690
691 if (!WidenNewIV)
692 return;
693
694 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
695 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
696 auto *WidenOriginalIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
697
698 if (!WidenOriginalIV || !WidenOriginalIV->isCanonical())
699 continue;
700
701 // Replace WidenNewIV with WidenOriginalIV if WidenOriginalIV provides
702 // everything WidenNewIV's users need. That is, WidenOriginalIV will
703 // generate a vector phi or all users of WidenNewIV demand the first lane
704 // only.
705 if (Plan.hasScalarVFOnly() ||
706 !vputils::onlyScalarValuesUsed(WidenOriginalIV) ||
707 vputils::onlyFirstLaneUsed(WidenNewIV)) {
708 // We are replacing a wide canonical iv with a suitable wide induction.
709 // This is used to compute header mask, hence all lanes will be used and
710 // we need to drop wrap flags only applying to lanes guranteed to execute
711 // in the original scalar loop.
712 WidenOriginalIV->dropPoisonGeneratingFlags();
713 WidenNewIV->replaceAllUsesWith(WidenOriginalIV);
714 WidenNewIV->eraseFromParent();
715 return;
716 }
717 }
718}
719
720/// Returns true if \p R is dead and can be removed.
721static bool isDeadRecipe(VPRecipeBase &R) {
722 // Do remove conditional assume instructions as their conditions may be
723 // flattened.
724 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
725 bool IsConditionalAssume = RepR && RepR->isPredicated() &&
727 if (IsConditionalAssume)
728 return true;
729
730 if (R.mayHaveSideEffects())
731 return false;
732
733 // Recipe is dead if no user keeps the recipe alive.
734 return all_of(R.definedValues(),
735 [](VPValue *V) { return V->getNumUsers() == 0; });
736}
737
740 Plan.getEntry());
742 // The recipes in the block are processed in reverse order, to catch chains
743 // of dead recipes.
744 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
745 if (isDeadRecipe(R)) {
746 R.eraseFromParent();
747 continue;
748 }
749
750 // Check if R is a dead VPPhi <-> update cycle and remove it.
751 VPValue *Start, *Incoming;
752 if (!match(&R, m_VPPhi(m_VPValue(Start), m_VPValue(Incoming))))
753 continue;
754 auto *PhiR = cast<VPPhi>(&R);
755 VPUser *PhiUser = PhiR->getSingleUser();
756 if (!PhiUser)
757 continue;
758 if (PhiUser != Incoming->getDefiningRecipe() ||
759 Incoming->getNumUsers() != 1)
760 continue;
761 PhiR->replaceAllUsesWith(Start);
762 PhiR->eraseFromParent();
763 Incoming->getDefiningRecipe()->eraseFromParent();
764 }
765 }
766}
767
770 Instruction::BinaryOps InductionOpcode,
771 FPMathOperator *FPBinOp, Instruction *TruncI,
772 VPIRValue *StartV, VPValue *Step, DebugLoc DL,
773 VPBuilder &Builder) {
774 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
775 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
776 VPValue *CanonicalIV = LoopRegion->getCanonicalIV();
777 VPSingleDefRecipe *BaseIV = Builder.createDerivedIV(
778 Kind, FPBinOp, StartV, CanonicalIV, Step, "offset.idx");
779
780 // Truncate base induction if needed.
781 VPTypeAnalysis TypeInfo(Plan);
782 Type *ResultTy = TypeInfo.inferScalarType(BaseIV);
783 if (TruncI) {
784 Type *TruncTy = TruncI->getType();
785 assert(ResultTy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits() &&
786 "Not truncating.");
787 assert(ResultTy->isIntegerTy() && "Truncation requires an integer type");
788 BaseIV = Builder.createScalarCast(Instruction::Trunc, BaseIV, TruncTy, DL);
789 ResultTy = TruncTy;
790 }
791
792 // Truncate step if needed.
793 Type *StepTy = TypeInfo.inferScalarType(Step);
794 if (ResultTy != StepTy) {
795 assert(StepTy->getScalarSizeInBits() > ResultTy->getScalarSizeInBits() &&
796 "Not truncating.");
797 assert(StepTy->isIntegerTy() && "Truncation requires an integer type");
798 auto *VecPreheader =
800 VPBuilder::InsertPointGuard Guard(Builder);
801 Builder.setInsertPoint(VecPreheader);
802 Step = Builder.createScalarCast(Instruction::Trunc, Step, ResultTy, DL);
803 }
804 return Builder.createScalarIVSteps(InductionOpcode, FPBinOp, BaseIV, Step,
805 &Plan.getVF(), DL);
806}
807
810 for (unsigned I = 0; I != Users.size(); ++I) {
812 if (isa<VPHeaderPHIRecipe>(Cur))
813 continue;
814 for (VPValue *V : Cur->definedValues())
815 Users.insert_range(V->users());
816 }
817 return Users.takeVector();
818}
819
820/// Scalarize a VPWidenPointerInductionRecipe by replacing it with a PtrAdd
821/// (IndStart, ScalarIVSteps (0, Step)). This is used when the recipe only
822/// generates scalar values.
823static VPValue *
825 VPlan &Plan, VPBuilder &Builder) {
827 VPIRValue *StartV = Plan.getZero(ID.getStep()->getType());
828 VPValue *StepV = PtrIV->getOperand(1);
830 Plan, InductionDescriptor::IK_IntInduction, Instruction::Add, nullptr,
831 nullptr, StartV, StepV, PtrIV->getDebugLoc(), Builder);
832
833 return Builder.createPtrAdd(PtrIV->getStartValue(), Steps,
834 PtrIV->getDebugLoc(), "next.gep");
835}
836
837/// Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd
838/// (IndStart, ScalarIVSteps (0, Step)) if only its scalar values are used, as
839/// VPWidenPointerInductionRecipe will generate vectors only. If some users
840/// require vectors while other require scalars, the scalar uses need to extract
841/// the scalars from the generated vectors (Note that this is different to how
842/// int/fp inductions are handled). Legalize extract-from-ends using uniform
843/// VPReplicateRecipe of wide inductions to use regular VPReplicateRecipe, so
844/// the correct end value is available. Also optimize
845/// VPWidenIntOrFpInductionRecipe, if any of its users needs scalar values, by
846/// providing them scalar steps built on the canonical scalar IV and update the
847/// original IV's users. This is an optional optimization to reduce the needs of
848/// vector extracts.
851 bool HasOnlyVectorVFs = !Plan.hasScalarVFOnly();
852 VPBuilder Builder(HeaderVPBB, HeaderVPBB->getFirstNonPhi());
853 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
854 auto *PhiR = dyn_cast<VPWidenInductionRecipe>(&Phi);
855 if (!PhiR)
856 continue;
857
858 // Try to narrow wide and replicating recipes to uniform recipes, based on
859 // VPlan analysis.
860 // TODO: Apply to all recipes in the future, to replace legacy uniformity
861 // analysis.
862 auto Users = collectUsersRecursively(PhiR);
863 for (VPUser *U : reverse(Users)) {
864 auto *Def = dyn_cast<VPRecipeWithIRFlags>(U);
865 auto *RepR = dyn_cast<VPReplicateRecipe>(U);
866 // Skip recipes that shouldn't be narrowed.
867 if (!Def || !isa<VPReplicateRecipe, VPWidenRecipe>(Def) ||
868 Def->getNumUsers() == 0 || !Def->getUnderlyingValue() ||
869 (RepR && (RepR->isSingleScalar() || RepR->isPredicated())))
870 continue;
871
872 // Skip recipes that may have other lanes than their first used.
874 continue;
875
876 auto *Clone = new VPReplicateRecipe(Def->getUnderlyingInstr(),
877 Def->operands(), /*IsUniform*/ true,
878 /*Mask*/ nullptr, /*Flags*/ *Def);
879 Clone->insertAfter(Def);
880 Def->replaceAllUsesWith(Clone);
881 }
882
883 // Replace wide pointer inductions which have only their scalars used by
884 // PtrAdd(IndStart, ScalarIVSteps (0, Step)).
885 if (auto *PtrIV = dyn_cast<VPWidenPointerInductionRecipe>(&Phi)) {
886 if (!Plan.hasScalarVFOnly() &&
887 !PtrIV->onlyScalarsGenerated(Plan.hasScalableVF()))
888 continue;
889
890 VPValue *PtrAdd = scalarizeVPWidenPointerInduction(PtrIV, Plan, Builder);
891 PtrIV->replaceAllUsesWith(PtrAdd);
892 continue;
893 }
894
895 // Replace widened induction with scalar steps for users that only use
896 // scalars.
897 auto *WideIV = cast<VPWidenIntOrFpInductionRecipe>(&Phi);
898 if (HasOnlyVectorVFs && none_of(WideIV->users(), [WideIV](VPUser *U) {
899 return U->usesScalars(WideIV);
900 }))
901 continue;
902
903 const InductionDescriptor &ID = WideIV->getInductionDescriptor();
905 Plan, ID.getKind(), ID.getInductionOpcode(),
906 dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
907 WideIV->getTruncInst(), WideIV->getStartValue(), WideIV->getStepValue(),
908 WideIV->getDebugLoc(), Builder);
909
910 // Update scalar users of IV to use Step instead.
911 if (!HasOnlyVectorVFs) {
912 assert(!Plan.hasScalableVF() &&
913 "plans containing a scalar VF cannot also include scalable VFs");
914 WideIV->replaceAllUsesWith(Steps);
915 } else {
916 bool HasScalableVF = Plan.hasScalableVF();
917 WideIV->replaceUsesWithIf(Steps,
918 [WideIV, HasScalableVF](VPUser &U, unsigned) {
919 if (HasScalableVF)
920 return U.usesFirstLaneOnly(WideIV);
921 return U.usesScalars(WideIV);
922 });
923 }
924 }
925}
926
927/// Check if \p VPV is an untruncated wide induction, either before or after the
928/// increment. If so return the header IV (before the increment), otherwise
929/// return null.
932 auto *WideIV = dyn_cast<VPWidenInductionRecipe>(VPV);
933 if (WideIV) {
934 // VPV itself is a wide induction, separately compute the end value for exit
935 // users if it is not a truncated IV.
936 auto *IntOrFpIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
937 return (IntOrFpIV && IntOrFpIV->getTruncInst()) ? nullptr : WideIV;
938 }
939
940 // Check if VPV is an optimizable induction increment.
941 VPRecipeBase *Def = VPV->getDefiningRecipe();
942 if (!Def || Def->getNumOperands() != 2)
943 return nullptr;
944 WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(0));
945 if (!WideIV)
946 WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(1));
947 if (!WideIV)
948 return nullptr;
949
950 auto IsWideIVInc = [&]() {
951 auto &ID = WideIV->getInductionDescriptor();
952
953 // Check if VPV increments the induction by the induction step.
954 VPValue *IVStep = WideIV->getStepValue();
955 switch (ID.getInductionOpcode()) {
956 case Instruction::Add:
957 return match(VPV, m_c_Add(m_Specific(WideIV), m_Specific(IVStep)));
958 case Instruction::FAdd:
959 return match(VPV, m_c_FAdd(m_Specific(WideIV), m_Specific(IVStep)));
960 case Instruction::FSub:
961 return match(VPV, m_Binary<Instruction::FSub>(m_Specific(WideIV),
962 m_Specific(IVStep)));
963 case Instruction::Sub: {
964 // IVStep will be the negated step of the subtraction. Check if Step == -1
965 // * IVStep.
966 VPValue *Step;
967 if (!match(VPV, m_Sub(m_VPValue(), m_VPValue(Step))))
968 return false;
969 const SCEV *IVStepSCEV = vputils::getSCEVExprForVPValue(IVStep, PSE);
970 const SCEV *StepSCEV = vputils::getSCEVExprForVPValue(Step, PSE);
971 ScalarEvolution &SE = *PSE.getSE();
972 return !isa<SCEVCouldNotCompute>(IVStepSCEV) &&
973 !isa<SCEVCouldNotCompute>(StepSCEV) &&
974 IVStepSCEV == SE.getNegativeSCEV(StepSCEV);
975 }
976 default:
977 return ID.getKind() == InductionDescriptor::IK_PtrInduction &&
978 match(VPV, m_GetElementPtr(m_Specific(WideIV),
979 m_Specific(WideIV->getStepValue())));
980 }
981 llvm_unreachable("should have been covered by switch above");
982 };
983 return IsWideIVInc() ? WideIV : nullptr;
984}
985
986/// Attempts to optimize the induction variable exit values for users in the
987/// early exit block.
989 VPTypeAnalysis &TypeInfo,
990 VPBlockBase *PredVPBB,
991 VPValue *Op,
993 VPValue *Incoming, *Mask;
995 m_VPValue(Incoming))))
996 return nullptr;
997
998 auto *WideIV = getOptimizableIVOf(Incoming, PSE);
999 if (!WideIV)
1000 return nullptr;
1001
1002 auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
1003 if (WideIntOrFp && WideIntOrFp->getTruncInst())
1004 return nullptr;
1005
1006 // Calculate the final index.
1007 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
1008 auto *CanonicalIV = LoopRegion->getCanonicalIV();
1009 Type *CanonicalIVType = LoopRegion->getCanonicalIVType();
1010 VPBuilder B(cast<VPBasicBlock>(PredVPBB));
1011
1012 DebugLoc DL = cast<VPInstruction>(Op)->getDebugLoc();
1013 VPValue *FirstActiveLane =
1014 B.createNaryOp(VPInstruction::FirstActiveLane, Mask, DL);
1015 Type *FirstActiveLaneType = TypeInfo.inferScalarType(FirstActiveLane);
1016 FirstActiveLane = B.createScalarZExtOrTrunc(FirstActiveLane, CanonicalIVType,
1017 FirstActiveLaneType, DL);
1018 VPValue *EndValue = B.createAdd(CanonicalIV, FirstActiveLane, DL);
1019
1020 // `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
1021 // changed it means the exit is using the incremented value, so we need to
1022 // add the step.
1023 if (Incoming != WideIV) {
1024 VPValue *One = Plan.getConstantInt(CanonicalIVType, 1);
1025 EndValue = B.createAdd(EndValue, One, DL);
1026 }
1027
1028 if (!WideIntOrFp || !WideIntOrFp->isCanonical()) {
1029 const InductionDescriptor &ID = WideIV->getInductionDescriptor();
1030 VPIRValue *Start = WideIV->getStartValue();
1031 VPValue *Step = WideIV->getStepValue();
1032 EndValue = B.createDerivedIV(
1033 ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
1034 Start, EndValue, Step);
1035 }
1036
1037 return EndValue;
1038}
1039
1040/// Compute the end value for \p WideIV, unless it is truncated. Creates a
1041/// VPDerivedIVRecipe for non-canonical inductions.
1043 VPBuilder &VectorPHBuilder,
1044 VPTypeAnalysis &TypeInfo,
1045 VPValue *VectorTC) {
1046 auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
1047 // Truncated wide inductions resume from the last lane of their vector value
1048 // in the last vector iteration which is handled elsewhere.
1049 if (WideIntOrFp && WideIntOrFp->getTruncInst())
1050 return nullptr;
1051
1052 VPIRValue *Start = WideIV->getStartValue();
1053 VPValue *Step = WideIV->getStepValue();
1055 VPValue *EndValue = VectorTC;
1056 if (!WideIntOrFp || !WideIntOrFp->isCanonical()) {
1057 EndValue = VectorPHBuilder.createDerivedIV(
1058 ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
1059 Start, VectorTC, Step);
1060 }
1061
1062 // EndValue is derived from the vector trip count (which has the same type as
1063 // the widest induction) and thus may be wider than the induction here.
1064 Type *ScalarTypeOfWideIV = TypeInfo.inferScalarType(WideIV);
1065 if (ScalarTypeOfWideIV != TypeInfo.inferScalarType(EndValue)) {
1066 EndValue = VectorPHBuilder.createScalarCast(Instruction::Trunc, EndValue,
1067 ScalarTypeOfWideIV,
1068 WideIV->getDebugLoc());
1069 }
1070
1071 return EndValue;
1072}
1073
1074/// Attempts to optimize the induction variable exit values for users in the
1075/// exit block coming from the latch in the original scalar loop.
1077 VPlan &Plan, VPTypeAnalysis &TypeInfo, VPBlockBase *PredVPBB, VPValue *Op,
1079 VPValue *Incoming;
1080 VPWidenInductionRecipe *WideIV = nullptr;
1082 WideIV = getOptimizableIVOf(Incoming, PSE);
1083
1084 if (!WideIV)
1085 return nullptr;
1086
1087 VPValue *EndValue = EndValues.lookup(WideIV);
1088 assert(EndValue && "Must have computed the end value up front");
1089
1090 // `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
1091 // changed it means the exit is using the incremented value, so we don't
1092 // need to subtract the step.
1093 if (Incoming != WideIV)
1094 return EndValue;
1095
1096 // Otherwise, subtract the step from the EndValue.
1097 VPBuilder B(cast<VPBasicBlock>(PredVPBB)->getTerminator());
1098 VPValue *Step = WideIV->getStepValue();
1099 Type *ScalarTy = TypeInfo.inferScalarType(WideIV);
1100 if (ScalarTy->isIntegerTy())
1101 return B.createSub(EndValue, Step, DebugLoc::getUnknown(), "ind.escape");
1102 if (ScalarTy->isPointerTy()) {
1103 Type *StepTy = TypeInfo.inferScalarType(Step);
1104 auto *Zero = Plan.getZero(StepTy);
1105 return B.createPtrAdd(EndValue, B.createSub(Zero, Step),
1106 DebugLoc::getUnknown(), "ind.escape");
1107 }
1108 if (ScalarTy->isFloatingPointTy()) {
1109 const auto &ID = WideIV->getInductionDescriptor();
1110 return B.createNaryOp(
1111 ID.getInductionBinOp()->getOpcode() == Instruction::FAdd
1112 ? Instruction::FSub
1113 : Instruction::FAdd,
1114 {EndValue, Step}, {ID.getInductionBinOp()->getFastMathFlags()});
1115 }
1116 llvm_unreachable("all possible induction types must be handled");
1117 return nullptr;
1118}
1119
1121 VPlan &Plan, PredicatedScalarEvolution &PSE, bool FoldTail) {
1122 // Compute end values for all inductions.
1123 VPTypeAnalysis TypeInfo(Plan);
1124 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
1125 auto *VectorPH = cast<VPBasicBlock>(VectorRegion->getSinglePredecessor());
1126 VPBuilder VectorPHBuilder(VectorPH, VectorPH->begin());
1128 VPValue *ResumeTC =
1129 FoldTail ? Plan.getTripCount() : &Plan.getVectorTripCount();
1130 for (auto &Phi : VectorRegion->getEntryBasicBlock()->phis()) {
1131 auto *WideIV = dyn_cast<VPWidenInductionRecipe>(&Phi);
1132 if (!WideIV)
1133 continue;
1135 WideIV, VectorPHBuilder, TypeInfo, ResumeTC))
1136 EndValues[WideIV] = EndValue;
1137 }
1138
1139 VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
1140 for (VPRecipeBase &R : make_early_inc_range(*MiddleVPBB)) {
1141 VPValue *Op;
1142 if (!match(&R, m_ExitingIVValue(m_VPValue(Op))))
1143 continue;
1144 auto *WideIV = cast<VPWidenInductionRecipe>(Op);
1145 if (VPValue *EndValue = EndValues.lookup(WideIV)) {
1146 R.getVPSingleValue()->replaceAllUsesWith(EndValue);
1147 R.eraseFromParent();
1148 }
1149 }
1150
1151 // Then, optimize exit block users.
1152 for (VPIRBasicBlock *ExitVPBB : Plan.getExitBlocks()) {
1153 for (VPRecipeBase &R : ExitVPBB->phis()) {
1154 auto *ExitIRI = cast<VPIRPhi>(&R);
1155
1156 for (auto [Idx, PredVPBB] : enumerate(ExitVPBB->getPredecessors())) {
1157 VPValue *Escape = nullptr;
1158 if (PredVPBB == MiddleVPBB)
1159 Escape = optimizeLatchExitInductionUser(Plan, TypeInfo, PredVPBB,
1160 ExitIRI->getOperand(Idx),
1161 EndValues, PSE);
1162 else
1164 Plan, TypeInfo, PredVPBB, ExitIRI->getOperand(Idx), PSE);
1165 if (Escape)
1166 ExitIRI->setOperand(Idx, Escape);
1167 }
1168 }
1169 }
1170}
1171
1172/// Remove redundant EpxandSCEVRecipes in \p Plan's entry block by replacing
1173/// them with already existing recipes expanding the same SCEV expression.
1176
1177 for (VPRecipeBase &R :
1179 auto *ExpR = dyn_cast<VPExpandSCEVRecipe>(&R);
1180 if (!ExpR)
1181 continue;
1182
1183 const auto &[V, Inserted] = SCEV2VPV.try_emplace(ExpR->getSCEV(), ExpR);
1184 if (Inserted)
1185 continue;
1186 ExpR->replaceAllUsesWith(V->second);
1187 ExpR->eraseFromParent();
1188 }
1189}
1190
1192 SmallVector<VPValue *> WorkList;
1194 WorkList.push_back(V);
1195
1196 while (!WorkList.empty()) {
1197 VPValue *Cur = WorkList.pop_back_val();
1198 if (!Seen.insert(Cur).second)
1199 continue;
1200 VPRecipeBase *R = Cur->getDefiningRecipe();
1201 if (!R)
1202 continue;
1203 if (!isDeadRecipe(*R))
1204 continue;
1205 append_range(WorkList, R->operands());
1206 R->eraseFromParent();
1207 }
1208}
1209
1210/// Get any instruction opcode or intrinsic ID data embedded in recipe \p R.
1211/// Returns an optional pair, where the first element indicates whether it is
1212/// an intrinsic ID.
1213static std::optional<std::pair<bool, unsigned>>
1215 return TypeSwitch<const VPSingleDefRecipe *,
1216 std::optional<std::pair<bool, unsigned>>>(R)
1219 [](auto *I) { return std::make_pair(false, I->getOpcode()); })
1220 .Case([](const VPWidenIntrinsicRecipe *I) {
1221 return std::make_pair(true, I->getVectorIntrinsicID());
1222 })
1223 .Case<VPVectorPointerRecipe, VPPredInstPHIRecipe, VPScalarIVStepsRecipe>(
1224 [](auto *I) {
1225 // For recipes that do not directly map to LLVM IR instructions,
1226 // assign opcodes after the last VPInstruction opcode (which is also
1227 // after the last IR Instruction opcode), based on the VPRecipeID.
1228 return std::make_pair(false, VPInstruction::OpsEnd + 1 +
1229 I->getVPRecipeID());
1230 })
1231 .Default([](auto *) { return std::nullopt; });
1232}
1233
1234/// Try to fold \p R using InstSimplifyFolder. Will succeed and return a
1235/// non-nullptr VPValue for a handled opcode or intrinsic ID if corresponding \p
1236/// Operands are foldable live-ins.
1238 ArrayRef<VPValue *> Operands,
1239 const DataLayout &DL,
1240 VPTypeAnalysis &TypeInfo) {
1241 auto OpcodeOrIID = getOpcodeOrIntrinsicID(&R);
1242 if (!OpcodeOrIID)
1243 return nullptr;
1244
1246 for (VPValue *Op : Operands) {
1247 if (!match(Op, m_LiveIn()))
1248 return nullptr;
1249 Value *V = Op->getUnderlyingValue();
1250 if (!V)
1251 return nullptr;
1252 Ops.push_back(V);
1253 }
1254
1255 auto FoldToIRValue = [&]() -> Value * {
1256 InstSimplifyFolder Folder(DL);
1257 if (OpcodeOrIID->first) {
1258 if (R.getNumOperands() != 2)
1259 return nullptr;
1260 unsigned ID = OpcodeOrIID->second;
1261 return Folder.FoldBinaryIntrinsic(ID, Ops[0], Ops[1],
1262 TypeInfo.inferScalarType(&R));
1263 }
1264 unsigned Opcode = OpcodeOrIID->second;
1265 if (Instruction::isBinaryOp(Opcode))
1266 return Folder.FoldBinOp(static_cast<Instruction::BinaryOps>(Opcode),
1267 Ops[0], Ops[1]);
1268 if (Instruction::isCast(Opcode))
1269 return Folder.FoldCast(static_cast<Instruction::CastOps>(Opcode), Ops[0],
1270 TypeInfo.inferScalarType(R.getVPSingleValue()));
1271 switch (Opcode) {
1273 return Folder.FoldSelect(Ops[0], Ops[1],
1275 case VPInstruction::Not:
1276 return Folder.FoldBinOp(Instruction::BinaryOps::Xor, Ops[0],
1278 case Instruction::Select:
1279 return Folder.FoldSelect(Ops[0], Ops[1], Ops[2]);
1280 case Instruction::ICmp:
1281 case Instruction::FCmp:
1282 return Folder.FoldCmp(cast<VPRecipeWithIRFlags>(R).getPredicate(), Ops[0],
1283 Ops[1]);
1284 case Instruction::GetElementPtr: {
1285 auto &RFlags = cast<VPRecipeWithIRFlags>(R);
1286 auto *GEP = cast<GetElementPtrInst>(RFlags.getUnderlyingInstr());
1287 return Folder.FoldGEP(GEP->getSourceElementType(), Ops[0],
1288 drop_begin(Ops), RFlags.getGEPNoWrapFlags());
1289 }
1292 return Folder.FoldGEP(IntegerType::getInt8Ty(TypeInfo.getContext()),
1293 Ops[0], Ops[1],
1294 cast<VPRecipeWithIRFlags>(R).getGEPNoWrapFlags());
1295 // An extract of a live-in is an extract of a broadcast, so return the
1296 // broadcasted element.
1297 case Instruction::ExtractElement:
1298 assert(!Ops[0]->getType()->isVectorTy() && "Live-ins should be scalar");
1299 return Ops[0];
1300 }
1301 return nullptr;
1302 };
1303
1304 if (Value *V = FoldToIRValue())
1305 return R.getParent()->getPlan()->getOrAddLiveIn(V);
1306 return nullptr;
1307}
1308
1309/// Try to simplify VPSingleDefRecipe \p Def.
1311 VPlan *Plan = Def->getParent()->getPlan();
1312
1313 // Simplification of live-in IR values for SingleDef recipes using
1314 // InstSimplifyFolder.
1315 const DataLayout &DL = Plan->getDataLayout();
1316 if (VPValue *V = tryToFoldLiveIns(*Def, Def->operands(), DL, TypeInfo))
1317 return Def->replaceAllUsesWith(V);
1318
1319 // Fold PredPHI LiveIn -> LiveIn.
1320 if (auto *PredPHI = dyn_cast<VPPredInstPHIRecipe>(Def)) {
1321 VPValue *Op = PredPHI->getOperand(0);
1322 if (isa<VPIRValue>(Op))
1323 PredPHI->replaceAllUsesWith(Op);
1324 }
1325
1326 VPBuilder Builder(Def);
1327
1328 // Avoid replacing VPInstructions with underlying values with new
1329 // VPInstructions, as we would fail to create widen/replicate recpes from the
1330 // new VPInstructions without an underlying value, and miss out on some
1331 // transformations that only apply to widened/replicated recipes later, by
1332 // doing so.
1333 // TODO: We should also not replace non-VPInstructions like VPWidenRecipe with
1334 // VPInstructions without underlying values, as those will get skipped during
1335 // cost computation.
1336 bool CanCreateNewRecipe =
1337 !isa<VPInstruction>(Def) || !Def->getUnderlyingValue();
1338
1339 VPValue *A;
1340 if (match(Def, m_Trunc(m_ZExtOrSExt(m_VPValue(A))))) {
1341 Type *TruncTy = TypeInfo.inferScalarType(Def);
1342 Type *ATy = TypeInfo.inferScalarType(A);
1343 if (TruncTy == ATy) {
1344 Def->replaceAllUsesWith(A);
1345 } else {
1346 // Don't replace a non-widened cast recipe with a widened cast.
1347 if (!isa<VPWidenCastRecipe>(Def))
1348 return;
1349 if (ATy->getScalarSizeInBits() < TruncTy->getScalarSizeInBits()) {
1350
1351 unsigned ExtOpcode = match(Def->getOperand(0), m_SExt(m_VPValue()))
1352 ? Instruction::SExt
1353 : Instruction::ZExt;
1354 auto *Ext = Builder.createWidenCast(Instruction::CastOps(ExtOpcode), A,
1355 TruncTy);
1356 if (auto *UnderlyingExt = Def->getOperand(0)->getUnderlyingValue()) {
1357 // UnderlyingExt has distinct return type, used to retain legacy cost.
1358 Ext->setUnderlyingValue(UnderlyingExt);
1359 }
1360 Def->replaceAllUsesWith(Ext);
1361 } else if (ATy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits()) {
1362 auto *Trunc = Builder.createWidenCast(Instruction::Trunc, A, TruncTy);
1363 Def->replaceAllUsesWith(Trunc);
1364 }
1365 }
1366#ifndef NDEBUG
1367 // Verify that the cached type info is for both A and its users is still
1368 // accurate by comparing it to freshly computed types.
1369 VPTypeAnalysis TypeInfo2(*Plan);
1370 assert(TypeInfo.inferScalarType(A) == TypeInfo2.inferScalarType(A));
1371 for (VPUser *U : A->users()) {
1372 auto *R = cast<VPRecipeBase>(U);
1373 for (VPValue *VPV : R->definedValues())
1374 assert(TypeInfo.inferScalarType(VPV) == TypeInfo2.inferScalarType(VPV));
1375 }
1376#endif
1377 }
1378
1379 // Simplify (X && Y) | (X && !Y) -> X.
1380 // TODO: Split up into simpler, modular combines: (X && Y) | (X && Z) into X
1381 // && (Y | Z) and (X | !X) into true. This requires queuing newly created
1382 // recipes to be visited during simplification.
1383 VPValue *X, *Y, *Z;
1384 if (match(Def,
1387 Def->replaceAllUsesWith(X);
1388 Def->eraseFromParent();
1389 return;
1390 }
1391
1392 // x | AllOnes -> AllOnes
1393 if (match(Def, m_c_BinaryOr(m_VPValue(X), m_AllOnes())))
1394 return Def->replaceAllUsesWith(
1395 Plan->getAllOnesValue(TypeInfo.inferScalarType(Def)));
1396
1397 // x | 0 -> x
1398 if (match(Def, m_c_BinaryOr(m_VPValue(X), m_ZeroInt())))
1399 return Def->replaceAllUsesWith(X);
1400
1401 // x | !x -> AllOnes
1403 return Def->replaceAllUsesWith(
1404 Plan->getAllOnesValue(TypeInfo.inferScalarType(Def)));
1405
1406 // x & 0 -> 0
1407 if (match(Def, m_c_BinaryAnd(m_VPValue(X), m_ZeroInt())))
1408 return Def->replaceAllUsesWith(
1409 Plan->getZero(TypeInfo.inferScalarType(Def)));
1410
1411 // x & AllOnes -> x
1412 if (match(Def, m_c_BinaryAnd(m_VPValue(X), m_AllOnes())))
1413 return Def->replaceAllUsesWith(X);
1414
1415 // x && false -> false
1416 if (match(Def, m_c_LogicalAnd(m_VPValue(X), m_False())))
1417 return Def->replaceAllUsesWith(Plan->getFalse());
1418
1419 // x && true -> x
1420 if (match(Def, m_c_LogicalAnd(m_VPValue(X), m_True())))
1421 return Def->replaceAllUsesWith(X);
1422
1423 // (x && y) | (x && z) -> x && (y | z)
1424 if (CanCreateNewRecipe &&
1427 // Simplify only if one of the operands has one use to avoid creating an
1428 // extra recipe.
1429 (!Def->getOperand(0)->hasMoreThanOneUniqueUser() ||
1430 !Def->getOperand(1)->hasMoreThanOneUniqueUser()))
1431 return Def->replaceAllUsesWith(
1432 Builder.createLogicalAnd(X, Builder.createOr(Y, Z)));
1433
1434 // x && (x && y) -> x && y
1435 if (match(Def, m_LogicalAnd(m_VPValue(X),
1437 return Def->replaceAllUsesWith(Def->getOperand(1));
1438
1439 // x && (y && x) -> x && y
1440 if (match(Def, m_LogicalAnd(m_VPValue(X),
1442 return Def->replaceAllUsesWith(Builder.createLogicalAnd(X, Y));
1443
1444 // x && !x -> 0
1446 return Def->replaceAllUsesWith(Plan->getFalse());
1447
1448 if (match(Def, m_Select(m_VPValue(), m_VPValue(X), m_Deferred(X))))
1449 return Def->replaceAllUsesWith(X);
1450
1451 // select c, false, true -> not c
1452 VPValue *C;
1453 if (CanCreateNewRecipe &&
1454 match(Def, m_Select(m_VPValue(C), m_False(), m_True())))
1455 return Def->replaceAllUsesWith(Builder.createNot(C));
1456
1457 // select !c, x, y -> select c, y, x
1458 if (match(Def, m_Select(m_Not(m_VPValue(C)), m_VPValue(X), m_VPValue(Y)))) {
1459 Def->setOperand(0, C);
1460 Def->setOperand(1, Y);
1461 Def->setOperand(2, X);
1462 return;
1463 }
1464
1465 if (match(Def, m_c_Add(m_VPValue(A), m_ZeroInt())))
1466 return Def->replaceAllUsesWith(A);
1467
1468 if (match(Def, m_c_Mul(m_VPValue(A), m_One())))
1469 return Def->replaceAllUsesWith(A);
1470
1471 if (match(Def, m_c_Mul(m_VPValue(A), m_ZeroInt())))
1472 return Def->replaceAllUsesWith(
1473 Plan->getZero(TypeInfo.inferScalarType(Def)));
1474
1475 if (CanCreateNewRecipe && match(Def, m_c_Mul(m_VPValue(A), m_AllOnes()))) {
1476 // Preserve nsw from the Mul on the new Sub.
1478 false, cast<VPRecipeWithIRFlags>(Def)->hasNoSignedWrap()};
1479 return Def->replaceAllUsesWith(
1480 Builder.createSub(Plan->getZero(TypeInfo.inferScalarType(A)), A,
1481 Def->getDebugLoc(), "", NW));
1482 }
1483
1484 if (CanCreateNewRecipe &&
1486 // Preserve nsw from the Add and the Sub, if it's present on both, on the
1487 // new Sub.
1489 false,
1490 cast<VPRecipeWithIRFlags>(Def)->hasNoSignedWrap() &&
1491 cast<VPRecipeWithIRFlags>(Def->getOperand(Def->getOperand(0) == X))
1492 ->hasNoSignedWrap()};
1493 return Def->replaceAllUsesWith(
1494 Builder.createSub(X, Y, Def->getDebugLoc(), "", NW));
1495 }
1496
1497 const APInt *APC;
1498 if (CanCreateNewRecipe && match(Def, m_c_Mul(m_VPValue(A), m_APInt(APC))) &&
1499 APC->isPowerOf2())
1500 return Def->replaceAllUsesWith(Builder.createNaryOp(
1501 Instruction::Shl,
1502 {A, Plan->getConstantInt(APC->getBitWidth(), APC->exactLogBase2())},
1503 *cast<VPRecipeWithIRFlags>(Def), Def->getDebugLoc()));
1504
1505 if (CanCreateNewRecipe && match(Def, m_UDiv(m_VPValue(A), m_APInt(APC))) &&
1506 APC->isPowerOf2())
1507 return Def->replaceAllUsesWith(Builder.createNaryOp(
1508 Instruction::LShr,
1509 {A, Plan->getConstantInt(APC->getBitWidth(), APC->exactLogBase2())},
1510 *cast<VPRecipeWithIRFlags>(Def), Def->getDebugLoc()));
1511
1512 if (match(Def, m_Not(m_VPValue(A)))) {
1513 if (match(A, m_Not(m_VPValue(A))))
1514 return Def->replaceAllUsesWith(A);
1515
1516 // Try to fold Not into compares by adjusting the predicate in-place.
1517 CmpPredicate Pred;
1518 if (match(A, m_Cmp(Pred, m_VPValue(), m_VPValue()))) {
1519 auto *Cmp = cast<VPRecipeWithIRFlags>(A);
1520 if (all_of(Cmp->users(),
1522 m_Not(m_Specific(Cmp)),
1523 m_Select(m_Specific(Cmp), m_VPValue(), m_VPValue()))))) {
1524 Cmp->setPredicate(CmpInst::getInversePredicate(Pred));
1525 for (VPUser *U : to_vector(Cmp->users())) {
1526 auto *R = cast<VPSingleDefRecipe>(U);
1527 if (match(R, m_Select(m_Specific(Cmp), m_VPValue(X), m_VPValue(Y)))) {
1528 // select (cmp pred), x, y -> select (cmp inv_pred), y, x
1529 R->setOperand(1, Y);
1530 R->setOperand(2, X);
1531 } else {
1532 // not (cmp pred) -> cmp inv_pred
1533 assert(match(R, m_Not(m_Specific(Cmp))) && "Unexpected user");
1534 R->replaceAllUsesWith(Cmp);
1535 }
1536 }
1537 // If Cmp doesn't have a debug location, use the one from the negation,
1538 // to preserve the location.
1539 if (!Cmp->getDebugLoc() && Def->getDebugLoc())
1540 Cmp->setDebugLoc(Def->getDebugLoc());
1541 }
1542 }
1543 }
1544
1545 // Fold any-of (fcmp uno %A, %A), (fcmp uno %B, %B), ... ->
1546 // any-of (fcmp uno %A, %B), ...
1547 if (match(Def, m_AnyOf())) {
1549 VPRecipeBase *UnpairedCmp = nullptr;
1550 for (VPValue *Op : Def->operands()) {
1551 VPValue *X;
1552 if (Op->getNumUsers() > 1 ||
1554 m_Deferred(X)))) {
1555 NewOps.push_back(Op);
1556 } else if (!UnpairedCmp) {
1557 UnpairedCmp = Op->getDefiningRecipe();
1558 } else {
1559 NewOps.push_back(Builder.createFCmp(CmpInst::FCMP_UNO,
1560 UnpairedCmp->getOperand(0), X));
1561 UnpairedCmp = nullptr;
1562 }
1563 }
1564
1565 if (UnpairedCmp)
1566 NewOps.push_back(UnpairedCmp->getVPSingleValue());
1567
1568 if (NewOps.size() < Def->getNumOperands()) {
1569 VPValue *NewAnyOf = Builder.createNaryOp(VPInstruction::AnyOf, NewOps);
1570 return Def->replaceAllUsesWith(NewAnyOf);
1571 }
1572 }
1573
1574 // Fold (fcmp uno %X, %X) or (fcmp uno %Y, %Y) -> fcmp uno %X, %Y
1575 // This is useful for fmax/fmin without fast-math flags, where we need to
1576 // check if any operand is NaN.
1577 if (CanCreateNewRecipe &&
1579 m_Deferred(X)),
1581 m_Deferred(Y))))) {
1582 VPValue *NewCmp = Builder.createFCmp(CmpInst::FCMP_UNO, X, Y);
1583 return Def->replaceAllUsesWith(NewCmp);
1584 }
1585
1586 // Remove redundant DerviedIVs, that is 0 + A * 1 -> A and 0 + 0 * x -> 0.
1587 if ((match(Def, m_DerivedIV(m_ZeroInt(), m_VPValue(A), m_One())) ||
1588 match(Def, m_DerivedIV(m_ZeroInt(), m_ZeroInt(), m_VPValue()))) &&
1589 TypeInfo.inferScalarType(Def->getOperand(1)) ==
1590 TypeInfo.inferScalarType(Def))
1591 return Def->replaceAllUsesWith(Def->getOperand(1));
1592
1594 m_One()))) {
1595 Type *WideStepTy = TypeInfo.inferScalarType(Def);
1596 if (TypeInfo.inferScalarType(X) != WideStepTy)
1597 X = Builder.createWidenCast(Instruction::Trunc, X, WideStepTy);
1598 Def->replaceAllUsesWith(X);
1599 return;
1600 }
1601
1602 // For i1 vp.merges produced by AnyOf reductions:
1603 // vp.merge true, (or x, y), x, evl -> vp.merge y, true, x, evl
1605 m_VPValue(X), m_VPValue())) &&
1607 TypeInfo.inferScalarType(Def)->isIntegerTy(1)) {
1608 Def->setOperand(1, Def->getOperand(0));
1609 Def->setOperand(0, Y);
1610 return;
1611 }
1612
1613 // Simplify MaskedCond with no block mask to its single operand.
1615 !cast<VPInstruction>(Def)->isMasked())
1616 return Def->replaceAllUsesWith(Def->getOperand(0));
1617
1618 // Look through ExtractLastLane.
1619 if (match(Def, m_ExtractLastLane(m_VPValue(A)))) {
1620 if (match(A, m_BuildVector())) {
1621 auto *BuildVector = cast<VPInstruction>(A);
1622 Def->replaceAllUsesWith(
1623 BuildVector->getOperand(BuildVector->getNumOperands() - 1));
1624 return;
1625 }
1626 if (Plan->hasScalarVFOnly())
1627 return Def->replaceAllUsesWith(A);
1628 }
1629
1630 // Look through ExtractPenultimateElement (BuildVector ....).
1632 auto *BuildVector = cast<VPInstruction>(Def->getOperand(0));
1633 Def->replaceAllUsesWith(
1634 BuildVector->getOperand(BuildVector->getNumOperands() - 2));
1635 return;
1636 }
1637
1638 uint64_t Idx;
1640 auto *BuildVector = cast<VPInstruction>(Def->getOperand(0));
1641 Def->replaceAllUsesWith(BuildVector->getOperand(Idx));
1642 return;
1643 }
1644
1645 if (match(Def, m_BuildVector()) && all_equal(Def->operands())) {
1646 Def->replaceAllUsesWith(
1647 Builder.createNaryOp(VPInstruction::Broadcast, Def->getOperand(0)));
1648 return;
1649 }
1650
1651 // Look through broadcast of single-scalar when used as select conditions; in
1652 // that case the scalar condition can be used directly.
1653 if (match(Def,
1656 "broadcast operand must be single-scalar");
1657 Def->setOperand(0, C);
1658 return;
1659 }
1660
1662 if (Def->getNumOperands() == 1) {
1663 Def->replaceAllUsesWith(Def->getOperand(0));
1664 return;
1665 }
1666 if (auto *Phi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(Def)) {
1667 if (all_equal(Phi->incoming_values()))
1668 Phi->replaceAllUsesWith(Phi->getOperand(0));
1669 }
1670 return;
1671 }
1672
1673 VPIRValue *IRV;
1674 if (Def->getNumOperands() == 1 &&
1676 return Def->replaceAllUsesWith(IRV);
1677
1678 // Some simplifications can only be applied after unrolling. Perform them
1679 // below.
1680 if (!Plan->isUnrolled())
1681 return;
1682
1683 // After unrolling, extract-lane may be used to extract values from multiple
1684 // scalar sources. Only simplify when extracting from a single scalar source.
1685 VPValue *LaneToExtract;
1686 if (match(Def, m_ExtractLane(m_VPValue(LaneToExtract), m_VPValue(A)))) {
1687 // Simplify extract-lane(%lane_num, %scalar_val) -> %scalar_val.
1689 return Def->replaceAllUsesWith(A);
1690
1691 // Simplify extract-lane with single source to extract-element.
1692 Def->replaceAllUsesWith(Builder.createNaryOp(
1693 Instruction::ExtractElement, {A, LaneToExtract}, Def->getDebugLoc()));
1694 return;
1695 }
1696
1697 // Look for cycles where Def is of the form:
1698 // X = phi(0, IVInc) ; used only by IVInc, or by IVInc and Inc = X + Y
1699 // IVInc = X + Step ; used by X and Def
1700 // Def = IVInc + Y
1701 // Fold the increment Y into the phi's start value, replace Def with IVInc,
1702 // and if Inc exists, replace it with X.
1703 if (match(Def, m_Add(m_Add(m_VPValue(X), m_VPValue()), m_VPValue(Y))) &&
1704 isa<VPIRValue>(Y) &&
1705 match(X, m_VPPhi(m_ZeroInt(), m_Specific(Def->getOperand(0))))) {
1706 auto *Phi = cast<VPPhi>(X);
1707 auto *IVInc = Def->getOperand(0);
1708 if (IVInc->getNumUsers() == 2) {
1709 // If Phi has a second user (besides IVInc's defining recipe), it must
1710 // be Inc = Phi + Y for the fold to apply.
1713 if (Phi->getNumUsers() == 1 || (Phi->getNumUsers() == 2 && Inc)) {
1714 Def->replaceAllUsesWith(IVInc);
1715 if (Inc)
1716 Inc->replaceAllUsesWith(Phi);
1717 Phi->setOperand(0, Y);
1718 return;
1719 }
1720 }
1721 }
1722
1723 // Simplify unrolled VectorPointer without offset, or with zero offset, to
1724 // just the pointer operand.
1725 if (auto *VPR = dyn_cast<VPVectorPointerRecipe>(Def))
1726 if (!VPR->getOffset() || match(VPR->getOffset(), m_ZeroInt()))
1727 return VPR->replaceAllUsesWith(VPR->getOperand(0));
1728
1729 // VPScalarIVSteps after unrolling can be replaced by their start value, if
1730 // the start index is zero and only the first lane 0 is demanded.
1731 if (auto *Steps = dyn_cast<VPScalarIVStepsRecipe>(Def)) {
1732 if (!Steps->getStartIndex() && vputils::onlyFirstLaneUsed(Steps)) {
1733 Steps->replaceAllUsesWith(Steps->getOperand(0));
1734 return;
1735 }
1736 }
1737 // Simplify redundant ReductionStartVector recipes after unrolling.
1738 VPValue *StartV;
1740 m_VPValue(StartV), m_VPValue(), m_VPValue()))) {
1741 Def->replaceUsesWithIf(StartV, [](const VPUser &U, unsigned Idx) {
1742 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&U);
1743 return PhiR && PhiR->isInLoop();
1744 });
1745 return;
1746 }
1747
1749 Def->replaceAllUsesWith(A);
1750 return;
1751 }
1752
1753 if (match(Def, m_ExtractLastLane(m_VPValue(A))) &&
1756 cast<VPReplicateRecipe>(A)->isSingleScalar())) &&
1757 all_of(A->users(),
1758 [Def, A](VPUser *U) { return U->usesScalars(A) || Def == U; })) {
1759 return Def->replaceAllUsesWith(A);
1760 }
1761
1762 if (Plan->getConcreteUF() == 1 && match(Def, m_ExtractLastPart(m_VPValue(A))))
1763 return Def->replaceAllUsesWith(A);
1764}
1765
1768 Plan.getEntry());
1769 VPTypeAnalysis TypeInfo(Plan);
1771 for (VPRecipeBase &R : make_early_inc_range(*VPBB))
1772 if (auto *Def = dyn_cast<VPSingleDefRecipe>(&R))
1773 simplifyRecipe(Def, TypeInfo);
1774 }
1775}
1776
1777/// Reassociate (headermask && x) && y -> headermask && (x && y) to allow the
1778/// header mask to be simplified further when tail folding, e.g. in
1779/// optimizeEVLMasks.
1780static void reassociateHeaderMask(VPlan &Plan) {
1781 VPValue *HeaderMask = vputils::findHeaderMask(Plan);
1782 if (!HeaderMask)
1783 return;
1784
1785 SmallVector<VPUser *> Worklist;
1786 for (VPUser *U : HeaderMask->users())
1787 if (match(U, m_LogicalAnd(m_Specific(HeaderMask), m_VPValue())))
1789
1790 while (!Worklist.empty()) {
1791 auto *R = dyn_cast<VPSingleDefRecipe>(Worklist.pop_back_val());
1792 VPValue *X, *Y;
1793 if (!R || !match(R, m_LogicalAnd(
1794 m_LogicalAnd(m_Specific(HeaderMask), m_VPValue(X)),
1795 m_VPValue(Y))))
1796 continue;
1797 append_range(Worklist, R->users());
1798 VPBuilder Builder(R);
1799 R->replaceAllUsesWith(
1800 Builder.createLogicalAnd(HeaderMask, Builder.createLogicalAnd(X, Y)));
1801 }
1802}
1803
1805 if (Plan.hasScalarVFOnly())
1806 return;
1807
1808 // Try to narrow wide and replicating recipes to single scalar recipes,
1809 // based on VPlan analysis. Only process blocks in the loop region for now,
1810 // without traversing into nested regions, as recipes in replicate regions
1811 // cannot be converted yet.
1814 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
1816 VPWidenStoreRecipe>(&R))
1817 continue;
1818 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
1819 if (RepR && (RepR->isSingleScalar() || RepR->isPredicated()))
1820 continue;
1821
1822 // Convert an unmasked scatter with an uniform address into
1823 // extract-last-lane + scalar store.
1824 // TODO: Add a profitability check comparing the cost of a scatter vs.
1825 // extract + scalar store.
1826 auto *WidenStoreR = dyn_cast<VPWidenStoreRecipe>(&R);
1827 if (WidenStoreR && vputils::isSingleScalar(WidenStoreR->getAddr()) &&
1828 !WidenStoreR->isConsecutive()) {
1829 VPValue *Mask = WidenStoreR->getMask();
1830
1831 // Only convert the scatter to a scalar store if it is unmasked.
1832 // TODO: Support converting scatter masked by the header mask to scalar
1833 // store.
1834 if (Mask)
1835 continue;
1836
1838 {WidenStoreR->getOperand(1)});
1839 Extract->insertBefore(WidenStoreR);
1840
1841 // TODO: Sink the scalar store recipe to middle block if possible.
1842 auto *ScalarStore = new VPReplicateRecipe(
1843 &WidenStoreR->getIngredient(), {Extract, WidenStoreR->getAddr()},
1844 true /*IsSingleScalar*/, nullptr /*Mask*/, {},
1845 *WidenStoreR /*Metadata*/);
1846 ScalarStore->insertBefore(WidenStoreR);
1847 WidenStoreR->eraseFromParent();
1848 continue;
1849 }
1850
1851 auto *RepOrWidenR = dyn_cast<VPRecipeWithIRFlags>(&R);
1852 if (RepR && RepR->getOpcode() == Instruction::Store &&
1853 vputils::isSingleScalar(RepR->getOperand(1))) {
1854 auto *Clone = new VPReplicateRecipe(
1855 RepOrWidenR->getUnderlyingInstr(), RepOrWidenR->operands(),
1856 true /*IsSingleScalar*/, nullptr /*Mask*/, *RepR /*Flags*/,
1857 *RepR /*Metadata*/, RepR->getDebugLoc());
1858 Clone->insertBefore(RepOrWidenR);
1859 VPBuilder Builder(Clone);
1860 VPValue *ExtractOp = Clone->getOperand(0);
1861 if (vputils::isUniformAcrossVFsAndUFs(RepR->getOperand(1)))
1862 ExtractOp =
1863 Builder.createNaryOp(VPInstruction::ExtractLastPart, ExtractOp);
1864 ExtractOp =
1865 Builder.createNaryOp(VPInstruction::ExtractLastLane, ExtractOp);
1866 Clone->setOperand(0, ExtractOp);
1867 RepR->eraseFromParent();
1868 continue;
1869 }
1870
1871 // Skip recipes that aren't single scalars.
1872 if (!RepOrWidenR || !vputils::isSingleScalar(RepOrWidenR))
1873 continue;
1874
1875 // Predicate to check if a user of Op introduces extra broadcasts.
1876 auto IntroducesBCastOf = [](const VPValue *Op) {
1877 return [Op](const VPUser *U) {
1878 if (auto *VPI = dyn_cast<VPInstruction>(U)) {
1882 VPI->getOpcode()))
1883 return false;
1884 }
1885 return !U->usesScalars(Op);
1886 };
1887 };
1888
1889 if (any_of(RepOrWidenR->users(), IntroducesBCastOf(RepOrWidenR)) &&
1890 none_of(RepOrWidenR->operands(), [&](VPValue *Op) {
1891 if (any_of(
1892 make_filter_range(Op->users(), not_equal_to(RepOrWidenR)),
1893 IntroducesBCastOf(Op)))
1894 return false;
1895 // Non-constant live-ins require broadcasts, while constants do not
1896 // need explicit broadcasts.
1897 auto *IRV = dyn_cast<VPIRValue>(Op);
1898 bool LiveInNeedsBroadcast = IRV && !isa<Constant>(IRV->getValue());
1899 auto *OpR = dyn_cast<VPReplicateRecipe>(Op);
1900 return LiveInNeedsBroadcast || (OpR && OpR->isSingleScalar());
1901 }))
1902 continue;
1903
1904 auto *Clone = new VPReplicateRecipe(
1905 RepOrWidenR->getUnderlyingInstr(), RepOrWidenR->operands(),
1906 true /*IsSingleScalar*/, nullptr, *RepOrWidenR);
1907 Clone->insertBefore(RepOrWidenR);
1908 RepOrWidenR->replaceAllUsesWith(Clone);
1909 if (isDeadRecipe(*RepOrWidenR))
1910 RepOrWidenR->eraseFromParent();
1911 }
1912 }
1913}
1914
1915/// Try to see if all of \p Blend's masks share a common value logically and'ed
1916/// and remove it from the masks.
1918 if (Blend->isNormalized())
1919 return;
1920 VPValue *CommonEdgeMask;
1921 if (!match(Blend->getMask(0),
1922 m_LogicalAnd(m_VPValue(CommonEdgeMask), m_VPValue())))
1923 return;
1924 for (unsigned I = 0; I < Blend->getNumIncomingValues(); I++)
1925 if (!match(Blend->getMask(I),
1926 m_LogicalAnd(m_Specific(CommonEdgeMask), m_VPValue())))
1927 return;
1928 for (unsigned I = 0; I < Blend->getNumIncomingValues(); I++)
1929 Blend->setMask(I, Blend->getMask(I)->getDefiningRecipe()->getOperand(1));
1930}
1931
1932/// Normalize and simplify VPBlendRecipes. Should be run after simplifyRecipes
1933/// to make sure the masks are simplified.
1934static void simplifyBlends(VPlan &Plan) {
1937 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
1938 auto *Blend = dyn_cast<VPBlendRecipe>(&R);
1939 if (!Blend)
1940 continue;
1941
1942 removeCommonBlendMask(Blend);
1943
1944 // Try to remove redundant blend recipes.
1945 SmallPtrSet<VPValue *, 4> UniqueValues;
1946 if (Blend->isNormalized() || !match(Blend->getMask(0), m_False()))
1947 UniqueValues.insert(Blend->getIncomingValue(0));
1948 for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
1949 if (!match(Blend->getMask(I), m_False()))
1950 UniqueValues.insert(Blend->getIncomingValue(I));
1951
1952 if (UniqueValues.size() == 1) {
1953 Blend->replaceAllUsesWith(*UniqueValues.begin());
1954 Blend->eraseFromParent();
1955 continue;
1956 }
1957
1958 if (Blend->isNormalized())
1959 continue;
1960
1961 // Normalize the blend so its first incoming value is used as the initial
1962 // value with the others blended into it.
1963
1964 unsigned StartIndex = 0;
1965 for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
1966 // If a value's mask is used only by the blend then is can be deadcoded.
1967 // TODO: Find the most expensive mask that can be deadcoded, or a mask
1968 // that's used by multiple blends where it can be removed from them all.
1969 VPValue *Mask = Blend->getMask(I);
1970 if (Mask->getNumUsers() == 1 && !match(Mask, m_False())) {
1971 StartIndex = I;
1972 break;
1973 }
1974 }
1975
1976 SmallVector<VPValue *, 4> OperandsWithMask;
1977 OperandsWithMask.push_back(Blend->getIncomingValue(StartIndex));
1978
1979 for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
1980 if (I == StartIndex)
1981 continue;
1982 OperandsWithMask.push_back(Blend->getIncomingValue(I));
1983 OperandsWithMask.push_back(Blend->getMask(I));
1984 }
1985
1986 auto *NewBlend =
1987 new VPBlendRecipe(cast_or_null<PHINode>(Blend->getUnderlyingValue()),
1988 OperandsWithMask, *Blend, Blend->getDebugLoc());
1989 NewBlend->insertBefore(&R);
1990
1991 VPValue *DeadMask = Blend->getMask(StartIndex);
1992 Blend->replaceAllUsesWith(NewBlend);
1993 Blend->eraseFromParent();
1995
1996 /// Simplify BLEND %a, %b, Not(%mask) -> BLEND %b, %a, %mask.
1997 VPValue *NewMask;
1998 if (NewBlend->getNumOperands() == 3 &&
1999 match(NewBlend->getMask(1), m_Not(m_VPValue(NewMask)))) {
2000 VPValue *Inc0 = NewBlend->getOperand(0);
2001 VPValue *Inc1 = NewBlend->getOperand(1);
2002 VPValue *OldMask = NewBlend->getOperand(2);
2003 NewBlend->setOperand(0, Inc1);
2004 NewBlend->setOperand(1, Inc0);
2005 NewBlend->setOperand(2, NewMask);
2006 if (OldMask->getNumUsers() == 0)
2007 cast<VPInstruction>(OldMask)->eraseFromParent();
2008 }
2009 }
2010 }
2011}
2012
2013/// Optimize the width of vector induction variables in \p Plan based on a known
2014/// constant Trip Count, \p BestVF and \p BestUF.
2016 ElementCount BestVF,
2017 unsigned BestUF) {
2018 // Only proceed if we have not completely removed the vector region.
2019 if (!Plan.getVectorLoopRegion())
2020 return false;
2021
2022 const APInt *TC;
2023 if (!BestVF.isFixed() || !match(Plan.getTripCount(), m_APInt(TC)))
2024 return false;
2025
2026 // Calculate the minimum power-of-2 bit width that can fit the known TC, VF
2027 // and UF. Returns at least 8.
2028 auto ComputeBitWidth = [](APInt TC, uint64_t Align) {
2029 APInt AlignedTC =
2032 APInt MaxVal = AlignedTC - 1;
2033 return std::max<unsigned>(PowerOf2Ceil(MaxVal.getActiveBits()), 8);
2034 };
2035 unsigned NewBitWidth =
2036 ComputeBitWidth(*TC, BestVF.getKnownMinValue() * BestUF);
2037
2038 LLVMContext &Ctx = Plan.getContext();
2039 auto *NewIVTy = IntegerType::get(Ctx, NewBitWidth);
2040
2041 bool MadeChange = false;
2042
2043 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
2044 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
2045 auto *WideIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
2046
2047 // Currently only handle canonical IVs as it is trivial to replace the start
2048 // and stop values, and we currently only perform the optimization when the
2049 // IV has a single use.
2050 if (!WideIV || !WideIV->isCanonical() ||
2051 WideIV->hasMoreThanOneUniqueUser() ||
2052 NewIVTy == WideIV->getScalarType())
2053 continue;
2054
2055 // Currently only handle cases where the single user is a header-mask
2056 // comparison with the backedge-taken-count.
2057 VPUser *SingleUser = WideIV->getSingleUser();
2058 if (!SingleUser ||
2059 !match(SingleUser,
2060 m_ICmp(m_Specific(WideIV),
2062 continue;
2063
2064 // Update IV operands and comparison bound to use new narrower type.
2065 auto *NewStart = Plan.getZero(NewIVTy);
2066 WideIV->setStartValue(NewStart);
2067 auto *NewStep = Plan.getConstantInt(NewIVTy, 1);
2068 WideIV->setStepValue(NewStep);
2069
2070 auto *NewBTC = new VPWidenCastRecipe(
2071 Instruction::Trunc, Plan.getOrCreateBackedgeTakenCount(), NewIVTy,
2072 nullptr, VPIRFlags::getDefaultFlags(Instruction::Trunc));
2073 Plan.getVectorPreheader()->appendRecipe(NewBTC);
2074 auto *Cmp = cast<VPInstruction>(WideIV->getSingleUser());
2075 Cmp->setOperand(1, NewBTC);
2076
2077 MadeChange = true;
2078 }
2079
2080 return MadeChange;
2081}
2082
2083/// Return true if \p Cond is known to be true for given \p BestVF and \p
2084/// BestUF.
2086 ElementCount BestVF, unsigned BestUF,
2089 return any_of(Cond->getDefiningRecipe()->operands(), [&Plan, BestVF, BestUF,
2090 &PSE](VPValue *C) {
2091 return isConditionTrueViaVFAndUF(C, Plan, BestVF, BestUF, PSE);
2092 });
2093
2094 auto *CanIV = Plan.getVectorLoopRegion()->getCanonicalIV();
2097 m_c_Add(m_Specific(CanIV), m_Specific(&Plan.getVFxUF())),
2098 m_Specific(&Plan.getVectorTripCount()))))
2099 return false;
2100
2101 // The compare checks CanIV + VFxUF == vector trip count. The vector trip
2102 // count is not conveniently available as SCEV so far, so we compare directly
2103 // against the original trip count. This is stricter than necessary, as we
2104 // will only return true if the trip count == vector trip count.
2105 const SCEV *VectorTripCount =
2107 if (isa<SCEVCouldNotCompute>(VectorTripCount))
2108 VectorTripCount = vputils::getSCEVExprForVPValue(Plan.getTripCount(), PSE);
2109 assert(!isa<SCEVCouldNotCompute>(VectorTripCount) &&
2110 "Trip count SCEV must be computable");
2111 ScalarEvolution &SE = *PSE.getSE();
2112 ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF);
2113 const SCEV *C = SE.getElementCount(VectorTripCount->getType(), NumElements);
2114 return SE.isKnownPredicate(CmpInst::ICMP_EQ, VectorTripCount, C);
2115}
2116
2117/// Try to replace multiple active lane masks used for control flow with
2118/// a single, wide active lane mask instruction followed by multiple
2119/// extract subvector intrinsics. This applies to the active lane mask
2120/// instructions both in the loop and in the preheader.
2121/// Incoming values of all ActiveLaneMaskPHIs are updated to use the
2122/// new extracts from the first active lane mask, which has it's last
2123/// operand (multiplier) set to UF.
2125 unsigned UF) {
2126 if (!EnableWideActiveLaneMask || !VF.isVector() || UF == 1)
2127 return false;
2128
2129 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
2130 VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
2131 auto *Term = &ExitingVPBB->back();
2132
2133 using namespace llvm::VPlanPatternMatch;
2135 m_VPValue(), m_VPValue(), m_VPValue())))))
2136 return false;
2137
2138 auto *Header = cast<VPBasicBlock>(VectorRegion->getEntry());
2139 LLVMContext &Ctx = Plan.getContext();
2140
2141 auto ExtractFromALM = [&](VPInstruction *ALM,
2142 SmallVectorImpl<VPValue *> &Extracts) {
2143 DebugLoc DL = ALM->getDebugLoc();
2144 for (unsigned Part = 0; Part < UF; ++Part) {
2146 Ops.append({ALM, Plan.getConstantInt(64, VF.getKnownMinValue() * Part)});
2147 auto *Ext =
2148 new VPWidenIntrinsicRecipe(Intrinsic::vector_extract, Ops,
2149 IntegerType::getInt1Ty(Ctx), {}, {}, DL);
2150 Extracts[Part] = Ext;
2151 Ext->insertAfter(ALM);
2152 }
2153 };
2154
2155 // Create a list of each active lane mask phi, ordered by unroll part.
2157 for (VPRecipeBase &R : Header->phis()) {
2159 if (!Phi)
2160 continue;
2161 VPValue *Index = nullptr;
2162 match(Phi->getBackedgeValue(),
2164 assert(Index && "Expected index from ActiveLaneMask instruction");
2165
2166 uint64_t Part;
2167 if (match(Index,
2169 m_VPValue(), m_Mul(m_VPValue(), m_ConstantInt(Part)))))
2170 Phis[Part] = Phi;
2171 else {
2172 // Anything other than a CanonicalIVIncrementForPart is part 0
2173 assert(!match(
2174 Index,
2176 Phis[0] = Phi;
2177 }
2178 }
2179
2180 assert(all_of(Phis, not_equal_to(nullptr)) &&
2181 "Expected one VPActiveLaneMaskPHIRecipe for each unroll part");
2182
2183 auto *EntryALM = cast<VPInstruction>(Phis[0]->getStartValue());
2184 auto *LoopALM = cast<VPInstruction>(Phis[0]->getBackedgeValue());
2185
2186 assert((EntryALM->getOpcode() == VPInstruction::ActiveLaneMask &&
2187 LoopALM->getOpcode() == VPInstruction::ActiveLaneMask) &&
2188 "Expected incoming values of Phi to be ActiveLaneMasks");
2189
2190 // When using wide lane masks, the return type of the get.active.lane.mask
2191 // intrinsic is VF x UF (last operand).
2192 VPValue *ALMMultiplier = Plan.getConstantInt(64, UF);
2193 EntryALM->setOperand(2, ALMMultiplier);
2194 LoopALM->setOperand(2, ALMMultiplier);
2195
2196 // Create UF x extract vectors and insert into preheader.
2197 SmallVector<VPValue *> EntryExtracts(UF);
2198 ExtractFromALM(EntryALM, EntryExtracts);
2199
2200 // Create UF x extract vectors and insert before the loop compare & branch,
2201 // updating the compare to use the first extract.
2202 SmallVector<VPValue *> LoopExtracts(UF);
2203 ExtractFromALM(LoopALM, LoopExtracts);
2204 VPInstruction *Not = cast<VPInstruction>(Term->getOperand(0));
2205 Not->setOperand(0, LoopExtracts[0]);
2206
2207 // Update the incoming values of active lane mask phis.
2208 for (unsigned Part = 0; Part < UF; ++Part) {
2209 Phis[Part]->setStartValue(EntryExtracts[Part]);
2210 Phis[Part]->setBackedgeValue(LoopExtracts[Part]);
2211 }
2212
2213 return true;
2214}
2215
2216/// Try to simplify the branch condition of \p Plan. This may restrict the
2217/// resulting plan to \p BestVF and \p BestUF.
2219 unsigned BestUF,
2221 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
2222 VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
2223 auto *Term = &ExitingVPBB->back();
2224 VPValue *Cond;
2225 auto m_CanIVInc = m_Add(m_VPValue(), m_Specific(&Plan.getVFxUF()));
2226 // Check if the branch condition compares the canonical IV increment (for main
2227 // loop), or the canonical IV increment plus an offset (for epilog loop).
2228 if (match(Term, m_BranchOnCount(
2229 m_CombineOr(m_CanIVInc, m_c_Add(m_CanIVInc, m_LiveIn())),
2230 m_VPValue())) ||
2232 m_VPValue(), m_VPValue(), m_VPValue()))))) {
2233 // Try to simplify the branch condition if VectorTC <= VF * UF when the
2234 // latch terminator is BranchOnCount or BranchOnCond(Not(ActiveLaneMask)).
2235 const SCEV *VectorTripCount =
2237 if (isa<SCEVCouldNotCompute>(VectorTripCount))
2238 VectorTripCount =
2240 assert(!isa<SCEVCouldNotCompute>(VectorTripCount) &&
2241 "Trip count SCEV must be computable");
2242 ScalarEvolution &SE = *PSE.getSE();
2243 ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF);
2244 const SCEV *C = SE.getElementCount(VectorTripCount->getType(), NumElements);
2245 if (!SE.isKnownPredicate(CmpInst::ICMP_ULE, VectorTripCount, C))
2246 return false;
2247 } else if (match(Term, m_BranchOnCond(m_VPValue(Cond))) ||
2249 // For BranchOnCond, check if we can prove the condition to be true using VF
2250 // and UF.
2251 if (!isConditionTrueViaVFAndUF(Cond, Plan, BestVF, BestUF, PSE))
2252 return false;
2253 } else {
2254 return false;
2255 }
2256
2257 // The vector loop region only executes once. Convert terminator of the
2258 // exiting block to exit in the first iteration.
2259 if (match(Term, m_BranchOnTwoConds())) {
2260 Term->setOperand(1, Plan.getTrue());
2261 return true;
2262 }
2263
2264 auto *BOC = new VPInstruction(VPInstruction::BranchOnCond, Plan.getTrue(), {},
2265 {}, Term->getDebugLoc());
2266 ExitingVPBB->appendRecipe(BOC);
2267 Term->eraseFromParent();
2268
2269 return true;
2270}
2271
2272/// From the definition of llvm.experimental.get.vector.length,
2273/// VPInstruction::ExplicitVectorLength(%AVL) = %AVL when %AVL <= VF.
2277 vp_depth_first_deep(Plan.getEntry()))) {
2278 for (VPRecipeBase &R : *VPBB) {
2279 VPValue *AVL;
2280 if (!match(&R, m_EVL(m_VPValue(AVL))))
2281 continue;
2282
2283 const SCEV *AVLSCEV = vputils::getSCEVExprForVPValue(AVL, PSE);
2284 if (isa<SCEVCouldNotCompute>(AVLSCEV))
2285 continue;
2286 ScalarEvolution &SE = *PSE.getSE();
2287 const SCEV *VFSCEV = SE.getElementCount(AVLSCEV->getType(), VF);
2288 if (!SE.isKnownPredicate(CmpInst::ICMP_ULE, AVLSCEV, VFSCEV))
2289 continue;
2290
2292 AVL, Type::getInt32Ty(Plan.getContext()), AVLSCEV->getType(),
2293 R.getDebugLoc());
2294 if (Trunc != AVL) {
2295 auto *TruncR = cast<VPSingleDefRecipe>(Trunc);
2296 const DataLayout &DL = Plan.getDataLayout();
2297 VPTypeAnalysis TypeInfo(Plan);
2298 if (VPValue *Folded =
2299 tryToFoldLiveIns(*TruncR, TruncR->operands(), DL, TypeInfo))
2300 Trunc = Folded;
2301 }
2302 R.getVPSingleValue()->replaceAllUsesWith(Trunc);
2303 return true;
2304 }
2305 }
2306 return false;
2307}
2308
2310 unsigned BestUF,
2312 assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
2313 assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
2314
2315 bool MadeChange = tryToReplaceALMWithWideALM(Plan, BestVF, BestUF);
2316 MadeChange |= simplifyBranchConditionForVFAndUF(Plan, BestVF, BestUF, PSE);
2317 MadeChange |= optimizeVectorInductionWidthForTCAndVFUF(Plan, BestVF, BestUF);
2318
2319 if (MadeChange) {
2320 Plan.setVF(BestVF);
2321 assert(Plan.getConcreteUF() == BestUF && "BestUF must match the Plan's UF");
2322 }
2323}
2324
2325/// Sink users of \p FOR after the recipe defining the previous value \p
2326/// Previous of the recurrence. \returns true if all users of \p FOR could be
2327/// re-arranged as needed or false if it is not possible.
2328static bool
2330 VPRecipeBase *Previous,
2331 VPDominatorTree &VPDT) {
2332 // If Previous is a live-in (no defining recipe), it naturally dominates all
2333 // recipes in the loop, so no sinking is needed.
2334 if (!Previous)
2335 return true;
2336
2337 // Collect recipes that need sinking.
2340 Seen.insert(Previous);
2341 auto TryToPushSinkCandidate = [&](VPRecipeBase *SinkCandidate) {
2342 // The previous value must not depend on the users of the recurrence phi. In
2343 // that case, FOR is not a fixed order recurrence.
2344 if (SinkCandidate == Previous)
2345 return false;
2346
2347 if (isa<VPHeaderPHIRecipe>(SinkCandidate) ||
2348 !Seen.insert(SinkCandidate).second ||
2349 VPDT.properlyDominates(Previous, SinkCandidate))
2350 return true;
2351
2352 if (cannotHoistOrSinkRecipe(*SinkCandidate, /*Sinking=*/true))
2353 return false;
2354
2355 WorkList.push_back(SinkCandidate);
2356 return true;
2357 };
2358
2359 // Recursively sink users of FOR after Previous.
2360 WorkList.push_back(FOR);
2361 for (unsigned I = 0; I != WorkList.size(); ++I) {
2362 VPRecipeBase *Current = WorkList[I];
2363 assert(Current->getNumDefinedValues() == 1 &&
2364 "only recipes with a single defined value expected");
2365
2366 for (VPUser *User : Current->getVPSingleValue()->users()) {
2367 if (!TryToPushSinkCandidate(cast<VPRecipeBase>(User)))
2368 return false;
2369 }
2370 }
2371
2372 // Keep recipes to sink ordered by dominance so earlier instructions are
2373 // processed first.
2374 sort(WorkList, [&VPDT](const VPRecipeBase *A, const VPRecipeBase *B) {
2375 return VPDT.properlyDominates(A, B);
2376 });
2377
2378 for (VPRecipeBase *SinkCandidate : WorkList) {
2379 if (SinkCandidate == FOR)
2380 continue;
2381
2382 SinkCandidate->moveAfter(Previous);
2383 Previous = SinkCandidate;
2384 }
2385 return true;
2386}
2387
2388/// Try to hoist \p Previous and its operands before all users of \p FOR.
2390 VPRecipeBase *Previous,
2391 VPDominatorTree &VPDT) {
2392 if (cannotHoistOrSinkRecipe(*Previous))
2393 return false;
2394
2395 // Collect recipes that need hoisting.
2396 SmallVector<VPRecipeBase *> HoistCandidates;
2398 VPRecipeBase *HoistPoint = nullptr;
2399 // Find the closest hoist point by looking at all users of FOR and selecting
2400 // the recipe dominating all other users.
2401 for (VPUser *U : FOR->users()) {
2402 auto *R = cast<VPRecipeBase>(U);
2403 if (!HoistPoint || VPDT.properlyDominates(R, HoistPoint))
2404 HoistPoint = R;
2405 }
2406 assert(all_of(FOR->users(),
2407 [&VPDT, HoistPoint](VPUser *U) {
2408 auto *R = cast<VPRecipeBase>(U);
2409 return HoistPoint == R ||
2410 VPDT.properlyDominates(HoistPoint, R);
2411 }) &&
2412 "HoistPoint must dominate all users of FOR");
2413
2414 auto NeedsHoisting = [HoistPoint, &VPDT,
2415 &Visited](VPValue *HoistCandidateV) -> VPRecipeBase * {
2416 VPRecipeBase *HoistCandidate = HoistCandidateV->getDefiningRecipe();
2417 if (!HoistCandidate)
2418 return nullptr;
2419 VPRegionBlock *EnclosingLoopRegion =
2420 HoistCandidate->getParent()->getEnclosingLoopRegion();
2421 assert((!HoistCandidate->getRegion() ||
2422 HoistCandidate->getRegion() == EnclosingLoopRegion) &&
2423 "CFG in VPlan should still be flat, without replicate regions");
2424 // Hoist candidate was already visited, no need to hoist.
2425 if (!Visited.insert(HoistCandidate).second)
2426 return nullptr;
2427
2428 // Candidate is outside loop region or a header phi, dominates FOR users w/o
2429 // hoisting.
2430 if (!EnclosingLoopRegion || isa<VPHeaderPHIRecipe>(HoistCandidate))
2431 return nullptr;
2432
2433 // If we reached a recipe that dominates HoistPoint, we don't need to
2434 // hoist the recipe.
2435 if (VPDT.properlyDominates(HoistCandidate, HoistPoint))
2436 return nullptr;
2437 return HoistCandidate;
2438 };
2439
2440 if (!NeedsHoisting(Previous->getVPSingleValue()))
2441 return true;
2442
2443 // Recursively try to hoist Previous and its operands before all users of FOR.
2444 HoistCandidates.push_back(Previous);
2445
2446 for (unsigned I = 0; I != HoistCandidates.size(); ++I) {
2447 VPRecipeBase *Current = HoistCandidates[I];
2448 assert(Current->getNumDefinedValues() == 1 &&
2449 "only recipes with a single defined value expected");
2450 if (cannotHoistOrSinkRecipe(*Current))
2451 return false;
2452
2453 for (VPValue *Op : Current->operands()) {
2454 // If we reach FOR, it means the original Previous depends on some other
2455 // recurrence that in turn depends on FOR. If that is the case, we would
2456 // also need to hoist recipes involving the other FOR, which may break
2457 // dependencies.
2458 if (Op == FOR)
2459 return false;
2460
2461 if (auto *R = NeedsHoisting(Op)) {
2462 // Bail out if the recipe defines multiple values.
2463 // TODO: Hoisting such recipes requires additional handling.
2464 if (R->getNumDefinedValues() != 1)
2465 return false;
2466 HoistCandidates.push_back(R);
2467 }
2468 }
2469 }
2470
2471 // Order recipes to hoist by dominance so earlier instructions are processed
2472 // first.
2473 sort(HoistCandidates, [&VPDT](const VPRecipeBase *A, const VPRecipeBase *B) {
2474 return VPDT.properlyDominates(A, B);
2475 });
2476
2477 for (VPRecipeBase *HoistCandidate : HoistCandidates) {
2478 HoistCandidate->moveBefore(*HoistPoint->getParent(),
2479 HoistPoint->getIterator());
2480 }
2481
2482 return true;
2483}
2484
2486 VPBuilder &LoopBuilder) {
2487 VPDominatorTree VPDT(Plan);
2488 VPTypeAnalysis TypeInfo(Plan);
2489
2491 for (VPRecipeBase &R :
2494 RecurrencePhis.push_back(FOR);
2495
2496 for (VPFirstOrderRecurrencePHIRecipe *FOR : RecurrencePhis) {
2498 VPRecipeBase *Previous = FOR->getBackedgeValue()->getDefiningRecipe();
2499 // Fixed-order recurrences do not contain cycles, so this loop is guaranteed
2500 // to terminate.
2501 while (auto *PrevPhi =
2503 assert(PrevPhi->getParent() == FOR->getParent());
2504 assert(SeenPhis.insert(PrevPhi).second);
2505 Previous = PrevPhi->getBackedgeValue()->getDefiningRecipe();
2506 }
2507
2508 if (!sinkRecurrenceUsersAfterPrevious(FOR, Previous, VPDT) &&
2509 !hoistPreviousBeforeFORUsers(FOR, Previous, VPDT))
2510 return false;
2511
2512 // Introduce a recipe to combine the incoming and previous values of a
2513 // fixed-order recurrence.
2514 VPBasicBlock *InsertBlock =
2515 Previous ? Previous->getParent() : FOR->getParent();
2516 if (!Previous || isa<VPHeaderPHIRecipe>(Previous))
2517 LoopBuilder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi());
2518 else
2519 LoopBuilder.setInsertPoint(InsertBlock,
2520 std::next(Previous->getIterator()));
2521
2522 auto *RecurSplice =
2524 {FOR, FOR->getBackedgeValue()});
2525
2526 FOR->replaceAllUsesWith(RecurSplice);
2527 // Set the first operand of RecurSplice to FOR again, after replacing
2528 // all users.
2529 RecurSplice->setOperand(0, FOR);
2530
2531 // Check for users extracting at the penultimate active lane of the FOR.
2532 // If only a single lane is active in the current iteration, we need to
2533 // select the last element from the previous iteration (from the FOR phi
2534 // directly).
2535 for (VPUser *U : RecurSplice->users()) {
2537 m_Specific(RecurSplice))))
2538 continue;
2539
2541 VPValue *LastActiveLane = cast<VPInstruction>(U)->getOperand(0);
2542 Type *Ty = TypeInfo.inferScalarType(LastActiveLane);
2543 VPValue *Zero = Plan.getConstantInt(Ty, 0);
2544 VPValue *One = Plan.getConstantInt(Ty, 1);
2545 VPValue *PenultimateIndex = B.createSub(LastActiveLane, One);
2546 VPValue *PenultimateLastIter =
2547 B.createNaryOp(VPInstruction::ExtractLane,
2548 {PenultimateIndex, FOR->getBackedgeValue()});
2549 VPValue *LastPrevIter =
2550 B.createNaryOp(VPInstruction::ExtractLastLane, FOR);
2551
2552 VPValue *Cmp = B.createICmp(CmpInst::ICMP_EQ, LastActiveLane, Zero);
2553 VPValue *Sel = B.createSelect(Cmp, LastPrevIter, PenultimateLastIter);
2554 cast<VPInstruction>(U)->replaceAllUsesWith(Sel);
2555 }
2556 }
2557 return true;
2558}
2559
2561 for (VPRecipeBase &R :
2563 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
2564 if (!PhiR)
2565 continue;
2566 RecurKind RK = PhiR->getRecurrenceKind();
2567 if (RK != RecurKind::Add && RK != RecurKind::Mul && RK != RecurKind::Sub &&
2569 continue;
2570
2571 for (VPUser *U : collectUsersRecursively(PhiR))
2572 if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(U)) {
2573 RecWithFlags->dropPoisonGeneratingFlags();
2574 }
2575 }
2576}
2577
2578namespace {
2579struct VPCSEDenseMapInfo : public DenseMapInfo<VPSingleDefRecipe *> {
2580 static bool isSentinel(const VPSingleDefRecipe *Def) {
2581 return Def == getEmptyKey() || Def == getTombstoneKey();
2582 }
2583
2584 /// If recipe \p R will lower to a GEP with a non-i8 source element type,
2585 /// return that source element type.
2586 static Type *getGEPSourceElementType(const VPSingleDefRecipe *R) {
2587 // All VPInstructions that lower to GEPs must have the i8 source element
2588 // type (as they are PtrAdds), so we omit it.
2590 .Case([](const VPReplicateRecipe *I) -> Type * {
2591 if (auto *GEP = dyn_cast<GetElementPtrInst>(I->getUnderlyingValue()))
2592 return GEP->getSourceElementType();
2593 return nullptr;
2594 })
2595 .Case<VPVectorPointerRecipe, VPWidenGEPRecipe>(
2596 [](auto *I) { return I->getSourceElementType(); })
2597 .Default([](auto *) { return nullptr; });
2598 }
2599
2600 /// Returns true if recipe \p Def can be safely handed for CSE.
2601 static bool canHandle(const VPSingleDefRecipe *Def) {
2602 // We can extend the list of handled recipes in the future,
2603 // provided we account for the data embedded in them while checking for
2604 // equality or hashing.
2605 auto C = getOpcodeOrIntrinsicID(Def);
2606
2607 // The issue with (Insert|Extract)Value is that the index of the
2608 // insert/extract is not a proper operand in LLVM IR, and hence also not in
2609 // VPlan.
2610 if (!C || (!C->first && (C->second == Instruction::InsertValue ||
2611 C->second == Instruction::ExtractValue)))
2612 return false;
2613
2614 // During CSE, we can only handle recipes that don't read from memory: if
2615 // they read from memory, there could be an intervening write to memory
2616 // before the next instance is CSE'd, leading to an incorrect result.
2617 return !Def->mayReadFromMemory();
2618 }
2619
2620 /// Hash the underlying data of \p Def.
2621 static unsigned getHashValue(const VPSingleDefRecipe *Def) {
2622 const VPlan *Plan = Def->getParent()->getPlan();
2623 VPTypeAnalysis TypeInfo(*Plan);
2624 hash_code Result = hash_combine(
2625 Def->getVPRecipeID(), getOpcodeOrIntrinsicID(Def),
2626 getGEPSourceElementType(Def), TypeInfo.inferScalarType(Def),
2628 if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(Def))
2629 if (RFlags->hasPredicate())
2630 return hash_combine(Result, RFlags->getPredicate());
2631 if (auto *SIVSteps = dyn_cast<VPScalarIVStepsRecipe>(Def))
2632 return hash_combine(Result, SIVSteps->getInductionOpcode());
2633 return Result;
2634 }
2635
2636 /// Check equality of underlying data of \p L and \p R.
2637 static bool isEqual(const VPSingleDefRecipe *L, const VPSingleDefRecipe *R) {
2638 if (isSentinel(L) || isSentinel(R))
2639 return L == R;
2640 if (L->getVPRecipeID() != R->getVPRecipeID() ||
2642 getGEPSourceElementType(L) != getGEPSourceElementType(R) ||
2644 !equal(L->operands(), R->operands()))
2645 return false;
2647 "must have valid opcode info for both recipes");
2648 if (auto *LFlags = dyn_cast<VPRecipeWithIRFlags>(L))
2649 if (LFlags->hasPredicate() &&
2650 LFlags->getPredicate() !=
2651 cast<VPRecipeWithIRFlags>(R)->getPredicate())
2652 return false;
2653 if (auto *LSIV = dyn_cast<VPScalarIVStepsRecipe>(L))
2654 if (LSIV->getInductionOpcode() !=
2655 cast<VPScalarIVStepsRecipe>(R)->getInductionOpcode())
2656 return false;
2657 // Recipes in replicate regions implicitly depend on predicate. If either
2658 // recipe is in a replicate region, only consider them equal if both have
2659 // the same parent.
2660 const VPRegionBlock *RegionL = L->getRegion();
2661 const VPRegionBlock *RegionR = R->getRegion();
2662 if (((RegionL && RegionL->isReplicator()) ||
2663 (RegionR && RegionR->isReplicator())) &&
2664 L->getParent() != R->getParent())
2665 return false;
2666 const VPlan *Plan = L->getParent()->getPlan();
2667 VPTypeAnalysis TypeInfo(*Plan);
2668 return TypeInfo.inferScalarType(L) == TypeInfo.inferScalarType(R);
2669 }
2670};
2671} // end anonymous namespace
2672
2673/// Perform a common-subexpression-elimination of VPSingleDefRecipes on the \p
2674/// Plan.
2676 VPDominatorTree VPDT(Plan);
2678
2680 Plan.getEntry());
2682 for (VPRecipeBase &R : *VPBB) {
2683 auto *Def = dyn_cast<VPSingleDefRecipe>(&R);
2684 if (!Def || !VPCSEDenseMapInfo::canHandle(Def))
2685 continue;
2686 if (VPSingleDefRecipe *V = CSEMap.lookup(Def)) {
2687 // V must dominate Def for a valid replacement.
2688 if (!VPDT.dominates(V->getParent(), VPBB))
2689 continue;
2690 // Only keep flags present on both V and Def.
2691 if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(V))
2692 RFlags->intersectFlags(*cast<VPRecipeWithIRFlags>(Def));
2693 Def->replaceAllUsesWith(V);
2694 continue;
2695 }
2696 CSEMap[Def] = Def;
2697 }
2698 }
2699}
2700
2701/// Move loop-invariant recipes out of the vector loop region in \p Plan.
2702static void licm(VPlan &Plan) {
2703 VPBasicBlock *Preheader = Plan.getVectorPreheader();
2704
2705 // Hoist any loop invariant recipes from the vector loop region to the
2706 // preheader. Preform a shallow traversal of the vector loop region, to
2707 // exclude recipes in replicate regions. Since the top-level blocks in the
2708 // vector loop region are guaranteed to execute if the vector pre-header is,
2709 // we don't need to check speculation safety.
2710 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2711 assert(Preheader->getSingleSuccessor() == LoopRegion &&
2712 "Expected vector prehader's successor to be the vector loop region");
2714 vp_depth_first_shallow(LoopRegion->getEntry()))) {
2715 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
2717 continue;
2718 if (any_of(R.operands(), [](VPValue *Op) {
2719 return !Op->isDefinedOutsideLoopRegions();
2720 }))
2721 continue;
2722 R.moveBefore(*Preheader, Preheader->end());
2723 }
2724 }
2725
2726#ifndef NDEBUG
2727 VPDominatorTree VPDT(Plan);
2728#endif
2729 // Sink recipes with no users inside the vector loop region if all users are
2730 // in the same exit block of the region.
2731 // TODO: Extend to sink recipes from inner loops.
2733 LoopRegion->getEntry());
2735 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
2736 if (cannotHoistOrSinkRecipe(R, /*Sinking=*/true))
2737 continue;
2738
2739 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
2740 assert(!RepR->isPredicated() &&
2741 "Expected prior transformation of predicated replicates to "
2742 "replicate regions");
2743 // narrowToSingleScalarRecipes should have already maximally narrowed
2744 // replicates to single-scalar replicates.
2745 // TODO: When unrolling, replicateByVF doesn't handle sunk
2746 // non-single-scalar replicates correctly.
2747 if (!RepR->isSingleScalar())
2748 continue;
2749 }
2750
2751 // TODO: Use R.definedValues() instead of casting to VPSingleDefRecipe to
2752 // support recipes with multiple defined values (e.g., interleaved loads).
2753 auto *Def = cast<VPSingleDefRecipe>(&R);
2754
2755 // Cannot sink the recipe if the user is defined in a loop region or a
2756 // non-successor of the vector loop region. Cannot sink if user is a phi
2757 // either.
2758 VPBasicBlock *SinkBB = nullptr;
2759 if (any_of(Def->users(), [&SinkBB, &LoopRegion](VPUser *U) {
2760 auto *UserR = cast<VPRecipeBase>(U);
2761 VPBasicBlock *Parent = UserR->getParent();
2762 // TODO: Support sinking when users are in multiple blocks.
2763 if (SinkBB && SinkBB != Parent)
2764 return true;
2765 SinkBB = Parent;
2766 // TODO: If the user is a PHI node, we should check the block of
2767 // incoming value. Support PHI node users if needed.
2768 return UserR->isPhi() || Parent->getEnclosingLoopRegion() ||
2769 Parent->getSinglePredecessor() != LoopRegion;
2770 }))
2771 continue;
2772
2773 if (!SinkBB)
2774 SinkBB = cast<VPBasicBlock>(LoopRegion->getSingleSuccessor());
2775
2776 // TODO: This will need to be a check instead of a assert after
2777 // conditional branches in vectorized loops are supported.
2778 assert(VPDT.properlyDominates(VPBB, SinkBB) &&
2779 "Defining block must dominate sink block");
2780 // TODO: Clone the recipe if users are on multiple exit paths, instead of
2781 // just moving.
2782 Def->moveBefore(*SinkBB, SinkBB->getFirstNonPhi());
2783 }
2784 }
2785}
2786
2788 VPlan &Plan, const MapVector<Instruction *, uint64_t> &MinBWs) {
2789 if (Plan.hasScalarVFOnly())
2790 return;
2791 // Keep track of created truncates, so they can be re-used. Note that we
2792 // cannot use RAUW after creating a new truncate, as this would could make
2793 // other uses have different types for their operands, making them invalidly
2794 // typed.
2796 VPTypeAnalysis TypeInfo(Plan);
2797 VPBasicBlock *PH = Plan.getVectorPreheader();
2800 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
2803 continue;
2804
2805 VPValue *ResultVPV = R.getVPSingleValue();
2806 auto *UI = cast_or_null<Instruction>(ResultVPV->getUnderlyingValue());
2807 unsigned NewResSizeInBits = MinBWs.lookup(UI);
2808 if (!NewResSizeInBits)
2809 continue;
2810
2811 // If the value wasn't vectorized, we must maintain the original scalar
2812 // type. Skip those here, after incrementing NumProcessedRecipes. Also
2813 // skip casts which do not need to be handled explicitly here, as
2814 // redundant casts will be removed during recipe simplification.
2816 continue;
2817
2818 Type *OldResTy = TypeInfo.inferScalarType(ResultVPV);
2819 unsigned OldResSizeInBits = OldResTy->getScalarSizeInBits();
2820 assert(OldResTy->isIntegerTy() && "only integer types supported");
2821 (void)OldResSizeInBits;
2822
2823 auto *NewResTy = IntegerType::get(Plan.getContext(), NewResSizeInBits);
2824
2825 // Any wrapping introduced by shrinking this operation shouldn't be
2826 // considered undefined behavior. So, we can't unconditionally copy
2827 // arithmetic wrapping flags to VPW.
2828 if (auto *VPW = dyn_cast<VPRecipeWithIRFlags>(&R))
2829 VPW->dropPoisonGeneratingFlags();
2830
2831 if (OldResSizeInBits != NewResSizeInBits &&
2832 !match(&R, m_ICmp(m_VPValue(), m_VPValue()))) {
2833 // Extend result to original width.
2834 auto *Ext = new VPWidenCastRecipe(
2835 Instruction::ZExt, ResultVPV, OldResTy, nullptr,
2836 VPIRFlags::getDefaultFlags(Instruction::ZExt));
2837 Ext->insertAfter(&R);
2838 ResultVPV->replaceAllUsesWith(Ext);
2839 Ext->setOperand(0, ResultVPV);
2840 assert(OldResSizeInBits > NewResSizeInBits && "Nothing to shrink?");
2841 } else {
2842 assert(match(&R, m_ICmp(m_VPValue(), m_VPValue())) &&
2843 "Only ICmps should not need extending the result.");
2844 }
2845
2846 assert(!isa<VPWidenStoreRecipe>(&R) && "stores cannot be narrowed");
2848 continue;
2849
2850 // Shrink operands by introducing truncates as needed.
2851 unsigned StartIdx =
2852 match(&R, m_Select(m_VPValue(), m_VPValue(), m_VPValue())) ? 1 : 0;
2853 for (unsigned Idx = StartIdx; Idx != R.getNumOperands(); ++Idx) {
2854 auto *Op = R.getOperand(Idx);
2855 unsigned OpSizeInBits =
2857 if (OpSizeInBits == NewResSizeInBits)
2858 continue;
2859 assert(OpSizeInBits > NewResSizeInBits && "nothing to truncate");
2860 auto [ProcessedIter, IterIsEmpty] = ProcessedTruncs.try_emplace(Op);
2861 if (!IterIsEmpty) {
2862 R.setOperand(Idx, ProcessedIter->second);
2863 continue;
2864 }
2865
2866 VPBuilder Builder;
2867 if (isa<VPIRValue>(Op))
2868 Builder.setInsertPoint(PH);
2869 else
2870 Builder.setInsertPoint(&R);
2871 VPWidenCastRecipe *NewOp =
2872 Builder.createWidenCast(Instruction::Trunc, Op, NewResTy);
2873 ProcessedIter->second = NewOp;
2874 R.setOperand(Idx, NewOp);
2875 }
2876
2877 }
2878 }
2879}
2880
2881void VPlanTransforms::removeBranchOnConst(VPlan &Plan, bool OnlyLatches) {
2882 std::optional<VPDominatorTree> VPDT;
2883 if (OnlyLatches)
2884 VPDT.emplace(Plan);
2885
2886 // Collect all blocks before modifying the CFG so we can identify unreachable
2887 // ones after constant branch removal.
2889
2890 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(AllBlocks)) {
2891 VPValue *Cond;
2892 // Skip blocks that are not terminated by BranchOnCond.
2893 if (VPBB->empty() || !match(&VPBB->back(), m_BranchOnCond(m_VPValue(Cond))))
2894 continue;
2895
2896 if (OnlyLatches && !VPBlockUtils::isLatch(VPBB, *VPDT))
2897 continue;
2898
2899 assert(VPBB->getNumSuccessors() == 2 &&
2900 "Two successors expected for BranchOnCond");
2901 unsigned RemovedIdx;
2902 if (match(Cond, m_True()))
2903 RemovedIdx = 1;
2904 else if (match(Cond, m_False()))
2905 RemovedIdx = 0;
2906 else
2907 continue;
2908
2909 VPBasicBlock *RemovedSucc =
2910 cast<VPBasicBlock>(VPBB->getSuccessors()[RemovedIdx]);
2911 assert(count(RemovedSucc->getPredecessors(), VPBB) == 1 &&
2912 "There must be a single edge between VPBB and its successor");
2913 // Values coming from VPBB into phi recipes of RemovedSucc are removed from
2914 // these recipes.
2915 for (VPRecipeBase &R : RemovedSucc->phis())
2916 cast<VPPhiAccessors>(&R)->removeIncomingValueFor(VPBB);
2917
2918 // Disconnect blocks and remove the terminator.
2919 VPBlockUtils::disconnectBlocks(VPBB, RemovedSucc);
2920 VPBB->back().eraseFromParent();
2921 }
2922
2923 // Compute which blocks are still reachable from the entry after constant
2924 // branch removal.
2927
2928 // Detach all unreachable blocks from their successors, removing their recipes
2929 // and incoming values from phi recipes.
2930 VPSymbolicValue Tmp;
2931 for (VPBlockBase *B : AllBlocks) {
2932 if (Reachable.contains(B))
2933 continue;
2934 for (VPBlockBase *Succ : to_vector(B->successors())) {
2935 if (auto *SuccBB = dyn_cast<VPBasicBlock>(Succ))
2936 for (VPRecipeBase &R : SuccBB->phis())
2937 cast<VPPhiAccessors>(&R)->removeIncomingValueFor(B);
2939 }
2940 for (VPBasicBlock *DeadBB :
2942 for (VPRecipeBase &R : make_early_inc_range(*DeadBB)) {
2943 for (VPValue *Def : R.definedValues())
2944 Def->replaceAllUsesWith(&Tmp);
2945 R.eraseFromParent();
2946 }
2947 }
2948 }
2949}
2950
2972
2973// Add a VPActiveLaneMaskPHIRecipe and related recipes to \p Plan and replace
2974// the loop terminator with a branch-on-cond recipe with the negated
2975// active-lane-mask as operand. Note that this turns the loop into an
2976// uncountable one. Only the existing terminator is replaced, all other existing
2977// recipes/users remain unchanged, except for poison-generating flags being
2978// dropped from the canonical IV increment. Return the created
2979// VPActiveLaneMaskPHIRecipe.
2980//
2981// The function adds the following recipes:
2982//
2983// vector.ph:
2984// %EntryInc = canonical-iv-increment-for-part CanonicalIVStart
2985// %EntryALM = active-lane-mask %EntryInc, TC
2986//
2987// vector.body:
2988// ...
2989// %P = active-lane-mask-phi [ %EntryALM, %vector.ph ], [ %ALM, %vector.body ]
2990// ...
2991// %InLoopInc = canonical-iv-increment-for-part CanonicalIVIncrement
2992// %ALM = active-lane-mask %InLoopInc, TC
2993// %Negated = Not %ALM
2994// branch-on-cond %Negated
2995//
2998 VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
2999 VPBasicBlock *EB = TopRegion->getExitingBasicBlock();
3000 VPValue *StartV = Plan.getZero(TopRegion->getCanonicalIVType());
3001 auto *CanonicalIVIncrement = TopRegion->getOrCreateCanonicalIVIncrement();
3002 // TODO: Check if dropping the flags is needed.
3003 TopRegion->clearCanonicalIVNUW(CanonicalIVIncrement);
3004 DebugLoc DL = CanonicalIVIncrement->getDebugLoc();
3005 // We can't use StartV directly in the ActiveLaneMask VPInstruction, since
3006 // we have to take unrolling into account. Each part needs to start at
3007 // Part * VF
3008 auto *VecPreheader = Plan.getVectorPreheader();
3009 VPBuilder Builder(VecPreheader);
3010
3011 // Create the ActiveLaneMask instruction using the correct start values.
3012 VPValue *TC = Plan.getTripCount();
3013 VPValue *VF = &Plan.getVF();
3014
3015 auto *EntryIncrement = Builder.createOverflowingOp(
3016 VPInstruction::CanonicalIVIncrementForPart, {StartV, VF}, {false, false},
3017 DL, "index.part.next");
3018
3019 // Create the active lane mask instruction in the VPlan preheader.
3020 VPValue *ALMMultiplier =
3021 Plan.getConstantInt(TopRegion->getCanonicalIVType(), 1);
3022 auto *EntryALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
3023 {EntryIncrement, TC, ALMMultiplier}, DL,
3024 "active.lane.mask.entry");
3025
3026 // Now create the ActiveLaneMaskPhi recipe in the main loop using the
3027 // preheader ActiveLaneMask instruction.
3028 auto *LaneMaskPhi =
3030 auto *HeaderVPBB = TopRegion->getEntryBasicBlock();
3031 LaneMaskPhi->insertBefore(*HeaderVPBB, HeaderVPBB->begin());
3032
3033 // Create the active lane mask for the next iteration of the loop before the
3034 // original terminator.
3035 VPRecipeBase *OriginalTerminator = EB->getTerminator();
3036 Builder.setInsertPoint(OriginalTerminator);
3037 auto *InLoopIncrement = Builder.createOverflowingOp(
3039 {CanonicalIVIncrement, &Plan.getVF()}, {false, false}, DL);
3040 auto *ALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
3041 {InLoopIncrement, TC, ALMMultiplier}, DL,
3042 "active.lane.mask.next");
3043 LaneMaskPhi->addOperand(ALM);
3044
3045 // Replace the original terminator with BranchOnCond. We have to invert the
3046 // mask here because a true condition means jumping to the exit block.
3047 auto *NotMask = Builder.createNot(ALM, DL);
3048 Builder.createNaryOp(VPInstruction::BranchOnCond, {NotMask}, DL);
3049 OriginalTerminator->eraseFromParent();
3050 return LaneMaskPhi;
3051}
3052
3054 bool UseActiveLaneMaskForControlFlow) {
3055 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3056 auto *FoundWidenCanonicalIVUser = find_if(
3058 assert(FoundWidenCanonicalIVUser &&
3059 "Must have widened canonical IV when tail folding!");
3060 VPSingleDefRecipe *HeaderMask = vputils::findHeaderMask(Plan);
3061 auto *WideCanonicalIV =
3062 cast<VPWidenCanonicalIVRecipe>(*FoundWidenCanonicalIVUser);
3063 VPSingleDefRecipe *LaneMask;
3064 if (UseActiveLaneMaskForControlFlow) {
3065 LaneMask = addVPLaneMaskPhiAndUpdateExitBranch(Plan);
3066 } else {
3067 VPBuilder B = VPBuilder::getToInsertAfter(WideCanonicalIV);
3068 VPValue *ALMMultiplier =
3069 Plan.getConstantInt(LoopRegion->getCanonicalIVType(), 1);
3070 LaneMask =
3071 B.createNaryOp(VPInstruction::ActiveLaneMask,
3072 {WideCanonicalIV, Plan.getTripCount(), ALMMultiplier},
3073 nullptr, "active.lane.mask");
3074 }
3075
3076 // Walk users of WideCanonicalIV and replace the header mask of the form
3077 // (ICMP_ULE, WideCanonicalIV, backedge-taken-count) with an active-lane-mask,
3078 // removing the old one to ensure there is always only a single header mask.
3079 HeaderMask->replaceAllUsesWith(LaneMask);
3080 HeaderMask->eraseFromParent();
3081}
3082
3083template <typename Op0_t, typename Op1_t> struct RemoveMask_match {
3084 Op0_t In;
3086
3087 RemoveMask_match(const Op0_t &In, Op1_t &Out) : In(In), Out(Out) {}
3088
3089 template <typename OpTy> bool match(OpTy *V) const {
3090 if (m_Specific(In).match(V)) {
3091 Out = nullptr;
3092 return true;
3093 }
3094 return m_LogicalAnd(m_Specific(In), m_VPValue(Out)).match(V);
3095 }
3096};
3097
3098/// Match a specific mask \p In, or a combination of it (logical-and In, Out).
3099/// Returns the remaining part \p Out if so, or nullptr otherwise.
3100template <typename Op0_t, typename Op1_t>
3101static inline RemoveMask_match<Op0_t, Op1_t> m_RemoveMask(const Op0_t &In,
3102 Op1_t &Out) {
3103 return RemoveMask_match<Op0_t, Op1_t>(In, Out);
3104}
3105
3106/// Try to optimize a \p CurRecipe masked by \p HeaderMask to a corresponding
3107/// EVL-based recipe without the header mask. Returns nullptr if no EVL-based
3108/// recipe could be created.
3109/// \p HeaderMask Header Mask.
3110/// \p CurRecipe Recipe to be transform.
3111/// \p TypeInfo VPlan-based type analysis.
3112/// \p EVL The explicit vector length parameter of vector-predication
3113/// intrinsics.
3115 VPRecipeBase &CurRecipe,
3116 VPTypeAnalysis &TypeInfo, VPValue &EVL) {
3117 VPlan *Plan = CurRecipe.getParent()->getPlan();
3118 DebugLoc DL = CurRecipe.getDebugLoc();
3119 VPValue *Addr, *Mask, *EndPtr;
3120
3121 /// Adjust any end pointers so that they point to the end of EVL lanes not VF.
3122 auto AdjustEndPtr = [&CurRecipe, &EVL](VPValue *EndPtr) {
3123 auto *EVLEndPtr = cast<VPVectorEndPointerRecipe>(EndPtr)->clone();
3124 EVLEndPtr->insertBefore(&CurRecipe);
3125 EVLEndPtr->setOperand(1, &EVL);
3126 return EVLEndPtr;
3127 };
3128
3129 auto GetVPReverse = [&CurRecipe, &EVL, &TypeInfo, Plan,
3131 if (!V)
3132 return nullptr;
3133 auto *Reverse = new VPWidenIntrinsicRecipe(
3134 Intrinsic::experimental_vp_reverse, {V, Plan->getTrue(), &EVL},
3135 TypeInfo.inferScalarType(V), {}, {}, DL);
3136 Reverse->insertBefore(&CurRecipe);
3137 return Reverse;
3138 };
3139
3140 if (match(&CurRecipe,
3141 m_MaskedLoad(m_VPValue(Addr), m_RemoveMask(HeaderMask, Mask))))
3142 return new VPWidenLoadEVLRecipe(cast<VPWidenLoadRecipe>(CurRecipe), Addr,
3143 EVL, Mask);
3144
3145 VPValue *ReversedVal;
3146 if (match(&CurRecipe, m_Reverse(m_VPValue(ReversedVal))) &&
3147 match(ReversedVal,
3148 m_MaskedLoad(m_VPValue(EndPtr),
3149 m_Reverse(m_RemoveMask(HeaderMask, Mask)))) &&
3150 match(EndPtr, m_VecEndPtr(m_VPValue(), m_Specific(&Plan->getVF())))) {
3151 Mask = GetVPReverse(Mask);
3152 Addr = AdjustEndPtr(EndPtr);
3153 auto *LoadR = new VPWidenLoadEVLRecipe(
3154 *cast<VPWidenLoadRecipe>(ReversedVal), Addr, EVL, Mask);
3155 LoadR->insertBefore(&CurRecipe);
3156 return new VPWidenIntrinsicRecipe(
3157 Intrinsic::experimental_vp_reverse, {LoadR, Plan->getTrue(), &EVL},
3158 TypeInfo.inferScalarType(LoadR), {}, {}, DL);
3159 }
3160
3161 VPValue *StoredVal;
3162 if (match(&CurRecipe, m_MaskedStore(m_VPValue(Addr), m_VPValue(StoredVal),
3163 m_RemoveMask(HeaderMask, Mask))))
3164 return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(CurRecipe), Addr,
3165 StoredVal, EVL, Mask);
3166
3167 if (match(&CurRecipe,
3168 m_MaskedStore(m_VPValue(EndPtr), m_Reverse(m_VPValue(ReversedVal)),
3169 m_Reverse(m_RemoveMask(HeaderMask, Mask)))) &&
3170 match(EndPtr, m_VecEndPtr(m_VPValue(), m_Specific(&Plan->getVF())))) {
3171 Mask = GetVPReverse(Mask);
3172 Addr = AdjustEndPtr(EndPtr);
3173 StoredVal = GetVPReverse(ReversedVal);
3174 return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(CurRecipe), Addr,
3175 StoredVal, EVL, Mask);
3176 }
3177
3178 if (auto *Rdx = dyn_cast<VPReductionRecipe>(&CurRecipe))
3179 if (Rdx->isConditional() &&
3180 match(Rdx->getCondOp(), m_RemoveMask(HeaderMask, Mask)))
3181 return new VPReductionEVLRecipe(*Rdx, EVL, Mask);
3182
3183 if (auto *Interleave = dyn_cast<VPInterleaveRecipe>(&CurRecipe))
3184 if (Interleave->getMask() &&
3185 match(Interleave->getMask(), m_RemoveMask(HeaderMask, Mask)))
3186 return new VPInterleaveEVLRecipe(*Interleave, EVL, Mask);
3187
3188 VPValue *LHS, *RHS;
3189 if (match(&CurRecipe,
3190 m_Select(m_Specific(HeaderMask), m_VPValue(LHS), m_VPValue(RHS))))
3191 return new VPWidenIntrinsicRecipe(
3192 Intrinsic::vp_merge, {Plan->getTrue(), LHS, RHS, &EVL},
3193 TypeInfo.inferScalarType(LHS), {}, {}, DL);
3194
3195 if (match(&CurRecipe, m_Select(m_RemoveMask(HeaderMask, Mask), m_VPValue(LHS),
3196 m_VPValue(RHS))))
3197 return new VPWidenIntrinsicRecipe(
3198 Intrinsic::vp_merge, {Mask, LHS, RHS, &EVL},
3199 TypeInfo.inferScalarType(LHS), {}, {}, DL);
3200
3201 if (match(&CurRecipe, m_LastActiveLane(m_Specific(HeaderMask)))) {
3202 Type *Ty = TypeInfo.inferScalarType(CurRecipe.getVPSingleValue());
3203 VPValue *ZExt = VPBuilder(&CurRecipe)
3205 &EVL, Ty, TypeInfo.inferScalarType(&EVL), DL);
3206 return new VPInstruction(
3207 Instruction::Sub, {ZExt, Plan->getConstantInt(Ty, 1)},
3208 VPIRFlags::getDefaultFlags(Instruction::Sub), {}, DL);
3209 }
3210
3211 // lhs | (headermask && rhs) -> vp.merge rhs, true, lhs, evl
3212 if (match(&CurRecipe,
3214 m_LogicalAnd(m_Specific(HeaderMask), m_VPValue(RHS)))))
3215 return new VPWidenIntrinsicRecipe(
3216 Intrinsic::vp_merge, {RHS, Plan->getTrue(), LHS, &EVL},
3217 TypeInfo.inferScalarType(LHS), {}, {}, DL);
3218
3219 return nullptr;
3220}
3221
3222/// Optimize away any EVL-based header masks to VP intrinsic based recipes.
3223/// The transforms here need to preserve the original semantics.
3225 // Find the EVL-based header mask if it exists: icmp ult step-vector, EVL
3226 VPValue *HeaderMask = nullptr, *EVL = nullptr;
3229 m_VPValue(EVL))) &&
3230 match(EVL, m_EVL(m_VPValue()))) {
3231 HeaderMask = R.getVPSingleValue();
3232 break;
3233 }
3234 }
3235 if (!HeaderMask)
3236 return;
3237
3238 VPTypeAnalysis TypeInfo(Plan);
3239 SmallVector<VPRecipeBase *> OldRecipes;
3240 for (VPUser *U : collectUsersRecursively(HeaderMask)) {
3242 if (auto *NewR = optimizeMaskToEVL(HeaderMask, *R, TypeInfo, *EVL)) {
3243 NewR->insertBefore(R);
3244 for (auto [Old, New] :
3245 zip_equal(R->definedValues(), NewR->definedValues()))
3246 Old->replaceAllUsesWith(New);
3247 OldRecipes.push_back(R);
3248 }
3249 }
3250
3251 // Replace remaining (HeaderMask && Mask) with vp.merge (True, Mask,
3252 // False, EVL)
3253 for (VPUser *U : collectUsersRecursively(HeaderMask)) {
3254 VPValue *Mask;
3255 if (match(U, m_LogicalAnd(m_Specific(HeaderMask), m_VPValue(Mask)))) {
3256 auto *LogicalAnd = cast<VPInstruction>(U);
3257 auto *Merge = new VPWidenIntrinsicRecipe(
3258 Intrinsic::vp_merge, {Plan.getTrue(), Mask, Plan.getFalse(), EVL},
3259 TypeInfo.inferScalarType(Mask), {}, {}, LogicalAnd->getDebugLoc());
3260 Merge->insertBefore(LogicalAnd);
3261 LogicalAnd->replaceAllUsesWith(Merge);
3262 OldRecipes.push_back(LogicalAnd);
3263 }
3264 }
3265
3266 // Erase old recipes at the end so we don't invalidate TypeInfo.
3267 for (VPRecipeBase *R : reverse(OldRecipes)) {
3268 SmallVector<VPValue *> PossiblyDead(R->operands());
3269 R->eraseFromParent();
3270 for (VPValue *Op : PossiblyDead)
3272 }
3273}
3274
3275/// After replacing the canonical IV with a EVL-based IV, fixup recipes that use
3276/// VF to use the EVL instead to avoid incorrect updates on the penultimate
3277/// iteration.
3278static void fixupVFUsersForEVL(VPlan &Plan, VPValue &EVL) {
3279 VPTypeAnalysis TypeInfo(Plan);
3280 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3281 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
3282
3283 assert(all_of(Plan.getVF().users(),
3286 "User of VF that we can't transform to EVL.");
3287 Plan.getVF().replaceUsesWithIf(&EVL, [](VPUser &U, unsigned Idx) {
3289 });
3290
3291 assert(all_of(Plan.getVFxUF().users(),
3293 m_c_Add(m_Specific(LoopRegion->getCanonicalIV()),
3294 m_Specific(&Plan.getVFxUF())),
3296 "Only users of VFxUF should be VPWidenPointerInductionRecipe and the "
3297 "increment of the canonical induction.");
3298 Plan.getVFxUF().replaceUsesWithIf(&EVL, [](VPUser &U, unsigned Idx) {
3299 // Only replace uses in VPWidenPointerInductionRecipe; The increment of the
3300 // canonical induction must not be updated.
3302 });
3303
3304 // Create a scalar phi to track the previous EVL if fixed-order recurrence is
3305 // contained.
3306 bool ContainsFORs =
3308 if (ContainsFORs) {
3309 // TODO: Use VPInstruction::ExplicitVectorLength to get maximum EVL.
3310 VPValue *MaxEVL = &Plan.getVF();
3311 // Emit VPScalarCastRecipe in preheader if VF is not a 32 bits integer.
3312 VPBuilder Builder(LoopRegion->getPreheaderVPBB());
3313 MaxEVL = Builder.createScalarZExtOrTrunc(
3314 MaxEVL, Type::getInt32Ty(Plan.getContext()),
3315 TypeInfo.inferScalarType(MaxEVL), DebugLoc::getUnknown());
3316
3317 Builder.setInsertPoint(Header, Header->getFirstNonPhi());
3318 VPValue *PrevEVL = Builder.createScalarPhi(
3319 {MaxEVL, &EVL}, DebugLoc::getUnknown(), "prev.evl");
3320
3323 for (VPRecipeBase &R : *VPBB) {
3324 VPValue *V1, *V2;
3325 if (!match(&R,
3327 m_VPValue(V1), m_VPValue(V2))))
3328 continue;
3329 VPValue *Imm = Plan.getOrAddLiveIn(
3332 Intrinsic::experimental_vp_splice,
3333 {V1, V2, Imm, Plan.getTrue(), PrevEVL, &EVL},
3334 TypeInfo.inferScalarType(R.getVPSingleValue()), {}, {},
3335 R.getDebugLoc());
3336 VPSplice->insertBefore(&R);
3337 R.getVPSingleValue()->replaceAllUsesWith(VPSplice);
3338 }
3339 }
3340 }
3341
3342 VPValue *HeaderMask = vputils::findHeaderMask(Plan);
3343 if (!HeaderMask)
3344 return;
3345
3346 // Ensure that any reduction that uses a select to mask off tail lanes does so
3347 // in the vector loop, not the middle block, since EVL tail folding can have
3348 // tail elements in the penultimate iteration.
3349 assert(all_of(*Plan.getMiddleBlock(), [&Plan, HeaderMask](VPRecipeBase &R) {
3350 if (match(&R, m_ComputeReductionResult(m_Select(m_Specific(HeaderMask),
3351 m_VPValue(), m_VPValue()))))
3352 return R.getOperand(0)->getDefiningRecipe()->getRegion() ==
3353 Plan.getVectorLoopRegion();
3354 return true;
3355 }));
3356
3357 // Replace header masks with a mask equivalent to predicating by EVL:
3358 //
3359 // icmp ule widen-canonical-iv backedge-taken-count
3360 // ->
3361 // icmp ult step-vector, EVL
3362 VPRecipeBase *EVLR = EVL.getDefiningRecipe();
3363 VPBuilder Builder(EVLR->getParent(), std::next(EVLR->getIterator()));
3364 Type *EVLType = TypeInfo.inferScalarType(&EVL);
3365 VPValue *EVLMask = Builder.createICmp(
3367 Builder.createNaryOp(VPInstruction::StepVector, {}, EVLType), &EVL);
3368 HeaderMask->replaceAllUsesWith(EVLMask);
3369}
3370
3371/// Converts a tail folded vector loop region to step by
3372/// VPInstruction::ExplicitVectorLength elements instead of VF elements each
3373/// iteration.
3374///
3375/// - Add a VPCurrentIterationPHIRecipe and related recipes to \p Plan and
3376/// replaces all uses of the canonical IV except for the canonical IV
3377/// increment with a VPCurrentIterationPHIRecipe. The canonical IV is used
3378/// only for loop iterations counting after this transformation.
3379///
3380/// - The header mask is replaced with a header mask based on the EVL.
3381///
3382/// - Plans with FORs have a new phi added to keep track of the EVL of the
3383/// previous iteration, and VPFirstOrderRecurrencePHIRecipes are replaced with
3384/// @llvm.vp.splice.
3385///
3386/// The function uses the following definitions:
3387/// %StartV is the canonical induction start value.
3388///
3389/// The function adds the following recipes:
3390///
3391/// vector.ph:
3392/// ...
3393///
3394/// vector.body:
3395/// ...
3396/// %CurrentIter = CURRENT-ITERATION-PHI [ %StartV, %vector.ph ],
3397/// [ %NextIter, %vector.body ]
3398/// %AVL = phi [ trip-count, %vector.ph ], [ %NextAVL, %vector.body ]
3399/// %VPEVL = EXPLICIT-VECTOR-LENGTH %AVL
3400/// ...
3401/// %OpEVL = cast i32 %VPEVL to IVSize
3402/// %NextIter = add IVSize %OpEVL, %CurrentIter
3403/// %NextAVL = sub IVSize nuw %AVL, %OpEVL
3404/// ...
3405///
3406/// If MaxSafeElements is provided, the function adds the following recipes:
3407/// vector.ph:
3408/// ...
3409///
3410/// vector.body:
3411/// ...
3412/// %CurrentIter = CURRENT-ITERATION-PHI [ %StartV, %vector.ph ],
3413/// [ %NextIter, %vector.body ]
3414/// %AVL = phi [ trip-count, %vector.ph ], [ %NextAVL, %vector.body ]
3415/// %cmp = cmp ult %AVL, MaxSafeElements
3416/// %SAFE_AVL = select %cmp, %AVL, MaxSafeElements
3417/// %VPEVL = EXPLICIT-VECTOR-LENGTH %SAFE_AVL
3418/// ...
3419/// %OpEVL = cast i32 %VPEVL to IVSize
3420/// %NextIter = add IVSize %OpEVL, %CurrentIter
3421/// %NextAVL = sub IVSize nuw %AVL, %OpEVL
3422/// ...
3423///
3425 VPlan &Plan, const std::optional<unsigned> &MaxSafeElements) {
3426 if (Plan.hasScalarVFOnly())
3427 return;
3428 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3429 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
3430
3431 auto *CanonicalIV = LoopRegion->getCanonicalIV();
3432 auto *CanIVTy = LoopRegion->getCanonicalIVType();
3433 VPValue *StartV = Plan.getZero(CanIVTy);
3434 auto *CanonicalIVIncrement = LoopRegion->getOrCreateCanonicalIVIncrement();
3435
3436 // Create the CurrentIteration recipe in the vector loop.
3437 auto *CurrentIteration =
3439 CurrentIteration->insertBefore(*Header, Header->begin());
3440 VPBuilder Builder(Header, Header->getFirstNonPhi());
3441 // Create the AVL (application vector length), starting from TC -> 0 in steps
3442 // of EVL.
3443 VPPhi *AVLPhi = Builder.createScalarPhi(
3444 {Plan.getTripCount()}, DebugLoc::getCompilerGenerated(), "avl");
3445 VPValue *AVL = AVLPhi;
3446
3447 if (MaxSafeElements) {
3448 // Support for MaxSafeDist for correct loop emission.
3449 VPValue *AVLSafe = Plan.getConstantInt(CanIVTy, *MaxSafeElements);
3450 VPValue *Cmp = Builder.createICmp(ICmpInst::ICMP_ULT, AVL, AVLSafe);
3451 AVL = Builder.createSelect(Cmp, AVL, AVLSafe, DebugLoc::getUnknown(),
3452 "safe_avl");
3453 }
3454 auto *VPEVL = Builder.createNaryOp(VPInstruction::ExplicitVectorLength, AVL,
3455 DebugLoc::getUnknown(), "evl");
3456
3457 Builder.setInsertPoint(CanonicalIVIncrement);
3458 VPValue *OpVPEVL = VPEVL;
3459
3460 auto *I32Ty = Type::getInt32Ty(Plan.getContext());
3461 OpVPEVL = Builder.createScalarZExtOrTrunc(
3462 OpVPEVL, CanIVTy, I32Ty, CanonicalIVIncrement->getDebugLoc());
3463
3464 auto *NextIter = Builder.createAdd(
3465 OpVPEVL, CurrentIteration, CanonicalIVIncrement->getDebugLoc(),
3466 "current.iteration.next", CanonicalIVIncrement->getNoWrapFlags());
3467 CurrentIteration->addOperand(NextIter);
3468
3469 VPValue *NextAVL =
3470 Builder.createSub(AVLPhi, OpVPEVL, DebugLoc::getCompilerGenerated(),
3471 "avl.next", {/*NUW=*/true, /*NSW=*/false});
3472 AVLPhi->addOperand(NextAVL);
3473
3474 fixupVFUsersForEVL(Plan, *VPEVL);
3475 removeDeadRecipes(Plan);
3476
3477 // Replace all uses of the canonical IV with VPCurrentIterationPHIRecipe
3478 // except for the canonical IV increment.
3479 CanonicalIV->replaceAllUsesWith(CurrentIteration);
3480 CanonicalIVIncrement->setOperand(0, CanonicalIV);
3481 // TODO: support unroll factor > 1.
3482 Plan.setUF(1);
3483}
3484
3486 // Find the vector loop entry by locating VPCurrentIterationPHIRecipe.
3487 // There should be only one VPCurrentIteration in the entire plan.
3488 VPCurrentIterationPHIRecipe *CurrentIteration = nullptr;
3489
3492 for (VPRecipeBase &R : VPBB->phis())
3493 if (auto *PhiR = dyn_cast<VPCurrentIterationPHIRecipe>(&R)) {
3494 assert(!CurrentIteration &&
3495 "Found multiple CurrentIteration. Only one expected");
3496 CurrentIteration = PhiR;
3497 }
3498
3499 // Early return if it is not variable-length stepping.
3500 if (!CurrentIteration)
3501 return;
3502
3503 VPBasicBlock *HeaderVPBB = CurrentIteration->getParent();
3504 VPValue *CurrentIterationIncr = CurrentIteration->getBackedgeValue();
3505
3506 // Convert CurrentIteration to concrete recipe.
3507 auto *ScalarR =
3508 VPBuilder(CurrentIteration)
3510 {CurrentIteration->getStartValue(), CurrentIterationIncr},
3511 CurrentIteration->getDebugLoc(), "current.iteration.iv");
3512 CurrentIteration->replaceAllUsesWith(ScalarR);
3513 CurrentIteration->eraseFromParent();
3514
3515 // Replace CanonicalIVInc with CurrentIteration increment if it exists.
3516 auto *CanonicalIV = cast<VPPhi>(&*HeaderVPBB->begin());
3517 if (auto *CanIVInc = vputils::findUserOf(
3518 CanonicalIV, m_c_Add(m_VPValue(), m_Specific(&Plan.getVFxUF())))) {
3519 cast<VPInstruction>(CanIVInc)->replaceAllUsesWith(CurrentIterationIncr);
3520 CanIVInc->eraseFromParent();
3521 }
3522}
3523
3525 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3526 if (!LoopRegion)
3527 return;
3528 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
3529 if (Header->empty())
3530 return;
3531 // The EVL IV is always at the beginning.
3532 auto *EVLPhi = dyn_cast<VPCurrentIterationPHIRecipe>(&Header->front());
3533 if (!EVLPhi)
3534 return;
3535
3536 // Bail if not an EVL tail folded loop.
3537 VPValue *AVL;
3538 if (!match(EVLPhi->getBackedgeValue(),
3539 m_c_Add(m_ZExtOrSelf(m_EVL(m_VPValue(AVL))), m_Specific(EVLPhi))))
3540 return;
3541
3542 // The AVL may be capped to a safe distance.
3543 VPValue *SafeAVL, *UnsafeAVL;
3544 if (match(AVL,
3546 m_VPValue(SafeAVL)),
3547 m_Deferred(UnsafeAVL), m_Deferred(SafeAVL))))
3548 AVL = UnsafeAVL;
3549
3550 VPValue *AVLNext;
3551 [[maybe_unused]] bool FoundAVLNext =
3553 m_Specific(Plan.getTripCount()), m_VPValue(AVLNext)));
3554 assert(FoundAVLNext && "Didn't find AVL backedge?");
3555
3556 VPBasicBlock *Latch = LoopRegion->getExitingBasicBlock();
3557 auto *LatchBr = cast<VPInstruction>(Latch->getTerminator());
3558 if (match(LatchBr, m_BranchOnCond(m_True())))
3559 return;
3560
3561 VPValue *CanIVInc;
3562 [[maybe_unused]] bool FoundIncrement = match(
3563 LatchBr,
3565 m_Specific(&Plan.getVectorTripCount()))));
3566 assert(FoundIncrement &&
3567 match(CanIVInc, m_Add(m_Specific(LoopRegion->getCanonicalIV()),
3568 m_Specific(&Plan.getVFxUF()))) &&
3569 "Expected BranchOnCond with ICmp comparing CanIV + VFxUF with vector "
3570 "trip count");
3571
3572 Type *AVLTy = VPTypeAnalysis(Plan).inferScalarType(AVLNext);
3573 VPBuilder Builder(LatchBr);
3574 LatchBr->setOperand(
3575 0, Builder.createICmp(CmpInst::ICMP_EQ, AVLNext, Plan.getZero(AVLTy)));
3576}
3577
3579 VPlan &Plan, PredicatedScalarEvolution &PSE,
3580 const DenseMap<Value *, const SCEV *> &StridesMap) {
3581 // Replace VPValues for known constant strides guaranteed by predicate scalar
3582 // evolution.
3583 auto CanUseVersionedStride = [&Plan](VPUser &U, unsigned) {
3584 auto *R = cast<VPRecipeBase>(&U);
3585 return R->getRegion() ||
3586 R->getParent() == Plan.getVectorLoopRegion()->getSinglePredecessor();
3587 };
3588 ValueToSCEVMapTy RewriteMap;
3589 for (const SCEV *Stride : StridesMap.values()) {
3590 using namespace SCEVPatternMatch;
3591 auto *StrideV = cast<SCEVUnknown>(Stride)->getValue();
3592 const APInt *StrideConst;
3593 if (!match(PSE.getSCEV(StrideV), m_scev_APInt(StrideConst)))
3594 // Only handle constant strides for now.
3595 continue;
3596
3597 auto *CI = Plan.getConstantInt(*StrideConst);
3598 if (VPValue *StrideVPV = Plan.getLiveIn(StrideV))
3599 StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
3600
3601 // The versioned value may not be used in the loop directly but through a
3602 // sext/zext. Add new live-ins in those cases.
3603 for (Value *U : StrideV->users()) {
3605 continue;
3606 VPValue *StrideVPV = Plan.getLiveIn(U);
3607 if (!StrideVPV)
3608 continue;
3609 unsigned BW = U->getType()->getScalarSizeInBits();
3610 APInt C =
3611 isa<SExtInst>(U) ? StrideConst->sext(BW) : StrideConst->zext(BW);
3612 VPValue *CI = Plan.getConstantInt(C);
3613 StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
3614 }
3615 RewriteMap[StrideV] = PSE.getSCEV(StrideV);
3616 }
3617
3618 for (VPRecipeBase &R : *Plan.getEntry()) {
3619 auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(&R);
3620 if (!ExpSCEV)
3621 continue;
3622 const SCEV *ScevExpr = ExpSCEV->getSCEV();
3623 auto *NewSCEV =
3624 SCEVParameterRewriter::rewrite(ScevExpr, *PSE.getSE(), RewriteMap);
3625 if (NewSCEV != ScevExpr) {
3626 VPValue *NewExp = vputils::getOrCreateVPValueForSCEVExpr(Plan, NewSCEV);
3627 ExpSCEV->replaceAllUsesWith(NewExp);
3628 if (Plan.getTripCount() == ExpSCEV)
3629 Plan.resetTripCount(NewExp);
3630 }
3631 }
3632}
3633
3635 VPlan &Plan,
3636 const std::function<bool(BasicBlock *)> &BlockNeedsPredication) {
3637 // Collect recipes in the backward slice of `Root` that may generate a poison
3638 // value that is used after vectorization.
3640 auto CollectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) {
3642 Worklist.push_back(Root);
3643
3644 // Traverse the backward slice of Root through its use-def chain.
3645 while (!Worklist.empty()) {
3646 VPRecipeBase *CurRec = Worklist.pop_back_val();
3647
3648 if (!Visited.insert(CurRec).second)
3649 continue;
3650
3651 // Prune search if we find another recipe generating a widen memory
3652 // instruction. Widen memory instructions involved in address computation
3653 // will lead to gather/scatter instructions, which don't need to be
3654 // handled.
3656 VPHeaderPHIRecipe>(CurRec))
3657 continue;
3658
3659 // This recipe contributes to the address computation of a widen
3660 // load/store. If the underlying instruction has poison-generating flags,
3661 // drop them directly.
3662 if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(CurRec)) {
3663 VPValue *A, *B;
3664 // Dropping disjoint from an OR may yield incorrect results, as some
3665 // analysis may have converted it to an Add implicitly (e.g. SCEV used
3666 // for dependence analysis). Instead, replace it with an equivalent Add.
3667 // This is possible as all users of the disjoint OR only access lanes
3668 // where the operands are disjoint or poison otherwise.
3669 if (match(RecWithFlags, m_BinaryOr(m_VPValue(A), m_VPValue(B))) &&
3670 RecWithFlags->isDisjoint()) {
3671 VPBuilder Builder(RecWithFlags);
3672 VPInstruction *New =
3673 Builder.createAdd(A, B, RecWithFlags->getDebugLoc());
3674 New->setUnderlyingValue(RecWithFlags->getUnderlyingValue());
3675 RecWithFlags->replaceAllUsesWith(New);
3676 RecWithFlags->eraseFromParent();
3677 CurRec = New;
3678 } else
3679 RecWithFlags->dropPoisonGeneratingFlags();
3680 } else {
3683 (void)Instr;
3684 assert((!Instr || !Instr->hasPoisonGeneratingFlags()) &&
3685 "found instruction with poison generating flags not covered by "
3686 "VPRecipeWithIRFlags");
3687 }
3688
3689 // Add new definitions to the worklist.
3690 for (VPValue *Operand : CurRec->operands())
3691 if (VPRecipeBase *OpDef = Operand->getDefiningRecipe())
3692 Worklist.push_back(OpDef);
3693 }
3694 });
3695
3696 // Traverse all the recipes in the VPlan and collect the poison-generating
3697 // recipes in the backward slice starting at the address of a VPWidenRecipe or
3698 // VPInterleaveRecipe.
3699 auto Iter =
3702 for (VPRecipeBase &Recipe : *VPBB) {
3703 if (auto *WidenRec = dyn_cast<VPWidenMemoryRecipe>(&Recipe)) {
3704 Instruction &UnderlyingInstr = WidenRec->getIngredient();
3705 VPRecipeBase *AddrDef = WidenRec->getAddr()->getDefiningRecipe();
3706 if (AddrDef && WidenRec->isConsecutive() &&
3707 BlockNeedsPredication(UnderlyingInstr.getParent()))
3708 CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
3709 } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) {
3710 VPRecipeBase *AddrDef = InterleaveRec->getAddr()->getDefiningRecipe();
3711 if (AddrDef) {
3712 // Check if any member of the interleave group needs predication.
3713 const InterleaveGroup<Instruction> *InterGroup =
3714 InterleaveRec->getInterleaveGroup();
3715 bool NeedPredication = false;
3716 for (int I = 0, NumMembers = InterGroup->getNumMembers();
3717 I < NumMembers; ++I) {
3718 Instruction *Member = InterGroup->getMember(I);
3719 if (Member)
3720 NeedPredication |= BlockNeedsPredication(Member->getParent());
3721 }
3722
3723 if (NeedPredication)
3724 CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
3725 }
3726 }
3727 }
3728 }
3729}
3730
3732 VPlan &Plan,
3734 &InterleaveGroups,
3735 VPRecipeBuilder &RecipeBuilder, const bool &EpilogueAllowed) {
3736 if (InterleaveGroups.empty())
3737 return;
3738
3739 // Interleave memory: for each Interleave Group we marked earlier as relevant
3740 // for this VPlan, replace the Recipes widening its memory instructions with a
3741 // single VPInterleaveRecipe at its insertion point.
3742 VPDominatorTree VPDT(Plan);
3743 for (const auto *IG : InterleaveGroups) {
3744 auto *Start =
3745 cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IG->getMember(0)));
3746 VPIRMetadata InterleaveMD(*Start);
3747 SmallVector<VPValue *, 4> StoredValues;
3748 if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(Start))
3749 StoredValues.push_back(StoreR->getStoredValue());
3750 for (unsigned I = 1; I < IG->getFactor(); ++I) {
3751 Instruction *MemberI = IG->getMember(I);
3752 if (!MemberI)
3753 continue;
3754 VPWidenMemoryRecipe *MemoryR =
3755 cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(MemberI));
3756 if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(MemoryR))
3757 StoredValues.push_back(StoreR->getStoredValue());
3758 InterleaveMD.intersect(*MemoryR);
3759 }
3760
3761 bool NeedsMaskForGaps =
3762 (IG->requiresScalarEpilogue() && !EpilogueAllowed) ||
3763 (!StoredValues.empty() && !IG->isFull());
3764
3765 Instruction *IRInsertPos = IG->getInsertPos();
3766 auto *InsertPos =
3767 cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IRInsertPos));
3768
3770 if (auto *Gep = dyn_cast<GetElementPtrInst>(
3771 getLoadStorePointerOperand(IRInsertPos)->stripPointerCasts()))
3772 NW = Gep->getNoWrapFlags().withoutNoUnsignedWrap();
3773
3774 // Get or create the start address for the interleave group.
3775 VPValue *Addr = Start->getAddr();
3776 VPRecipeBase *AddrDef = Addr->getDefiningRecipe();
3777 if (AddrDef && !VPDT.properlyDominates(AddrDef, InsertPos)) {
3778 // We cannot re-use the address of member zero because it does not
3779 // dominate the insert position. Instead, use the address of the insert
3780 // position and create a PtrAdd adjusting it to the address of member
3781 // zero.
3782 // TODO: Hoist Addr's defining recipe (and any operands as needed) to
3783 // InsertPos or sink loads above zero members to join it.
3784 assert(IG->getIndex(IRInsertPos) != 0 &&
3785 "index of insert position shouldn't be zero");
3786 auto &DL = IRInsertPos->getDataLayout();
3787 APInt Offset(32,
3788 DL.getTypeAllocSize(getLoadStoreType(IRInsertPos)) *
3789 IG->getIndex(IRInsertPos),
3790 /*IsSigned=*/true);
3791 VPValue *OffsetVPV = Plan.getConstantInt(-Offset);
3792 VPBuilder B(InsertPos);
3793 Addr = B.createNoWrapPtrAdd(InsertPos->getAddr(), OffsetVPV, NW);
3794 }
3795 // If the group is reverse, adjust the index to refer to the last vector
3796 // lane instead of the first. We adjust the index from the first vector
3797 // lane, rather than directly getting the pointer for lane VF - 1, because
3798 // the pointer operand of the interleaved access is supposed to be uniform.
3799 if (IG->isReverse()) {
3800 auto *ReversePtr = new VPVectorEndPointerRecipe(
3801 Addr, &Plan.getVF(), getLoadStoreType(IRInsertPos),
3802 -(int64_t)IG->getFactor(), NW, InsertPos->getDebugLoc());
3803 ReversePtr->insertBefore(InsertPos);
3804 Addr = ReversePtr;
3805 }
3806 auto *VPIG = new VPInterleaveRecipe(IG, Addr, StoredValues,
3807 InsertPos->getMask(), NeedsMaskForGaps,
3808 InterleaveMD, InsertPos->getDebugLoc());
3809 VPIG->insertBefore(InsertPos);
3810
3811 unsigned J = 0;
3812 for (unsigned i = 0; i < IG->getFactor(); ++i)
3813 if (Instruction *Member = IG->getMember(i)) {
3814 VPRecipeBase *MemberR = RecipeBuilder.getRecipe(Member);
3815 if (!Member->getType()->isVoidTy()) {
3816 VPValue *OriginalV = MemberR->getVPSingleValue();
3817 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));
3818 J++;
3819 }
3820 MemberR->eraseFromParent();
3821 }
3822 }
3823}
3824
3825/// Expand a VPWidenIntOrFpInduction into executable recipes, for the initial
3826/// value, phi and backedge value. In the following example:
3827///
3828/// vector.ph:
3829/// Successor(s): vector loop
3830///
3831/// <x1> vector loop: {
3832/// vector.body:
3833/// WIDEN-INDUCTION %i = phi %start, %step, %vf
3834/// ...
3835/// EMIT branch-on-count ...
3836/// No successors
3837/// }
3838///
3839/// WIDEN-INDUCTION will get expanded to:
3840///
3841/// vector.ph:
3842/// ...
3843/// vp<%induction.start> = ...
3844/// vp<%induction.increment> = ...
3845///
3846/// Successor(s): vector loop
3847///
3848/// <x1> vector loop: {
3849/// vector.body:
3850/// ir<%i> = WIDEN-PHI vp<%induction.start>, vp<%vec.ind.next>
3851/// ...
3852/// vp<%vec.ind.next> = add ir<%i>, vp<%induction.increment>
3853/// EMIT branch-on-count ...
3854/// No successors
3855/// }
3856static void
3858 VPTypeAnalysis &TypeInfo) {
3859 VPlan *Plan = WidenIVR->getParent()->getPlan();
3860 VPValue *Start = WidenIVR->getStartValue();
3861 VPValue *Step = WidenIVR->getStepValue();
3862 VPValue *VF = WidenIVR->getVFValue();
3863 DebugLoc DL = WidenIVR->getDebugLoc();
3864
3865 // The value from the original loop to which we are mapping the new induction
3866 // variable.
3867 Type *Ty = TypeInfo.inferScalarType(WidenIVR);
3868
3869 const InductionDescriptor &ID = WidenIVR->getInductionDescriptor();
3872 VPIRFlags Flags = *WidenIVR;
3873 if (ID.getKind() == InductionDescriptor::IK_IntInduction) {
3874 AddOp = Instruction::Add;
3875 MulOp = Instruction::Mul;
3876 } else {
3877 AddOp = ID.getInductionOpcode();
3878 MulOp = Instruction::FMul;
3879 }
3880
3881 // If the phi is truncated, truncate the start and step values.
3882 VPBuilder Builder(Plan->getVectorPreheader());
3883 Type *StepTy = TypeInfo.inferScalarType(Step);
3884 if (Ty->getScalarSizeInBits() < StepTy->getScalarSizeInBits()) {
3885 assert(StepTy->isIntegerTy() && "Truncation requires an integer type");
3886 Step = Builder.createScalarCast(Instruction::Trunc, Step, Ty, DL);
3887 Start = Builder.createScalarCast(Instruction::Trunc, Start, Ty, DL);
3888 StepTy = Ty;
3889 }
3890
3891 // Construct the initial value of the vector IV in the vector loop preheader.
3892 Type *IVIntTy =
3894 VPValue *Init = Builder.createNaryOp(VPInstruction::StepVector, {}, IVIntTy);
3895 if (StepTy->isFloatingPointTy())
3896 Init = Builder.createWidenCast(Instruction::UIToFP, Init, StepTy);
3897
3898 VPValue *SplatStart = Builder.createNaryOp(VPInstruction::Broadcast, Start);
3899 VPValue *SplatStep = Builder.createNaryOp(VPInstruction::Broadcast, Step);
3900
3901 Init = Builder.createNaryOp(MulOp, {Init, SplatStep}, Flags);
3902 Init = Builder.createNaryOp(AddOp, {SplatStart, Init}, Flags,
3903 DebugLoc::getUnknown(), "induction");
3904
3905 // Create the widened phi of the vector IV.
3906 auto *WidePHI = VPBuilder(WidenIVR).createWidenPhi(
3907 Init, WidenIVR->getDebugLoc(), "vec.ind");
3908
3909 // Create the backedge value for the vector IV.
3910 VPValue *Inc;
3911 VPValue *Prev;
3912 // If unrolled, use the increment and prev value from the operands.
3913 if (auto *SplatVF = WidenIVR->getSplatVFValue()) {
3914 Inc = SplatVF;
3915 Prev = WidenIVR->getLastUnrolledPartOperand();
3916 } else {
3917 if (VPRecipeBase *R = VF->getDefiningRecipe())
3918 Builder.setInsertPoint(R->getParent(), std::next(R->getIterator()));
3919 // Multiply the vectorization factor by the step using integer or
3920 // floating-point arithmetic as appropriate.
3921 if (StepTy->isFloatingPointTy())
3922 VF = Builder.createScalarCast(Instruction::CastOps::UIToFP, VF, StepTy,
3923 DL);
3924 else
3925 VF = Builder.createScalarZExtOrTrunc(VF, StepTy,
3926 TypeInfo.inferScalarType(VF), DL);
3927
3928 Inc = Builder.createNaryOp(MulOp, {Step, VF}, Flags);
3929 Inc = Builder.createNaryOp(VPInstruction::Broadcast, Inc);
3930 Prev = WidePHI;
3931 }
3932
3934 Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator());
3935 auto *Next = Builder.createNaryOp(AddOp, {Prev, Inc}, Flags,
3936 WidenIVR->getDebugLoc(), "vec.ind.next");
3937
3938 WidePHI->addOperand(Next);
3939
3940 WidenIVR->replaceAllUsesWith(WidePHI);
3941}
3942
3943/// Expand a VPWidenPointerInductionRecipe into executable recipes, for the
3944/// initial value, phi and backedge value. In the following example:
3945///
3946/// <x1> vector loop: {
3947/// vector.body:
3948/// EMIT ir<%ptr.iv> = WIDEN-POINTER-INDUCTION %start, %step, %vf
3949/// ...
3950/// EMIT branch-on-count ...
3951/// }
3952///
3953/// WIDEN-POINTER-INDUCTION will get expanded to:
3954///
3955/// <x1> vector loop: {
3956/// vector.body:
3957/// EMIT-SCALAR %pointer.phi = phi %start, %ptr.ind
3958/// EMIT %mul = mul %stepvector, %step
3959/// EMIT %vector.gep = wide-ptradd %pointer.phi, %mul
3960/// ...
3961/// EMIT %ptr.ind = ptradd %pointer.phi, %vf
3962/// EMIT branch-on-count ...
3963/// }
3965 VPTypeAnalysis &TypeInfo) {
3966 VPlan *Plan = R->getParent()->getPlan();
3967 VPValue *Start = R->getStartValue();
3968 VPValue *Step = R->getStepValue();
3969 VPValue *VF = R->getVFValue();
3970
3971 assert(R->getInductionDescriptor().getKind() ==
3973 "Not a pointer induction according to InductionDescriptor!");
3974 assert(TypeInfo.inferScalarType(R)->isPointerTy() && "Unexpected type.");
3975 assert(!R->onlyScalarsGenerated(Plan->hasScalableVF()) &&
3976 "Recipe should have been replaced");
3977
3978 VPBuilder Builder(R);
3979 DebugLoc DL = R->getDebugLoc();
3980
3981 // Build a scalar pointer phi.
3982 VPPhi *ScalarPtrPhi = Builder.createScalarPhi(Start, DL, "pointer.phi");
3983
3984 // Create actual address geps that use the pointer phi as base and a
3985 // vectorized version of the step value (<step*0, ..., step*N>) as offset.
3986 Builder.setInsertPoint(R->getParent(), R->getParent()->getFirstNonPhi());
3987 Type *StepTy = TypeInfo.inferScalarType(Step);
3988 VPValue *Offset = Builder.createNaryOp(VPInstruction::StepVector, {}, StepTy);
3989 Offset = Builder.createOverflowingOp(Instruction::Mul, {Offset, Step});
3990 VPValue *PtrAdd =
3991 Builder.createWidePtrAdd(ScalarPtrPhi, Offset, DL, "vector.gep");
3992 R->replaceAllUsesWith(PtrAdd);
3993
3994 // Create the backedge value for the scalar pointer phi.
3996 Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator());
3997 VF = Builder.createScalarZExtOrTrunc(VF, StepTy, TypeInfo.inferScalarType(VF),
3998 DL);
3999 VPValue *Inc = Builder.createOverflowingOp(Instruction::Mul, {Step, VF});
4000
4001 VPValue *InductionGEP =
4002 Builder.createPtrAdd(ScalarPtrPhi, Inc, DL, "ptr.ind");
4003 ScalarPtrPhi->addOperand(InductionGEP);
4004}
4005
4007 // Replace loop regions with explicity CFG.
4008 SmallVector<VPRegionBlock *> LoopRegions;
4010 vp_depth_first_deep(Plan.getEntry()))) {
4011 if (!R->isReplicator())
4012 LoopRegions.push_back(R);
4013 }
4014 for (VPRegionBlock *R : LoopRegions)
4015 R->dissolveToCFGLoop();
4016}
4017
4020 // The transform runs after dissolving loop regions, so all VPBasicBlocks
4021 // terminated with BranchOnTwoConds are reached via a shallow traversal.
4024 if (!VPBB->empty() && match(&VPBB->back(), m_BranchOnTwoConds()))
4025 WorkList.push_back(cast<VPInstruction>(&VPBB->back()));
4026 }
4027
4028 // Expand BranchOnTwoConds instructions into explicit CFG with two new
4029 // single-condition branches:
4030 // 1. A branch that replaces BranchOnTwoConds, jumps to the first successor if
4031 // the first condition is true, and otherwise jumps to a new interim block.
4032 // 2. A branch that ends the interim block, jumps to the second successor if
4033 // the second condition is true, and otherwise jumps to the third
4034 // successor.
4035 for (VPInstruction *Br : WorkList) {
4036 assert(Br->getNumOperands() == 2 &&
4037 "BranchOnTwoConds must have exactly 2 conditions");
4038 DebugLoc DL = Br->getDebugLoc();
4039 VPBasicBlock *BrOnTwoCondsBB = Br->getParent();
4040 const auto Successors = to_vector(BrOnTwoCondsBB->getSuccessors());
4041 assert(Successors.size() == 3 &&
4042 "BranchOnTwoConds must have exactly 3 successors");
4043
4044 for (VPBlockBase *Succ : Successors)
4045 VPBlockUtils::disconnectBlocks(BrOnTwoCondsBB, Succ);
4046
4047 VPValue *Cond0 = Br->getOperand(0);
4048 VPValue *Cond1 = Br->getOperand(1);
4049 VPBlockBase *Succ0 = Successors[0];
4050 VPBlockBase *Succ1 = Successors[1];
4051 VPBlockBase *Succ2 = Successors[2];
4052 assert(!Succ0->getParent() && !Succ1->getParent() && !Succ2->getParent() &&
4053 !BrOnTwoCondsBB->getParent() && "regions must already be dissolved");
4054
4055 VPBasicBlock *InterimBB =
4056 Plan.createVPBasicBlock(BrOnTwoCondsBB->getName() + ".interim");
4057
4058 VPBuilder(BrOnTwoCondsBB)
4060 VPBlockUtils::connectBlocks(BrOnTwoCondsBB, Succ0);
4061 VPBlockUtils::connectBlocks(BrOnTwoCondsBB, InterimBB);
4062
4064 VPBlockUtils::connectBlocks(InterimBB, Succ1);
4065 VPBlockUtils::connectBlocks(InterimBB, Succ2);
4066 Br->eraseFromParent();
4067 }
4068}
4069
4071 VPTypeAnalysis TypeInfo(Plan);
4074 vp_depth_first_deep(Plan.getEntry()))) {
4075 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
4076 if (auto *WidenIVR = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R)) {
4077 expandVPWidenIntOrFpInduction(WidenIVR, TypeInfo);
4078 ToRemove.push_back(WidenIVR);
4079 continue;
4080 }
4081
4082 if (auto *WidenIVR = dyn_cast<VPWidenPointerInductionRecipe>(&R)) {
4083 // If the recipe only generates scalars, scalarize it instead of
4084 // expanding it.
4085 if (WidenIVR->onlyScalarsGenerated(Plan.hasScalableVF())) {
4086 VPBuilder Builder(WidenIVR);
4087 VPValue *PtrAdd =
4088 scalarizeVPWidenPointerInduction(WidenIVR, Plan, Builder);
4089 WidenIVR->replaceAllUsesWith(PtrAdd);
4090 ToRemove.push_back(WidenIVR);
4091 continue;
4092 }
4093 expandVPWidenPointerInduction(WidenIVR, TypeInfo);
4094 ToRemove.push_back(WidenIVR);
4095 continue;
4096 }
4097
4098 // Expand VPBlendRecipe into VPInstruction::Select.
4099 VPBuilder Builder(&R);
4100 if (auto *Blend = dyn_cast<VPBlendRecipe>(&R)) {
4101 VPValue *Select = Blend->getIncomingValue(0);
4102 for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
4103 Select = Builder.createSelect(Blend->getMask(I),
4104 Blend->getIncomingValue(I), Select,
4105 R.getDebugLoc(), "predphi", *Blend);
4106 Blend->replaceAllUsesWith(Select);
4107 ToRemove.push_back(Blend);
4108 }
4109
4110 if (auto *VEPR = dyn_cast<VPVectorEndPointerRecipe>(&R)) {
4111 if (!VEPR->getOffset()) {
4112 assert(Plan.getConcreteUF() == 1 &&
4113 "Expected unroller to have materialized offset for UF != 1");
4114 VEPR->materializeOffset();
4115 }
4116 }
4117
4118 if (auto *Expr = dyn_cast<VPExpressionRecipe>(&R)) {
4119 Expr->decompose();
4120 ToRemove.push_back(Expr);
4121 }
4122
4123 // Expand LastActiveLane into Not + FirstActiveLane + Sub.
4124 auto *LastActiveL = dyn_cast<VPInstruction>(&R);
4125 if (LastActiveL &&
4126 LastActiveL->getOpcode() == VPInstruction::LastActiveLane) {
4127 // Create Not(Mask) for all operands.
4129 for (VPValue *Op : LastActiveL->operands()) {
4130 VPValue *NotMask = Builder.createNot(Op, LastActiveL->getDebugLoc());
4131 NotMasks.push_back(NotMask);
4132 }
4133
4134 // Create FirstActiveLane on the inverted masks.
4135 VPValue *FirstInactiveLane = Builder.createNaryOp(
4137 LastActiveL->getDebugLoc(), "first.inactive.lane");
4138
4139 // Subtract 1 to get the last active lane.
4140 VPValue *One =
4141 Plan.getConstantInt(TypeInfo.inferScalarType(FirstInactiveLane), 1);
4142 VPValue *LastLane =
4143 Builder.createSub(FirstInactiveLane, One,
4144 LastActiveL->getDebugLoc(), "last.active.lane");
4145
4146 LastActiveL->replaceAllUsesWith(LastLane);
4147 ToRemove.push_back(LastActiveL);
4148 continue;
4149 }
4150
4151 // Lower MaskedCond with block mask to LogicalAnd.
4153 auto *VPI = cast<VPInstruction>(&R);
4154 assert(VPI->isMasked() &&
4155 "Unmasked MaskedCond should be simplified earlier");
4156 VPI->replaceAllUsesWith(Builder.createNaryOp(
4157 VPInstruction::LogicalAnd, {VPI->getMask(), VPI->getOperand(0)}));
4158 ToRemove.push_back(VPI);
4159 continue;
4160 }
4161
4162 // Lower CanonicalIVIncrementForPart to plain Add.
4163 if (match(
4164 &R,
4166 auto *VPI = cast<VPInstruction>(&R);
4167 VPValue *Add = Builder.createOverflowingOp(
4168 Instruction::Add, VPI->operands(), VPI->getNoWrapFlags(),
4169 VPI->getDebugLoc());
4170 VPI->replaceAllUsesWith(Add);
4171 ToRemove.push_back(VPI);
4172 continue;
4173 }
4174
4175 // Lower BranchOnCount to ICmp + BranchOnCond.
4176 VPValue *IV, *TC;
4177 if (match(&R, m_BranchOnCount(m_VPValue(IV), m_VPValue(TC)))) {
4178 auto *BranchOnCountInst = cast<VPInstruction>(&R);
4179 DebugLoc DL = BranchOnCountInst->getDebugLoc();
4180 VPValue *Cond = Builder.createICmp(CmpInst::ICMP_EQ, IV, TC, DL);
4181 Builder.createNaryOp(VPInstruction::BranchOnCond, Cond, DL);
4182 ToRemove.push_back(BranchOnCountInst);
4183 continue;
4184 }
4185
4186 VPValue *VectorStep;
4187 VPValue *ScalarStep;
4189 m_VPValue(VectorStep), m_VPValue(ScalarStep))))
4190 continue;
4191
4192 // Expand WideIVStep.
4193 auto *VPI = cast<VPInstruction>(&R);
4194 Type *IVTy = TypeInfo.inferScalarType(VPI);
4195 if (TypeInfo.inferScalarType(VectorStep) != IVTy) {
4197 ? Instruction::UIToFP
4198 : Instruction::Trunc;
4199 VectorStep = Builder.createWidenCast(CastOp, VectorStep, IVTy);
4200 }
4201
4202 assert(!match(ScalarStep, m_One()) && "Expected non-unit scalar-step");
4203 if (TypeInfo.inferScalarType(ScalarStep) != IVTy) {
4204 ScalarStep =
4205 Builder.createWidenCast(Instruction::Trunc, ScalarStep, IVTy);
4206 }
4207
4208 VPIRFlags Flags;
4209 unsigned MulOpc;
4210 if (IVTy->isFloatingPointTy()) {
4211 MulOpc = Instruction::FMul;
4212 Flags = VPI->getFastMathFlags();
4213 } else {
4214 MulOpc = Instruction::Mul;
4215 Flags = VPIRFlags::getDefaultFlags(MulOpc);
4216 }
4217
4218 VPInstruction *Mul = Builder.createNaryOp(
4219 MulOpc, {VectorStep, ScalarStep}, Flags, R.getDebugLoc());
4220 VectorStep = Mul;
4221 VPI->replaceAllUsesWith(VectorStep);
4222 ToRemove.push_back(VPI);
4223 }
4224 }
4225
4226 for (VPRecipeBase *R : ToRemove)
4227 R->eraseFromParent();
4228}
4229
4231 VPBasicBlock *HeaderVPBB,
4232 VPBasicBlock *LatchVPBB,
4233 VPBasicBlock *MiddleVPBB,
4234 UncountableExitStyle Style) {
4235 struct EarlyExitInfo {
4236 VPBasicBlock *EarlyExitingVPBB;
4237 VPIRBasicBlock *EarlyExitVPBB;
4238 VPValue *CondToExit;
4239 };
4240
4241 VPDominatorTree VPDT(Plan);
4242 VPBuilder Builder(LatchVPBB->getTerminator());
4244 for (VPIRBasicBlock *ExitBlock : Plan.getExitBlocks()) {
4245 for (VPBlockBase *Pred : to_vector(ExitBlock->getPredecessors())) {
4246 if (Pred == MiddleVPBB)
4247 continue;
4248 // Collect condition for this early exit.
4249 auto *EarlyExitingVPBB = cast<VPBasicBlock>(Pred);
4250 VPBlockBase *TrueSucc = EarlyExitingVPBB->getSuccessors()[0];
4251 VPValue *CondOfEarlyExitingVPBB;
4252 [[maybe_unused]] bool Matched =
4253 match(EarlyExitingVPBB->getTerminator(),
4254 m_BranchOnCond(m_VPValue(CondOfEarlyExitingVPBB)));
4255 assert(Matched && "Terminator must be BranchOnCond");
4256
4257 // Insert the MaskedCond in the EarlyExitingVPBB so the predicator adds
4258 // the correct block mask.
4259 VPBuilder EarlyExitingBuilder(EarlyExitingVPBB->getTerminator());
4260 auto *CondToEarlyExit = EarlyExitingBuilder.createNaryOp(
4262 TrueSucc == ExitBlock
4263 ? CondOfEarlyExitingVPBB
4264 : EarlyExitingBuilder.createNot(CondOfEarlyExitingVPBB));
4265 assert((isa<VPIRValue>(CondOfEarlyExitingVPBB) ||
4266 !VPDT.properlyDominates(EarlyExitingVPBB, LatchVPBB) ||
4267 VPDT.properlyDominates(
4268 CondOfEarlyExitingVPBB->getDefiningRecipe()->getParent(),
4269 LatchVPBB)) &&
4270 "exit condition must dominate the latch");
4271 Exits.push_back({
4272 EarlyExitingVPBB,
4273 ExitBlock,
4274 CondToEarlyExit,
4275 });
4276 }
4277 }
4278
4279 assert(!Exits.empty() && "must have at least one early exit");
4280 // Sort exits by RPO order to get correct program order. RPO gives a
4281 // topological ordering of the CFG, ensuring upstream exits are checked
4282 // before downstream exits in the dispatch chain.
4284 HeaderVPBB);
4286 for (const auto &[Num, VPB] : enumerate(RPOT))
4287 RPOIdx[VPB] = Num;
4288 llvm::sort(Exits, [&RPOIdx](const EarlyExitInfo &A, const EarlyExitInfo &B) {
4289 return RPOIdx[A.EarlyExitingVPBB] < RPOIdx[B.EarlyExitingVPBB];
4290 });
4291#ifndef NDEBUG
4292 // After RPO sorting, verify that for any pair where one exit dominates
4293 // another, the dominating exit comes first. This is guaranteed by RPO
4294 // (topological order) and is required for the dispatch chain correctness.
4295 for (unsigned I = 0; I + 1 < Exits.size(); ++I)
4296 for (unsigned J = I + 1; J < Exits.size(); ++J)
4297 assert(!VPDT.properlyDominates(Exits[J].EarlyExitingVPBB,
4298 Exits[I].EarlyExitingVPBB) &&
4299 "RPO sort must place dominating exits before dominated ones");
4300#endif
4301
4302 // Build the AnyOf condition for the latch terminator using logical OR
4303 // to avoid poison propagation from later exit conditions when an earlier
4304 // exit is taken.
4305 VPValue *Combined = Exits[0].CondToExit;
4306 for (const EarlyExitInfo &Info : drop_begin(Exits))
4307 Combined = Builder.createLogicalOr(Combined, Info.CondToExit);
4308
4309 VPValue *IsAnyExitTaken =
4310 Builder.createNaryOp(VPInstruction::AnyOf, {Combined});
4311
4313 "Early exit store masking not implemented");
4314
4315 // Create the vector.early.exit blocks.
4316 SmallVector<VPBasicBlock *> VectorEarlyExitVPBBs(Exits.size());
4317 for (unsigned Idx = 0; Idx != Exits.size(); ++Idx) {
4318 Twine BlockSuffix = Exits.size() == 1 ? "" : Twine(".") + Twine(Idx);
4319 VPBasicBlock *VectorEarlyExitVPBB =
4320 Plan.createVPBasicBlock("vector.early.exit" + BlockSuffix);
4321 VectorEarlyExitVPBBs[Idx] = VectorEarlyExitVPBB;
4322 }
4323
4324 // Create the dispatch block (or reuse the single exit block if only one
4325 // exit). The dispatch block computes the first active lane of the combined
4326 // condition and, for multiple exits, chains through conditions to determine
4327 // which exit to take.
4328 VPBasicBlock *DispatchVPBB =
4329 Exits.size() == 1 ? VectorEarlyExitVPBBs[0]
4330 : Plan.createVPBasicBlock("vector.early.exit.check");
4331 VPBuilder DispatchBuilder(DispatchVPBB, DispatchVPBB->begin());
4332 VPValue *FirstActiveLane =
4333 DispatchBuilder.createNaryOp(VPInstruction::FirstActiveLane, {Combined},
4334 DebugLoc::getUnknown(), "first.active.lane");
4335
4336 // For each early exit, disconnect the original exiting block
4337 // (early.exiting.I) from the exit block (ir-bb<exit.I>) and route through a
4338 // new vector.early.exit block. Update ir-bb<exit.I>'s phis to extract their
4339 // values at the first active lane:
4340 //
4341 // Input:
4342 // early.exiting.I:
4343 // ...
4344 // EMIT branch-on-cond vp<%cond.I>
4345 // Successor(s): in.loop.succ, ir-bb<exit.I>
4346 //
4347 // ir-bb<exit.I>:
4348 // IR %phi = phi [ vp<%incoming.I>, early.exiting.I ], ...
4349 //
4350 // Output:
4351 // early.exiting.I:
4352 // ...
4353 // Successor(s): in.loop.succ
4354 //
4355 // vector.early.exit.I:
4356 // EMIT vp<%exit.val> = extract-lane vp<%first.lane>, vp<%incoming.I>
4357 // Successor(s): ir-bb<exit.I>
4358 //
4359 // ir-bb<exit.I>:
4360 // IR %phi = phi ... (extra operand: vp<%exit.val> from
4361 // vector.early.exit.I)
4362 //
4363 for (auto [Exit, VectorEarlyExitVPBB] :
4364 zip_equal(Exits, VectorEarlyExitVPBBs)) {
4365 auto &[EarlyExitingVPBB, EarlyExitVPBB, _] = Exit;
4366 // Adjust the phi nodes in EarlyExitVPBB.
4367 // 1. remove incoming values from EarlyExitingVPBB,
4368 // 2. extract the incoming value at FirstActiveLane
4369 // 3. add back the extracts as last operands for the phis
4370 // Then adjust the CFG, removing the edge between EarlyExitingVPBB and
4371 // EarlyExitVPBB and adding a new edge between VectorEarlyExitVPBB and
4372 // EarlyExitVPBB. The extracts at FirstActiveLane are now the incoming
4373 // values from VectorEarlyExitVPBB.
4374 for (VPRecipeBase &R : EarlyExitVPBB->phis()) {
4375 auto *ExitIRI = cast<VPIRPhi>(&R);
4376 VPValue *IncomingVal =
4377 ExitIRI->getIncomingValueForBlock(EarlyExitingVPBB);
4378 VPValue *NewIncoming = IncomingVal;
4379 if (!isa<VPIRValue>(IncomingVal)) {
4380 VPBuilder EarlyExitBuilder(VectorEarlyExitVPBB);
4381 NewIncoming = EarlyExitBuilder.createNaryOp(
4382 VPInstruction::ExtractLane, {FirstActiveLane, IncomingVal},
4383 DebugLoc::getUnknown(), "early.exit.value");
4384 }
4385 ExitIRI->removeIncomingValueFor(EarlyExitingVPBB);
4386 ExitIRI->addOperand(NewIncoming);
4387 }
4388
4389 EarlyExitingVPBB->getTerminator()->eraseFromParent();
4390 VPBlockUtils::disconnectBlocks(EarlyExitingVPBB, EarlyExitVPBB);
4391 VPBlockUtils::connectBlocks(VectorEarlyExitVPBB, EarlyExitVPBB);
4392 }
4393
4394 // Chain through exits: for each exit, check if its condition is true at
4395 // the first active lane. If so, take that exit; otherwise, try the next.
4396 // The last exit needs no check since it must be taken if all others fail.
4397 //
4398 // For 3 exits (cond.0, cond.1, cond.2), this creates:
4399 //
4400 // latch:
4401 // ...
4402 // EMIT vp<%combined> = logical-or vp<%cond.0>, vp<%cond.1>, vp<%cond.2>
4403 // ...
4404 //
4405 // vector.early.exit.check:
4406 // EMIT vp<%first.lane> = first-active-lane vp<%combined>
4407 // EMIT vp<%at.cond.0> = extract-lane vp<%first.lane>, vp<%cond.0>
4408 // EMIT branch-on-cond vp<%at.cond.0>
4409 // Successor(s): vector.early.exit.0, vector.early.exit.check.0
4410 //
4411 // vector.early.exit.check.0:
4412 // EMIT vp<%at.cond.1> = extract-lane vp<%first.lane>, vp<%cond.1>
4413 // EMIT branch-on-cond vp<%at.cond.1>
4414 // Successor(s): vector.early.exit.1, vector.early.exit.2
4415 VPBasicBlock *CurrentBB = DispatchVPBB;
4416 for (auto [I, Exit] : enumerate(ArrayRef(Exits).drop_back())) {
4417 VPValue *LaneVal = DispatchBuilder.createNaryOp(
4418 VPInstruction::ExtractLane, {FirstActiveLane, Exit.CondToExit},
4419 DebugLoc::getUnknown(), "exit.cond.at.lane");
4420
4421 // For the last dispatch, branch directly to the last exit on false;
4422 // otherwise, create a new check block.
4423 bool IsLastDispatch = (I + 2 == Exits.size());
4424 VPBasicBlock *FalseBB =
4425 IsLastDispatch ? VectorEarlyExitVPBBs.back()
4426 : Plan.createVPBasicBlock(
4427 Twine("vector.early.exit.check.") + Twine(I));
4428
4429 DispatchBuilder.createNaryOp(VPInstruction::BranchOnCond, {LaneVal});
4430 CurrentBB->setSuccessors({VectorEarlyExitVPBBs[I], FalseBB});
4431 VectorEarlyExitVPBBs[I]->setPredecessors({CurrentBB});
4432 FalseBB->setPredecessors({CurrentBB});
4433
4434 CurrentBB = FalseBB;
4435 DispatchBuilder.setInsertPoint(CurrentBB);
4436 }
4437
4438 // Replace the latch terminator with the new branching logic.
4439 auto *LatchExitingBranch = cast<VPInstruction>(LatchVPBB->getTerminator());
4440 assert(LatchExitingBranch->getOpcode() == VPInstruction::BranchOnCount &&
4441 "Unexpected terminator");
4442 auto *IsLatchExitTaken =
4443 Builder.createICmp(CmpInst::ICMP_EQ, LatchExitingBranch->getOperand(0),
4444 LatchExitingBranch->getOperand(1));
4445
4446 DebugLoc LatchDL = LatchExitingBranch->getDebugLoc();
4447 LatchExitingBranch->eraseFromParent();
4448 Builder.setInsertPoint(LatchVPBB);
4449 Builder.createNaryOp(VPInstruction::BranchOnTwoConds,
4450 {IsAnyExitTaken, IsLatchExitTaken}, LatchDL);
4451 LatchVPBB->clearSuccessors();
4452 LatchVPBB->setSuccessors({DispatchVPBB, MiddleVPBB, HeaderVPBB});
4453 DispatchVPBB->setPredecessors({LatchVPBB});
4454}
4455
4456/// This function tries convert extended in-loop reductions to
4457/// VPExpressionRecipe and clamp the \p Range if it is beneficial and
4458/// valid. The created recipe must be decomposed to its constituent
4459/// recipes before execution.
4460static VPExpressionRecipe *
4462 VFRange &Range) {
4463 Type *RedTy = Ctx.Types.inferScalarType(Red);
4464 VPValue *VecOp = Red->getVecOp();
4465
4466 assert(!Red->isPartialReduction() &&
4467 "This path does not support partial reductions");
4468
4469 // Clamp the range if using extended-reduction is profitable.
4470 auto IsExtendedRedValidAndClampRange =
4471 [&](unsigned Opcode, Instruction::CastOps ExtOpc, Type *SrcTy) -> bool {
4473 [&](ElementCount VF) {
4474 auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF));
4476
4478 InstructionCost ExtCost =
4479 cast<VPWidenCastRecipe>(VecOp)->computeCost(VF, Ctx);
4480 InstructionCost RedCost = Red->computeCost(VF, Ctx);
4481
4482 assert(!RedTy->isFloatingPointTy() &&
4483 "getExtendedReductionCost only supports integer types");
4484 ExtRedCost = Ctx.TTI.getExtendedReductionCost(
4485 Opcode, ExtOpc == Instruction::CastOps::ZExt, RedTy, SrcVecTy,
4486 Red->getFastMathFlags(), CostKind);
4487 return ExtRedCost.isValid() && ExtRedCost < ExtCost + RedCost;
4488 },
4489 Range);
4490 };
4491
4492 VPValue *A;
4493 // Match reduce(ext)).
4495 IsExtendedRedValidAndClampRange(
4496 RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()),
4497 cast<VPWidenCastRecipe>(VecOp)->getOpcode(),
4498 Ctx.Types.inferScalarType(A)))
4499 return new VPExpressionRecipe(cast<VPWidenCastRecipe>(VecOp), Red);
4500
4501 return nullptr;
4502}
4503
4504/// This function tries convert extended in-loop reductions to
4505/// VPExpressionRecipe and clamp the \p Range if it is beneficial
4506/// and valid. The created VPExpressionRecipe must be decomposed to its
4507/// constituent recipes before execution. Patterns of the
4508/// VPExpressionRecipe:
4509/// reduce.add(mul(...)),
4510/// reduce.add(mul(ext(A), ext(B))),
4511/// reduce.add(ext(mul(ext(A), ext(B)))).
4512/// reduce.fadd(fmul(ext(A), ext(B)))
4513static VPExpressionRecipe *
4515 VPCostContext &Ctx, VFRange &Range) {
4516 unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind());
4517 if (Opcode != Instruction::Add && Opcode != Instruction::Sub &&
4518 Opcode != Instruction::FAdd)
4519 return nullptr;
4520
4521 assert(!Red->isPartialReduction() &&
4522 "This path does not support partial reductions");
4523 Type *RedTy = Ctx.Types.inferScalarType(Red);
4524
4525 // Clamp the range if using multiply-accumulate-reduction is profitable.
4526 auto IsMulAccValidAndClampRange =
4528 VPWidenCastRecipe *OuterExt) -> bool {
4530 [&](ElementCount VF) {
4532 Type *SrcTy =
4533 Ext0 ? Ctx.Types.inferScalarType(Ext0->getOperand(0)) : RedTy;
4534 InstructionCost MulAccCost;
4535
4536 // getMulAccReductionCost for in-loop reductions does not support
4537 // mixed or floating-point extends.
4538 if (Ext0 && Ext1 &&
4539 (Ext0->getOpcode() != Ext1->getOpcode() ||
4540 Ext0->getOpcode() == Instruction::CastOps::FPExt))
4541 return false;
4542
4543 bool IsZExt =
4544 !Ext0 || Ext0->getOpcode() == Instruction::CastOps::ZExt;
4545 auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF));
4546 MulAccCost = Ctx.TTI.getMulAccReductionCost(IsZExt, Opcode, RedTy,
4547 SrcVecTy, CostKind);
4548
4549 InstructionCost MulCost = Mul->computeCost(VF, Ctx);
4550 InstructionCost RedCost = Red->computeCost(VF, Ctx);
4551 InstructionCost ExtCost = 0;
4552 if (Ext0)
4553 ExtCost += Ext0->computeCost(VF, Ctx);
4554 if (Ext1)
4555 ExtCost += Ext1->computeCost(VF, Ctx);
4556 if (OuterExt)
4557 ExtCost += OuterExt->computeCost(VF, Ctx);
4558
4559 return MulAccCost.isValid() &&
4560 MulAccCost < ExtCost + MulCost + RedCost;
4561 },
4562 Range);
4563 };
4564
4565 VPValue *VecOp = Red->getVecOp();
4566 VPRecipeBase *Sub = nullptr;
4567 VPValue *A, *B;
4568 VPValue *Tmp = nullptr;
4569
4570 if (RedTy->isFloatingPointTy())
4571 return nullptr;
4572
4573 // Sub reductions could have a sub between the add reduction and vec op.
4574 if (match(VecOp, m_Sub(m_ZeroInt(), m_VPValue(Tmp)))) {
4575 Sub = VecOp->getDefiningRecipe();
4576 VecOp = Tmp;
4577 }
4578
4579 // If ValB is a constant and can be safely extended, truncate it to the same
4580 // type as ExtA's operand, then extend it to the same type as ExtA. This
4581 // creates two uniform extends that can more easily be matched by the rest of
4582 // the bundling code. The ExtB reference, ValB and operand 1 of Mul are all
4583 // replaced with the new extend of the constant.
4584 auto ExtendAndReplaceConstantOp = [&Ctx](VPWidenCastRecipe *ExtA,
4585 VPWidenCastRecipe *&ExtB,
4586 VPValue *&ValB, VPWidenRecipe *Mul) {
4587 if (!ExtA || ExtB || !isa<VPIRValue>(ValB))
4588 return;
4589 Type *NarrowTy = Ctx.Types.inferScalarType(ExtA->getOperand(0));
4590 Instruction::CastOps ExtOpc = ExtA->getOpcode();
4591 const APInt *Const;
4592 if (!match(ValB, m_APInt(Const)) ||
4594 Const, NarrowTy, TTI::getPartialReductionExtendKind(ExtOpc)))
4595 return;
4596 // The truncate ensures that the type of each extended operand is the
4597 // same, and it's been proven that the constant can be extended from
4598 // NarrowTy safely. Necessary since ExtA's extended operand would be
4599 // e.g. an i8, while the const will likely be an i32. This will be
4600 // elided by later optimisations.
4601 VPBuilder Builder(Mul);
4602 auto *Trunc =
4603 Builder.createWidenCast(Instruction::CastOps::Trunc, ValB, NarrowTy);
4604 Type *WideTy = Ctx.Types.inferScalarType(ExtA);
4605 ValB = ExtB = Builder.createWidenCast(ExtOpc, Trunc, WideTy);
4606 Mul->setOperand(1, ExtB);
4607 };
4608
4609 // Try to match reduce.add(mul(...)).
4610 if (match(VecOp, m_Mul(m_VPValue(A), m_VPValue(B)))) {
4611 auto *RecipeA = dyn_cast<VPWidenCastRecipe>(A);
4612 auto *RecipeB = dyn_cast<VPWidenCastRecipe>(B);
4613 auto *Mul = cast<VPWidenRecipe>(VecOp);
4614
4615 // Convert reduce.add(mul(ext, const)) to reduce.add(mul(ext, ext(const)))
4616 ExtendAndReplaceConstantOp(RecipeA, RecipeB, B, Mul);
4617
4618 // Match reduce.add/sub(mul(ext, ext)).
4619 if (RecipeA && RecipeB && match(RecipeA, m_ZExtOrSExt(m_VPValue())) &&
4620 match(RecipeB, m_ZExtOrSExt(m_VPValue())) &&
4621 IsMulAccValidAndClampRange(Mul, RecipeA, RecipeB, nullptr)) {
4622 if (Sub)
4623 return new VPExpressionRecipe(RecipeA, RecipeB, Mul,
4624 cast<VPWidenRecipe>(Sub), Red);
4625 return new VPExpressionRecipe(RecipeA, RecipeB, Mul, Red);
4626 }
4627 // TODO: Add an expression type for this variant with a negated mul
4628 if (!Sub && IsMulAccValidAndClampRange(Mul, nullptr, nullptr, nullptr))
4629 return new VPExpressionRecipe(Mul, Red);
4630 }
4631 // TODO: Add an expression type for negated versions of other expression
4632 // variants.
4633 if (Sub)
4634 return nullptr;
4635
4636 // Match reduce.add(ext(mul(A, B))).
4637 if (match(VecOp, m_ZExtOrSExt(m_Mul(m_VPValue(A), m_VPValue(B))))) {
4638 auto *Ext = cast<VPWidenCastRecipe>(VecOp);
4639 auto *Mul = cast<VPWidenRecipe>(Ext->getOperand(0));
4640 auto *Ext0 = dyn_cast<VPWidenCastRecipe>(A);
4641 auto *Ext1 = dyn_cast<VPWidenCastRecipe>(B);
4642
4643 // reduce.add(ext(mul(ext, const)))
4644 // -> reduce.add(ext(mul(ext, ext(const))))
4645 ExtendAndReplaceConstantOp(Ext0, Ext1, B, Mul);
4646
4647 // reduce.add(ext(mul(ext(A), ext(B))))
4648 // -> reduce.add(mul(wider_ext(A), wider_ext(B)))
4649 // The inner extends must either have the same opcode as the outer extend or
4650 // be the same, in which case the multiply can never result in a negative
4651 // value and the outer extend can be folded away by doing wider
4652 // extends for the operands of the mul.
4653 if (Ext0 && Ext1 &&
4654 (Ext->getOpcode() == Ext0->getOpcode() || Ext0 == Ext1) &&
4655 Ext0->getOpcode() == Ext1->getOpcode() &&
4656 IsMulAccValidAndClampRange(Mul, Ext0, Ext1, Ext) && Mul->hasOneUse()) {
4657 auto *NewExt0 = new VPWidenCastRecipe(
4658 Ext0->getOpcode(), Ext0->getOperand(0), Ext->getResultType(), nullptr,
4659 *Ext0, *Ext0, Ext0->getDebugLoc());
4660 NewExt0->insertBefore(Ext0);
4661
4662 VPWidenCastRecipe *NewExt1 = NewExt0;
4663 if (Ext0 != Ext1) {
4664 NewExt1 = new VPWidenCastRecipe(Ext1->getOpcode(), Ext1->getOperand(0),
4665 Ext->getResultType(), nullptr, *Ext1,
4666 *Ext1, Ext1->getDebugLoc());
4667 NewExt1->insertBefore(Ext1);
4668 }
4669 Mul->setOperand(0, NewExt0);
4670 Mul->setOperand(1, NewExt1);
4671 Red->setOperand(1, Mul);
4672 return new VPExpressionRecipe(NewExt0, NewExt1, Mul, Red);
4673 }
4674 }
4675 return nullptr;
4676}
4677
4678/// This function tries to create abstract recipes from the reduction recipe for
4679/// following optimizations and cost estimation.
4681 VPCostContext &Ctx,
4682 VFRange &Range) {
4683 // Creation of VPExpressions for partial reductions is entirely handled in
4684 // transformToPartialReduction.
4685 assert(!Red->isPartialReduction() &&
4686 "This path does not support partial reductions");
4687
4688 VPExpressionRecipe *AbstractR = nullptr;
4689 auto IP = std::next(Red->getIterator());
4690 auto *VPBB = Red->getParent();
4691 if (auto *MulAcc = tryToMatchAndCreateMulAccumulateReduction(Red, Ctx, Range))
4692 AbstractR = MulAcc;
4693 else if (auto *ExtRed = tryToMatchAndCreateExtendedReduction(Red, Ctx, Range))
4694 AbstractR = ExtRed;
4695 // Cannot create abstract inloop reduction recipes.
4696 if (!AbstractR)
4697 return;
4698
4699 AbstractR->insertBefore(*VPBB, IP);
4700 Red->replaceAllUsesWith(AbstractR);
4701}
4702
4713
4715 if (Plan.hasScalarVFOnly())
4716 return;
4717
4718#ifndef NDEBUG
4719 VPDominatorTree VPDT(Plan);
4720#endif
4721
4722 SmallVector<VPValue *> VPValues;
4723 if (VPValue *BTC = Plan.getBackedgeTakenCount())
4724 VPValues.push_back(BTC);
4725 append_range(VPValues, Plan.getLiveIns());
4726 for (VPRecipeBase &R : *Plan.getEntry())
4727 append_range(VPValues, R.definedValues());
4728
4729 auto *VectorPreheader = Plan.getVectorPreheader();
4730 for (VPValue *VPV : VPValues) {
4732 (isa<VPIRValue>(VPV) && isa<Constant>(VPV->getLiveInIRValue())))
4733 continue;
4734
4735 // Add explicit broadcast at the insert point that dominates all users.
4736 VPBasicBlock *HoistBlock = VectorPreheader;
4737 VPBasicBlock::iterator HoistPoint = VectorPreheader->end();
4738 for (VPUser *User : VPV->users()) {
4739 if (User->usesScalars(VPV))
4740 continue;
4741 if (cast<VPRecipeBase>(User)->getParent() == VectorPreheader)
4742 HoistPoint = HoistBlock->begin();
4743 else
4744 assert(VPDT.dominates(VectorPreheader,
4745 cast<VPRecipeBase>(User)->getParent()) &&
4746 "All users must be in the vector preheader or dominated by it");
4747 }
4748
4749 VPBuilder Builder(cast<VPBasicBlock>(HoistBlock), HoistPoint);
4750 auto *Broadcast = Builder.createNaryOp(VPInstruction::Broadcast, {VPV});
4751 VPV->replaceUsesWithIf(Broadcast,
4752 [VPV, Broadcast](VPUser &U, unsigned Idx) {
4753 return Broadcast != &U && !U.usesScalars(VPV);
4754 });
4755 }
4756}
4757
4759 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
4760
4761 // Collect candidate loads with invariant addresses and noalias scopes
4762 // metadata and memory-writing recipes with noalias metadata.
4766 vp_depth_first_shallow(LoopRegion->getEntry()))) {
4767 for (VPRecipeBase &R : *VPBB) {
4768 // Only handle single-scalar replicated loads with invariant addresses.
4769 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
4770 if (RepR->isPredicated() || !RepR->isSingleScalar() ||
4771 RepR->getOpcode() != Instruction::Load)
4772 continue;
4773
4774 VPValue *Addr = RepR->getOperand(0);
4775 if (Addr->isDefinedOutsideLoopRegions()) {
4777 if (!Loc.AATags.Scope)
4778 continue;
4779 CandidateLoads.push_back({RepR, Loc});
4780 }
4781 }
4782 if (R.mayWriteToMemory()) {
4784 if (!Loc || !Loc->AATags.Scope || !Loc->AATags.NoAlias)
4785 return;
4786 Stores.push_back(*Loc);
4787 }
4788 }
4789 }
4790
4791 VPBasicBlock *Preheader = Plan.getVectorPreheader();
4792 for (auto &[LoadRecipe, LoadLoc] : CandidateLoads) {
4793 // Hoist the load to the preheader if it doesn't alias with any stores
4794 // according to the noalias metadata. Other loads should have been hoisted
4795 // by other passes
4796 const AAMDNodes &LoadAA = LoadLoc.AATags;
4797 if (all_of(Stores, [&](const MemoryLocation &StoreLoc) {
4799 LoadAA.Scope, StoreLoc.AATags.NoAlias);
4800 })) {
4801 LoadRecipe->moveBefore(*Preheader, Preheader->getFirstNonPhi());
4802 }
4803 }
4804}
4805
4806// Collect common metadata from a group of replicate recipes by intersecting
4807// metadata from all recipes in the group.
4809 VPIRMetadata CommonMetadata = *Recipes.front();
4810 for (VPReplicateRecipe *Recipe : drop_begin(Recipes))
4811 CommonMetadata.intersect(*Recipe);
4812 return CommonMetadata;
4813}
4814
4815template <unsigned Opcode>
4819 const Loop *L) {
4820 static_assert(Opcode == Instruction::Load || Opcode == Instruction::Store,
4821 "Only Load and Store opcodes supported");
4822 constexpr bool IsLoad = (Opcode == Instruction::Load);
4823 VPTypeAnalysis TypeInfo(Plan);
4824
4825 // For each address, collect operations with the same or complementary masks.
4827 auto GetLoadStoreValueType = [&](VPReplicateRecipe *Recipe) {
4828 return TypeInfo.inferScalarType(IsLoad ? Recipe : Recipe->getOperand(0));
4829 };
4831 Plan, PSE, L,
4832 [](VPReplicateRecipe *RepR) { return RepR->isPredicated(); });
4833 for (auto Recipes : Groups) {
4834 if (Recipes.size() < 2)
4835 continue;
4836
4837 // Collect groups with the same or complementary masks.
4838 for (VPReplicateRecipe *&RecipeI : Recipes) {
4839 if (!RecipeI)
4840 continue;
4841
4842 VPValue *MaskI = RecipeI->getMask();
4843 Type *TypeI = GetLoadStoreValueType(RecipeI);
4845 Group.push_back(RecipeI);
4846 RecipeI = nullptr;
4847
4848 // Find all operations with the same or complementary masks.
4849 bool HasComplementaryMask = false;
4850 for (VPReplicateRecipe *&RecipeJ : Recipes) {
4851 if (!RecipeJ)
4852 continue;
4853
4854 VPValue *MaskJ = RecipeJ->getMask();
4855 Type *TypeJ = GetLoadStoreValueType(RecipeJ);
4856 if (TypeI == TypeJ) {
4857 // Check if any operation in the group has a complementary mask with
4858 // another, that is M1 == NOT(M2) or M2 == NOT(M1).
4859 HasComplementaryMask |= match(MaskI, m_Not(m_Specific(MaskJ))) ||
4860 match(MaskJ, m_Not(m_Specific(MaskI)));
4861 Group.push_back(RecipeJ);
4862 RecipeJ = nullptr;
4863 }
4864 }
4865
4866 if (HasComplementaryMask) {
4867 assert(Group.size() >= 2 && "must have at least 2 entries");
4868 AllGroups.push_back(std::move(Group));
4869 }
4870 }
4871 }
4872
4873 return AllGroups;
4874}
4875
4876// Find the recipe with minimum alignment in the group.
4877template <typename InstType>
4878static VPReplicateRecipe *
4880 return *min_element(Group, [](VPReplicateRecipe *A, VPReplicateRecipe *B) {
4881 return cast<InstType>(A->getUnderlyingInstr())->getAlign() <
4882 cast<InstType>(B->getUnderlyingInstr())->getAlign();
4883 });
4884}
4885
4888 const Loop *L) {
4889 auto Groups =
4891 if (Groups.empty())
4892 return;
4893
4894 // Process each group of loads.
4895 for (auto &Group : Groups) {
4896 // Try to use the earliest (most dominating) load to replace all others.
4897 VPReplicateRecipe *EarliestLoad = Group[0];
4898 VPBasicBlock *FirstBB = EarliestLoad->getParent();
4899 VPBasicBlock *LastBB = Group.back()->getParent();
4900
4901 // Check that the load doesn't alias with stores between first and last.
4902 auto LoadLoc = vputils::getMemoryLocation(*EarliestLoad);
4903 if (!LoadLoc || !canHoistOrSinkWithNoAliasCheck(*LoadLoc, FirstBB, LastBB))
4904 continue;
4905
4906 // Collect common metadata from all loads in the group.
4907 VPIRMetadata CommonMetadata = getCommonMetadata(Group);
4908
4909 // Find the load with minimum alignment to use.
4910 auto *LoadWithMinAlign = findRecipeWithMinAlign<LoadInst>(Group);
4911
4912 bool IsSingleScalar = EarliestLoad->isSingleScalar();
4913 assert(all_of(Group,
4914 [IsSingleScalar](VPReplicateRecipe *R) {
4915 return R->isSingleScalar() == IsSingleScalar;
4916 }) &&
4917 "all members in group must agree on IsSingleScalar");
4918
4919 // Create an unpredicated version of the earliest load with common
4920 // metadata.
4921 auto *UnpredicatedLoad = new VPReplicateRecipe(
4922 LoadWithMinAlign->getUnderlyingInstr(), {EarliestLoad->getOperand(0)},
4923 IsSingleScalar, /*Mask=*/nullptr, *EarliestLoad, CommonMetadata);
4924
4925 UnpredicatedLoad->insertBefore(EarliestLoad);
4926
4927 // Replace all loads in the group with the unpredicated load.
4928 for (VPReplicateRecipe *Load : Group) {
4929 Load->replaceAllUsesWith(UnpredicatedLoad);
4930 Load->eraseFromParent();
4931 }
4932 }
4933}
4934
4935static bool
4937 PredicatedScalarEvolution &PSE, const Loop &L,
4938 VPTypeAnalysis &TypeInfo) {
4939 auto StoreLoc = vputils::getMemoryLocation(*StoresToSink.front());
4940 if (!StoreLoc || !StoreLoc->AATags.Scope)
4941 return false;
4942
4943 // When sinking a group of stores, all members of the group alias each other.
4944 // Skip them during the alias checks.
4945 SmallPtrSet<VPRecipeBase *, 4> StoresToSinkSet(StoresToSink.begin(),
4946 StoresToSink.end());
4947
4948 VPBasicBlock *FirstBB = StoresToSink.front()->getParent();
4949 VPBasicBlock *LastBB = StoresToSink.back()->getParent();
4950 SinkStoreInfo SinkInfo(StoresToSinkSet, *StoresToSink[0], PSE, L, TypeInfo);
4951 return canHoistOrSinkWithNoAliasCheck(*StoreLoc, FirstBB, LastBB, SinkInfo);
4952}
4953
4956 const Loop *L) {
4957 auto Groups =
4959 if (Groups.empty())
4960 return;
4961
4962 VPTypeAnalysis TypeInfo(Plan);
4963
4964 for (auto &Group : Groups) {
4965 if (!canSinkStoreWithNoAliasCheck(Group, PSE, *L, TypeInfo))
4966 continue;
4967
4968 // Use the last (most dominated) store's location for the unconditional
4969 // store.
4970 VPReplicateRecipe *LastStore = Group.back();
4971 VPBasicBlock *InsertBB = LastStore->getParent();
4972
4973 // Collect common alias metadata from all stores in the group.
4974 VPIRMetadata CommonMetadata = getCommonMetadata(Group);
4975
4976 // Build select chain for stored values.
4977 VPValue *SelectedValue = Group[0]->getOperand(0);
4978 VPBuilder Builder(InsertBB, LastStore->getIterator());
4979
4980 bool IsSingleScalar = Group[0]->isSingleScalar();
4981 for (unsigned I = 1; I < Group.size(); ++I) {
4982 assert(IsSingleScalar == Group[I]->isSingleScalar() &&
4983 "all members in group must agree on IsSingleScalar");
4984 VPValue *Mask = Group[I]->getMask();
4985 VPValue *Value = Group[I]->getOperand(0);
4986 SelectedValue = Builder.createSelect(Mask, Value, SelectedValue,
4987 Group[I]->getDebugLoc());
4988 }
4989
4990 // Find the store with minimum alignment to use.
4991 auto *StoreWithMinAlign = findRecipeWithMinAlign<StoreInst>(Group);
4992
4993 // Create unconditional store with selected value and common metadata.
4994 auto *UnpredicatedStore = new VPReplicateRecipe(
4995 StoreWithMinAlign->getUnderlyingInstr(),
4996 {SelectedValue, LastStore->getOperand(1)}, IsSingleScalar,
4997 /*Mask=*/nullptr, *LastStore, CommonMetadata);
4998 UnpredicatedStore->insertBefore(*InsertBB, LastStore->getIterator());
4999
5000 // Remove all predicated stores from the group.
5001 for (VPReplicateRecipe *Store : Group)
5002 Store->eraseFromParent();
5003 }
5004}
5005
5007 VPlan &Plan, ElementCount BestVF, unsigned BestUF,
5009 assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
5010 assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
5011
5012 VPValue *TC = Plan.getTripCount();
5013 if (TC->getNumUsers() == 0)
5014 return;
5015
5016 // Skip cases for which the trip count may be non-trivial to materialize.
5017 // I.e., when a scalar tail is absent - due to tail folding, or when a scalar
5018 // tail is required.
5019 if (!Plan.hasScalarTail() ||
5021 Plan.getScalarPreheader() ||
5022 !isa<VPIRValue>(TC))
5023 return;
5024
5025 // Materialize vector trip counts for constants early if it can simply
5026 // be computed as (Original TC / VF * UF) * VF * UF.
5027 // TODO: Compute vector trip counts for loops requiring a scalar epilogue and
5028 // tail-folded loops.
5029 ScalarEvolution &SE = *PSE.getSE();
5030 auto *TCScev = SE.getSCEV(TC->getLiveInIRValue());
5031 if (!isa<SCEVConstant>(TCScev))
5032 return;
5033 const SCEV *VFxUF = SE.getElementCount(TCScev->getType(), BestVF * BestUF);
5034 auto VecTCScev = SE.getMulExpr(SE.getUDivExpr(TCScev, VFxUF), VFxUF);
5035 if (auto *ConstVecTC = dyn_cast<SCEVConstant>(VecTCScev))
5036 Plan.getVectorTripCount().setUnderlyingValue(ConstVecTC->getValue());
5037}
5038
5040 VPBasicBlock *VectorPH) {
5042 if (BTC->getNumUsers() == 0)
5043 return;
5044
5045 VPBuilder Builder(VectorPH, VectorPH->begin());
5046 auto *TCTy = VPTypeAnalysis(Plan).inferScalarType(Plan.getTripCount());
5047 auto *TCMO =
5048 Builder.createSub(Plan.getTripCount(), Plan.getConstantInt(TCTy, 1),
5049 DebugLoc::getCompilerGenerated(), "trip.count.minus.1");
5050 BTC->replaceAllUsesWith(TCMO);
5051}
5052
5054 if (Plan.hasScalarVFOnly())
5055 return;
5056
5057 VPTypeAnalysis TypeInfo(Plan);
5058 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
5059 auto VPBBsOutsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
5061 auto VPBBsInsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
5062 vp_depth_first_shallow(LoopRegion->getEntry()));
5063 // Materialize Build(Struct)Vector for all replicating VPReplicateRecipes,
5064 // VPScalarIVStepsRecipe and VPInstructions, excluding ones in replicate
5065 // regions. Those are not materialized explicitly yet. Those vector users are
5066 // still handled in VPReplicateRegion::execute(), via shouldPack().
5067 // TODO: materialize build vectors for replicating recipes in replicating
5068 // regions.
5069 for (VPBasicBlock *VPBB :
5070 concat<VPBasicBlock *>(VPBBsOutsideLoopRegion, VPBBsInsideLoopRegion)) {
5071 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
5073 continue;
5074 auto *DefR = cast<VPSingleDefRecipe>(&R);
5075 auto UsesVectorOrInsideReplicateRegion = [DefR, LoopRegion](VPUser *U) {
5076 VPRegionBlock *ParentRegion = cast<VPRecipeBase>(U)->getRegion();
5077 return !U->usesScalars(DefR) || ParentRegion != LoopRegion;
5078 };
5079 if ((isa<VPReplicateRecipe>(DefR) &&
5080 cast<VPReplicateRecipe>(DefR)->isSingleScalar()) ||
5081 (isa<VPInstruction>(DefR) &&
5083 !cast<VPInstruction>(DefR)->doesGeneratePerAllLanes())) ||
5084 none_of(DefR->users(), UsesVectorOrInsideReplicateRegion))
5085 continue;
5086
5087 Type *ScalarTy = TypeInfo.inferScalarType(DefR);
5088 unsigned Opcode = ScalarTy->isStructTy()
5091 auto *BuildVector = new VPInstruction(Opcode, {DefR});
5092 BuildVector->insertAfter(DefR);
5093
5094 DefR->replaceUsesWithIf(
5095 BuildVector, [BuildVector, &UsesVectorOrInsideReplicateRegion](
5096 VPUser &U, unsigned) {
5097 return &U != BuildVector && UsesVectorOrInsideReplicateRegion(&U);
5098 });
5099 }
5100 }
5101
5102 // Create explicit VPInstructions to convert vectors to scalars. The current
5103 // implementation is conservative - it may miss some cases that may or may not
5104 // be vector values. TODO: introduce Unpacks speculatively - remove them later
5105 // if they are known to operate on scalar values.
5106 for (VPBasicBlock *VPBB : VPBBsInsideLoopRegion) {
5107 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
5109 VPDerivedIVRecipe>(&R))
5110 continue;
5111 for (VPValue *Def : R.definedValues()) {
5112 // Skip recipes that are single-scalar or only have their first lane
5113 // used.
5114 // TODO: The Defs skipped here may or may not be vector values.
5115 // Introduce Unpacks, and remove them later, if they are guaranteed to
5116 // produce scalar values.
5118 continue;
5119
5120 // At the moment, we create unpacks only for scalar users outside
5121 // replicate regions. Recipes inside replicate regions still extract the
5122 // required lanes implicitly.
5123 // TODO: Remove once replicate regions are unrolled completely.
5124 auto IsCandidateUnpackUser = [Def](VPUser *U) {
5125 VPRegionBlock *ParentRegion = cast<VPRecipeBase>(U)->getRegion();
5126 return U->usesScalars(Def) &&
5127 (!ParentRegion || !ParentRegion->isReplicator());
5128 };
5129 if (none_of(Def->users(), IsCandidateUnpackUser))
5130 continue;
5131
5132 auto *Unpack = new VPInstruction(VPInstruction::Unpack, {Def});
5133 if (R.isPhi())
5134 Unpack->insertBefore(*VPBB, VPBB->getFirstNonPhi());
5135 else
5136 Unpack->insertAfter(&R);
5137 Def->replaceUsesWithIf(Unpack,
5138 [&IsCandidateUnpackUser](VPUser &U, unsigned) {
5139 return IsCandidateUnpackUser(&U);
5140 });
5141 }
5142 }
5143 }
5144}
5145
5147 VPlan &Plan, VPBasicBlock *VectorPHVPBB, bool TailByMasking,
5148 bool RequiresScalarEpilogue, VPValue *Step,
5149 std::optional<uint64_t> MaxRuntimeStep) {
5150 VPSymbolicValue &VectorTC = Plan.getVectorTripCount();
5151 // There's nothing to do if there are no users of the vector trip count or its
5152 // IR value has already been set.
5153 if (VectorTC.getNumUsers() == 0 || VectorTC.getUnderlyingValue())
5154 return;
5155
5156 VPValue *TC = Plan.getTripCount();
5157 Type *TCTy = VPTypeAnalysis(Plan).inferScalarType(TC);
5158 VPBasicBlock::iterator InsertPt = VectorPHVPBB->begin();
5159 if (auto *StepR = Step->getDefiningRecipe()) {
5160 assert(StepR->getParent() == VectorPHVPBB &&
5161 "Step must be defined in VectorPHVPBB");
5162 // Insert after Step's definition to maintain valid def-use ordering.
5163 InsertPt = std::next(StepR->getIterator());
5164 }
5165 VPBuilder Builder(VectorPHVPBB, InsertPt);
5166
5167 // For scalable steps, if TC is a constant and is divisible by the maximum
5168 // possible runtime step, then TC % Step == 0 for all valid vscale values
5169 // and the vector trip count equals TC directly.
5170 const APInt *TCVal;
5171 if (!RequiresScalarEpilogue && match(TC, m_APInt(TCVal)) && MaxRuntimeStep &&
5172 TCVal->getZExtValue() % *MaxRuntimeStep == 0) {
5173 VectorTC.replaceAllUsesWith(TC);
5174 return;
5175 }
5176
5177 // If the tail is to be folded by masking, round the number of iterations N
5178 // up to a multiple of Step instead of rounding down. This is done by first
5179 // adding Step-1 and then rounding down. Note that it's ok if this addition
5180 // overflows: the vector induction variable will eventually wrap to zero given
5181 // that it starts at zero and its Step is a power of two; the loop will then
5182 // exit, with the last early-exit vector comparison also producing all-true.
5183 if (TailByMasking) {
5184 TC = Builder.createAdd(
5185 TC, Builder.createSub(Step, Plan.getConstantInt(TCTy, 1)),
5186 DebugLoc::getCompilerGenerated(), "n.rnd.up");
5187 }
5188
5189 // Now we need to generate the expression for the part of the loop that the
5190 // vectorized body will execute. This is equal to N - (N % Step) if scalar
5191 // iterations are not required for correctness, or N - Step, otherwise. Step
5192 // is equal to the vectorization factor (number of SIMD elements) times the
5193 // unroll factor (number of SIMD instructions).
5194 VPValue *R =
5195 Builder.createNaryOp(Instruction::URem, {TC, Step},
5196 DebugLoc::getCompilerGenerated(), "n.mod.vf");
5197
5198 // There are cases where we *must* run at least one iteration in the remainder
5199 // loop. See the cost model for when this can happen. If the step evenly
5200 // divides the trip count, we set the remainder to be equal to the step. If
5201 // the step does not evenly divide the trip count, no adjustment is necessary
5202 // since there will already be scalar iterations. Note that the minimum
5203 // iterations check ensures that N >= Step.
5204 if (RequiresScalarEpilogue) {
5205 assert(!TailByMasking &&
5206 "requiring scalar epilogue is not supported with fail folding");
5207 VPValue *IsZero =
5208 Builder.createICmp(CmpInst::ICMP_EQ, R, Plan.getZero(TCTy));
5209 R = Builder.createSelect(IsZero, Step, R);
5210 }
5211
5212 VPValue *Res =
5213 Builder.createSub(TC, R, DebugLoc::getCompilerGenerated(), "n.vec");
5214 VectorTC.replaceAllUsesWith(Res);
5215}
5216
5218 ElementCount VFEC) {
5219 // If VF and VFxUF have already been materialized (no remaining users),
5220 // there's nothing more to do.
5221 if (Plan.getVF().isMaterialized()) {
5222 assert(Plan.getVFxUF().isMaterialized() &&
5223 "VF and VFxUF must be materialized together");
5224 return;
5225 }
5226
5227 VPBuilder Builder(VectorPH, VectorPH->begin());
5228 Type *TCTy = VPTypeAnalysis(Plan).inferScalarType(Plan.getTripCount());
5229 VPValue &VF = Plan.getVF();
5230 VPValue &VFxUF = Plan.getVFxUF();
5231 // If there are no users of the runtime VF, compute VFxUF by constant folding
5232 // the multiplication of VF and UF.
5233 if (VF.getNumUsers() == 0) {
5234 VPValue *RuntimeVFxUF =
5235 Builder.createElementCount(TCTy, VFEC * Plan.getConcreteUF());
5236 VFxUF.replaceAllUsesWith(RuntimeVFxUF);
5237 return;
5238 }
5239
5240 // For users of the runtime VF, compute it as VF * vscale, and VFxUF as (VF *
5241 // vscale) * UF.
5242 VPValue *RuntimeVF = Builder.createElementCount(TCTy, VFEC);
5244 VPValue *BC = Builder.createNaryOp(VPInstruction::Broadcast, RuntimeVF);
5246 BC, [&VF](VPUser &U, unsigned) { return !U.usesScalars(&VF); });
5247 }
5248 VF.replaceAllUsesWith(RuntimeVF);
5249
5250 VPValue *MulByUF = Builder.createOverflowingOp(
5251 Instruction::Mul,
5252 {RuntimeVF, Plan.getConstantInt(TCTy, Plan.getConcreteUF())},
5253 {true, false});
5254 VFxUF.replaceAllUsesWith(MulByUF);
5255}
5256
5259 SCEVExpander Expander(SE, "induction", /*PreserveLCSSA=*/false);
5260
5261 auto *Entry = cast<VPIRBasicBlock>(Plan.getEntry());
5262 BasicBlock *EntryBB = Entry->getIRBasicBlock();
5263 DenseMap<const SCEV *, Value *> ExpandedSCEVs;
5264 for (VPRecipeBase &R : make_early_inc_range(*Entry)) {
5266 continue;
5267 auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(&R);
5268 if (!ExpSCEV)
5269 break;
5270 const SCEV *Expr = ExpSCEV->getSCEV();
5271 Value *Res =
5272 Expander.expandCodeFor(Expr, Expr->getType(), EntryBB->getTerminator());
5273 ExpandedSCEVs[ExpSCEV->getSCEV()] = Res;
5274 VPValue *Exp = Plan.getOrAddLiveIn(Res);
5275 ExpSCEV->replaceAllUsesWith(Exp);
5276 if (Plan.getTripCount() == ExpSCEV)
5277 Plan.resetTripCount(Exp);
5278 ExpSCEV->eraseFromParent();
5279 }
5281 "VPExpandSCEVRecipes must be at the beginning of the entry block, "
5282 "before any VPIRInstructions");
5283 // Add IR instructions in the entry basic block but not in the VPIRBasicBlock
5284 // to the VPIRBasicBlock.
5285 auto EI = Entry->begin();
5286 for (Instruction &I : drop_end(*EntryBB)) {
5287 if (EI != Entry->end() && isa<VPIRInstruction>(*EI) &&
5288 &cast<VPIRInstruction>(&*EI)->getInstruction() == &I) {
5289 EI++;
5290 continue;
5291 }
5293 }
5294
5295 return ExpandedSCEVs;
5296}
5297
5298/// Returns true if \p V is VPWidenLoadRecipe or VPInterleaveRecipe that can be
5299/// converted to a narrower recipe. \p V is used by a wide recipe that feeds a
5300/// store interleave group at index \p Idx, \p WideMember0 is the recipe feeding
5301/// the same interleave group at index 0. A VPWidenLoadRecipe can be narrowed to
5302/// an index-independent load if it feeds all wide ops at all indices (\p OpV
5303/// must be the operand at index \p OpIdx for both the recipe at lane 0, \p
5304/// WideMember0). A VPInterleaveRecipe can be narrowed to a wide load, if \p V
5305/// is defined at \p Idx of a load interleave group.
5306static bool canNarrowLoad(VPSingleDefRecipe *WideMember0, unsigned OpIdx,
5307 VPValue *OpV, unsigned Idx, bool IsScalable) {
5308 VPValue *Member0Op = WideMember0->getOperand(OpIdx);
5309 VPRecipeBase *Member0OpR = Member0Op->getDefiningRecipe();
5310 if (!Member0OpR)
5311 return Member0Op == OpV;
5312 if (auto *W = dyn_cast<VPWidenLoadRecipe>(Member0OpR))
5313 // For scalable VFs, the narrowed plan processes vscale iterations at once,
5314 // so a shared wide load cannot be narrowed to a uniform scalar; bail out.
5315 return !IsScalable && !W->getMask() && W->isConsecutive() &&
5316 Member0Op == OpV;
5317 if (auto *IR = dyn_cast<VPInterleaveRecipe>(Member0OpR))
5318 return IR->getInterleaveGroup()->isFull() && IR->getVPValue(Idx) == OpV;
5319 return false;
5320}
5321
5322static bool canNarrowOps(ArrayRef<VPValue *> Ops, bool IsScalable) {
5324 auto *WideMember0 = dyn_cast<VPSingleDefRecipe>(Ops[0]);
5325 if (!WideMember0)
5326 return false;
5327 for (VPValue *V : Ops) {
5329 return false;
5330 auto *R = cast<VPSingleDefRecipe>(V);
5331 if (getOpcodeOrIntrinsicID(R) != getOpcodeOrIntrinsicID(WideMember0))
5332 return false;
5333 }
5334
5335 for (unsigned Idx = 0; Idx != WideMember0->getNumOperands(); ++Idx) {
5337 for (VPValue *Op : Ops)
5338 OpsI.push_back(Op->getDefiningRecipe()->getOperand(Idx));
5339
5340 if (canNarrowOps(OpsI, IsScalable))
5341 continue;
5342
5343 if (any_of(enumerate(OpsI), [WideMember0, Idx, IsScalable](const auto &P) {
5344 const auto &[OpIdx, OpV] = P;
5345 return !canNarrowLoad(WideMember0, Idx, OpV, OpIdx, IsScalable);
5346 }))
5347 return false;
5348 }
5349
5350 return true;
5351}
5352
5353/// Returns VF from \p VFs if \p IR is a full interleave group with factor and
5354/// number of members both equal to VF. The interleave group must also access
5355/// the full vector width.
5356static std::optional<ElementCount> isConsecutiveInterleaveGroup(
5358 VPTypeAnalysis &TypeInfo, const TargetTransformInfo &TTI) {
5359 if (!InterleaveR || InterleaveR->getMask())
5360 return std::nullopt;
5361
5362 Type *GroupElementTy = nullptr;
5363 if (InterleaveR->getStoredValues().empty()) {
5364 GroupElementTy = TypeInfo.inferScalarType(InterleaveR->getVPValue(0));
5365 if (!all_of(InterleaveR->definedValues(),
5366 [&TypeInfo, GroupElementTy](VPValue *Op) {
5367 return TypeInfo.inferScalarType(Op) == GroupElementTy;
5368 }))
5369 return std::nullopt;
5370 } else {
5371 GroupElementTy =
5372 TypeInfo.inferScalarType(InterleaveR->getStoredValues()[0]);
5373 if (!all_of(InterleaveR->getStoredValues(),
5374 [&TypeInfo, GroupElementTy](VPValue *Op) {
5375 return TypeInfo.inferScalarType(Op) == GroupElementTy;
5376 }))
5377 return std::nullopt;
5378 }
5379
5380 auto IG = InterleaveR->getInterleaveGroup();
5381 if (IG->getFactor() != IG->getNumMembers())
5382 return std::nullopt;
5383
5384 auto GetVectorBitWidthForVF = [&TTI](ElementCount VF) {
5385 TypeSize Size = TTI.getRegisterBitWidth(
5388 assert(Size.isScalable() == VF.isScalable() &&
5389 "if Size is scalable, VF must be scalable and vice versa");
5390 return Size.getKnownMinValue();
5391 };
5392
5393 for (ElementCount VF : VFs) {
5394 unsigned MinVal = VF.getKnownMinValue();
5395 unsigned GroupSize = GroupElementTy->getScalarSizeInBits() * MinVal;
5396 if (IG->getFactor() == MinVal && GroupSize == GetVectorBitWidthForVF(VF))
5397 return {VF};
5398 }
5399 return std::nullopt;
5400}
5401
5402/// Returns true if \p VPValue is a narrow VPValue.
5403static bool isAlreadyNarrow(VPValue *VPV) {
5404 if (isa<VPIRValue>(VPV))
5405 return true;
5406 auto *RepR = dyn_cast<VPReplicateRecipe>(VPV);
5407 return RepR && RepR->isSingleScalar();
5408}
5409
5410// Convert a wide recipe defining a VPValue \p V feeding an interleave group to
5411// a narrow variant.
5412static VPValue *
5414 auto *R = V->getDefiningRecipe();
5415 if (!R || NarrowedOps.contains(V))
5416 return V;
5417
5418 if (isAlreadyNarrow(V))
5419 return V;
5420
5422 auto *WideMember0 = cast<VPSingleDefRecipe>(R);
5423 for (unsigned Idx = 0, E = WideMember0->getNumOperands(); Idx != E; ++Idx)
5424 WideMember0->setOperand(
5425 Idx,
5426 narrowInterleaveGroupOp(WideMember0->getOperand(Idx), NarrowedOps));
5427 return V;
5428 }
5429
5430 if (auto *LoadGroup = dyn_cast<VPInterleaveRecipe>(R)) {
5431 // Narrow interleave group to wide load, as transformed VPlan will only
5432 // process one original iteration.
5433 auto *LI = cast<LoadInst>(LoadGroup->getInterleaveGroup()->getInsertPos());
5434 auto *L = new VPWidenLoadRecipe(*LI, LoadGroup->getAddr(),
5435 LoadGroup->getMask(), /*Consecutive=*/true,
5436 {}, LoadGroup->getDebugLoc());
5437 L->insertBefore(LoadGroup);
5438 NarrowedOps.insert(L);
5439 return L;
5440 }
5441
5442 if (auto *RepR = dyn_cast<VPReplicateRecipe>(R)) {
5443 assert(RepR->isSingleScalar() && RepR->getOpcode() == Instruction::Load &&
5444 "must be a single scalar load");
5445 NarrowedOps.insert(RepR);
5446 return RepR;
5447 }
5448
5449 auto *WideLoad = cast<VPWidenLoadRecipe>(R);
5450 VPValue *PtrOp = WideLoad->getAddr();
5451 if (auto *VecPtr = dyn_cast<VPVectorPointerRecipe>(PtrOp))
5452 PtrOp = VecPtr->getOperand(0);
5453 // Narrow wide load to uniform scalar load, as transformed VPlan will only
5454 // process one original iteration.
5455 auto *N = new VPReplicateRecipe(&WideLoad->getIngredient(), {PtrOp},
5456 /*IsUniform*/ true,
5457 /*Mask*/ nullptr, {}, *WideLoad);
5458 N->insertBefore(WideLoad);
5459 NarrowedOps.insert(N);
5460 return N;
5461}
5462
5463std::unique_ptr<VPlan>
5465 const TargetTransformInfo &TTI) {
5466 VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
5467
5468 if (!VectorLoop)
5469 return nullptr;
5470
5471 // Only handle single-block loops for now.
5472 if (VectorLoop->getEntryBasicBlock() != VectorLoop->getExitingBasicBlock())
5473 return nullptr;
5474
5475 // Skip plans when we may not be able to properly narrow.
5476 VPBasicBlock *Exiting = VectorLoop->getExitingBasicBlock();
5477 if (!match(&Exiting->back(), m_BranchOnCount()))
5478 return nullptr;
5479
5480 assert(match(&Exiting->back(),
5482 m_Specific(&Plan.getVectorTripCount()))) &&
5483 "unexpected branch-on-count");
5484
5485 VPTypeAnalysis TypeInfo(Plan);
5487 std::optional<ElementCount> VFToOptimize;
5488 for (auto &R : *VectorLoop->getEntryBasicBlock()) {
5491 continue;
5492
5493 // Bail out on recipes not supported at the moment:
5494 // * phi recipes other than the canonical induction
5495 // * recipes writing to memory except interleave groups
5496 // Only support plans with a canonical induction phi.
5497 if (R.isPhi())
5498 return nullptr;
5499
5500 auto *InterleaveR = dyn_cast<VPInterleaveRecipe>(&R);
5501 if (R.mayWriteToMemory() && !InterleaveR)
5502 return nullptr;
5503
5504 // All other ops are allowed, but we reject uses that cannot be converted
5505 // when checking all allowed consumers (store interleave groups) below.
5506 if (!InterleaveR)
5507 continue;
5508
5509 // Try to find a single VF, where all interleave groups are consecutive and
5510 // saturate the full vector width. If we already have a candidate VF, check
5511 // if it is applicable for the current InterleaveR, otherwise look for a
5512 // suitable VF across the Plan's VFs.
5514 VFToOptimize ? SmallVector<ElementCount>({*VFToOptimize})
5515 : to_vector(Plan.vectorFactors());
5516 std::optional<ElementCount> NarrowedVF =
5517 isConsecutiveInterleaveGroup(InterleaveR, VFs, TypeInfo, TTI);
5518 if (!NarrowedVF || (VFToOptimize && NarrowedVF != VFToOptimize))
5519 return nullptr;
5520 VFToOptimize = NarrowedVF;
5521
5522 // Skip read interleave groups.
5523 if (InterleaveR->getStoredValues().empty())
5524 continue;
5525
5526 // Narrow interleave groups, if all operands are already matching narrow
5527 // ops.
5528 auto *Member0 = InterleaveR->getStoredValues()[0];
5529 if (isAlreadyNarrow(Member0) &&
5530 all_of(InterleaveR->getStoredValues(), equal_to(Member0))) {
5531 StoreGroups.push_back(InterleaveR);
5532 continue;
5533 }
5534
5535 // For now, we only support full interleave groups storing load interleave
5536 // groups.
5537 if (all_of(enumerate(InterleaveR->getStoredValues()), [](auto Op) {
5538 VPRecipeBase *DefR = Op.value()->getDefiningRecipe();
5539 if (!DefR)
5540 return false;
5541 auto *IR = dyn_cast<VPInterleaveRecipe>(DefR);
5542 return IR && IR->getInterleaveGroup()->isFull() &&
5543 IR->getVPValue(Op.index()) == Op.value();
5544 })) {
5545 StoreGroups.push_back(InterleaveR);
5546 continue;
5547 }
5548
5549 // Check if all values feeding InterleaveR are matching wide recipes, which
5550 // operands that can be narrowed.
5551 if (!canNarrowOps(InterleaveR->getStoredValues(),
5552 VFToOptimize->isScalable()))
5553 return nullptr;
5554 StoreGroups.push_back(InterleaveR);
5555 }
5556
5557 if (StoreGroups.empty())
5558 return nullptr;
5559
5560 VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
5561 bool RequiresScalarEpilogue =
5562 MiddleVPBB->getNumSuccessors() == 1 &&
5563 MiddleVPBB->getSingleSuccessor() == Plan.getScalarPreheader();
5564 // Bail out for tail-folding (middle block with a single successor to exit).
5565 if (MiddleVPBB->getNumSuccessors() != 2 && !RequiresScalarEpilogue)
5566 return nullptr;
5567
5568 // All interleave groups in Plan can be narrowed for VFToOptimize. Split the
5569 // original Plan into 2: a) a new clone which contains all VFs of Plan, except
5570 // VFToOptimize, and b) the original Plan with VFToOptimize as single VF.
5571 // TODO: Handle cases where only some interleave groups can be narrowed.
5572 std::unique_ptr<VPlan> NewPlan;
5573 if (size(Plan.vectorFactors()) != 1) {
5574 NewPlan = std::unique_ptr<VPlan>(Plan.duplicate());
5575 Plan.setVF(*VFToOptimize);
5576 NewPlan->removeVF(*VFToOptimize);
5577 }
5578
5579 // Convert InterleaveGroup \p R to a single VPWidenLoadRecipe.
5580 SmallPtrSet<VPValue *, 4> NarrowedOps;
5581 // Narrow operation tree rooted at store groups.
5582 for (auto *StoreGroup : StoreGroups) {
5583 VPValue *Res =
5584 narrowInterleaveGroupOp(StoreGroup->getStoredValues()[0], NarrowedOps);
5585 auto *SI =
5586 cast<StoreInst>(StoreGroup->getInterleaveGroup()->getInsertPos());
5587 auto *S = new VPWidenStoreRecipe(*SI, StoreGroup->getAddr(), Res, nullptr,
5588 /*Consecutive=*/true, {},
5589 StoreGroup->getDebugLoc());
5590 S->insertBefore(StoreGroup);
5591 StoreGroup->eraseFromParent();
5592 }
5593
5594 // Adjust induction to reflect that the transformed plan only processes one
5595 // original iteration.
5597 Type *CanIVTy = VectorLoop->getCanonicalIVType();
5598 VPBasicBlock *VectorPH = Plan.getVectorPreheader();
5599 VPBuilder PHBuilder(VectorPH, VectorPH->begin());
5600
5601 VPValue *UF = &Plan.getUF();
5602 VPValue *Step;
5603 if (VFToOptimize->isScalable()) {
5604 VPValue *VScale =
5605 PHBuilder.createElementCount(CanIVTy, ElementCount::getScalable(1));
5606 Step = PHBuilder.createOverflowingOp(Instruction::Mul, {VScale, UF},
5607 {true, false});
5608 Plan.getVF().replaceAllUsesWith(VScale);
5609 } else {
5610 Step = UF;
5611 Plan.getVF().replaceAllUsesWith(Plan.getConstantInt(CanIVTy, 1));
5612 }
5613 // Materialize vector trip count with the narrowed step.
5614 materializeVectorTripCount(Plan, VectorPH, /*TailByMasking=*/false,
5615 RequiresScalarEpilogue, Step);
5616
5617 CanIVInc->setOperand(1, Step);
5618 Plan.getVFxUF().replaceAllUsesWith(Step);
5619
5620 removeDeadRecipes(Plan);
5621 assert(none_of(*VectorLoop->getEntryBasicBlock(),
5623 "All VPVectorPointerRecipes should have been removed");
5624 return NewPlan;
5625}
5626
5627/// Add branch weight metadata, if the \p Plan's middle block is terminated by a
5628/// BranchOnCond recipe.
5630 VPlan &Plan, ElementCount VF, std::optional<unsigned> VScaleForTuning) {
5631 VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
5632 auto *MiddleTerm =
5634 // Only add branch metadata if there is a (conditional) terminator.
5635 if (!MiddleTerm)
5636 return;
5637
5638 assert(MiddleTerm->getOpcode() == VPInstruction::BranchOnCond &&
5639 "must have a BranchOnCond");
5640 // Assume that `TripCount % VectorStep ` is equally distributed.
5641 unsigned VectorStep = Plan.getConcreteUF() * VF.getKnownMinValue();
5642 if (VF.isScalable() && VScaleForTuning.has_value())
5643 VectorStep *= *VScaleForTuning;
5644 assert(VectorStep > 0 && "trip count should not be zero");
5645 MDBuilder MDB(Plan.getContext());
5646 MDNode *BranchWeights =
5647 MDB.createBranchWeights({1, VectorStep - 1}, /*IsExpected=*/false);
5648 MiddleTerm->setMetadata(LLVMContext::MD_prof, BranchWeights);
5649}
5650
5652 VFRange &Range) {
5653 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
5654 auto *MiddleVPBB = Plan.getMiddleBlock();
5655 VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
5656
5657 auto IsScalableOne = [](ElementCount VF) -> bool {
5658 return VF == ElementCount::getScalable(1);
5659 };
5660
5661 for (auto &HeaderPhi : VectorRegion->getEntryBasicBlock()->phis()) {
5662 auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&HeaderPhi);
5663 if (!FOR)
5664 continue;
5665
5666 assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
5667 "Cannot handle loops with uncountable early exits");
5668
5669 // This is the second phase of vectorizing first-order recurrences, creating
5670 // extract for users outside the loop. An overview of the transformation is
5671 // described below. Suppose we have the following loop with some use after
5672 // the loop of the last a[i-1],
5673 //
5674 // for (int i = 0; i < n; ++i) {
5675 // t = a[i - 1];
5676 // b[i] = a[i] - t;
5677 // }
5678 // use t;
5679 //
5680 // There is a first-order recurrence on "a". For this loop, the shorthand
5681 // scalar IR looks like:
5682 //
5683 // scalar.ph:
5684 // s.init = a[-1]
5685 // br scalar.body
5686 //
5687 // scalar.body:
5688 // i = phi [0, scalar.ph], [i+1, scalar.body]
5689 // s1 = phi [s.init, scalar.ph], [s2, scalar.body]
5690 // s2 = a[i]
5691 // b[i] = s2 - s1
5692 // br cond, scalar.body, exit.block
5693 //
5694 // exit.block:
5695 // use = lcssa.phi [s1, scalar.body]
5696 //
5697 // In this example, s1 is a recurrence because it's value depends on the
5698 // previous iteration. In the first phase of vectorization, we created a
5699 // VPFirstOrderRecurrencePHIRecipe v1 for s1. Now we create the extracts
5700 // for users in the scalar preheader and exit block.
5701 //
5702 // vector.ph:
5703 // v_init = vector(..., ..., ..., a[-1])
5704 // br vector.body
5705 //
5706 // vector.body
5707 // i = phi [0, vector.ph], [i+4, vector.body]
5708 // v1 = phi [v_init, vector.ph], [v2, vector.body]
5709 // v2 = a[i, i+1, i+2, i+3]
5710 // b[i] = v2 - v1
5711 // // Next, third phase will introduce v1' = splice(v1(3), v2(0, 1, 2))
5712 // b[i, i+1, i+2, i+3] = v2 - v1
5713 // br cond, vector.body, middle.block
5714 //
5715 // middle.block:
5716 // vector.recur.extract.for.phi = v2(2)
5717 // vector.recur.extract = v2(3)
5718 // br cond, scalar.ph, exit.block
5719 //
5720 // scalar.ph:
5721 // scalar.recur.init = phi [vector.recur.extract, middle.block],
5722 // [s.init, otherwise]
5723 // br scalar.body
5724 //
5725 // scalar.body:
5726 // i = phi [0, scalar.ph], [i+1, scalar.body]
5727 // s1 = phi [scalar.recur.init, scalar.ph], [s2, scalar.body]
5728 // s2 = a[i]
5729 // b[i] = s2 - s1
5730 // br cond, scalar.body, exit.block
5731 //
5732 // exit.block:
5733 // lo = lcssa.phi [s1, scalar.body],
5734 // [vector.recur.extract.for.phi, middle.block]
5735 //
5736 // Now update VPIRInstructions modeling LCSSA phis in the exit block.
5737 // Extract the penultimate value of the recurrence and use it as operand for
5738 // the VPIRInstruction modeling the phi.
5740 make_range(MiddleVPBB->getFirstNonPhi(), MiddleVPBB->end()))) {
5742 continue;
5743
5744 // For VF vscale x 1, if vscale = 1, we are unable to extract the
5745 // penultimate value of the recurrence. Instead we rely on the existing
5746 // extract of the last element from the result of
5747 // VPInstruction::FirstOrderRecurrenceSplice.
5748 // TODO: Consider vscale_range info and UF.
5750 Range))
5751 return;
5752 VPValue *PenultimateElement = MiddleBuilder.createNaryOp(
5753 VPInstruction::ExtractPenultimateElement, FOR->getBackedgeValue(), {},
5754 "vector.recur.extract.for.phi");
5755 for (VPUser *U : to_vector(cast<VPInstruction>(&R)->users())) {
5756 auto *ExitPhi = dyn_cast<VPIRPhi>(U);
5757 if (!ExitPhi)
5758 continue;
5759 ExitPhi->replaceUsesOfWith(cast<VPInstruction>(&R), PenultimateElement);
5760 }
5761 }
5762 }
5763}
5764
5765/// Check if \p V is a binary expression of a widened IV and a loop-invariant
5766/// value. Returns the widened IV if found, nullptr otherwise.
5768 auto *BinOp = dyn_cast<VPWidenRecipe>(V);
5769 if (!BinOp || !Instruction::isBinaryOp(BinOp->getOpcode()) ||
5770 Instruction::isIntDivRem(BinOp->getOpcode()))
5771 return nullptr;
5772
5773 VPValue *WidenIVCandidate = BinOp->getOperand(0);
5774 VPValue *InvariantCandidate = BinOp->getOperand(1);
5775 if (!isa<VPWidenIntOrFpInductionRecipe>(WidenIVCandidate))
5776 std::swap(WidenIVCandidate, InvariantCandidate);
5777
5778 if (!InvariantCandidate->isDefinedOutsideLoopRegions())
5779 return nullptr;
5780
5781 return dyn_cast<VPWidenIntOrFpInductionRecipe>(WidenIVCandidate);
5782}
5783
5784/// Create a scalar version of \p BinOp, with its \p WidenIV operand replaced
5785/// by \p ScalarIV, and place it after \p ScalarIV's defining recipe.
5789 BinOp->getNumOperands() == 2 && "BinOp must have 2 operands");
5790 auto *ClonedOp = BinOp->clone();
5791 if (ClonedOp->getOperand(0) == WidenIV) {
5792 ClonedOp->setOperand(0, ScalarIV);
5793 } else {
5794 assert(ClonedOp->getOperand(1) == WidenIV && "one operand must be WideIV");
5795 ClonedOp->setOperand(1, ScalarIV);
5796 }
5797 ClonedOp->insertAfter(ScalarIV->getDefiningRecipe());
5798 return ClonedOp;
5799}
5800
5803 Loop &L) {
5804 ScalarEvolution &SE = *PSE.getSE();
5805 VPRegionBlock *VectorLoopRegion = Plan.getVectorLoopRegion();
5806
5807 // Helper lambda to check if the IV range excludes the sentinel value. Try
5808 // signed first, then unsigned. Return an excluded sentinel if found,
5809 // otherwise return std::nullopt.
5810 auto CheckSentinel = [&SE](const SCEV *IVSCEV,
5811 bool UseMax) -> std::optional<APSInt> {
5812 unsigned BW = IVSCEV->getType()->getScalarSizeInBits();
5813 for (bool Signed : {true, false}) {
5814 APSInt Sentinel = UseMax ? APSInt::getMinValue(BW, /*Unsigned=*/!Signed)
5815 : APSInt::getMaxValue(BW, /*Unsigned=*/!Signed);
5816
5817 ConstantRange IVRange =
5818 Signed ? SE.getSignedRange(IVSCEV) : SE.getUnsignedRange(IVSCEV);
5819 if (!IVRange.contains(Sentinel))
5820 return Sentinel;
5821 }
5822 return std::nullopt;
5823 };
5824
5825 VPValue *HeaderMask = vputils::findHeaderMask(Plan);
5826 for (VPRecipeBase &Phi :
5827 make_early_inc_range(VectorLoopRegion->getEntryBasicBlock()->phis())) {
5828 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&Phi);
5830 PhiR->getRecurrenceKind()))
5831 continue;
5832
5833 Type *PhiTy = VPTypeAnalysis(Plan).inferScalarType(PhiR);
5834 if (PhiTy->isPointerTy() || PhiTy->isFloatingPointTy())
5835 continue;
5836
5837 // If there's a header mask, the backedge select will not be the find-last
5838 // select.
5839 VPValue *BackedgeVal = PhiR->getBackedgeValue();
5840 auto *FindLastSelect = cast<VPSingleDefRecipe>(BackedgeVal);
5841 if (HeaderMask &&
5842 !match(BackedgeVal,
5843 m_Select(m_Specific(HeaderMask),
5844 m_VPSingleDefRecipe(FindLastSelect), m_Specific(PhiR))))
5845 llvm_unreachable("expected header mask select");
5846
5847 // Get the find-last expression from the find-last select of the reduction
5848 // phi. The find-last select should be a select between the phi and the
5849 // find-last expression.
5850 VPValue *Cond, *FindLastExpression;
5851 if (!match(FindLastSelect, m_Select(m_VPValue(Cond), m_Specific(PhiR),
5852 m_VPValue(FindLastExpression))) &&
5853 !match(FindLastSelect,
5854 m_Select(m_VPValue(Cond), m_VPValue(FindLastExpression),
5855 m_Specific(PhiR))))
5856 continue;
5857
5858 // Check if FindLastExpression is a simple expression of a widened IV. If
5859 // so, we can track the underlying IV instead and sink the expression.
5860 auto *IVOfExpressionToSink = getExpressionIV(FindLastExpression);
5861 const SCEV *IVSCEV = vputils::getSCEVExprForVPValue(
5862 IVOfExpressionToSink ? IVOfExpressionToSink : FindLastExpression, PSE,
5863 &L);
5864 const SCEV *Step;
5865 if (!match(IVSCEV, m_scev_AffineAddRec(m_SCEV(), m_SCEV(Step)))) {
5866 assert(!match(vputils::getSCEVExprForVPValue(FindLastExpression, PSE, &L),
5868 "IVOfExpressionToSink not being an AddRec must imply "
5869 "FindLastExpression not being an AddRec.");
5870 continue;
5871 }
5872
5873 // Determine direction from SCEV step.
5874 if (!SE.isKnownNonZero(Step))
5875 continue;
5876
5877 // Positive step means we need UMax/SMax to find the last IV value, and
5878 // UMin/SMin otherwise.
5879 bool UseMax = SE.isKnownPositive(Step);
5880 std::optional<APSInt> SentinelVal = CheckSentinel(IVSCEV, UseMax);
5881 bool UseSigned = SentinelVal && SentinelVal->isSigned();
5882
5883 // Sinking an expression will disable epilogue vectorization. Only use it,
5884 // if FindLastExpression cannot be vectorized via a sentinel. Sinking may
5885 // also prevent vectorizing using a sentinel (e.g., if the expression is a
5886 // multiply or divide by large constant, respectively), which also makes
5887 // sinking undesirable.
5888 if (IVOfExpressionToSink) {
5889 const SCEV *FindLastExpressionSCEV =
5890 vputils::getSCEVExprForVPValue(FindLastExpression, PSE, &L);
5891 if (match(FindLastExpressionSCEV,
5892 m_scev_AffineAddRec(m_SCEV(), m_SCEV(Step)))) {
5893 bool NewUseMax = SE.isKnownPositive(Step);
5894 if (auto NewSentinel =
5895 CheckSentinel(FindLastExpressionSCEV, NewUseMax)) {
5896 // The original expression already has a sentinel, so prefer not
5897 // sinking to keep epilogue vectorization possible.
5898 SentinelVal = *NewSentinel;
5899 UseSigned = NewSentinel->isSigned();
5900 UseMax = NewUseMax;
5901 IVSCEV = FindLastExpressionSCEV;
5902 IVOfExpressionToSink = nullptr;
5903 }
5904 }
5905 }
5906
5907 // If no sentinel was found, fall back to a boolean AnyOf reduction to track
5908 // if the condition was ever true. Requires the IV to not wrap, otherwise we
5909 // cannot use min/max.
5910 if (!SentinelVal) {
5911 auto *AR = cast<SCEVAddRecExpr>(IVSCEV);
5912 if (AR->hasNoSignedWrap())
5913 UseSigned = true;
5914 else if (AR->hasNoUnsignedWrap())
5915 UseSigned = false;
5916 else
5917 continue;
5918 }
5919
5921 BackedgeVal,
5923
5924 VPValue *NewFindLastSelect = BackedgeVal;
5925 VPValue *SelectCond = Cond;
5926 if (!SentinelVal || IVOfExpressionToSink) {
5927 // When we need to create a new select, normalize the condition so that
5928 // PhiR is the last operand and include the header mask if needed.
5929 DebugLoc DL = FindLastSelect->getDefiningRecipe()->getDebugLoc();
5930 VPBuilder LoopBuilder(FindLastSelect->getDefiningRecipe());
5931 if (FindLastSelect->getDefiningRecipe()->getOperand(1) == PhiR)
5932 SelectCond = LoopBuilder.createNot(SelectCond);
5933
5934 // When tail folding, mask the condition with the header mask to prevent
5935 // propagating poison from inactive lanes in the last vector iteration.
5936 if (HeaderMask)
5937 SelectCond = LoopBuilder.createLogicalAnd(HeaderMask, SelectCond);
5938
5939 if (SelectCond != Cond || IVOfExpressionToSink) {
5940 NewFindLastSelect = LoopBuilder.createSelect(
5941 SelectCond,
5942 IVOfExpressionToSink ? IVOfExpressionToSink : FindLastExpression,
5943 PhiR, DL);
5944 }
5945 }
5946
5947 // Create the reduction result in the middle block using sentinel directly.
5948 RecurKind MinMaxKind =
5949 UseMax ? (UseSigned ? RecurKind::SMax : RecurKind::UMax)
5950 : (UseSigned ? RecurKind::SMin : RecurKind::UMin);
5951 VPIRFlags Flags(MinMaxKind, /*IsOrdered=*/false, /*IsInLoop=*/false,
5952 FastMathFlags());
5953 DebugLoc ExitDL = RdxResult->getDebugLoc();
5954 VPBuilder MiddleBuilder(RdxResult);
5955 VPValue *ReducedIV =
5957 NewFindLastSelect, Flags, ExitDL);
5958
5959 // If IVOfExpressionToSink is an expression to sink, sink it now.
5960 VPValue *VectorRegionExitingVal = ReducedIV;
5961 if (IVOfExpressionToSink)
5962 VectorRegionExitingVal =
5963 cloneBinOpForScalarIV(cast<VPWidenRecipe>(FindLastExpression),
5964 ReducedIV, IVOfExpressionToSink);
5965
5966 VPValue *NewRdxResult;
5967 VPValue *StartVPV = PhiR->getStartValue();
5968 if (SentinelVal) {
5969 // Sentinel-based approach: reduce IVs with min/max, compare against
5970 // sentinel to detect if condition was ever true, select accordingly.
5971 VPValue *Sentinel = Plan.getConstantInt(*SentinelVal);
5972 auto *Cmp = MiddleBuilder.createICmp(CmpInst::ICMP_NE, ReducedIV,
5973 Sentinel, ExitDL);
5974 NewRdxResult = MiddleBuilder.createSelect(Cmp, VectorRegionExitingVal,
5975 StartVPV, ExitDL);
5976 StartVPV = Sentinel;
5977 } else {
5978 // Introduce a boolean AnyOf reduction to track if the condition was ever
5979 // true in the loop. Use it to select the initial start value, if it was
5980 // never true.
5981 auto *AnyOfPhi = new VPReductionPHIRecipe(
5982 /*Phi=*/nullptr, RecurKind::Or, *Plan.getFalse(), *Plan.getFalse(),
5983 RdxUnordered{1}, {}, /*HasUsesOutsideReductionChain=*/false);
5984 AnyOfPhi->insertAfter(PhiR);
5985
5986 VPBuilder LoopBuilder(BackedgeVal->getDefiningRecipe());
5987 VPValue *OrVal = LoopBuilder.createOr(AnyOfPhi, SelectCond);
5988 AnyOfPhi->setOperand(1, OrVal);
5989
5990 NewRdxResult = MiddleBuilder.createAnyOfReduction(
5991 OrVal, VectorRegionExitingVal, StartVPV, ExitDL);
5992
5993 // Initialize the IV reduction phi with the neutral element, not the
5994 // original start value, to ensure correct min/max reduction results.
5995 StartVPV = Plan.getOrAddLiveIn(
5996 getRecurrenceIdentity(MinMaxKind, IVSCEV->getType(), {}));
5997 }
5998 RdxResult->replaceAllUsesWith(NewRdxResult);
5999 RdxResult->eraseFromParent();
6000
6001 auto *NewPhiR = new VPReductionPHIRecipe(
6002 cast<PHINode>(PhiR->getUnderlyingInstr()), RecurKind::FindIV, *StartVPV,
6003 *NewFindLastSelect, RdxUnordered{1}, {},
6004 PhiR->hasUsesOutsideReductionChain());
6005 NewPhiR->insertBefore(PhiR);
6006 PhiR->replaceAllUsesWith(NewPhiR);
6007 PhiR->eraseFromParent();
6008 }
6009}
6010
6011namespace {
6012
6013using ExtendKind = TTI::PartialReductionExtendKind;
6014struct ReductionExtend {
6015 Type *SrcType = nullptr;
6016 ExtendKind Kind = ExtendKind::PR_None;
6017};
6018
6019/// Describes the extends used to compute the extended reduction operand.
6020/// ExtendB is optional. If ExtendB is present, ExtendsUser is a binary
6021/// operation.
6022struct ExtendedReductionOperand {
6023 /// The recipe that consumes the extends.
6024 VPWidenRecipe *ExtendsUser = nullptr;
6025 /// Extend descriptions (inputs to getPartialReductionCost).
6026 ReductionExtend ExtendA, ExtendB;
6027};
6028
6029/// A chain of recipes that form a partial reduction. Matches either
6030/// reduction_bin_op (extended op, accumulator), or
6031/// reduction_bin_op (accumulator, extended op).
6032/// The possible forms of the "extended op" are listed in
6033/// matchExtendedReductionOperand.
6034struct VPPartialReductionChain {
6035 /// The top-level binary operation that forms the reduction to a scalar
6036 /// after the loop body.
6037 VPWidenRecipe *ReductionBinOp = nullptr;
6038 /// The user of the extends that is then reduced.
6039 ExtendedReductionOperand ExtendedOp;
6040 /// The recurrence kind for the entire partial reduction chain.
6041 /// This allows distinguishing between Sub and AddWithSub recurrences,
6042 /// when the ReductionBinOp is a Instruction::Sub.
6043 RecurKind RK;
6044 /// The index of the accumulator operand of ReductionBinOp. The extended op
6045 /// is `1 - AccumulatorOpIdx`.
6046 unsigned AccumulatorOpIdx;
6047 unsigned ScaleFactor;
6048};
6049
6050static VPSingleDefRecipe *
6051optimizeExtendsForPartialReduction(VPSingleDefRecipe *Op,
6052 VPTypeAnalysis &TypeInfo) {
6053 // reduce.add(mul(ext(A), C))
6054 // -> reduce.add(mul(ext(A), ext(trunc(C))))
6055 const APInt *Const;
6056 if (match(Op, m_Mul(m_ZExtOrSExt(m_VPValue()), m_APInt(Const)))) {
6057 auto *ExtA = cast<VPWidenCastRecipe>(Op->getOperand(0));
6058 Instruction::CastOps ExtOpc = ExtA->getOpcode();
6059 Type *NarrowTy = TypeInfo.inferScalarType(ExtA->getOperand(0));
6060 if (!Op->hasOneUse() ||
6062 Const, NarrowTy, TTI::getPartialReductionExtendKind(ExtOpc)))
6063 return Op;
6064
6065 VPBuilder Builder(Op);
6066 auto *Trunc = Builder.createWidenCast(Instruction::CastOps::Trunc,
6067 Op->getOperand(1), NarrowTy);
6068 Type *WideTy = TypeInfo.inferScalarType(ExtA);
6069 Op->setOperand(1, Builder.createWidenCast(ExtOpc, Trunc, WideTy));
6070 return Op;
6071 }
6072
6073 // reduce.add(abs(sub(ext(A), ext(B))))
6074 // -> reduce.add(ext(absolute-difference(A, B)))
6075 VPValue *X, *Y;
6078 auto *Sub = Op->getOperand(0)->getDefiningRecipe();
6079 auto *Ext = cast<VPWidenCastRecipe>(Sub->getOperand(0));
6080 assert(Ext->getOpcode() ==
6081 cast<VPWidenCastRecipe>(Sub->getOperand(1))->getOpcode() &&
6082 "Expected both the LHS and RHS extends to be the same");
6083 bool IsSigned = Ext->getOpcode() == Instruction::SExt;
6084 VPBuilder Builder(Op);
6085 Type *SrcTy = TypeInfo.inferScalarType(X);
6086 auto *FreezeX = Builder.insert(new VPWidenRecipe(Instruction::Freeze, {X}));
6087 auto *FreezeY = Builder.insert(new VPWidenRecipe(Instruction::Freeze, {Y}));
6088 auto *Max = Builder.insert(
6089 new VPWidenIntrinsicRecipe(IsSigned ? Intrinsic::smax : Intrinsic::umax,
6090 {FreezeX, FreezeY}, SrcTy));
6091 auto *Min = Builder.insert(
6092 new VPWidenIntrinsicRecipe(IsSigned ? Intrinsic::smin : Intrinsic::umin,
6093 {FreezeX, FreezeY}, SrcTy));
6094 auto *AbsDiff =
6095 Builder.insert(new VPWidenRecipe(Instruction::Sub, {Max, Min}));
6096 return Builder.createWidenCast(Instruction::CastOps::ZExt, AbsDiff,
6097 TypeInfo.inferScalarType(Op));
6098 }
6099
6100 // reduce.add(ext(mul(ext(A), ext(B))))
6101 // -> reduce.add(mul(wider_ext(A), wider_ext(B)))
6102 // TODO: Support this optimization for float types.
6104 m_ZExtOrSExt(m_VPValue()))))) {
6105 auto *Ext = cast<VPWidenCastRecipe>(Op);
6106 auto *Mul = cast<VPWidenRecipe>(Ext->getOperand(0));
6107 auto *MulLHS = cast<VPWidenCastRecipe>(Mul->getOperand(0));
6108 auto *MulRHS = cast<VPWidenCastRecipe>(Mul->getOperand(1));
6109 if (!Mul->hasOneUse() ||
6110 (Ext->getOpcode() != MulLHS->getOpcode() && MulLHS != MulRHS) ||
6111 MulLHS->getOpcode() != MulRHS->getOpcode())
6112 return Op;
6113 VPBuilder Builder(Mul);
6114 Mul->setOperand(0, Builder.createWidenCast(MulLHS->getOpcode(),
6115 MulLHS->getOperand(0),
6116 Ext->getResultType()));
6117 Mul->setOperand(1, MulLHS == MulRHS
6118 ? Mul->getOperand(0)
6119 : Builder.createWidenCast(MulRHS->getOpcode(),
6120 MulRHS->getOperand(0),
6121 Ext->getResultType()));
6122 return Mul;
6123 }
6124
6125 return Op;
6126}
6127
6128static VPExpressionRecipe *
6129createPartialReductionExpression(VPReductionRecipe *Red) {
6130 VPValue *VecOp = Red->getVecOp();
6131
6132 // reduce.[f]add(ext(op))
6133 // -> VPExpressionRecipe(op, red)
6134 if (match(VecOp, m_WidenAnyExtend(m_VPValue())))
6135 return new VPExpressionRecipe(cast<VPWidenCastRecipe>(VecOp), Red);
6136
6137 // reduce.[f]add([f]mul(ext(a), ext(b)))
6138 // -> VPExpressionRecipe(a, b, mul, red)
6139 if (match(VecOp, m_FMul(m_FPExt(m_VPValue()), m_FPExt(m_VPValue()))) ||
6140 match(VecOp,
6142 auto *Mul = cast<VPWidenRecipe>(VecOp);
6143 auto *ExtA = cast<VPWidenCastRecipe>(Mul->getOperand(0));
6144 auto *ExtB = cast<VPWidenCastRecipe>(Mul->getOperand(1));
6145 return new VPExpressionRecipe(ExtA, ExtB, Mul, Red);
6146 }
6147
6148 // reduce.add(neg(mul(ext(a), ext(b))))
6149 // -> VPExpressionRecipe(a, b, mul, sub, red)
6151 m_ZExtOrSExt(m_VPValue()))))) {
6152 auto *Sub = cast<VPWidenRecipe>(VecOp);
6153 auto *Mul = cast<VPWidenRecipe>(Sub->getOperand(1));
6154 auto *ExtA = cast<VPWidenCastRecipe>(Mul->getOperand(0));
6155 auto *ExtB = cast<VPWidenCastRecipe>(Mul->getOperand(1));
6156 return new VPExpressionRecipe(ExtA, ExtB, Mul, Sub, Red);
6157 }
6158
6159 llvm_unreachable("Unsupported expression");
6160}
6161
6162// Helper to transform a partial reduction chain into a partial reduction
6163// recipe. Assumes profitability has been checked.
6164static void transformToPartialReduction(const VPPartialReductionChain &Chain,
6165 VPTypeAnalysis &TypeInfo, VPlan &Plan,
6166 VPReductionPHIRecipe *RdxPhi) {
6167 VPWidenRecipe *WidenRecipe = Chain.ReductionBinOp;
6168 assert(WidenRecipe->getNumOperands() == 2 && "Expected binary operation");
6169
6170 VPValue *Accumulator = WidenRecipe->getOperand(Chain.AccumulatorOpIdx);
6171 auto *ExtendedOp = cast<VPSingleDefRecipe>(
6172 WidenRecipe->getOperand(1 - Chain.AccumulatorOpIdx));
6173
6174 // Sub-reductions can be implemented in two ways:
6175 // (1) negate the operand in the vector loop (the default way).
6176 // (2) subtract the reduced value from the init value in the middle block.
6177 // Both ways keep the reduction itself as an 'add' reduction.
6178 //
6179 // The ISD nodes for partial reductions don't support folding the
6180 // sub/negation into its operands because the following is not a valid
6181 // transformation:
6182 // sub(0, mul(ext(a), ext(b)))
6183 // -> mul(ext(a), ext(sub(0, b)))
6184 //
6185 // It's therefore better to choose option (2) such that the partial
6186 // reduction is always positive (starting at '0') and to do a final
6187 // subtract in the middle block.
6188 if (WidenRecipe->getOpcode() == Instruction::Sub &&
6189 Chain.RK != RecurKind::Sub) {
6190 VPBuilder Builder(WidenRecipe);
6191 Type *ElemTy = TypeInfo.inferScalarType(ExtendedOp);
6192 auto *Zero = Plan.getZero(ElemTy);
6193 auto *NegRecipe =
6194 new VPWidenRecipe(Instruction::Sub, {Zero, ExtendedOp}, VPIRFlags(),
6196 Builder.insert(NegRecipe);
6197 ExtendedOp = NegRecipe;
6198 }
6199
6200 // FIXME: Do these transforms before invoking the cost-model.
6201 ExtendedOp = optimizeExtendsForPartialReduction(ExtendedOp, TypeInfo);
6202
6203 // Check if WidenRecipe is the final result of the reduction. If so look
6204 // through selects for predicated reductions.
6205 VPValue *Cond = nullptr;
6207 WidenRecipe,
6208 m_Select(m_VPValue(Cond), m_Specific(WidenRecipe), m_Specific(RdxPhi))));
6209 bool IsLastInChain = RdxPhi->getBackedgeValue() == WidenRecipe ||
6210 RdxPhi->getBackedgeValue() == ExitValue;
6211 assert((!ExitValue || IsLastInChain) &&
6212 "if we found ExitValue, it must match RdxPhi's backedge value");
6213
6214 Type *PhiType = TypeInfo.inferScalarType(RdxPhi);
6215 RecurKind RdxKind =
6217 auto *PartialRed = new VPReductionRecipe(
6218 RdxKind,
6219 RdxKind == RecurKind::FAdd ? WidenRecipe->getFastMathFlags()
6220 : FastMathFlags(),
6221 WidenRecipe->getUnderlyingInstr(), Accumulator, ExtendedOp, Cond,
6222 RdxUnordered{/*VFScaleFactor=*/Chain.ScaleFactor});
6223 PartialRed->insertBefore(WidenRecipe);
6224
6225 if (Cond)
6226 ExitValue->replaceAllUsesWith(PartialRed);
6227 WidenRecipe->replaceAllUsesWith(PartialRed);
6228
6229 // For cost-model purposes, fold this into a VPExpression.
6230 VPExpressionRecipe *E = createPartialReductionExpression(PartialRed);
6231 E->insertBefore(WidenRecipe);
6232 PartialRed->replaceAllUsesWith(E);
6233
6234 // We only need to update the PHI node once, which is when we find the
6235 // last reduction in the chain.
6236 if (!IsLastInChain)
6237 return;
6238
6239 // Scale the PHI and ReductionStartVector by the VFScaleFactor
6240 assert(RdxPhi->getVFScaleFactor() == 1 && "scale factor must not be set");
6241 RdxPhi->setVFScaleFactor(Chain.ScaleFactor);
6242
6243 auto *StartInst = cast<VPInstruction>(RdxPhi->getStartValue());
6244 assert(StartInst->getOpcode() == VPInstruction::ReductionStartVector);
6245 auto *NewScaleFactor = Plan.getConstantInt(32, Chain.ScaleFactor);
6246 StartInst->setOperand(2, NewScaleFactor);
6247
6248 // If this is the last value in a sub-reduction chain, then update the PHI
6249 // node to start at `0` and update the reduction-result to subtract from
6250 // the PHI's start value.
6251 if (Chain.RK != RecurKind::Sub)
6252 return;
6253
6254 VPValue *OldStartValue = StartInst->getOperand(0);
6255 StartInst->setOperand(0, StartInst->getOperand(1));
6256
6257 // Replace reduction_result by 'sub (startval, reductionresult)'.
6259 assert(RdxResult && "Could not find reduction result");
6260
6261 VPBuilder Builder = VPBuilder::getToInsertAfter(RdxResult);
6262 constexpr unsigned SubOpc = Instruction::BinaryOps::Sub;
6263 VPInstruction *NewResult = Builder.createNaryOp(
6264 SubOpc, {OldStartValue, RdxResult}, VPIRFlags::getDefaultFlags(SubOpc),
6265 RdxPhi->getDebugLoc());
6266 RdxResult->replaceUsesWithIf(
6267 NewResult,
6268 [&NewResult](VPUser &U, unsigned Idx) { return &U != NewResult; });
6269}
6270
6271/// Returns the cost of a link in a partial-reduction chain for a given VF.
6272static InstructionCost
6273getPartialReductionLinkCost(VPCostContext &CostCtx,
6274 const VPPartialReductionChain &Link,
6275 ElementCount VF) {
6276 Type *RdxType = CostCtx.Types.inferScalarType(Link.ReductionBinOp);
6277 const ExtendedReductionOperand &ExtendedOp = Link.ExtendedOp;
6278 std::optional<unsigned> BinOpc = std::nullopt;
6279 // If ExtendB is not none, then the "ExtendsUser" is the binary operation.
6280 if (ExtendedOp.ExtendB.Kind != ExtendKind::PR_None)
6281 BinOpc = ExtendedOp.ExtendsUser->getOpcode();
6282
6283 std::optional<llvm::FastMathFlags> Flags;
6284 if (RdxType->isFloatingPointTy())
6285 Flags = Link.ReductionBinOp->getFastMathFlags();
6286
6287 unsigned Opcode = Link.RK == RecurKind::Sub
6288 ? (unsigned)Instruction::Add
6289 : Link.ReductionBinOp->getOpcode();
6290 return CostCtx.TTI.getPartialReductionCost(
6291 Opcode, ExtendedOp.ExtendA.SrcType, ExtendedOp.ExtendB.SrcType, RdxType,
6292 VF, ExtendedOp.ExtendA.Kind, ExtendedOp.ExtendB.Kind, BinOpc,
6293 CostCtx.CostKind, Flags);
6294}
6295
6296static ExtendKind getPartialReductionExtendKind(VPWidenCastRecipe *Cast) {
6298}
6299
6300/// Checks if \p Op (which is an operand of \p UpdateR) is an extended reduction
6301/// operand. This is an operand where the source of the value (e.g. a load) has
6302/// been extended (sext, zext, or fpext) before it is used in the reduction.
6303///
6304/// Possible forms matched by this function:
6305/// - UpdateR(PrevValue, ext(...))
6306/// - UpdateR(PrevValue, mul(ext(...), ext(...)))
6307/// - UpdateR(PrevValue, mul(ext(...), Constant))
6308/// - UpdateR(PrevValue, neg(mul(ext(...), ext(...))))
6309/// - UpdateR(PrevValue, neg(mul(ext(...), Constant)))
6310/// - UpdateR(PrevValue, ext(mul(ext(...), ext(...))))
6311/// - UpdateR(PrevValue, ext(mul(ext(...), Constant)))
6312/// - UpdateR(PrevValue, abs(sub(ext(...), ext(...)))
6313///
6314/// Note: The second operand of UpdateR corresponds to \p Op in the examples.
6315static std::optional<ExtendedReductionOperand>
6316matchExtendedReductionOperand(VPWidenRecipe *UpdateR, VPValue *Op,
6317 VPTypeAnalysis &TypeInfo) {
6318 assert(is_contained(UpdateR->operands(), Op) &&
6319 "Op should be operand of UpdateR");
6320
6321 // Try matching an absolute difference operand of the form
6322 // `abs(sub(ext(A), ext(B)))`. This will be later transformed into
6323 // `ext(absolute-difference(A, B))`. This allows us to perform the absolute
6324 // difference on a wider type and get the extend for "free" from the partial
6325 // reduction.
6326 VPValue *X, *Y;
6327 if (Op->hasOneUse() &&
6331 auto *Abs = cast<VPWidenIntrinsicRecipe>(Op);
6332 auto *Sub = cast<VPWidenRecipe>(Abs->getOperand(0));
6333 auto *LHSExt = cast<VPWidenCastRecipe>(Sub->getOperand(0));
6334 auto *RHSExt = cast<VPWidenCastRecipe>(Sub->getOperand(1));
6335 Type *LHSInputType = TypeInfo.inferScalarType(X);
6336 Type *RHSInputType = TypeInfo.inferScalarType(Y);
6337 if (LHSInputType != RHSInputType ||
6338 LHSExt->getOpcode() != RHSExt->getOpcode())
6339 return std::nullopt;
6340 // Note: This is essentially the same as matching ext(...) as we will
6341 // rewrite this operand to ext(absolute-difference(A, B)).
6342 return ExtendedReductionOperand{
6343 Sub,
6344 /*ExtendA=*/{LHSInputType, getPartialReductionExtendKind(LHSExt)},
6345 /*ExtendB=*/{}};
6346 }
6347
6348 std::optional<TTI::PartialReductionExtendKind> OuterExtKind;
6350 auto *CastRecipe = cast<VPWidenCastRecipe>(Op);
6351 VPValue *CastSource = CastRecipe->getOperand(0);
6352 OuterExtKind = getPartialReductionExtendKind(CastRecipe);
6353 if (match(CastSource, m_Mul(m_VPValue(), m_VPValue())) ||
6354 match(CastSource, m_FMul(m_VPValue(), m_VPValue()))) {
6355 // Match: ext(mul(...))
6356 // Record the outer extend kind and set `Op` to the mul. We can then match
6357 // this as a binary operation. Note: We can optimize out the outer extend
6358 // by widening the inner extends to match it. See
6359 // optimizeExtendsForPartialReduction.
6360 Op = CastSource;
6361 } else if (UpdateR->getOpcode() == Instruction::Add ||
6362 UpdateR->getOpcode() == Instruction::FAdd) {
6363 // Match: UpdateR(PrevValue, ext(...))
6364 // TODO: Remove the add/fadd restriction (we should be able to handle this
6365 // case for sub reductions too).
6366 return ExtendedReductionOperand{
6367 UpdateR,
6368 /*ExtendA=*/{TypeInfo.inferScalarType(CastSource), *OuterExtKind},
6369 /*ExtendB=*/{}};
6370 }
6371 }
6372
6373 if (!Op->hasOneUse())
6374 return std::nullopt;
6375
6377 if (!MulOp ||
6378 !is_contained({Instruction::Mul, Instruction::FMul}, MulOp->getOpcode()))
6379 return std::nullopt;
6380
6381 // The rest of the matching assumes `Op` is a (possibly extended/negated)
6382 // binary operation.
6383
6384 VPValue *LHS = MulOp->getOperand(0);
6385 VPValue *RHS = MulOp->getOperand(1);
6386
6387 // The LHS of the operation must always be an extend.
6389 return std::nullopt;
6390
6391 auto *LHSCast = cast<VPWidenCastRecipe>(LHS);
6392 Type *LHSInputType = TypeInfo.inferScalarType(LHSCast->getOperand(0));
6393 ExtendKind LHSExtendKind = getPartialReductionExtendKind(LHSCast);
6394
6395 // The RHS of the operation can be an extend or a constant integer.
6396 const APInt *RHSConst = nullptr;
6397 VPWidenCastRecipe *RHSCast = nullptr;
6399 RHSCast = cast<VPWidenCastRecipe>(RHS);
6400 else if (!match(RHS, m_APInt(RHSConst)) ||
6401 !canConstantBeExtended(RHSConst, LHSInputType, LHSExtendKind))
6402 return std::nullopt;
6403
6404 // The outer extend kind must match the inner extends for folding.
6405 for (VPWidenCastRecipe *Cast : {LHSCast, RHSCast})
6406 if (Cast && OuterExtKind &&
6407 getPartialReductionExtendKind(Cast) != OuterExtKind)
6408 return std::nullopt;
6409
6410 Type *RHSInputType = LHSInputType;
6411 ExtendKind RHSExtendKind = LHSExtendKind;
6412 if (RHSCast) {
6413 RHSInputType = TypeInfo.inferScalarType(RHSCast->getOperand(0));
6414 RHSExtendKind = getPartialReductionExtendKind(RHSCast);
6415 }
6416
6417 return ExtendedReductionOperand{
6418 MulOp, {LHSInputType, LHSExtendKind}, {RHSInputType, RHSExtendKind}};
6419}
6420
6421/// Examines each operation in the reduction chain corresponding to \p RedPhiR,
6422/// and determines if the target can use a cheaper operation with a wider
6423/// per-iteration input VF and narrower PHI VF. If successful, returns the chain
6424/// of operations in the reduction.
6425static std::optional<SmallVector<VPPartialReductionChain>>
6426getScaledReductions(VPReductionPHIRecipe *RedPhiR, VPCostContext &CostCtx,
6427 VFRange &Range) {
6428 // Get the backedge value from the reduction PHI and find the
6429 // ComputeReductionResult that uses it (directly or through a select for
6430 // predicated reductions).
6431 auto *RdxResult = vputils::findComputeReductionResult(RedPhiR);
6432 if (!RdxResult)
6433 return std::nullopt;
6434 VPValue *ExitValue = RdxResult->getOperand(0);
6435 match(ExitValue, m_Select(m_VPValue(), m_VPValue(ExitValue), m_VPValue()));
6436
6437 VPTypeAnalysis &TypeInfo = CostCtx.Types;
6439 RecurKind RK = RedPhiR->getRecurrenceKind();
6440 Type *PhiType = TypeInfo.inferScalarType(RedPhiR);
6441 TypeSize PHISize = PhiType->getPrimitiveSizeInBits();
6442
6443 // Work backwards from the ExitValue examining each reduction operation.
6444 VPValue *CurrentValue = ExitValue;
6445 while (CurrentValue != RedPhiR) {
6446 auto *UpdateR = dyn_cast<VPWidenRecipe>(CurrentValue);
6447 if (!UpdateR || !Instruction::isBinaryOp(UpdateR->getOpcode()))
6448 return std::nullopt;
6449
6450 VPValue *Op = UpdateR->getOperand(1);
6451 VPValue *PrevValue = UpdateR->getOperand(0);
6452
6453 // Find the extended operand. The other operand (PrevValue) is the next link
6454 // in the reduction chain.
6455 std::optional<ExtendedReductionOperand> ExtendedOp =
6456 matchExtendedReductionOperand(UpdateR, Op, TypeInfo);
6457 if (!ExtendedOp) {
6458 ExtendedOp = matchExtendedReductionOperand(UpdateR, PrevValue, TypeInfo);
6459 if (!ExtendedOp)
6460 return std::nullopt;
6461 std::swap(Op, PrevValue);
6462 }
6463
6464 Type *ExtSrcType = ExtendedOp->ExtendA.SrcType;
6465 TypeSize ExtSrcSize = ExtSrcType->getPrimitiveSizeInBits();
6466 if (!PHISize.hasKnownScalarFactor(ExtSrcSize))
6467 return std::nullopt;
6468
6469 // Check if a partial reduction chain is supported by the target (i.e. does
6470 // not have an invalid cost) for the given VF range. Clamps the range and
6471 // returns true if feasible for any VF.
6472 VPPartialReductionChain Link(
6473 {UpdateR, *ExtendedOp, RK,
6474 PrevValue == UpdateR->getOperand(0) ? 0U : 1U,
6475 static_cast<unsigned>(PHISize.getKnownScalarFactor(ExtSrcSize))});
6476 Chain.push_back(Link);
6477 CurrentValue = PrevValue;
6478 }
6479
6480 // The chain links were collected by traversing backwards from the exit value.
6481 // Reverse the chains so they are in program order.
6482 std::reverse(Chain.begin(), Chain.end());
6483 return Chain;
6484}
6485} // namespace
6486
6488 VPCostContext &CostCtx,
6489 VFRange &Range) {
6490 // Find all possible valid partial reductions, grouping chains by their PHI.
6491 // This grouping allows invalidating the whole chain, if any link is not a
6492 // valid partial reduction.
6494 ChainsByPhi;
6495 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
6496 for (VPRecipeBase &R : HeaderVPBB->phis()) {
6497 auto *RedPhiR = dyn_cast<VPReductionPHIRecipe>(&R);
6498 if (!RedPhiR)
6499 continue;
6500
6501 if (auto Chains = getScaledReductions(RedPhiR, CostCtx, Range))
6502 ChainsByPhi.try_emplace(RedPhiR, std::move(*Chains));
6503 }
6504
6505 if (ChainsByPhi.empty())
6506 return;
6507
6508 // Build set of partial reduction operations for extend user validation and
6509 // a map of reduction bin ops to their scale factors for scale validation.
6510 SmallPtrSet<VPRecipeBase *, 4> PartialReductionOps;
6511 DenseMap<VPSingleDefRecipe *, unsigned> ScaledReductionMap;
6512 for (const auto &[_, Chains] : ChainsByPhi)
6513 for (const VPPartialReductionChain &Chain : Chains) {
6514 PartialReductionOps.insert(Chain.ExtendedOp.ExtendsUser);
6515 ScaledReductionMap[Chain.ReductionBinOp] = Chain.ScaleFactor;
6516 }
6517
6518 // A partial reduction is invalid if any of its extends are used by
6519 // something that isn't another partial reduction. This is because the
6520 // extends are intended to be lowered along with the reduction itself.
6521 auto ExtendUsersValid = [&](VPValue *Ext) {
6522 return !isa<VPWidenCastRecipe>(Ext) || all_of(Ext->users(), [&](VPUser *U) {
6523 return PartialReductionOps.contains(cast<VPRecipeBase>(U));
6524 });
6525 };
6526
6527 auto IsProfitablePartialReductionChainForVF =
6528 [&](ArrayRef<VPPartialReductionChain> Chain, ElementCount VF) -> bool {
6529 InstructionCost PartialCost = 0, RegularCost = 0;
6530
6531 // The chain is a profitable partial reduction chain if the cost of handling
6532 // the entire chain is cheaper when using partial reductions than when
6533 // handling the entire chain using regular reductions.
6534 for (const VPPartialReductionChain &Link : Chain) {
6535 const ExtendedReductionOperand &ExtendedOp = Link.ExtendedOp;
6536 InstructionCost LinkCost = getPartialReductionLinkCost(CostCtx, Link, VF);
6537 if (!LinkCost.isValid())
6538 return false;
6539
6540 PartialCost += LinkCost;
6541 RegularCost += Link.ReductionBinOp->computeCost(VF, CostCtx);
6542 // If ExtendB is not none, then the "ExtendsUser" is the binary operation.
6543 if (ExtendedOp.ExtendB.Kind != ExtendKind::PR_None)
6544 RegularCost += ExtendedOp.ExtendsUser->computeCost(VF, CostCtx);
6545 for (VPValue *Op : ExtendedOp.ExtendsUser->operands())
6546 if (auto *Extend = dyn_cast<VPWidenCastRecipe>(Op))
6547 RegularCost += Extend->computeCost(VF, CostCtx);
6548 }
6549 return PartialCost.isValid() && PartialCost < RegularCost;
6550 };
6551
6552 // Validate chains: check that extends are only used by partial reductions,
6553 // and that reduction bin ops are only used by other partial reductions with
6554 // matching scale factors, are outside the loop region or the select
6555 // introduced by tail-folding. Otherwise we would create users of scaled
6556 // reductions where the types of the other operands don't match.
6557 for (auto &[RedPhiR, Chains] : ChainsByPhi) {
6558 for (const VPPartialReductionChain &Chain : Chains) {
6559 if (!all_of(Chain.ExtendedOp.ExtendsUser->operands(), ExtendUsersValid)) {
6560 Chains.clear();
6561 break;
6562 }
6563 auto UseIsValid = [&, RedPhiR = RedPhiR](VPUser *U) {
6564 if (auto *PhiR = dyn_cast<VPReductionPHIRecipe>(U))
6565 return PhiR == RedPhiR;
6566 auto *R = cast<VPSingleDefRecipe>(U);
6567 return Chain.ScaleFactor == ScaledReductionMap.lookup_or(R, 0) ||
6569 m_Specific(Chain.ReductionBinOp))) ||
6570 match(R, m_Select(m_VPValue(), m_Specific(Chain.ReductionBinOp),
6571 m_Specific(RedPhiR)));
6572 };
6573 if (!all_of(Chain.ReductionBinOp->users(), UseIsValid)) {
6574 Chains.clear();
6575 break;
6576 }
6577
6578 // Check if the compute-reduction-result is used by a sunk store.
6579 // TODO: Also form partial reductions in those cases.
6580 if (auto *RdxResult = vputils::findComputeReductionResult(RedPhiR)) {
6581 if (any_of(RdxResult->users(), [](VPUser *U) {
6582 auto *RepR = dyn_cast<VPReplicateRecipe>(U);
6583 return RepR && RepR->getOpcode() == Instruction::Store;
6584 })) {
6585 Chains.clear();
6586 break;
6587 }
6588 }
6589 }
6590
6591 // Clear the chain if it is not profitable.
6593 [&, &Chains = Chains](ElementCount VF) {
6594 return IsProfitablePartialReductionChainForVF(Chains, VF);
6595 },
6596 Range))
6597 Chains.clear();
6598 }
6599
6600 for (auto &[Phi, Chains] : ChainsByPhi)
6601 for (const VPPartialReductionChain &Chain : Chains)
6602 transformToPartialReduction(Chain, CostCtx.Types, Plan, Phi);
6603}
6604
6606 VPlan &Plan, VFRange &Range, VPRecipeBuilder &RecipeBuilder) {
6607 // Collect all loads/stores first. We will start with ones having simpler
6608 // decisions followed by more complex ones that are potentially
6609 // guided/dependent on the simpler ones.
6611 for (VPBasicBlock *VPBB :
6614 for (VPRecipeBase &R : *VPBB) {
6615 auto *VPI = dyn_cast<VPInstruction>(&R);
6616 if (VPI && VPI->getUnderlyingValue() &&
6617 is_contained({Instruction::Load, Instruction::Store},
6618 VPI->getOpcode()))
6619 MemOps.push_back(VPI);
6620 }
6621 }
6622
6623 VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
6624 VPBuilder FinalRedStoresBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
6625
6626 for (VPInstruction *VPI : MemOps) {
6627 auto ReplaceWith = [&](VPRecipeBase *New) {
6628 RecipeBuilder.setRecipe(cast<Instruction>(VPI->getUnderlyingValue()),
6629 New);
6630 New->insertBefore(VPI);
6631 if (VPI->getOpcode() == Instruction::Load)
6632 VPI->replaceAllUsesWith(New->getVPSingleValue());
6633 VPI->eraseFromParent();
6634 };
6635
6636 // Note: we must do that for scalar VPlan as well.
6637 if (RecipeBuilder.replaceWithFinalIfReductionStore(VPI,
6638 FinalRedStoresBuilder))
6639 continue;
6640
6641 // Filter out scalar VPlan for the remaining memory operations.
6643 [](ElementCount VF) { return VF.isScalar(); }, Range))
6644 continue;
6645
6646 if (VPHistogramRecipe *Histogram = RecipeBuilder.widenIfHistogram(VPI)) {
6647 ReplaceWith(Histogram);
6648 continue;
6649 }
6650
6651 VPRecipeBase *Recipe = RecipeBuilder.tryToWidenMemory(VPI, Range);
6652 if (!Recipe)
6653 Recipe = RecipeBuilder.handleReplication(VPI, Range);
6654
6655 ReplaceWith(Recipe);
6656 }
6657}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
AMDGPU Register Bank Select
This file implements a class to represent arbitrary precision integral constant values and operations...
ReachingDefInfo InstSet & ToRemove
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool isEqual(const Function &Caller, const Function &Callee)
static const Function * getParent(const Value *V)
#define X(NUM, ENUM, NAME)
Definition ELF.h:851
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
static bool isSentinel(const DWARFDebugNames::AttributeEncoding &AE)
@ Default
Hexagon Common GEP
#define _
iv Induction Variable Users
Definition IVUsers.cpp:48
iv users
Definition IVUsers.cpp:48
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
licm
Definition LICM.cpp:383
Legalize the Machine IR a function s Machine IR
Definition Legalizer.cpp:81
#define I(x, y, z)
Definition MD5.cpp:57
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first DebugLoc that has line number information, given a range of instructions.
This file provides utility analysis objects describing memory locations.
This file contains the declarations for metadata subclasses.
MachineInstr unsigned OpIdx
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
#define P(N)
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
R600 Clause Merge
const SmallVectorImpl< MachineOperand > & Cond
This file contains some templates that are useful if you are working with the STL at all.
This is the interface for a metadata-based scoped no-alias analysis.
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file defines the SmallPtrSet class.
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This file implements the TypeSwitch template, which mimics a switch() statement whose cases are type ...
This file implements dominator tree analysis for a single level of a VPlan's H-CFG.
This file contains the declarations of different VPlan-related auxiliary helpers.
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition VPlanSLP.cpp:247
static SmallVector< SmallVector< VPReplicateRecipe *, 4 > > collectComplementaryPredicatedMemOps(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L)
static void removeCommonBlendMask(VPBlendRecipe *Blend)
Try to see if all of Blend's masks share a common value logically and'ed and remove it from the masks...
static void tryToCreateAbstractReductionRecipe(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries to create abstract recipes from the reduction recipe for following optimizations ...
static VPReplicateRecipe * findRecipeWithMinAlign(ArrayRef< VPReplicateRecipe * > Group)
static bool sinkScalarOperands(VPlan &Plan)
static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Try to simplify the branch condition of Plan.
static void simplifyRecipe(VPSingleDefRecipe *Def, VPTypeAnalysis &TypeInfo)
Try to simplify VPSingleDefRecipe Def.
static VPValue * cloneBinOpForScalarIV(VPWidenRecipe *BinOp, VPValue *ScalarIV, VPWidenIntOrFpInductionRecipe *WidenIV)
Create a scalar version of BinOp, with its WidenIV operand replaced by ScalarIV, and place it after S...
static VPWidenIntOrFpInductionRecipe * getExpressionIV(VPValue *V)
Check if V is a binary expression of a widened IV and a loop-invariant value.
static void removeRedundantInductionCasts(VPlan &Plan)
Remove redundant casts of inductions.
static bool isConditionTrueViaVFAndUF(VPValue *Cond, VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Return true if Cond is known to be true for given BestVF and BestUF.
static bool tryToReplaceALMWithWideALM(VPlan &Plan, ElementCount VF, unsigned UF)
Try to replace multiple active lane masks used for control flow with a single, wide active lane mask ...
static std::optional< std::pair< bool, unsigned > > getOpcodeOrIntrinsicID(const VPSingleDefRecipe *R)
Get any instruction opcode or intrinsic ID data embedded in recipe R.
static VPExpressionRecipe * tryToMatchAndCreateExtendedReduction(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries convert extended in-loop reductions to VPExpressionRecipe and clamp the Range if ...
static RemoveMask_match< Op0_t, Op1_t > m_RemoveMask(const Op0_t &In, Op1_t &Out)
Match a specific mask In, or a combination of it (logical-and In, Out).
static VPIRMetadata getCommonMetadata(ArrayRef< VPReplicateRecipe * > Recipes)
static VPValue * getPredicatedMask(VPRegionBlock *R)
If R is a region with a VPBranchOnMaskRecipe in the entry block, return the mask.
static bool sinkRecurrenceUsersAfterPrevious(VPFirstOrderRecurrencePHIRecipe *FOR, VPRecipeBase *Previous, VPDominatorTree &VPDT)
Sink users of FOR after the recipe defining the previous value Previous of the recurrence.
static bool mergeReplicateRegionsIntoSuccessors(VPlan &Plan)
static VPScalarIVStepsRecipe * createScalarIVSteps(VPlan &Plan, InductionDescriptor::InductionKind Kind, Instruction::BinaryOps InductionOpcode, FPMathOperator *FPBinOp, Instruction *TruncI, VPIRValue *StartV, VPValue *Step, DebugLoc DL, VPBuilder &Builder)
static VPWidenInductionRecipe * getOptimizableIVOf(VPValue *VPV, PredicatedScalarEvolution &PSE)
Check if VPV is an untruncated wide induction, either before or after the increment.
static void fixupVFUsersForEVL(VPlan &Plan, VPValue &EVL)
After replacing the canonical IV with a EVL-based IV, fixup recipes that use VF to use the EVL instea...
static bool canNarrowLoad(VPSingleDefRecipe *WideMember0, unsigned OpIdx, VPValue *OpV, unsigned Idx, bool IsScalable)
Returns true if V is VPWidenLoadRecipe or VPInterleaveRecipe that can be converted to a narrower reci...
static void expandVPWidenPointerInduction(VPWidenPointerInductionRecipe *R, VPTypeAnalysis &TypeInfo)
Expand a VPWidenPointerInductionRecipe into executable recipes, for the initial value,...
static std::optional< ElementCount > isConsecutiveInterleaveGroup(VPInterleaveRecipe *InterleaveR, ArrayRef< ElementCount > VFs, VPTypeAnalysis &TypeInfo, const TargetTransformInfo &TTI)
Returns VF from VFs if IR is a full interleave group with factor and number of members both equal to ...
static bool isDeadRecipe(VPRecipeBase &R)
Returns true if R is dead and can be removed.
static void legalizeAndOptimizeInductions(VPlan &Plan)
Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd (IndStart, ScalarIVSteps (0,...
static void addReplicateRegions(VPlan &Plan)
static SmallVector< SmallVector< VPReplicateRecipe *, 4 > > collectGroupedReplicateMemOps(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L, function_ref< bool(VPReplicateRecipe *)> FilterFn)
Collect either replicated Loads or Stores grouped by their address SCEV, in a deep-traversal of the v...
static VPIRValue * tryToFoldLiveIns(VPSingleDefRecipe &R, ArrayRef< VPValue * > Operands, const DataLayout &DL, VPTypeAnalysis &TypeInfo)
Try to fold R using InstSimplifyFolder.
static VPValue * tryToComputeEndValueForInduction(VPWidenInductionRecipe *WideIV, VPBuilder &VectorPHBuilder, VPTypeAnalysis &TypeInfo, VPValue *VectorTC)
Compute the end value for WideIV, unless it is truncated.
static void removeRedundantExpandSCEVRecipes(VPlan &Plan)
Remove redundant EpxandSCEVRecipes in Plan's entry block by replacing them with already existing reci...
static bool hoistPreviousBeforeFORUsers(VPFirstOrderRecurrencePHIRecipe *FOR, VPRecipeBase *Previous, VPDominatorTree &VPDT)
Try to hoist Previous and its operands before all users of FOR.
static VPValue * scalarizeVPWidenPointerInduction(VPWidenPointerInductionRecipe *PtrIV, VPlan &Plan, VPBuilder &Builder)
Scalarize a VPWidenPointerInductionRecipe by replacing it with a PtrAdd (IndStart,...
static SmallVector< VPUser * > collectUsersRecursively(VPValue *V)
static VPValue * optimizeEarlyExitInductionUser(VPlan &Plan, VPTypeAnalysis &TypeInfo, VPBlockBase *PredVPBB, VPValue *Op, PredicatedScalarEvolution &PSE)
Attempts to optimize the induction variable exit values for users in the early exit block.
static void recursivelyDeleteDeadRecipes(VPValue *V)
static void reassociateHeaderMask(VPlan &Plan)
Reassociate (headermask && x) && y -> headermask && (x && y) to allow the header mask to be simplifie...
static bool canSinkStoreWithNoAliasCheck(ArrayRef< VPReplicateRecipe * > StoresToSink, PredicatedScalarEvolution &PSE, const Loop &L, VPTypeAnalysis &TypeInfo)
static VPActiveLaneMaskPHIRecipe * addVPLaneMaskPhiAndUpdateExitBranch(VPlan &Plan)
static VPRegionBlock * createReplicateRegion(VPReplicateRecipe *PredRecipe, VPlan &Plan)
static VPBasicBlock * getPredicatedThenBlock(VPRegionBlock *R)
If R is a triangle region, return the 'then' block of the triangle.
static VPValue * narrowInterleaveGroupOp(VPValue *V, SmallPtrSetImpl< VPValue * > &NarrowedOps)
static bool cannotHoistOrSinkRecipe(const VPRecipeBase &R, bool Sinking=false)
Return true if we do not know how to (mechanically) hoist or sink R out of a loop region.
static bool canHoistOrSinkWithNoAliasCheck(const MemoryLocation &MemLoc, VPBasicBlock *FirstBB, VPBasicBlock *LastBB, std::optional< SinkStoreInfo > SinkInfo={})
Check if a memory operation doesn't alias with memory operations using scoped noalias metadata,...
static void simplifyBlends(VPlan &Plan)
Normalize and simplify VPBlendRecipes.
static VPRecipeBase * optimizeMaskToEVL(VPValue *HeaderMask, VPRecipeBase &CurRecipe, VPTypeAnalysis &TypeInfo, VPValue &EVL)
Try to optimize a CurRecipe masked by HeaderMask to a corresponding EVL-based recipe without the head...
static bool isAlreadyNarrow(VPValue *VPV)
Returns true if VPValue is a narrow VPValue.
static bool canNarrowOps(ArrayRef< VPValue * > Ops, bool IsScalable)
static bool optimizeVectorInductionWidthForTCAndVFUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF)
Optimize the width of vector induction variables in Plan based on a known constant Trip Count,...
static VPExpressionRecipe * tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries convert extended in-loop reductions to VPExpressionRecipe and clamp the Range if ...
static void expandVPWidenIntOrFpInduction(VPWidenIntOrFpInductionRecipe *WidenIVR, VPTypeAnalysis &TypeInfo)
Expand a VPWidenIntOrFpInduction into executable recipes, for the initial value, phi and backedge val...
static void removeRedundantCanonicalIVs(VPlan &Plan)
Try to replace VPWidenCanonicalIVRecipes with a widened canonical IV recipe, if it exists.
static void narrowToSingleScalarRecipes(VPlan &Plan)
static VPValue * optimizeLatchExitInductionUser(VPlan &Plan, VPTypeAnalysis &TypeInfo, VPBlockBase *PredVPBB, VPValue *Op, DenseMap< VPValue *, VPValue * > &EndValues, PredicatedScalarEvolution &PSE)
Attempts to optimize the induction variable exit values for users in the exit block coming from the l...
This file provides utility VPlan to VPlan transformations.
#define RUN_VPLAN_PASS(PASS,...)
This file declares the class VPlanVerifier, which contains utility functions to check the consistency...
This file contains the declarations of the Vectorization Plan base classes:
static const X86InstrFMA3Group Groups[]
Value * RHS
Value * LHS
BinaryOperator * Mul
static const uint32_t IV[8]
Definition blake3_impl.h:83
Helper for extra no-alias checks via known-safe recipe and SCEV.
SinkStoreInfo(const SmallPtrSetImpl< VPRecipeBase * > &ExcludeRecipes, VPReplicateRecipe &GroupLeader, PredicatedScalarEvolution &PSE, const Loop &L, VPTypeAnalysis &TypeInfo)
bool shouldSkip(VPRecipeBase &R) const
Return true if R should be skipped during alias checking, either because it's in the exclude set or b...
Class for arbitrary precision integers.
Definition APInt.h:78
LLVM_ABI APInt zext(unsigned width) const
Zero extend to a new width.
Definition APInt.cpp:1054
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1563
unsigned getActiveBits() const
Compute the number of active bits in the value.
Definition APInt.h:1535
APInt abs() const
Get the absolute value.
Definition APInt.h:1818
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1511
LLVM_ABI APInt sext(unsigned width) const
Sign extend to a new width.
Definition APInt.cpp:1027
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition APInt.h:441
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition APInt.h:1228
An arbitrary precision integer that knows its signedness.
Definition APSInt.h:24
static APSInt getMinValue(uint32_t numBits, bool Unsigned)
Return the APSInt representing the minimum integer value with the given bit width and signedness.
Definition APSInt.h:310
static APSInt getMaxValue(uint32_t numBits, bool Unsigned)
Return the APSInt representing the maximum integer value with the given bit width and signedness.
Definition APSInt.h:302
@ NoAlias
The two locations do not alias at all.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
const T & back() const
back - Get the last element.
Definition ArrayRef.h:151
const T & front() const
front - Get the first element.
Definition ArrayRef.h:145
iterator end() const
Definition ArrayRef.h:131
iterator begin() const
Definition ArrayRef.h:130
LLVM Basic Block Representation.
Definition BasicBlock.h:62
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction; assumes that the block is well-formed.
Definition BasicBlock.h:237
This class represents a function call, abstracting a target machine's calling convention.
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:701
@ ICMP_NE
not equal
Definition InstrTypes.h:698
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:702
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition InstrTypes.h:686
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition InstrTypes.h:789
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
static ConstantInt * getSigned(IntegerType *Ty, int64_t V, bool ImplicitTrunc=false)
Return a ConstantInt with the specified value for the specified type.
Definition Constants.h:135
This class represents a range of values.
LLVM_ABI bool contains(const APInt &Val) const
Return true if the specified value is in the set.
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
A debug info location.
Definition DebugLoc.h:123
static DebugLoc getCompilerGenerated()
Definition DebugLoc.h:162
static DebugLoc getUnknown()
Definition DebugLoc.h:161
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition DenseMap.h:205
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:256
ValueT lookup_or(const_arg_type_t< KeyT > Val, U &&Default) const
Definition DenseMap.h:215
bool dominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
dominates - Returns true iff A dominates B.
constexpr bool isVector() const
One or more elements.
Definition TypeSize.h:324
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition TypeSize.h:312
constexpr bool isScalar() const
Exactly one element.
Definition TypeSize.h:320
Utility class for floating point operations which can have information about relaxed accuracy require...
Definition Operator.h:200
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:23
Represents flags for the getelementptr instruction/expression.
GEPNoWrapFlags withoutNoUnsignedWrap() const
static GEPNoWrapFlags none()
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
A struct for saving information about induction variables.
InductionKind
This enum represents the kinds of inductions that we support.
@ IK_PtrInduction
Pointer induction var. Step = C.
@ IK_IntInduction
Integer induction variable. Step = C.
InstSimplifyFolder - Use InstructionSimplify to fold operations to existing values.
static InstructionCost getInvalid(CostType Val=0)
bool isCast() const
bool isBinaryOp() const
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
bool isIntDivRem() const
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:354
The group of interleaved loads/stores sharing the same stride and close to each other.
InstTy * getMember(uint32_t Index) const
Get the member with the given index Index.
uint32_t getNumMembers() const
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
static bool getDecisionAndClampRange(const std::function< bool(ElementCount)> &Predicate, VFRange &Range)
Test a Predicate on a Range of VF's.
Definition VPlan.cpp:1685
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
LLVM_ABI MDNode * createBranchWeights(uint32_t TrueWeight, uint32_t FalseWeight, bool IsExpected=false)
Return metadata containing two branch weights.
Definition MDBuilder.cpp:38
Metadata node.
Definition Metadata.h:1080
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:36
bool empty() const
Definition MapVector.h:77
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
Definition MapVector.h:116
ValueT lookup(const KeyT &Key) const
Definition MapVector.h:108
Representation for a specific memory location.
AAMDNodes AATags
The metadata nodes which describes the aliasing of the location (each member is null if that kind of ...
Post-order traversal of a graph.
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
ScalarEvolution * getSE() const
Returns the ScalarEvolution analysis used.
LLVM_ABI const SCEV * getSCEV(Value *V)
Returns the SCEV expression of V, in the context of the current SCEV predicate.
static LLVM_ABI unsigned getOpcode(RecurKind Kind)
Returns the opcode corresponding to the RecurrenceKind.
static bool isFindLastRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is of the form select(cmp(),x,y) where one of (x,...
RegionT * getParent() const
Get the parent of the Region.
Definition RegionInfo.h:362
This class uses information about analyze scalars to rewrite expressions in canonical form.
LLVM_ABI Value * expandCodeFor(SCEVUse SH, Type *Ty, BasicBlock::iterator I)
Insert code to directly compute the specified SCEV expression into the program.
static const SCEV * rewrite(const SCEV *Scev, ScalarEvolution &SE, ValueToSCEVMapTy &Map)
This class represents an analyzed expression in the program.
LLVM_ABI Type * getType() const
Return the LLVM type of this SCEV expression.
The main scalar evolution driver.
LLVM_ABI const SCEV * getUDivExpr(SCEVUse LHS, SCEVUse RHS)
Get a canonical unsigned division expression, or something simpler if possible.
const DataLayout & getDataLayout() const
Return the DataLayout associated with the module this SCEV instance is operating on.
LLVM_ABI const SCEV * getNegativeSCEV(const SCEV *V, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
Return the SCEV object corresponding to -V.
LLVM_ABI bool isKnownNonZero(const SCEV *S)
Test if the given expression is known to be non-zero.
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI const SCEV * getMinusSCEV(SCEVUse LHS, SCEVUse RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
ConstantRange getSignedRange(const SCEV *S)
Determine the signed range for a particular SCEV.
LLVM_ABI bool isKnownPositive(const SCEV *S)
Test if the given expression is known to be positive.
LLVM_ABI const SCEV * getElementCount(Type *Ty, ElementCount EC, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
ConstantRange getUnsignedRange(const SCEV *S)
Determine the unsigned range for a particular SCEV.
LLVM_ABI const SCEV * getMulExpr(SmallVectorImpl< SCEVUse > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
LLVM_ABI bool isKnownPredicate(CmpPredicate Pred, SCEVUse LHS, SCEVUse RHS)
Test if the given expression is known to satisfy the condition described by Pred, LHS,...
static LLVM_ABI bool mayAliasInScopes(const MDNode *Scopes, const MDNode *NoAlias)
static LLVM_ABI AliasResult alias(const MemoryLocation &LocA, const MemoryLocation &LocB)
A vector that has set insertion semantics.
Definition SetVector.h:57
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:103
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:151
size_type size() const
Definition SmallPtrSet.h:99
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
iterator begin() const
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
Provides information about what library functions are available for the current target.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
static LLVM_ABI PartialReductionExtendKind getPartialReductionExtendKind(Instruction *I)
Get the kind of extension that an instruction represents.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
LLVM_ABI InstructionCost getPartialReductionCost(unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType, ElementCount VF, PartialReductionExtendKind OpAExtend, PartialReductionExtendKind OpBExtend, std::optional< unsigned > BinOp, TTI::TargetCostKind CostKind, std::optional< FastMathFlags > FMF) const
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
This class implements a switch-like dispatch statement for a value of 'T' using dyn_cast functionalit...
Definition TypeSwitch.h:89
TypeSwitch< T, ResultT > & Case(CallableT &&caseFn)
Add a case on the given type.
Definition TypeSwitch.h:98
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:313
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:284
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
Definition Type.cpp:311
bool isStructTy() const
True if this is an instance of StructType.
Definition Type.h:278
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:201
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:236
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:310
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:186
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:257
op_range operands()
Definition User.h:267
A recipe for generating the active lane mask for the vector loop that is used to predicate the vector...
Definition VPlan.h:3793
VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph.
Definition VPlan.h:4160
void appendRecipe(VPRecipeBase *Recipe)
Augment the existing recipes of a VPBasicBlock with an additional Recipe as the last recipe.
Definition VPlan.h:4235
RecipeListTy::iterator iterator
Instruction iterators...
Definition VPlan.h:4187
iterator end()
Definition VPlan.h:4197
iterator begin()
Recipe iterator methods.
Definition VPlan.h:4195
iterator_range< iterator > phis()
Returns an iterator range over the PHI-like recipes in the block.
Definition VPlan.h:4248
iterator getFirstNonPhi()
Return the position of the first non-phi node recipe in the block.
Definition VPlan.cpp:232
VPRegionBlock * getEnclosingLoopRegion()
Definition VPlan.cpp:598
VPBasicBlock * splitAt(iterator SplitAt)
Split current block at SplitAt by inserting a new block between the current block and its successors ...
Definition VPlan.cpp:565
VPRecipeBase * getTerminator()
If the block has multiple successors, return the branch recipe terminating the block.
Definition VPlan.cpp:644
const VPRecipeBase & back() const
Definition VPlan.h:4209
A recipe for vectorizing a phi-node as a sequence of mask-based select instructions.
Definition VPlan.h:2780
VPValue * getMask(unsigned Idx) const
Return mask number Idx.
Definition VPlan.h:2816
unsigned getNumIncomingValues() const
Return the number of incoming values, taking into account when normalized the first incoming value wi...
Definition VPlan.h:2806
void setMask(unsigned Idx, VPValue *V)
Set mask number Idx to V.
Definition VPlan.h:2822
bool isNormalized() const
A normalized blend is one that has an odd number of operands, whereby the first operand does not have...
Definition VPlan.h:2802
VPBlockBase is the building block of the Hierarchical Control-Flow Graph.
Definition VPlan.h:98
void setSuccessors(ArrayRef< VPBlockBase * > NewSuccs)
Set each VPBasicBlock in NewSuccss as successor of this VPBlockBase.
Definition VPlan.h:319
VPRegionBlock * getParent()
Definition VPlan.h:190
const VPBasicBlock * getExitingBasicBlock() const
Definition VPlan.cpp:202
size_t getNumSuccessors() const
Definition VPlan.h:241
void setPredecessors(ArrayRef< VPBlockBase * > NewPreds)
Set each VPBasicBlock in NewPreds as predecessor of this VPBlockBase.
Definition VPlan.h:310
const VPBlocksTy & getPredecessors() const
Definition VPlan.h:226
VPlan * getPlan()
Definition VPlan.cpp:177
const std::string & getName() const
Definition VPlan.h:181
void clearSuccessors()
Remove all the successors of this block.
Definition VPlan.h:329
VPBlockBase * getSinglePredecessor() const
Definition VPlan.h:237
const VPBasicBlock * getEntryBasicBlock() const
Definition VPlan.cpp:182
VPBlockBase * getSingleHierarchicalPredecessor()
Definition VPlan.h:283
VPBlockBase * getSingleSuccessor() const
Definition VPlan.h:231
const VPBlocksTy & getSuccessors() const
Definition VPlan.h:215
static void insertOnEdge(VPBlockBase *From, VPBlockBase *To, VPBlockBase *BlockPtr)
Inserts BlockPtr on the edge between From and To.
Definition VPlanUtils.h:303
static bool isLatch(const VPBlockBase *VPB, const VPDominatorTree &VPDT)
Returns true if VPB is a loop latch, using isHeader().
static void insertTwoBlocksAfter(VPBlockBase *IfTrue, VPBlockBase *IfFalse, VPBlockBase *BlockPtr)
Insert disconnected VPBlockBases IfTrue and IfFalse after BlockPtr.
Definition VPlanUtils.h:204
static void connectBlocks(VPBlockBase *From, VPBlockBase *To, unsigned PredIdx=-1u, unsigned SuccIdx=-1u)
Connect VPBlockBases From and To bi-directionally.
Definition VPlanUtils.h:222
static void disconnectBlocks(VPBlockBase *From, VPBlockBase *To)
Disconnect VPBlockBases From and To bi-directionally.
Definition VPlanUtils.h:240
static auto blocksOnly(T &&Range)
Return an iterator range over Range which only includes BlockTy blocks.
Definition VPlanUtils.h:276
static void transferSuccessors(VPBlockBase *Old, VPBlockBase *New)
Transfer successors from Old to New. New must have no successors.
Definition VPlanUtils.h:260
static SmallVector< VPBasicBlock * > blocksInSingleSuccessorChainBetween(VPBasicBlock *FirstBB, VPBasicBlock *LastBB)
Returns the blocks between FirstBB and LastBB, where FirstBB to LastBB forms a single-sucessor chain.
A recipe for generating conditional branches on the bits of a mask.
Definition VPlan.h:3277
RAII object that stores the current insertion point and restores it when the object is destroyed.
VPlan-based builder utility analogous to IRBuilder.
VPInstruction * createOr(VPValue *LHS, VPValue *RHS, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
VPValue * createScalarZExtOrTrunc(VPValue *Op, Type *ResultTy, Type *SrcTy, DebugLoc DL)
VPValue * createElementCount(Type *Ty, ElementCount EC)
VPInstruction * createNot(VPValue *Operand, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
VPInstruction * createAnyOfReduction(VPValue *ChainOp, VPValue *TrueVal, VPValue *FalseVal, DebugLoc DL=DebugLoc::getUnknown())
Create an AnyOf reduction pattern: or-reduce ChainOp, freeze the result, then select between TrueVal ...
Definition VPlan.cpp:1670
VPInstruction * createLogicalAnd(VPValue *LHS, VPValue *RHS, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
VPInstruction * createScalarCast(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy, DebugLoc DL, const VPIRMetadata &Metadata={})
VPWidenPHIRecipe * createWidenPhi(ArrayRef< VPValue * > IncomingValues, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
static VPBuilder getToInsertAfter(VPRecipeBase *R)
Create a VPBuilder to insert after R.
VPInstruction * createOverflowingOp(unsigned Opcode, ArrayRef< VPValue * > Operands, VPRecipeWithIRFlags::WrapFlagsTy WrapFlags={false, false}, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
VPPhi * createScalarPhi(ArrayRef< VPValue * > IncomingValues, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="", const VPIRFlags &Flags={})
VPDerivedIVRecipe * createDerivedIV(InductionDescriptor::InductionKind Kind, FPMathOperator *FPBinOp, VPIRValue *Start, VPValue *Current, VPValue *Step, const Twine &Name="")
Convert the input value Current to the corresponding value of an induction with Start and Step values...
VPInstruction * createICmp(CmpInst::Predicate Pred, VPValue *A, VPValue *B, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
Create a new ICmp VPInstruction with predicate Pred and operands A and B.
VPInstruction * createSelect(VPValue *Cond, VPValue *TrueVal, VPValue *FalseVal, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="", const VPIRFlags &Flags={})
void setInsertPoint(VPBasicBlock *TheBB)
This specifies that created VPInstructions should be appended to the end of the specified block.
VPInstruction * createNaryOp(unsigned Opcode, ArrayRef< VPValue * > Operands, Instruction *Inst=nullptr, const VPIRFlags &Flags={}, const VPIRMetadata &MD={}, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
Create an N-ary operation with Opcode, Operands and set Inst as its underlying Instruction.
A recipe for generating the phi node tracking the current scalar iteration index.
Definition VPlan.h:3825
unsigned getNumDefinedValues() const
Returns the number of values defined by the VPDef.
Definition VPlanValue.h:498
VPValue * getVPSingleValue()
Returns the only VPValue defined by the VPDef.
Definition VPlanValue.h:471
VPValue * getVPValue(unsigned I)
Returns the VPValue with index I defined by the VPDef.
Definition VPlanValue.h:483
ArrayRef< VPRecipeValue * > definedValues()
Returns an ArrayRef of the values defined by the VPDef.
Definition VPlanValue.h:493
A recipe for converting the input value IV value to the corresponding value of an IV with different s...
Definition VPlan.h:3909
Template specialization of the standard LLVM dominator tree utility for VPBlockBases.
bool properlyDominates(const VPRecipeBase *A, const VPRecipeBase *B)
A recipe to combine multiple recipes into a single 'expression' recipe, which should be considered a ...
Definition VPlan.h:3322
A pure virtual base class for all recipes modeling header phis, including phis for first order recurr...
Definition VPlan.h:2300
virtual VPValue * getBackedgeValue()
Returns the incoming value from the loop backedge.
Definition VPlan.h:2342
VPValue * getStartValue()
Returns the start value of the phi, if one is set.
Definition VPlan.h:2331
A recipe representing a sequence of load -> update -> store as part of a histogram operation.
Definition VPlan.h:2045
A special type of VPBasicBlock that wraps an existing IR basic block.
Definition VPlan.h:4313
Class to record and manage LLVM IR flags.
Definition VPlan.h:688
static VPIRFlags getDefaultFlags(unsigned Opcode)
Returns default flags for Opcode for opcodes that support it, asserts otherwise.
LLVM_ABI_FOR_TEST FastMathFlags getFastMathFlags() const
static LLVM_ABI_FOR_TEST VPIRInstruction * create(Instruction &I)
Create a new VPIRPhi for \I , if it is a PHINode, otherwise create a VPIRInstruction.
Helper to manage IR metadata for recipes.
Definition VPlan.h:1168
void intersect(const VPIRMetadata &MD)
Intersect this VPIRMetadata object with MD, keeping only metadata nodes that are common to both.
This is a concrete Recipe that models a single VPlan-level instruction.
Definition VPlan.h:1223
@ ExtractLane
Extracts a single lane (first operand) from a set of vector operands.
Definition VPlan.h:1324
@ Unpack
Extracts all lanes from its (non-scalable) vector operand.
Definition VPlan.h:1267
@ ReductionStartVector
Start vector for reductions with 3 operands: the original start value, the identity value for the red...
Definition VPlan.h:1318
@ BuildVector
Creates a fixed-width vector containing all operands.
Definition VPlan.h:1262
@ BuildStructVector
Given operands of (the same) struct type, creates a struct of fixed- width vectors each containing a ...
Definition VPlan.h:1259
@ CanonicalIVIncrementForPart
Definition VPlan.h:1243
@ ComputeReductionResult
Reduce the operands to the final reduction result using the operation specified via the operation's V...
Definition VPlan.h:1270
const InterleaveGroup< Instruction > * getInterleaveGroup() const
Definition VPlan.h:2918
VPValue * getMask() const
Return the mask used by this recipe.
Definition VPlan.h:2910
ArrayRef< VPValue * > getStoredValues() const
Return the VPValues stored by this interleave group.
Definition VPlan.h:2939
A recipe for interleaved memory operations with vector-predication intrinsics.
Definition VPlan.h:2991
VPInterleaveRecipe is a recipe for transforming an interleave group of load or stores into one wide l...
Definition VPlan.h:2949
VPPredInstPHIRecipe is a recipe for generating the phi nodes needed when control converges back from ...
Definition VPlan.h:3464
VPRecipeBase is a base class modeling a sequence of one or more output IR instructions.
Definition VPlan.h:406
VPRegionBlock * getRegion()
Definition VPlan.h:4505
VPBasicBlock * getParent()
Definition VPlan.h:480
DebugLoc getDebugLoc() const
Returns the debug location of the recipe.
Definition VPlan.h:554
void moveBefore(VPBasicBlock &BB, iplist< VPRecipeBase >::iterator I)
Unlink this recipe and insert into BB before I.
void insertBefore(VPRecipeBase *InsertPos)
Insert an unlinked recipe into a basic block immediately before the specified recipe.
void insertAfter(VPRecipeBase *InsertPos)
Insert an unlinked Recipe into a basic block immediately after the specified Recipe.
iplist< VPRecipeBase >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Helper class to create VPRecipies from IR instructions.
VPHistogramRecipe * widenIfHistogram(VPInstruction *VPI)
If VPI represents a histogram operation (as determined by LoopVectorizationLegality) make that safe f...
void setRecipe(Instruction *I, VPRecipeBase *R)
Set the recipe created for given ingredient.
VPRecipeBase * tryToWidenMemory(VPInstruction *VPI, VFRange &Range)
Check if the load or store instruction VPI should widened for Range.Start and potentially masked.
bool replaceWithFinalIfReductionStore(VPInstruction *VPI, VPBuilder &FinalRedStoresBuilder)
If VPI is a store of a reduction into an invariant address, delete it.
VPReplicateRecipe * handleReplication(VPInstruction *VPI, VFRange &Range)
Build a VPReplicationRecipe for VPI.
VPRecipeBase * getRecipe(Instruction *I)
Return the recipe created for given ingredient.
A recipe to represent inloop reduction operations with vector-predication intrinsics,...
Definition VPlan.h:3151
A recipe for handling reduction phis.
Definition VPlan.h:2686
void setVFScaleFactor(unsigned ScaleFactor)
Set the VFScaleFactor for this reduction phi.
Definition VPlan.h:2733
unsigned getVFScaleFactor() const
Get the factor that the VF of this recipe's output should be scaled by, or 1 if it isn't scaled.
Definition VPlan.h:2726
RecurKind getRecurrenceKind() const
Returns the recurrence kind of the reduction.
Definition VPlan.h:2744
A recipe to represent inloop, ordered or partial reduction operations.
Definition VPlan.h:3042
VPRegionBlock represents a collection of VPBasicBlocks and VPRegionBlocks which form a Single-Entry-S...
Definition VPlan.h:4370
const VPBlockBase * getEntry() const
Definition VPlan.h:4414
bool isReplicator() const
An indicator whether this region is to generate multiple replicated instances of output IR correspond...
Definition VPlan.h:4446
VPInstruction * getOrCreateCanonicalIVIncrement()
Get the canonical IV increment instruction if it exists.
Definition VPlan.cpp:880
void setExiting(VPBlockBase *ExitingBlock)
Set ExitingBlock as the exiting VPBlockBase of this VPRegionBlock.
Definition VPlan.h:4431
Type * getCanonicalIVType() const
Return the type of the canonical IV for loop regions.
Definition VPlan.h:4490
void clearCanonicalIVNUW(VPInstruction *Increment)
Unsets NUW for the canonical IV increment Increment, for loop regions.
Definition VPlan.h:4498
VPRegionValue * getCanonicalIV()
Return the canonical induction variable of the region, null for replicating regions.
Definition VPlan.h:4482
const VPBlockBase * getExiting() const
Definition VPlan.h:4426
VPBasicBlock * getPreheaderVPBB()
Returns the pre-header VPBasicBlock of the loop region.
Definition VPlan.h:4439
VPReplicateRecipe replicates a given instruction producing multiple scalar copies of the original sca...
Definition VPlan.h:3196
bool isSingleScalar() const
Definition VPlan.h:3237
bool isPredicated() const
Definition VPlan.h:3239
VPValue * getMask()
Return the mask of a predicated VPReplicateRecipe.
Definition VPlan.h:3261
A recipe for handling phi nodes of integer and floating-point inductions, producing their scalar valu...
Definition VPlan.h:3980
VPSingleDef is a base class for recipes for modeling a sequence of one or more output IR that define ...
Definition VPlan.h:606
Instruction * getUnderlyingInstr()
Returns the underlying instruction.
Definition VPlan.h:673
VPSingleDefRecipe * clone() override=0
Clone the current recipe.
An analysis for type-inference for VPValues.
LLVMContext & getContext()
Return the LLVMContext used by the analysis.
Type * inferScalarType(const VPValue *V)
Infer the type of V. Returns the scalar type of V.
This class augments VPValue with operands which provide the inverse def-use edges from VPValue's user...
Definition VPlanValue.h:329
operand_range operands()
Definition VPlanValue.h:397
void setOperand(unsigned I, VPValue *New)
Definition VPlanValue.h:373
unsigned getNumOperands() const
Definition VPlanValue.h:367
VPValue * getOperand(unsigned N) const
Definition VPlanValue.h:368
void addOperand(VPValue *Operand)
Definition VPlanValue.h:362
This is the base class of the VPlan Def/Use graph, used for modeling the data flow into,...
Definition VPlanValue.h:49
Value * getLiveInIRValue() const
Return the underlying IR value for a VPIRValue.
Definition VPlan.cpp:137
bool isDefinedOutsideLoopRegions() const
Returns true if the VPValue is defined outside any loop.
Definition VPlan.cpp:1495
VPRecipeBase * getDefiningRecipe()
Returns the recipe defining this VPValue or nullptr if it is not defined by a recipe,...
Definition VPlan.cpp:127
Value * getUnderlyingValue() const
Return the underlying Value attached to this VPValue.
Definition VPlanValue.h:74
void setUnderlyingValue(Value *Val)
Definition VPlanValue.h:202
void replaceAllUsesWith(VPValue *New)
Definition VPlan.cpp:1498
unsigned getNumUsers() const
Definition VPlanValue.h:113
void replaceUsesWithIf(VPValue *New, llvm::function_ref< bool(VPUser &U, unsigned Idx)> ShouldReplace)
Go through the uses list for this VPValue and make each use point to New if the callback ShouldReplac...
Definition VPlan.cpp:1504
user_range users()
Definition VPlanValue.h:155
A recipe to compute a pointer to the last element of each part of a widened memory access for widened...
Definition VPlan.h:2151
A Recipe for widening the canonical induction variable of the vector loop.
Definition VPlan.h:3868
VPWidenCastRecipe is a recipe to create vector cast instructions.
Definition VPlan.h:1837
Instruction::CastOps getOpcode() const
Definition VPlan.h:1875
A recipe for handling GEP instructions.
Definition VPlan.h:2087
Base class for widened induction (VPWidenIntOrFpInductionRecipe and VPWidenPointerInductionRecipe),...
Definition VPlan.h:2366
VPIRValue * getStartValue() const
Returns the start value of the induction.
Definition VPlan.h:2394
VPValue * getStepValue()
Returns the step value of the induction.
Definition VPlan.h:2397
const InductionDescriptor & getInductionDescriptor() const
Returns the induction descriptor for the recipe.
Definition VPlan.h:2417
A recipe for handling phi nodes of integer and floating-point inductions, producing their vector valu...
Definition VPlan.h:2448
VPIRValue * getStartValue() const
Returns the start value of the induction.
Definition VPlan.h:2495
VPValue * getSplatVFValue() const
If the recipe has been unrolled, return the VPValue for the induction increment, otherwise return nul...
Definition VPlan.h:2499
VPValue * getLastUnrolledPartOperand()
Returns the VPValue representing the value of this induction at the last unrolled part,...
Definition VPlan.h:2526
A recipe for widening vector intrinsics.
Definition VPlan.h:1889
A common base class for widening memory operations.
Definition VPlan.h:3507
A recipe for widened phis.
Definition VPlan.h:2584
VPWidenRecipe is a recipe for producing a widened instruction using the opcode and operands of the re...
Definition VPlan.h:1781
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPWidenRecipe.
VPWidenRecipe * clone() override
Clone the current recipe.
Definition VPlan.h:1801
unsigned getOpcode() const
Definition VPlan.h:1818
VPlan models a candidate for vectorization, encoding various decisions take to produce efficient outp...
Definition VPlan.h:4518
VPIRValue * getLiveIn(Value *V) const
Return the live-in VPIRValue for V, if there is one or nullptr otherwise.
Definition VPlan.h:4834
bool hasVF(ElementCount VF) const
Definition VPlan.h:4732
const DataLayout & getDataLayout() const
Definition VPlan.h:4714
LLVMContext & getContext() const
Definition VPlan.h:4710
VPBasicBlock * getEntry()
Definition VPlan.h:4610
bool hasScalableVF() const
Definition VPlan.h:4733
VPValue * getTripCount() const
The trip count of the original loop.
Definition VPlan.h:4669
VPValue * getOrCreateBackedgeTakenCount()
The backedge taken count of the original loop.
Definition VPlan.h:4690
iterator_range< SmallSetVector< ElementCount, 2 >::iterator > vectorFactors() const
Returns an iterator range over all VFs of the plan.
Definition VPlan.h:4739
VPIRValue * getFalse()
Return a VPIRValue wrapping i1 false.
Definition VPlan.h:4805
VPSymbolicValue & getVFxUF()
Returns VF * UF of the vector loop region.
Definition VPlan.h:4708
VPIRValue * getAllOnesValue(Type *Ty)
Return a VPIRValue wrapping the AllOnes value of type Ty.
Definition VPlan.h:4811
VPRegionBlock * createReplicateRegion(VPBlockBase *Entry, VPBlockBase *Exiting, const std::string &Name="")
Create a new replicate region with Entry, Exiting and Name.
Definition VPlan.h:4883
auto getLiveIns() const
Return the list of live-in VPValues available in the VPlan.
Definition VPlan.h:4837
bool hasUF(unsigned UF) const
Definition VPlan.h:4757
ArrayRef< VPIRBasicBlock * > getExitBlocks() const
Return an ArrayRef containing VPIRBasicBlocks wrapping the exit blocks of the original scalar loop.
Definition VPlan.h:4659
VPSymbolicValue & getVectorTripCount()
The vector trip count.
Definition VPlan.h:4698
VPValue * getBackedgeTakenCount() const
Definition VPlan.h:4695
VPIRValue * getOrAddLiveIn(Value *V)
Gets the live-in VPIRValue for V or adds a new live-in (if none exists yet) for V.
Definition VPlan.h:4782
VPIRValue * getZero(Type *Ty)
Return a VPIRValue wrapping the null value of type Ty.
Definition VPlan.h:4808
void setVF(ElementCount VF)
Definition VPlan.h:4720
bool isUnrolled() const
Returns true if the VPlan already has been unrolled, i.e.
Definition VPlan.h:4773
LLVM_ABI_FOR_TEST VPRegionBlock * getVectorLoopRegion()
Returns the VPRegionBlock of the vector loop.
Definition VPlan.cpp:1095
unsigned getConcreteUF() const
Returns the concrete UF of the plan, after unrolling.
Definition VPlan.h:4760
void resetTripCount(VPValue *NewTripCount)
Resets the trip count for the VPlan.
Definition VPlan.h:4683
VPBasicBlock * getMiddleBlock()
Returns the 'middle' block of the plan, that is the block that selects whether to execute the scalar ...
Definition VPlan.h:4635
VPBasicBlock * createVPBasicBlock(const Twine &Name, VPRecipeBase *Recipe=nullptr)
Create a new VPBasicBlock with Name and containing Recipe if present.
Definition VPlan.h:4860
VPIRValue * getTrue()
Return a VPIRValue wrapping i1 true.
Definition VPlan.h:4802
VPSymbolicValue & getUF()
Returns the UF of the vector loop region.
Definition VPlan.h:4705
bool hasScalarVFOnly() const
Definition VPlan.h:4750
VPBasicBlock * getScalarPreheader() const
Return the VPBasicBlock for the preheader of the scalar loop.
Definition VPlan.h:4649
VPBasicBlock * getVectorPreheader()
Returns the preheader of the vector loop region, if one exists, or null otherwise.
Definition VPlan.h:4615
VPSymbolicValue & getVF()
Returns the VF of the vector loop region.
Definition VPlan.h:4701
void setUF(unsigned UF)
Definition VPlan.h:4765
bool hasScalarTail() const
Returns true if the scalar tail may execute after the vector loop, i.e.
Definition VPlan.h:4915
LLVM_ABI_FOR_TEST VPlan * duplicate()
Clone the current VPlan, update all VPValues of the new VPlan and cloned recipes to refer to the clon...
Definition VPlan.cpp:1243
VPIRValue * getConstantInt(Type *Ty, uint64_t Val, bool IsSigned=false)
Return a VPIRValue wrapping a ConstantInt with the given type and value.
Definition VPlan.h:4816
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255
iterator_range< user_iterator > users()
Definition Value.h:426
bool hasName() const
Definition Value.h:261
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:318
constexpr bool hasKnownScalarFactor(const FixedOrScalableQuantity &RHS) const
Returns true if there exists a value X where RHS.multiplyCoefficientBy(X) will result in a value whos...
Definition TypeSize.h:269
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
constexpr ScalarTy getKnownScalarFactor(const FixedOrScalableQuantity &RHS) const
Returns a value X where RHS.multiplyCoefficientBy(X) will result in a value whose quantity matches ou...
Definition TypeSize.h:277
static constexpr bool isKnownLT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:216
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:168
constexpr LeafTy multiplyCoefficientBy(ScalarTy RHS) const
Definition TypeSize.h:256
constexpr bool isFixed() const
Returns true if the quantity is not scaled by vscale.
Definition TypeSize.h:171
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
LLVM_ABI APInt RoundingUDiv(const APInt &A, const APInt &B, APInt::Rounding RM)
Return A unsign-divided by B, rounded by the given rounding mode.
Definition APInt.cpp:2814
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
match_combine_or< Ty... > m_CombineOr(const Ty &...Ps)
Combine pattern matchers matching any of Ps patterns.
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
auto m_Cmp()
Matches any compare instruction and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
m_Intrinsic_Ty< Opnd0, Opnd1, Opnd2 >::Ty m_MaskedStore(const Opnd0 &Op0, const Opnd1 &Op1, const Opnd2 &Op2)
Matches MaskedStore Intrinsic.
ap_match< APInt > m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
CastInst_match< OpTy, TruncInst > m_Trunc(const OpTy &Op)
Matches Trunc.
LogicalOp_match< LHS, RHS, Instruction::And > m_LogicalAnd(const LHS &L, const RHS &R)
Matches L && R either in the form of L & R or L ?
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, ZExtInst >, OpTy > m_ZExtOrSelf(const OpTy &Op)
bool match(Val *V, const Pattern &P)
match_deferred< Value > m_Deferred(Value *const &V)
Like m_Specific(), but works if the specific value to match is determined as part of the same match()...
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
auto match_fn(const Pattern &P)
A match functor that can be used as a UnaryPredicate in functional algorithms like all_of.
m_Intrinsic_Ty< Opnd0, Opnd1, Opnd2 >::Ty m_MaskedLoad(const Opnd0 &Op0, const Opnd1 &Op1, const Opnd2 &Op2)
Matches MaskedLoad Intrinsic.
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
SpecificCmpClass_match< LHS, RHS, CmpInst > m_SpecificCmp(CmpPredicate MatchPred, const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
CastInst_match< OpTy, FPExtInst > m_FPExt(const OpTy &Op)
SpecificCmpClass_match< LHS, RHS, ICmpInst > m_SpecificICmp(CmpPredicate MatchPred, const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::UDiv > m_UDiv(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
CmpClass_match< LHS, RHS, ICmpInst > m_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
BinaryOp_match< LHS, RHS, Instruction::FAdd, true > m_c_FAdd(const LHS &L, const RHS &R)
Matches FAdd with LHS and RHS in either order.
LogicalOp_match< LHS, RHS, Instruction::And, true > m_c_LogicalAnd(const LHS &L, const RHS &R)
Matches L && R with LHS and RHS in either order.
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
BinaryOp_match< LHS, RHS, Instruction::Mul, true > m_c_Mul(const LHS &L, const RHS &R)
Matches a Mul with LHS and RHS in either order.
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
auto m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
bind_cst_ty m_scev_APInt(const APInt *&C)
Match an SCEV constant and bind it to an APInt.
bool match(const SCEV *S, const Pattern &P)
SCEVAffineAddRec_match< Op0_t, Op1_t, match_isa< const Loop > > m_scev_AffineAddRec(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::ExtractLastLane, VPInstruction_match< VPInstruction::ExtractLastPart, Op0_t > > m_ExtractLastLaneOfLastPart(const Op0_t &Op0)
AllRecipe_commutative_match< Instruction::And, Op0_t, Op1_t > m_c_BinaryAnd(const Op0_t &Op0, const Op1_t &Op1)
Match a binary AND operation.
AllRecipe_match< Instruction::Or, Op0_t, Op1_t > m_BinaryOr(const Op0_t &Op0, const Op1_t &Op1)
Match a binary OR operation.
VPInstruction_match< VPInstruction::AnyOf > m_AnyOf()
AllRecipe_commutative_match< Instruction::Or, Op0_t, Op1_t > m_c_BinaryOr(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::ComputeReductionResult, Op0_t > m_ComputeReductionResult(const Op0_t &Op0)
auto m_WidenAnyExtend(const Op0_t &Op0)
match_bind< VPIRValue > m_VPIRValue(VPIRValue *&V)
Match a VPIRValue.
VPInstruction_match< VPInstruction::StepVector > m_StepVector()
auto m_VPPhi(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::BranchOnTwoConds > m_BranchOnTwoConds()
AllRecipe_match< Opcode, Op0_t, Op1_t > m_Binary(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::LastActiveLane, Op0_t > m_LastActiveLane(const Op0_t &Op0)
auto m_WidenIntrinsic(const T &...Ops)
VPInstruction_match< VPInstruction::ExitingIVValue, Op0_t > m_ExitingIVValue(const Op0_t &Op0)
VPInstruction_match< Instruction::ExtractElement, Op0_t, Op1_t > m_ExtractElement(const Op0_t &Op0, const Op1_t &Op1)
specific_intval< 1 > m_False()
VPInstruction_match< VPInstruction::ExtractLastLane, Op0_t > m_ExtractLastLane(const Op0_t &Op0)
VPInstruction_match< VPInstruction::ActiveLaneMask, Op0_t, Op1_t, Op2_t > m_ActiveLaneMask(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2)
match_bind< VPSingleDefRecipe > m_VPSingleDefRecipe(VPSingleDefRecipe *&V)
Match a VPSingleDefRecipe, capturing if we match.
VPInstruction_match< VPInstruction::BranchOnCount > m_BranchOnCount()
auto m_GetElementPtr(const Op0_t &Op0, const Op1_t &Op1)
specific_intval< 1 > m_True()
auto m_VPValue()
Match an arbitrary VPValue and ignore it.
VectorEndPointerRecipe_match< Op0_t, Op1_t > m_VecEndPtr(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::ExtractLastPart, Op0_t > m_ExtractLastPart(const Op0_t &Op0)
VPInstruction_match< VPInstruction::Broadcast, Op0_t > m_Broadcast(const Op0_t &Op0)
VPInstruction_match< VPInstruction::ExplicitVectorLength, Op0_t > m_EVL(const Op0_t &Op0)
VPInstruction_match< VPInstruction::BuildVector > m_BuildVector()
BuildVector is matches only its opcode, w/o matching its operands as the number of operands is not fi...
VPInstruction_match< VPInstruction::ExtractPenultimateElement, Op0_t > m_ExtractPenultimateElement(const Op0_t &Op0)
match_bind< VPInstruction > m_VPInstruction(VPInstruction *&V)
Match a VPInstruction, capturing if we match.
VPInstruction_match< VPInstruction::FirstActiveLane, Op0_t > m_FirstActiveLane(const Op0_t &Op0)
auto m_DerivedIV(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2)
VPInstruction_match< VPInstruction::BranchOnCond > m_BranchOnCond()
VPInstruction_match< VPInstruction::ExtractLane, Op0_t, Op1_t > m_ExtractLane(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::Reverse, Op0_t > m_Reverse(const Op0_t &Op0)
NodeAddr< DefNode * > Def
Definition RDFGraph.h:384
bool isSingleScalar(const VPValue *VPV)
Returns true if VPV is a single scalar, either because it produces the same value for all lanes or on...
bool isUniformAcrossVFsAndUFs(VPValue *V)
Checks if V is uniform across all VF lanes and UF parts.
VPValue * getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr)
Get or create a VPValue that corresponds to the expansion of Expr.
VPInstruction * findComputeReductionResult(VPReductionPHIRecipe *PhiR)
Find the ComputeReductionResult recipe for PhiR, looking through selects inserted for predicated redu...
VPInstruction * findCanonicalIVIncrement(VPlan &Plan)
Find the canonical IV increment of Plan's vector loop region.
std::optional< MemoryLocation > getMemoryLocation(const VPRecipeBase &R)
Return a MemoryLocation for R with noalias metadata populated from R, if the recipe is supported and ...
bool onlyFirstLaneUsed(const VPValue *Def)
Returns true if only the first lane of Def is used.
VPRecipeBase * findRecipe(VPValue *Start, PredT Pred)
Search Start's users for a recipe satisfying Pred, looking through recipes with definitions.
Definition VPlanUtils.h:111
VPSingleDefRecipe * findHeaderMask(VPlan &Plan)
Collect the header mask with the pattern: (ICMP_ULE, WideCanonicalIV, backedge-taken-count) TODO: Int...
bool onlyScalarValuesUsed(const VPValue *Def)
Returns true if only scalar values of Def are used by all users.
static VPRecipeBase * findUserOf(VPValue *V, const MatchT &P)
If V is used by a recipe matching pattern P, return it.
Definition VPlanUtils.h:132
const SCEV * getSCEVExprForVPValue(const VPValue *V, PredicatedScalarEvolution &PSE, const Loop *L=nullptr)
Return the SCEV expression for V.
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
@ Offset
Definition DWP.cpp:532
constexpr auto not_equal_to(T &&Arg)
Functor variant of std::not_equal_to that can be used as a UnaryPredicate in functional algorithms li...
Definition STLExtras.h:2180
void stable_sort(R &&Range)
Definition STLExtras.h:2116
auto min_element(R &&Range)
Provide wrappers to std::min_element which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:2078
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1669
LLVM_ABI Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
Definition STLExtras.h:841
DenseMap< const Value *, const SCEV * > ValueToSCEVMapTy
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2554
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
constexpr from_range_t from_range
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2208
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:634
auto cast_or_null(const Y &Val)
Definition Casting.h:714
iterator_range< df_iterator< VPBlockShallowTraversalWrapper< VPBlockBase * > > > vp_depth_first_shallow(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order.
Definition VPlanCFG.h:253
iterator_range< df_iterator< VPBlockDeepTraversalWrapper< VPBlockBase * > > > vp_depth_first_deep(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order while traversing t...
Definition VPlanCFG.h:265
constexpr auto equal_to(T &&Arg)
Functor variant of std::equal_to that can be used as a UnaryPredicate in functional algorithms like a...
Definition STLExtras.h:2173
detail::concat_range< ValueT, RangeTs... > concat(RangeTs &&...Ranges)
Returns a concatenated range across two or more ranges.
Definition STLExtras.h:1152
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:385
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1746
auto reverse(ContainerTy &&C)
Definition STLExtras.h:408
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1636
LLVM_ABI_FOR_TEST cl::opt< bool > EnableWideActiveLaneMask
UncountableExitStyle
Different methods of handling early exits.
Definition VPlan.h:83
@ ReadOnly
No side effects to worry about, so we can process any uncountable exits in the loop and branch either...
Definition VPlan.h:88
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1753
SmallVector< ValueTypeFromRangeType< R >, Size > to_vector(R &&Range)
Given a range of type R, iterate the entire range and return a SmallVector with elements of the vecto...
iterator_range< filter_iterator< detail::IterOfRange< RangeT >, PredicateT > > make_filter_range(RangeT &&Range, PredicateT Pred)
Convenience function that takes a range of elements and a predicate, and return a new filter_iterator...
Definition STLExtras.h:552
bool canConstantBeExtended(const APInt *C, Type *NarrowType, TTI::PartialReductionExtendKind ExtKind)
Check if a constant CI can be safely treated as having been extended from a narrower type with the gi...
Definition VPlan.cpp:1879
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
auto drop_end(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the last N elements excluded.
Definition STLExtras.h:323
TargetTransformInfo TTI
RecurKind
These are the kinds of recurrences that we support.
@ UMin
Unsigned integer min implemented in terms of select(cmp()).
@ FindIV
FindIV reduction with select(icmp(),x,y) where one of (x,y) is a loop induction variable (increasing ...
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ SMin
Signed integer min implemented in terms of select(cmp()).
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
@ AddChainWithSubs
A chain of adds and subs.
@ FAdd
Sum of floats.
@ UMax
Unsigned integer max implemented in terms of select(cmp()).
LLVM_ABI Value * getRecurrenceIdentity(RecurKind K, Type *Tp, FastMathFlags FMF)
Given information about an recurrence kind, return the identity for the @llvm.vector....
LLVM_ABI BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="")
Split the specified block at the specified instruction.
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition STLExtras.h:2012
DWARFExpression::Operation Op
auto max_element(R &&Range)
Provide wrappers to std::max_element which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:2088
ArrayRef(const T &OneElt) -> ArrayRef< T >
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1772
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1947
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition STLExtras.h:2166
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition Hashing.h:592
bool equal(L &&LRange, R &&RRange)
Wrapper function around std::equal to detect if pair-wise elements between two ranges are the same.
Definition STLExtras.h:2146
Type * toVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
@ Default
The result value is uniform if and only if all operands are uniform.
Definition Uniformity.h:20
constexpr detail::IsaCheckPredicate< Types... > IsaPred
Function object wrapper for the llvm::isa type check.
Definition Casting.h:866
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition Hashing.h:466
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
#define N
RemoveMask_match(const Op0_t &In, Op1_t &Out)
bool match(OpTy *V) const
A collection of metadata nodes that might be associated with a memory access used by the alias-analys...
Definition Metadata.h:763
MDNode * Scope
The tag for alias scope specification (used with noalias).
Definition Metadata.h:786
MDNode * NoAlias
The tag specifying the noalias scope.
Definition Metadata.h:789
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
An information struct used to provide DenseMap with the various necessary components for a given valu...
This reduction is unordered with the partial result scaled down by some factor.
Definition VPlan.h:2668
A range of powers-of-2 vectorization factors with fixed start and adjustable end.
Struct to hold various analysis needed for cost computations.
TargetTransformInfo::TargetCostKind CostKind
VPTypeAnalysis Types
const TargetTransformInfo & TTI
A recipe for handling first-order recurrence phis.
Definition VPlan.h:2624
A VPValue representing a live-in from the input IR or a constant.
Definition VPlanValue.h:240
Type * getType() const
Returns the type of the underlying IR value.
Definition VPlan.cpp:141
A symbolic live-in VPValue, used for values like vector trip count, VF, and VFxUF.
Definition VPlanValue.h:280
bool isMaterialized() const
Returns true if this symbolic value has been materialized.
Definition VPlanValue.h:288
A recipe for widening load operations with vector-predication intrinsics, using the address to load f...
Definition VPlan.h:3626
A recipe for widening load operations, using the address to load from and an optional mask.
Definition VPlan.h:3586
A recipe for widening store operations with vector-predication intrinsics, using the value to store,...
Definition VPlan.h:3710
A recipe for widening store operations, using the stored value, the address to store to and an option...
Definition VPlan.h:3667
static void handleUncountableEarlyExits(VPlan &Plan, VPBasicBlock *HeaderVPBB, VPBasicBlock *LatchVPBB, VPBasicBlock *MiddleVPBB, UncountableExitStyle Style)
Update Plan to account for uncountable early exits by introducing appropriate branching logic in the ...
static LLVM_ABI_FOR_TEST bool tryToConvertVPInstructionsToVPRecipes(VPlan &Plan, const TargetLibraryInfo &TLI)
Replaces the VPInstructions in Plan with corresponding widen recipes.
static void makeMemOpWideningDecisions(VPlan &Plan, VFRange &Range, VPRecipeBuilder &RecipeBuilder)
Convert load/store VPInstructions in Plan into widened or replicate recipes.
static void materializeBroadcasts(VPlan &Plan)
Add explicit broadcasts for live-ins and VPValues defined in Plan's entry block if they are used as v...
static void materializePacksAndUnpacks(VPlan &Plan)
Add explicit Build[Struct]Vector recipes to Pack multiple scalar values into vectors and Unpack recip...
static void createInterleaveGroups(VPlan &Plan, const SmallPtrSetImpl< const InterleaveGroup< Instruction > * > &InterleaveGroups, VPRecipeBuilder &RecipeBuilder, const bool &EpilogueAllowed)
static bool simplifyKnownEVL(VPlan &Plan, ElementCount VF, PredicatedScalarEvolution &PSE)
Try to simplify VPInstruction::ExplicitVectorLength recipes when the AVL is known to be <= VF,...
static void removeBranchOnConst(VPlan &Plan, bool OnlyLatches=false)
Remove BranchOnCond recipes with true or false conditions together with removing dead edges to their ...
static void materializeFactors(VPlan &Plan, VPBasicBlock *VectorPH, ElementCount VF)
Materialize UF, VF and VFxUF to be computed explicitly using VPInstructions.
static void materializeBackedgeTakenCount(VPlan &Plan, VPBasicBlock *VectorPH)
Materialize the backedge-taken count to be computed explicitly using VPInstructions.
static void hoistInvariantLoads(VPlan &Plan)
Hoist single-scalar loads with invariant addresses out of the vector loop to the preheader,...
static void addActiveLaneMask(VPlan &Plan, bool UseActiveLaneMaskForControlFlow)
Replace (ICMP_ULE, wide canonical IV, backedge-taken-count) checks with an (active-lane-mask recipe,...
static void dropPoisonGeneratingRecipes(VPlan &Plan, const std::function< bool(BasicBlock *)> &BlockNeedsPredication)
Drop poison flags from recipes that may generate a poison value that is used after vectorization,...
static void createAndOptimizeReplicateRegions(VPlan &Plan)
Wrap predicated VPReplicateRecipes with a mask operand in an if-then region block and remove the mask...
static void convertToVariableLengthStep(VPlan &Plan)
Transform loops with variable-length stepping after region dissolution.
static void addBranchWeightToMiddleTerminator(VPlan &Plan, ElementCount VF, std::optional< unsigned > VScaleForTuning)
Add branch weight metadata, if the Plan's middle block is terminated by a BranchOnCond recipe.
static std::unique_ptr< VPlan > narrowInterleaveGroups(VPlan &Plan, const TargetTransformInfo &TTI)
Try to find a single VF among Plan's VFs for which all interleave groups (with known minimum VF eleme...
static DenseMap< const SCEV *, Value * > expandSCEVs(VPlan &Plan, ScalarEvolution &SE)
Expand VPExpandSCEVRecipes in Plan's entry block.
static void convertToConcreteRecipes(VPlan &Plan)
Lower abstract recipes to concrete ones, that can be codegen'd.
static void expandBranchOnTwoConds(VPlan &Plan)
Expand BranchOnTwoConds instructions into explicit CFG with BranchOnCond instructions.
static void materializeVectorTripCount(VPlan &Plan, VPBasicBlock *VectorPHVPBB, bool TailByMasking, bool RequiresScalarEpilogue, VPValue *Step, std::optional< uint64_t > MaxRuntimeStep=std::nullopt)
Materialize vector trip count computations to a set of VPInstructions.
static void hoistPredicatedLoads(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L)
Hoist predicated loads from the same address to the loop entry block, if they are guaranteed to execu...
static bool mergeBlocksIntoPredecessors(VPlan &Plan)
Remove redundant VPBasicBlocks by merging them into their single predecessor if the latter has a sing...
static void optimizeFindIVReductions(VPlan &Plan, PredicatedScalarEvolution &PSE, Loop &L)
Optimize FindLast reductions selecting IVs (or expressions of IVs) by converting them to FindIV reduc...
static void convertToAbstractRecipes(VPlan &Plan, VPCostContext &Ctx, VFRange &Range)
This function converts initial recipes to the abstract recipes and clamps Range based on cost model f...
static void materializeConstantVectorTripCount(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
static void addExitUsersForFirstOrderRecurrences(VPlan &Plan, VFRange &Range)
Handle users in the exit block for first order reductions in the original exit block.
static void addExplicitVectorLength(VPlan &Plan, const std::optional< unsigned > &MaxEVLSafeElements)
Add a VPCurrentIterationPHIRecipe and related recipes to Plan and replaces all uses of the canonical ...
static void optimizeEVLMasks(VPlan &Plan)
Optimize recipes which use an EVL-based header mask to VP intrinsics, for example:
static void replaceSymbolicStrides(VPlan &Plan, PredicatedScalarEvolution &PSE, const DenseMap< Value *, const SCEV * > &StridesMap)
Replace symbolic strides from StridesMap in Plan with constants when possible.
static void removeDeadRecipes(VPlan &Plan)
Remove dead recipes from Plan.
static void simplifyRecipes(VPlan &Plan)
Perform instcombine-like simplifications on recipes in Plan.
static void sinkPredicatedStores(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L)
Sink predicated stores to the same address with complementary predicates (P and NOT P) to an uncondit...
static void clearReductionWrapFlags(VPlan &Plan)
Clear NSW/NUW flags from reduction instructions if necessary.
static void optimizeInductionLiveOutUsers(VPlan &Plan, PredicatedScalarEvolution &PSE, bool FoldTail)
If there's a single exit block, optimize its phi recipes that use exiting IV values by feeding them p...
static void createPartialReductions(VPlan &Plan, VPCostContext &CostCtx, VFRange &Range)
Detect and create partial reduction recipes for scaled reductions in Plan.
static void cse(VPlan &Plan)
Perform common-subexpression-elimination on Plan.
static LLVM_ABI_FOR_TEST void optimize(VPlan &Plan)
Apply VPlan-to-VPlan optimizations to Plan, including induction recipe optimizations,...
static void dissolveLoopRegions(VPlan &Plan)
Replace loop regions with explicit CFG.
static void truncateToMinimalBitwidths(VPlan &Plan, const MapVector< Instruction *, uint64_t > &MinBWs)
Insert truncates and extends for any truncated recipe.
static bool adjustFixedOrderRecurrences(VPlan &Plan, VPBuilder &Builder)
Try to have all users of fixed-order recurrences appear after the recipe defining their previous valu...
static void optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Optimize Plan based on BestVF and BestUF.
static void convertEVLExitCond(VPlan &Plan)
Replaces the exit condition from (branch-on-cond eq CanonicalIVInc, VectorTripCount) to (branch-on-co...