LLVM 23.0.0git
VPlanTransforms.cpp
Go to the documentation of this file.
1//===-- VPlanTransforms.cpp - Utility VPlan to VPlan transforms -----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file implements a set of utility VPlan to VPlan transformations.
11///
12//===----------------------------------------------------------------------===//
13
14#include "VPlanTransforms.h"
15#include "VPRecipeBuilder.h"
16#include "VPlan.h"
17#include "VPlanAnalysis.h"
18#include "VPlanCFG.h"
19#include "VPlanDominatorTree.h"
20#include "VPlanHelpers.h"
21#include "VPlanPatternMatch.h"
22#include "VPlanUtils.h"
23#include "VPlanVerifier.h"
24#include "llvm/ADT/APInt.h"
26#include "llvm/ADT/STLExtras.h"
27#include "llvm/ADT/SetVector.h"
29#include "llvm/ADT/TypeSwitch.h"
32#include "llvm/Analysis/Loads.h"
38#include "llvm/IR/Intrinsics.h"
39#include "llvm/IR/MDBuilder.h"
40#include "llvm/IR/Metadata.h"
45
46using namespace llvm;
47using namespace VPlanPatternMatch;
48using namespace SCEVPatternMatch;
49
51 VPlan &Plan, const TargetLibraryInfo &TLI) {
52
54 Plan.getVectorLoopRegion());
56 // Skip blocks outside region
57 if (!VPBB->getParent())
58 break;
59 VPRecipeBase *Term = VPBB->getTerminator();
60 auto EndIter = Term ? Term->getIterator() : VPBB->end();
61 // Introduce each ingredient into VPlan.
62 for (VPRecipeBase &Ingredient :
63 make_early_inc_range(make_range(VPBB->begin(), EndIter))) {
64
65 VPValue *VPV = Ingredient.getVPSingleValue();
66 if (!VPV->getUnderlyingValue())
67 continue;
68
70
71 VPRecipeBase *NewRecipe = nullptr;
72 if (auto *PhiR = dyn_cast<VPPhi>(&Ingredient)) {
73 auto *Phi = cast<PHINode>(PhiR->getUnderlyingValue());
74 NewRecipe = new VPWidenPHIRecipe(PhiR->operands(), PhiR->getDebugLoc(),
75 Phi->getName());
76 } else if (auto *VPI = dyn_cast<VPInstruction>(&Ingredient)) {
77 assert(!isa<PHINode>(Inst) && "phis should be handled above");
78 // Create VPWidenMemoryRecipe for loads and stores.
79 if (LoadInst *Load = dyn_cast<LoadInst>(Inst)) {
80 NewRecipe = new VPWidenLoadRecipe(
81 *Load, Ingredient.getOperand(0), nullptr /*Mask*/,
82 false /*Consecutive*/, *VPI, Ingredient.getDebugLoc());
83 } else if (StoreInst *Store = dyn_cast<StoreInst>(Inst)) {
84 NewRecipe = new VPWidenStoreRecipe(
85 *Store, Ingredient.getOperand(1), Ingredient.getOperand(0),
86 nullptr /*Mask*/, false /*Consecutive*/, *VPI,
87 Ingredient.getDebugLoc());
89 NewRecipe = new VPWidenGEPRecipe(GEP->getSourceElementType(),
90 Ingredient.operands(), *VPI,
91 Ingredient.getDebugLoc(), GEP);
92 } else if (CallInst *CI = dyn_cast<CallInst>(Inst)) {
93 Intrinsic::ID VectorID = getVectorIntrinsicIDForCall(CI, &TLI);
94 if (VectorID == Intrinsic::not_intrinsic)
95 return false;
96
97 // The noalias.scope.decl intrinsic declares a noalias scope that
98 // is valid for a single iteration. Emitting it as a single-scalar
99 // replicate would incorrectly extend the scope across multiple
100 // original iterations packed into one vector iteration.
101 // FIXME: If we want to vectorize this loop, then we have to drop
102 // all the associated !alias.scope and !noalias.
103 if (VectorID == Intrinsic::experimental_noalias_scope_decl)
104 return false;
105
106 // These intrinsics are recognized by getVectorIntrinsicIDForCall
107 // but are not widenable. Emit them as replicate instead of widening.
108 if (VectorID == Intrinsic::assume ||
109 VectorID == Intrinsic::lifetime_end ||
110 VectorID == Intrinsic::lifetime_start ||
111 VectorID == Intrinsic::sideeffect ||
112 VectorID == Intrinsic::pseudoprobe) {
113 // If the operand of llvm.assume holds before vectorization, it will
114 // also hold per lane.
115 // llvm.pseudoprobe requires to be duplicated per lane for accurate
116 // sample count.
117 const bool IsSingleScalar = VectorID != Intrinsic::assume &&
118 VectorID != Intrinsic::pseudoprobe;
119 NewRecipe = new VPReplicateRecipe(CI, Ingredient.operands(),
120 /*IsSingleScalar=*/IsSingleScalar,
121 /*Mask=*/nullptr, *VPI, *VPI,
122 Ingredient.getDebugLoc());
123 } else {
124 NewRecipe = new VPWidenIntrinsicRecipe(
125 *CI, VectorID, drop_end(Ingredient.operands()), CI->getType(),
126 VPIRFlags(*CI), *VPI, CI->getDebugLoc());
127 }
128 } else if (auto *CI = dyn_cast<CastInst>(Inst)) {
129 NewRecipe = new VPWidenCastRecipe(
130 CI->getOpcode(), Ingredient.getOperand(0), CI->getType(), CI,
131 VPIRFlags(*CI), VPIRMetadata(*CI));
132 } else {
133 NewRecipe = new VPWidenRecipe(*Inst, Ingredient.operands(), *VPI,
134 *VPI, Ingredient.getDebugLoc());
135 }
136 } else {
138 "inductions must be created earlier");
139 continue;
140 }
141
142 NewRecipe->insertBefore(&Ingredient);
143 if (NewRecipe->getNumDefinedValues() == 1)
144 VPV->replaceAllUsesWith(NewRecipe->getVPSingleValue());
145 else
146 assert(NewRecipe->getNumDefinedValues() == 0 &&
147 "Only recpies with zero or one defined values expected");
148 Ingredient.eraseFromParent();
149 }
150 }
151 return true;
152}
153
154/// Helper for extra no-alias checks via known-safe recipe and SCEV.
157 VPReplicateRecipe &GroupLeader;
158 PredicatedScalarEvolution *PSE = nullptr;
159 const Loop *L = nullptr;
160
161 // Return true if \p A and \p B are known to not alias for all VFs in the
162 // plan, checked via the distance between the accesses
163 bool isNoAliasViaDistance(VPReplicateRecipe *A, VPReplicateRecipe *B) const {
164 if (A->getOpcode() != Instruction::Store ||
165 B->getOpcode() != Instruction::Store)
166 return false;
167
168 if (!PSE || !L)
169 return A == B;
170
171 VPValue *AddrA = A->getOperand(1);
172 const SCEV *SCEVA = vputils::getSCEVExprForVPValue(AddrA, *PSE, L);
173 VPValue *AddrB = B->getOperand(1);
174 const SCEV *SCEVB = vputils::getSCEVExprForVPValue(AddrB, *PSE, L);
176 return false;
177
178 const APInt *Distance;
179 ScalarEvolution &SE = *PSE->getSE();
180 if (!match(SE.getMinusSCEV(SCEVA, SCEVB), m_scev_APInt(Distance)))
181 return false;
182
183 const DataLayout &DL = SE.getDataLayout();
184 Type *TyA = A->getOperand(0)->getScalarType();
185 uint64_t SizeA = DL.getTypeStoreSize(TyA);
186 Type *TyB = B->getOperand(0)->getScalarType();
187 uint64_t SizeB = DL.getTypeStoreSize(TyB);
188
189 // Use the maximum store size to ensure no overlap from either direction.
190 // Currently only handles fixed sizes, as it is only used for
191 // replicating VPReplicateRecipes.
192 uint64_t MaxStoreSize = std::max(SizeA, SizeB);
193
194 auto VFs = B->getParent()->getPlan()->vectorFactors();
196 if (MaxVF.isScalable())
197 return false;
198 return Distance->abs().uge(
199 MaxVF.multiplyCoefficientBy(MaxStoreSize).getFixedValue());
200 }
201
202public:
205 const Loop &L)
206 : ExcludeRecipes(ExcludeRecipes.begin(), ExcludeRecipes.end()),
207 GroupLeader(GroupLeader), PSE(&PSE), L(&L) {}
208
209 SinkStoreInfo(VPReplicateRecipe &GroupLeader) : GroupLeader(GroupLeader) {}
210
211 /// Return true if \p R should be skipped during alias checking, either
212 /// because it's in the exclude set or because no-alias can be proven via
213 /// SCEV.
214 bool shouldSkip(VPRecipeBase &R) const {
215 auto *Store = dyn_cast<VPReplicateRecipe>(&R);
216 return ExcludeRecipes.contains(Store) ||
217 (Store && isNoAliasViaDistance(Store, &GroupLeader));
218 }
219};
220
221/// Check if a memory operation doesn't alias with memory operations using
222/// scoped noalias metadata, in blocks in the single-successor chain between \p
223/// FirstBB and \p LastBB. If \p SinkInfo is std::nullopt, only recipes that may
224/// write to memory are checked (for load hoisting). Otherwise recipes that both
225/// read and write memory are checked, and SCEV is used to prove no-alias
226/// between the group leader and other replicate recipes (for store sinking).
227static bool
229 VPBasicBlock *FirstBB, VPBasicBlock *LastBB,
230 std::optional<SinkStoreInfo> SinkInfo = {}) {
231 bool CheckReads = SinkInfo.has_value();
232 if (!MemLoc.AATags.Scope)
233 return false;
234
235 for (VPBasicBlock *VPBB :
237 for (VPRecipeBase &R : *VPBB) {
238 if (SinkInfo && SinkInfo->shouldSkip(R))
239 continue;
240
241 // Skip recipes that don't need checking.
242 if (!R.mayWriteToMemory() && !(CheckReads && R.mayReadFromMemory()))
243 continue;
244
246 if (!Loc)
247 // Conservatively assume aliasing for memory operations without
248 // location.
249 return false;
250
252 return false;
253 }
254 }
255 return true;
256}
257
258/// Get the value type of the replicate load or store. \p IsLoad indicates
259/// whether it is a load.
261 return (IsLoad ? R : R->getOperand(0))->getScalarType();
262}
263
264/// Collect either replicated Loads or Stores grouped by their address SCEV and
265/// their load-store type, in a deep-traversal of the vector loop region in \p
266/// Plan.
267template <unsigned Opcode>
270 VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L,
271 function_ref<bool(VPReplicateRecipe *)> FilterFn) {
272 static_assert(Opcode == Instruction::Load || Opcode == Instruction::Store,
273 "Only Load and Store opcodes supported");
274 constexpr bool IsLoad = (Opcode == Instruction::Load);
277 RecipesByAddressAndType;
280 for (VPRecipeBase &R : *VPBB) {
281 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
282 if (!RepR || RepR->getOpcode() != Opcode || !FilterFn(RepR))
283 continue;
284
285 // For loads, operand 0 is address; for stores, operand 1 is address.
286 VPValue *Addr = RepR->getOperand(IsLoad ? 0 : 1);
287 const Type *LoadStoreTy = getLoadStoreValueType(RepR, IsLoad);
288 const SCEV *AddrSCEV = vputils::getSCEVExprForVPValue(Addr, PSE, L);
289 if (!isa<SCEVCouldNotCompute>(AddrSCEV))
290 RecipesByAddressAndType[{AddrSCEV, LoadStoreTy}].push_back(RepR);
291 }
292 }
293 auto Groups = to_vector(RecipesByAddressAndType.values());
294 VPDominatorTree VPDT(Plan);
295 for (auto &Group : Groups) {
296 // Sort mem ops by dominance order, with earliest (most dominating) first.
298 return VPDT.properlyDominates(A, B);
299 });
300 }
301 return Groups;
302}
303
304static bool sinkScalarOperands(VPlan &Plan) {
305 auto Iter = vp_depth_first_deep(Plan.getEntry());
306 bool ScalarVFOnly = Plan.hasScalarVFOnly();
307 bool Changed = false;
308
310 auto InsertIfValidSinkCandidate = [ScalarVFOnly, &WorkList](
311 VPBasicBlock *SinkTo, VPValue *Op) {
312 auto *Candidate =
313 dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe());
314 if (!Candidate)
315 return;
316
317 // We only know how to sink VPReplicateRecipes and VPScalarIVStepsRecipes
318 // for now.
320 return;
321
322 if (Candidate->getParent() == SinkTo ||
323 vputils::cannotHoistOrSinkRecipe(*Candidate, /*Sinking=*/true))
324 return;
325
326 if (auto *RepR = dyn_cast<VPReplicateRecipe>(Candidate))
327 if (!ScalarVFOnly && RepR->isSingleScalar())
328 return;
329
330 WorkList.insert({SinkTo, Candidate});
331 };
332
333 // First, collect the operands of all recipes in replicate blocks as seeds for
334 // sinking.
336 VPBasicBlock *EntryVPBB = VPR->getEntryBasicBlock();
337 if (!VPR->isReplicator() || EntryVPBB->getSuccessors().size() != 2)
338 continue;
339 VPBasicBlock *VPBB = cast<VPBasicBlock>(EntryVPBB->getSuccessors().front());
340 if (VPBB->getSingleSuccessor() != VPR->getExitingBasicBlock())
341 continue;
342 for (auto &Recipe : *VPBB)
343 for (VPValue *Op : Recipe.operands())
344 InsertIfValidSinkCandidate(VPBB, Op);
345 }
346
347 // Try to sink each replicate or scalar IV steps recipe in the worklist.
348 for (unsigned I = 0; I != WorkList.size(); ++I) {
349 VPBasicBlock *SinkTo;
350 VPSingleDefRecipe *SinkCandidate;
351 std::tie(SinkTo, SinkCandidate) = WorkList[I];
352
353 // All recipe users of SinkCandidate must be in the same block SinkTo or all
354 // users outside of SinkTo must only use the first lane of SinkCandidate. In
355 // the latter case, we need to duplicate SinkCandidate.
356 auto UsersOutsideSinkTo =
357 make_filter_range(SinkCandidate->users(), [SinkTo](VPUser *U) {
358 return cast<VPRecipeBase>(U)->getParent() != SinkTo;
359 });
360 if (any_of(UsersOutsideSinkTo, [SinkCandidate](VPUser *U) {
361 return !U->usesFirstLaneOnly(SinkCandidate);
362 }))
363 continue;
364 bool NeedsDuplicating = !UsersOutsideSinkTo.empty();
365
366 if (NeedsDuplicating) {
367 if (ScalarVFOnly)
368 continue;
369 VPSingleDefRecipe *Clone;
370 if (auto *SinkCandidateRepR =
371 dyn_cast<VPReplicateRecipe>(SinkCandidate)) {
372 // TODO: Handle converting to uniform recipes as separate transform,
373 // then cloning should be sufficient here.
375 SinkCandidateRepR->getOpcode(), SinkCandidate->operands(),
376 /*Mask=*/nullptr, *SinkCandidateRepR, *SinkCandidateRepR,
377 SinkCandidate->getDebugLoc(), SinkCandidate->getUnderlyingInstr());
378 // TODO: add ".cloned" suffix to name of Clone's VPValue.
379 } else {
380 Clone = SinkCandidate->clone();
381 }
382
383 Clone->insertBefore(SinkCandidate);
384 SinkCandidate->replaceUsesWithIf(Clone, [SinkTo](VPUser &U, unsigned) {
385 return cast<VPRecipeBase>(&U)->getParent() != SinkTo;
386 });
387 }
388 SinkCandidate->moveBefore(*SinkTo, SinkTo->getFirstNonPhi());
389 for (VPValue *Op : SinkCandidate->operands())
390 InsertIfValidSinkCandidate(SinkTo, Op);
391 Changed = true;
392 }
393 return Changed;
394}
395
396/// If \p R is a region with a VPBranchOnMaskRecipe in the entry block, return
397/// the mask.
399 auto *EntryBB = dyn_cast<VPBasicBlock>(R->getEntry());
400 if (!EntryBB || EntryBB->size() != 1 ||
401 !isa<VPBranchOnMaskRecipe>(EntryBB->begin()))
402 return nullptr;
403
404 return cast<VPBranchOnMaskRecipe>(&*EntryBB->begin())->getOperand(0);
405}
406
407/// If \p R is a triangle region, return the 'then' block of the triangle.
409 auto *EntryBB = cast<VPBasicBlock>(R->getEntry());
410 if (EntryBB->getNumSuccessors() != 2)
411 return nullptr;
412
413 auto *Succ0 = dyn_cast<VPBasicBlock>(EntryBB->getSuccessors()[0]);
414 auto *Succ1 = dyn_cast<VPBasicBlock>(EntryBB->getSuccessors()[1]);
415 if (!Succ0 || !Succ1)
416 return nullptr;
417
418 if (Succ0->getNumSuccessors() + Succ1->getNumSuccessors() != 1)
419 return nullptr;
420 if (Succ0->getSingleSuccessor() == Succ1)
421 return Succ0;
422 if (Succ1->getSingleSuccessor() == Succ0)
423 return Succ1;
424 return nullptr;
425}
426
427// Merge replicate regions in their successor region, if a replicate region
428// is connected to a successor replicate region with the same predicate by a
429// single, empty VPBasicBlock.
431 SmallPtrSet<VPRegionBlock *, 4> TransformedRegions;
432
433 // Collect replicate regions followed by an empty block, followed by another
434 // replicate region with matching masks to process front. This is to avoid
435 // iterator invalidation issues while merging regions.
438 vp_depth_first_deep(Plan.getEntry()))) {
439 if (!Region1->isReplicator())
440 continue;
441 auto *MiddleBasicBlock =
442 dyn_cast_or_null<VPBasicBlock>(Region1->getSingleSuccessor());
443 if (!MiddleBasicBlock || !MiddleBasicBlock->empty())
444 continue;
445
446 auto *Region2 =
447 dyn_cast_or_null<VPRegionBlock>(MiddleBasicBlock->getSingleSuccessor());
448 if (!Region2 || !Region2->isReplicator())
449 continue;
450
451 VPValue *Mask1 = getPredicatedMask(Region1);
452 VPValue *Mask2 = getPredicatedMask(Region2);
453 if (!Mask1 || Mask1 != Mask2)
454 continue;
455
456 assert(Mask1 && Mask2 && "both region must have conditions");
457 WorkList.push_back(Region1);
458 }
459
460 // Move recipes from Region1 to its successor region, if both are triangles.
461 for (VPRegionBlock *Region1 : WorkList) {
462 if (TransformedRegions.contains(Region1))
463 continue;
464 auto *MiddleBasicBlock = cast<VPBasicBlock>(Region1->getSingleSuccessor());
465 auto *Region2 = cast<VPRegionBlock>(MiddleBasicBlock->getSingleSuccessor());
466
467 VPBasicBlock *Then1 = getPredicatedThenBlock(Region1);
468 VPBasicBlock *Then2 = getPredicatedThenBlock(Region2);
469 if (!Then1 || !Then2)
470 continue;
471
472 // Note: No fusion-preventing memory dependencies are expected in either
473 // region. Such dependencies should be rejected during earlier dependence
474 // checks, which guarantee accesses can be re-ordered for vectorization.
475 //
476 // Move recipes to the successor region.
477 for (VPRecipeBase &ToMove : make_early_inc_range(reverse(*Then1)))
478 ToMove.moveBefore(*Then2, Then2->getFirstNonPhi());
479
480 auto *Merge1 = cast<VPBasicBlock>(Then1->getSingleSuccessor());
481 auto *Merge2 = cast<VPBasicBlock>(Then2->getSingleSuccessor());
482
483 // Move VPPredInstPHIRecipes from the merge block to the successor region's
484 // merge block. Update all users inside the successor region to use the
485 // original values.
486 for (VPRecipeBase &Phi1ToMove : make_early_inc_range(reverse(*Merge1))) {
487 VPValue *PredInst1 =
488 cast<VPPredInstPHIRecipe>(&Phi1ToMove)->getOperand(0);
489 VPValue *Phi1ToMoveV = Phi1ToMove.getVPSingleValue();
490 Phi1ToMoveV->replaceUsesWithIf(PredInst1, [Then2](VPUser &U, unsigned) {
491 return cast<VPRecipeBase>(&U)->getParent() == Then2;
492 });
493
494 // Remove phi recipes that are unused after merging the regions.
495 if (Phi1ToMove.getVPSingleValue()->getNumUsers() == 0) {
496 Phi1ToMove.eraseFromParent();
497 continue;
498 }
499 Phi1ToMove.moveBefore(*Merge2, Merge2->begin());
500 }
501
502 // Remove the dead recipes in Region1's entry block.
503 for (VPRecipeBase &R :
504 make_early_inc_range(reverse(*Region1->getEntryBasicBlock())))
505 R.eraseFromParent();
506
507 // Finally, remove the first region.
508 for (VPBlockBase *Pred : make_early_inc_range(Region1->getPredecessors())) {
509 VPBlockUtils::disconnectBlocks(Pred, Region1);
510 VPBlockUtils::connectBlocks(Pred, MiddleBasicBlock);
511 }
512 VPBlockUtils::disconnectBlocks(Region1, MiddleBasicBlock);
513 TransformedRegions.insert(Region1);
514 }
515
516 return !TransformedRegions.empty();
517}
518
520 VPRegionBlock *ParentRegion,
521 VPlan &Plan) {
522 Instruction *Instr = PredRecipe->getUnderlyingInstr();
523 // Build the triangular if-then region.
524 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
525 assert(Instr->getParent() && "Predicated instruction not in any basic block");
526 auto *BlockInMask = PredRecipe->getMask();
527 auto *MaskDef = BlockInMask->getDefiningRecipe();
528 auto *BOMRecipe = new VPBranchOnMaskRecipe(
529 BlockInMask, MaskDef ? MaskDef->getDebugLoc() : DebugLoc::getUnknown());
530 auto *Entry =
531 Plan.createVPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
532
533 // Replace predicated replicate recipe with a replicate recipe without a
534 // mask but in the replicate region.
535 auto *RecipeWithoutMask = new VPReplicateRecipe(
536 PredRecipe->getUnderlyingInstr(), PredRecipe->operandsWithoutMask(),
537 PredRecipe->isSingleScalar(), nullptr /*Mask*/, *PredRecipe, *PredRecipe,
538 PredRecipe->getDebugLoc());
539 auto *Pred =
540 Plan.createVPBasicBlock(Twine(RegionName) + ".if", RecipeWithoutMask);
541 auto *Exiting = Plan.createVPBasicBlock(Twine(RegionName) + ".continue");
543 Plan.createReplicateRegion(Entry, Exiting, RegionName);
544
545 // Note: first set Entry as region entry and then connect successors starting
546 // from it in order, to propagate the "parent" of each VPBasicBlock.
547 Region->setParent(ParentRegion);
548 VPBlockUtils::insertTwoBlocksAfter(Pred, Exiting, Entry);
549 VPBlockUtils::connectBlocks(Pred, Exiting);
550
551 if (PredRecipe->getNumUsers() != 0) {
552 auto *PHIRecipe = new VPPredInstPHIRecipe(RecipeWithoutMask,
553 RecipeWithoutMask->getDebugLoc());
554 Exiting->appendRecipe(PHIRecipe);
555 PredRecipe->replaceAllUsesWith(PHIRecipe);
556 }
557 PredRecipe->eraseFromParent();
558 return Region;
559}
560
561static void addReplicateRegions(VPlan &Plan) {
564 vp_depth_first_deep(Plan.getEntry()))) {
565 for (VPRecipeBase &R : *VPBB)
566 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
567 if (RepR->isPredicated())
568 WorkList.push_back(RepR);
569 }
570 }
571
572 unsigned BBNum = 0;
573 for (VPReplicateRecipe *RepR : WorkList) {
574 VPBasicBlock *CurrentBlock = RepR->getParent();
575 VPBasicBlock *SplitBlock = CurrentBlock->splitAt(RepR->getIterator());
576
577 BasicBlock *OrigBB = RepR->getUnderlyingInstr()->getParent();
578 SplitBlock->setName(
579 OrigBB->hasName() ? OrigBB->getName() + "." + Twine(BBNum++) : "");
580 // Record predicated instructions for above packing optimizations.
582 createReplicateRegion(RepR, CurrentBlock->getParent(), Plan);
584
585 VPRegionBlock *ParentRegion = Region->getParent();
586 if (ParentRegion && ParentRegion->getExiting() == CurrentBlock)
587 ParentRegion->setExiting(SplitBlock);
588 }
589}
590
594 vp_depth_first_deep(Plan.getEntry()))) {
595 // Don't fold the blocks in the skeleton of the Plan into their single
596 // predecessors for now.
597 // TODO: Remove restriction once more of the skeleton is modeled in VPlan.
598 if (!VPBB->getParent())
599 continue;
600 auto *PredVPBB =
601 dyn_cast_or_null<VPBasicBlock>(VPBB->getSinglePredecessor());
602 if (!PredVPBB || PredVPBB->getNumSuccessors() != 1 ||
603 isa<VPIRBasicBlock>(PredVPBB))
604 continue;
605 WorkList.push_back(VPBB);
606 }
607
608 for (VPBasicBlock *VPBB : WorkList) {
609 VPBasicBlock *PredVPBB = cast<VPBasicBlock>(VPBB->getSinglePredecessor());
610 for (VPRecipeBase &R : make_early_inc_range(*VPBB))
611 R.moveBefore(*PredVPBB, PredVPBB->end());
612 VPBlockUtils::disconnectBlocks(PredVPBB, VPBB);
613 auto *ParentRegion = VPBB->getParent();
614 if (ParentRegion && ParentRegion->getExiting() == VPBB)
615 ParentRegion->setExiting(PredVPBB);
616 VPBlockUtils::transferSuccessors(VPBB, PredVPBB);
617 // VPBB is now dead and will be cleaned up when the plan gets destroyed.
618 }
619 return !WorkList.empty();
620}
621
623 // Convert masked VPReplicateRecipes to if-then region blocks.
625
626 bool ShouldSimplify = true;
627 while (ShouldSimplify) {
628 ShouldSimplify = sinkScalarOperands(Plan);
629 ShouldSimplify |= mergeReplicateRegionsIntoSuccessors(Plan);
630 ShouldSimplify |= mergeBlocksIntoPredecessors(Plan);
631 }
632}
633
634/// Remove redundant casts of inductions.
635///
636/// Such redundant casts are casts of induction variables that can be ignored,
637/// because we already proved that the casted phi is equal to the uncasted phi
638/// in the vectorized loop. There is no need to vectorize the cast - the same
639/// value can be used for both the phi and casts in the vector loop.
641 for (auto &Phi : Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
643 if (!IV || IV->getTruncInst())
644 continue;
645
646 // A sequence of IR Casts has potentially been recorded for IV, which
647 // *must be bypassed* when the IV is vectorized, because the vectorized IV
648 // will produce the desired casted value. This sequence forms a def-use
649 // chain and is provided in reverse order, ending with the cast that uses
650 // the IV phi. Search for the recipe of the last cast in the chain and
651 // replace it with the original IV. Note that only the final cast is
652 // expected to have users outside the cast-chain and the dead casts left
653 // over will be cleaned up later.
654 ArrayRef<Instruction *> Casts = IV->getInductionDescriptor().getCastInsts();
655 VPValue *FindMyCast = IV;
656 for (Instruction *IRCast : reverse(Casts)) {
657 VPSingleDefRecipe *FoundUserCast = nullptr;
658 for (auto *U : FindMyCast->users()) {
659 auto *UserCast = dyn_cast<VPSingleDefRecipe>(U);
660 if (UserCast && UserCast->getUnderlyingValue() == IRCast) {
661 FoundUserCast = UserCast;
662 break;
663 }
664 }
665 // A cast recipe in the chain may have been removed by earlier DCE.
666 if (!FoundUserCast)
667 break;
668 FindMyCast = FoundUserCast;
669 }
670 if (FindMyCast != IV)
671 FindMyCast->replaceAllUsesWith(IV);
672 }
673}
674
677 Instruction::BinaryOps InductionOpcode,
678 FPMathOperator *FPBinOp, Instruction *TruncI,
679 VPIRValue *StartV, VPValue *Step, DebugLoc DL,
680 VPBuilder &Builder) {
681 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
682 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
683 VPValue *CanonicalIV = LoopRegion->getCanonicalIV();
684 VPSingleDefRecipe *BaseIV =
685 Builder.createDerivedIV(Kind, FPBinOp, StartV, CanonicalIV, Step);
686
687 // Truncate base induction if needed.
688 Type *ResultTy = BaseIV->getScalarType();
689 if (TruncI) {
690 Type *TruncTy = TruncI->getType();
691 assert(ResultTy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits() &&
692 "Not truncating.");
693 assert(ResultTy->isIntegerTy() && "Truncation requires an integer type");
694 BaseIV = Builder.createScalarCast(Instruction::Trunc, BaseIV, TruncTy, DL);
695 ResultTy = TruncTy;
696 }
697
698 // Truncate step if needed.
699 Type *StepTy = Step->getScalarType();
700 if (ResultTy != StepTy) {
701 assert(StepTy->getScalarSizeInBits() > ResultTy->getScalarSizeInBits() &&
702 "Not truncating.");
703 assert(StepTy->isIntegerTy() && "Truncation requires an integer type");
704 auto *VecPreheader =
706 VPBuilder::InsertPointGuard Guard(Builder);
707 Builder.setInsertPoint(VecPreheader);
708 Step = Builder.createScalarCast(Instruction::Trunc, Step, ResultTy, DL);
709 }
710 return Builder.createScalarIVSteps(InductionOpcode, FPBinOp, BaseIV, Step,
711 &Plan.getVF(), DL);
712}
713
715 VPlan &Plan, ScalarEvolution &SE, const TargetTransformInfo &TTI,
717 const SmallPtrSetImpl<const Value *> &ValuesToIgnore) {
718 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
719 if (!LoopRegion)
720 return;
721
722 auto *WideCanIV =
724 if (!WideCanIV)
725 return;
726
727 Type *CanIVTy = LoopRegion->getCanonicalIVType();
728
729 // Replace the wide canonical IV with a scalar-iv-steps over the canonical
730 // IV.
731 if (Plan.hasScalarVFOnly() || vputils::onlyFirstLaneUsed(WideCanIV)) {
732 VPBuilder Builder(WideCanIV);
733 WideCanIV->replaceAllUsesWith(createScalarIVSteps(
734 Plan, InductionDescriptor::IK_IntInduction, Instruction::Add, nullptr,
735 nullptr, Plan.getZero(CanIVTy), Plan.getConstantInt(CanIVTy, 1),
736 WideCanIV->getDebugLoc(), Builder));
737 WideCanIV->eraseFromParent();
738 return;
739 }
740
741 if (vputils::onlyScalarValuesUsed(WideCanIV))
742 return;
743
744 // If a canonical VPWidenIntOrFpInductionRecipe already produces vector lanes
745 // in the header, reuse it instead of introducing another wide induction phi.
746 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
747 for (VPRecipeBase &Phi : Header->phis()) {
749 if (!match(&Phi, m_CanonicalWidenIV(WidenIV)))
750 continue;
751 // The reused wide IV feeds the header mask, whose lanes may extend past
752 // the trip count; drop flags that only hold inside the scalar loop.
753 WidenIV->dropPoisonGeneratingFlags();
754 WideCanIV->replaceAllUsesWith(WidenIV);
755 WideCanIV->eraseFromParent();
756 return;
757 }
758
759 // Introduce a new VPWidenIntOrFpInductionRecipe if profitable.
760 auto *VecTy = VectorType::get(CanIVTy, VF);
761 InstructionCost BroadcastCost = TTI.getShuffleCost(
763 InstructionCost PHICost = TTI.getCFInstrCost(Instruction::PHI, CostKind);
764 if (PHICost > BroadcastCost)
765 return;
766
767 // Bail out if the additional wide induction phi increase the expected spill
768 // cost.
769 VPRegisterUsage UnrolledBase =
770 calculateRegisterUsageForPlan(Plan, VF, TTI, ValuesToIgnore)[0];
771 for (unsigned &NumUsers : make_second_range(UnrolledBase.MaxLocalUsers))
772 NumUsers *= UF;
773 unsigned RegClass = TTI.getRegisterClassForType(/*Vector=*/true, VecTy);
774 VPRegisterUsage Projected = UnrolledBase;
775 Projected.MaxLocalUsers[RegClass] += TTI.getRegUsageForType(VecTy);
776 if (Projected.spillCost(TTI, CostKind) >
777 UnrolledBase.spillCost(TTI, CostKind))
778 return;
779
782 VPValue *StepV = Plan.getConstantInt(CanIVTy, 1);
783 auto *NewWideIV = new VPWidenIntOrFpInductionRecipe(
784 /*IV=*/nullptr, Plan.getZero(CanIVTy), StepV, &Plan.getVF(), ID,
785 WideCanIV->getNoWrapFlags(), WideCanIV->getDebugLoc());
786 NewWideIV->insertBefore(&*Header->getFirstNonPhi());
787 WideCanIV->replaceAllUsesWith(NewWideIV);
788 WideCanIV->eraseFromParent();
789}
790
791/// Returns true if \p R is dead and can be removed.
792static bool isDeadRecipe(VPRecipeBase &R) {
793 // Do remove conditional assume instructions as their conditions may be
794 // flattened.
795 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
796 bool IsConditionalAssume = RepR && RepR->isPredicated() &&
798 if (IsConditionalAssume)
799 return true;
800
801 if (R.mayHaveSideEffects())
802 return false;
803
804 // Recipe is dead if no user keeps the recipe alive.
805 return all_of(R.definedValues(),
806 [](VPValue *V) { return V->getNumUsers() == 0; });
807}
808
811 Plan.getEntry());
813 // The recipes in the block are processed in reverse order, to catch chains
814 // of dead recipes.
815 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
816 if (isDeadRecipe(R)) {
817 R.eraseFromParent();
818 continue;
819 }
820
821 // Check if R is a dead VPPhi <-> update cycle and remove it.
822 VPValue *Start, *Incoming;
823 if (!match(&R, m_VPPhi(m_VPValue(Start), m_VPValue(Incoming))))
824 continue;
825 auto *PhiR = cast<VPPhi>(&R);
826 VPUser *PhiUser = PhiR->getSingleUser();
827 if (!PhiUser)
828 continue;
829 if (PhiUser != Incoming->getDefiningRecipe() ||
830 Incoming->getNumUsers() != 1)
831 continue;
832 PhiR->replaceAllUsesWith(Start);
833 PhiR->eraseFromParent();
834 Incoming->getDefiningRecipe()->eraseFromParent();
835 }
836 }
837}
838
841 for (unsigned I = 0; I != Users.size(); ++I) {
843 for (VPValue *V : Cur->definedValues())
844 Users.insert_range(V->users());
845 }
846 return Users.takeVector();
847}
848
849/// Scalarize a VPWidenPointerInductionRecipe by replacing it with a PtrAdd
850/// (IndStart, ScalarIVSteps (0, Step)). This is used when the recipe only
851/// generates scalar values.
852static VPValue *
854 VPlan &Plan, VPBuilder &Builder) {
856 VPIRValue *StartV = Plan.getZero(ID.getStep()->getType());
857 VPValue *StepV = PtrIV->getOperand(1);
859 Plan, InductionDescriptor::IK_IntInduction, Instruction::Add, nullptr,
860 nullptr, StartV, StepV, PtrIV->getDebugLoc(), Builder);
861
862 return Builder.createPtrAdd(PtrIV->getStartValue(), Steps,
863 PtrIV->getDebugLoc(), "next.gep");
864}
865
866/// Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd
867/// (IndStart, ScalarIVSteps (0, Step)) if only its scalar values are used, as
868/// VPWidenPointerInductionRecipe will generate vectors only. If some users
869/// require vectors while other require scalars, the scalar uses need to extract
870/// the scalars from the generated vectors (Note that this is different to how
871/// int/fp inductions are handled). Legalize extract-from-ends using uniform
872/// VPReplicateRecipe of wide inductions to use regular VPReplicateRecipe, so
873/// the correct end value is available. Also optimize
874/// VPWidenIntOrFpInductionRecipe, if any of its users needs scalar values, by
875/// providing them scalar steps built on the canonical scalar IV and update the
876/// original IV's users. This is an optional optimization to reduce the needs of
877/// vector extracts.
880 bool HasOnlyVectorVFs = !Plan.hasScalarVFOnly();
881 VPBuilder Builder(HeaderVPBB, HeaderVPBB->getFirstNonPhi());
882 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
883 auto *PhiR = dyn_cast<VPWidenInductionRecipe>(&Phi);
884 if (!PhiR)
885 continue;
886
887 // Try to narrow wide and replicating recipes to uniform recipes, based on
888 // VPlan analysis.
889 // TODO: Apply to all recipes in the future, to replace legacy uniformity
890 // analysis.
891 auto Users = collectUsersRecursively(PhiR);
892 for (VPUser *U : reverse(Users)) {
893 auto *Def = dyn_cast<VPRecipeWithIRFlags>(U);
894 auto *RepR = dyn_cast<VPReplicateRecipe>(U);
895 // Skip recipes that shouldn't be narrowed.
896 if (!Def || !isa<VPReplicateRecipe, VPWidenRecipe>(Def) ||
897 Def->getNumUsers() == 0 || !Def->getUnderlyingValue() ||
898 (RepR && (RepR->isSingleScalar() || RepR->isPredicated())))
899 continue;
900
901 // Skip recipes that may have other lanes than their first used.
903 continue;
904
905 // TODO: Support scalarizing ExtractValue.
906 if (match(Def,
908 continue;
909
911 Def->getUnderlyingInstr()->getOpcode(), Def->operands(),
912 /*Mask=*/nullptr, *Def, {}, DebugLoc::getUnknown(),
913 Def->getUnderlyingInstr());
914 Clone->insertAfter(Def);
915 Def->replaceAllUsesWith(Clone);
916 }
917
918 // Replace wide pointer inductions which have only their scalars used by
919 // PtrAdd(IndStart, ScalarIVSteps (0, Step)).
920 if (auto *PtrIV = dyn_cast<VPWidenPointerInductionRecipe>(&Phi)) {
921 if (!Plan.hasScalarVFOnly() &&
922 !PtrIV->onlyScalarsGenerated(Plan.hasScalableVF()))
923 continue;
924
925 VPValue *PtrAdd = scalarizeVPWidenPointerInduction(PtrIV, Plan, Builder);
926 PtrIV->replaceAllUsesWith(PtrAdd);
927 continue;
928 }
929
930 // Replace widened induction with scalar steps for users that only use
931 // scalars.
932 auto *WideIV = cast<VPWidenIntOrFpInductionRecipe>(&Phi);
933 if (HasOnlyVectorVFs && none_of(WideIV->users(), [WideIV](VPUser *U) {
934 return U->usesScalars(WideIV);
935 }))
936 continue;
937
938 const InductionDescriptor &ID = WideIV->getInductionDescriptor();
940 Plan, ID.getKind(), ID.getInductionOpcode(),
941 dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
942 WideIV->getTruncInst(), WideIV->getStartValue(), WideIV->getStepValue(),
943 WideIV->getDebugLoc(), Builder);
944
945 // Update scalar users of IV to use Step instead.
946 if (!HasOnlyVectorVFs) {
947 assert(!Plan.hasScalableVF() &&
948 "plans containing a scalar VF cannot also include scalable VFs");
949 WideIV->replaceAllUsesWith(Steps);
950 } else {
951 bool HasScalableVF = Plan.hasScalableVF();
952 WideIV->replaceUsesWithIf(Steps,
953 [WideIV, HasScalableVF](VPUser &U, unsigned) {
954 if (HasScalableVF)
955 return U.usesFirstLaneOnly(WideIV);
956 return U.usesScalars(WideIV);
957 });
958 }
959 }
960}
961
962/// Check if \p VPV is an untruncated wide induction, either before or after the
963/// increment. If so return the header IV (before the increment), otherwise
964/// return null.
967 auto *WideIV = dyn_cast<VPWidenInductionRecipe>(VPV);
968 if (WideIV) {
969 // VPV itself is a wide induction, separately compute the end value for exit
970 // users if it is not a truncated IV.
971 auto *IntOrFpIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
972 return (IntOrFpIV && IntOrFpIV->getTruncInst()) ? nullptr : WideIV;
973 }
974
975 // Check if VPV is an optimizable induction increment.
976 VPRecipeBase *Def = VPV->getDefiningRecipe();
977 if (!Def || Def->getNumOperands() != 2)
978 return nullptr;
979 WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(0));
980 if (!WideIV)
981 WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(1));
982 if (!WideIV)
983 return nullptr;
984
985 auto IsWideIVInc = [&]() {
986 auto &ID = WideIV->getInductionDescriptor();
987
988 // Check if VPV increments the induction by the induction step.
989 VPValue *IVStep = WideIV->getStepValue();
990 switch (ID.getInductionOpcode()) {
991 case Instruction::Add:
992 return match(VPV, m_c_Add(m_Specific(WideIV), m_Specific(IVStep)));
993 case Instruction::FAdd:
994 return match(VPV, m_c_FAdd(m_Specific(WideIV), m_Specific(IVStep)));
995 case Instruction::FSub:
996 return match(VPV, m_Binary<Instruction::FSub>(m_Specific(WideIV),
997 m_Specific(IVStep)));
998 case Instruction::Sub: {
999 // IVStep will be the negated step of the subtraction. Check if Step == -1
1000 // * IVStep.
1001 VPValue *Step;
1002 if (!match(VPV, m_Sub(m_VPValue(), m_VPValue(Step))))
1003 return false;
1004 const SCEV *IVStepSCEV = vputils::getSCEVExprForVPValue(IVStep, PSE);
1005 const SCEV *StepSCEV = vputils::getSCEVExprForVPValue(Step, PSE);
1006 ScalarEvolution &SE = *PSE.getSE();
1007 return !isa<SCEVCouldNotCompute>(IVStepSCEV) &&
1008 !isa<SCEVCouldNotCompute>(StepSCEV) &&
1009 IVStepSCEV == SE.getNegativeSCEV(StepSCEV);
1010 }
1011 default:
1012 return ID.getKind() == InductionDescriptor::IK_PtrInduction &&
1013 match(VPV, m_GetElementPtr(m_Specific(WideIV),
1014 m_Specific(WideIV->getStepValue())));
1015 }
1016 llvm_unreachable("should have been covered by switch above");
1017 };
1018 return IsWideIVInc() ? WideIV : nullptr;
1019}
1020
1021/// Attempts to optimize the induction variable exit values for users in the
1022/// early exit block.
1025 VPValue *Incoming, *Mask;
1027 m_VPValue(Incoming))))
1028 return nullptr;
1029
1030 auto *WideIV = getOptimizableIVOf(Incoming, PSE);
1031 if (!WideIV)
1032 return nullptr;
1033
1034 auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
1035 if (WideIntOrFp && WideIntOrFp->getTruncInst())
1036 return nullptr;
1037
1038 // Calculate the final index.
1039 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
1040 auto *CanonicalIV = LoopRegion->getCanonicalIV();
1041 Type *CanonicalIVType = LoopRegion->getCanonicalIVType();
1042 auto *ExtractR = cast<VPInstruction>(Op);
1043 VPBuilder B(ExtractR);
1044
1045 DebugLoc DL = ExtractR->getDebugLoc();
1046 VPValue *FirstActiveLane = B.createFirstActiveLane(Mask, DL);
1047 FirstActiveLane = B.createScalarZExtOrTrunc(
1048 FirstActiveLane, CanonicalIVType, FirstActiveLane->getScalarType(), DL);
1049 VPValue *EndValue = B.createAdd(CanonicalIV, FirstActiveLane, DL);
1050
1051 // `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
1052 // changed it means the exit is using the incremented value, so we need to
1053 // add the step.
1054 if (Incoming != WideIV) {
1055 VPValue *One = Plan.getConstantInt(CanonicalIVType, 1);
1056 EndValue = B.createAdd(EndValue, One, DL);
1057 }
1058
1059 if (!match(WideIV, m_CanonicalWidenIV())) {
1060 const InductionDescriptor &ID = WideIV->getInductionDescriptor();
1061 VPIRValue *Start = WideIV->getStartValue();
1062 VPValue *Step = WideIV->getStepValue();
1063 EndValue = B.createDerivedIV(
1064 ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
1065 Start, EndValue, Step);
1066 }
1067
1068 return EndValue;
1069}
1070
1071/// Compute the end value for \p WideIV, unless it is truncated. Creates a
1072/// VPDerivedIVRecipe for non-canonical inductions.
1074 VPBuilder &VectorPHBuilder,
1075 VPValue *VectorTC) {
1076 auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
1077 // Truncated wide inductions resume from the last lane of their vector value
1078 // in the last vector iteration which is handled elsewhere.
1079 if (WideIntOrFp && WideIntOrFp->getTruncInst())
1080 return nullptr;
1081
1082 VPIRValue *Start = WideIV->getStartValue();
1083 VPValue *Step = WideIV->getStepValue();
1085 VPValue *EndValue = VectorTC;
1086 if (!match(WideIV, m_CanonicalWidenIV())) {
1087 EndValue = VectorPHBuilder.createDerivedIV(
1088 ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
1089 Start, VectorTC, Step);
1090 }
1091
1092 // EndValue is derived from the vector trip count (which has the same type as
1093 // the widest induction) and thus may be wider than the induction here.
1094 Type *ScalarTypeOfWideIV = WideIV->getScalarType();
1095 if (ScalarTypeOfWideIV != EndValue->getScalarType()) {
1096 EndValue = VectorPHBuilder.createScalarCast(Instruction::Trunc, EndValue,
1097 ScalarTypeOfWideIV,
1098 WideIV->getDebugLoc());
1099 }
1100
1101 return EndValue;
1102}
1103
1104/// Attempts to optimize the induction variable exit values for users in the
1105/// exit block coming from the latch in the original scalar loop.
1106static VPValue *
1110 VPValue *Incoming;
1112 return nullptr;
1113
1114 VPWidenInductionRecipe *WideIV = getOptimizableIVOf(Incoming, PSE);
1115 if (!WideIV)
1116 return nullptr;
1117
1118 VPValue *EndValue = EndValues.lookup(WideIV);
1119 assert(EndValue && "Must have computed the end value up front");
1120
1121 // `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
1122 // changed it means the exit is using the incremented value, so we don't
1123 // need to subtract the step.
1124 if (Incoming != WideIV)
1125 return EndValue;
1126
1127 // Otherwise, subtract the step from the EndValue.
1128 auto *ExtractR = cast<VPInstruction>(Op);
1129 VPBuilder B(ExtractR);
1130 VPValue *Step = WideIV->getStepValue();
1131 Type *ScalarTy = WideIV->getScalarType();
1132 if (ScalarTy->isIntegerTy())
1133 return B.createSub(EndValue, Step, DebugLoc::getUnknown(), "ind.escape");
1134 if (ScalarTy->isPointerTy()) {
1135 Type *StepTy = Step->getScalarType();
1136 auto *Zero = Plan.getZero(StepTy);
1137 return B.createPtrAdd(EndValue, B.createSub(Zero, Step),
1138 DebugLoc::getUnknown(), "ind.escape");
1139 }
1140 if (ScalarTy->isFloatingPointTy()) {
1141 const auto &ID = WideIV->getInductionDescriptor();
1142 return B.createNaryOp(
1143 ID.getInductionBinOp()->getOpcode() == Instruction::FAdd
1144 ? Instruction::FSub
1145 : Instruction::FAdd,
1146 {EndValue, Step}, {ID.getInductionBinOp()->getFastMathFlags()});
1147 }
1148 llvm_unreachable("all possible induction types must be handled");
1149 return nullptr;
1150}
1151
1153 VPlan &Plan, PredicatedScalarEvolution &PSE, bool FoldTail) {
1154 // Compute end values for all inductions.
1155 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
1156 auto *VectorPH = cast<VPBasicBlock>(VectorRegion->getSinglePredecessor());
1157 VPBuilder VectorPHBuilder(VectorPH, VectorPH->begin());
1159 VPValue *ResumeTC =
1160 FoldTail ? Plan.getTripCount() : &Plan.getVectorTripCount();
1161 for (auto &Phi : VectorRegion->getEntryBasicBlock()->phis()) {
1162 auto *WideIV = dyn_cast<VPWidenInductionRecipe>(&Phi);
1163 if (!WideIV)
1164 continue;
1165 if (VPValue *EndValue =
1166 tryToComputeEndValueForInduction(WideIV, VectorPHBuilder, ResumeTC))
1167 EndValues[WideIV] = EndValue;
1168 }
1169
1170 VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
1171 for (VPRecipeBase &R : make_early_inc_range(*MiddleVPBB)) {
1172 VPValue *Op;
1173 if (!match(&R, m_ExitingIVValue(m_VPValue(Op))))
1174 continue;
1175 auto *WideIV = cast<VPWidenInductionRecipe>(Op);
1176 if (VPValue *EndValue = EndValues.lookup(WideIV)) {
1177 R.getVPSingleValue()->replaceAllUsesWith(EndValue);
1178 R.eraseFromParent();
1179 }
1180 }
1181
1182 // Then, optimize exit block users.
1183 for (VPIRBasicBlock *ExitVPBB : Plan.getExitBlocks()) {
1184 for (VPRecipeBase &R : ExitVPBB->phis()) {
1185 auto *ExitIRI = cast<VPIRPhi>(&R);
1186
1187 for (auto [Idx, PredVPBB] : enumerate(ExitVPBB->getPredecessors())) {
1188 VPValue *Escape = nullptr;
1189 if (PredVPBB == MiddleVPBB)
1191 Plan, ExitIRI->getOperand(Idx), EndValues, PSE);
1192 else
1194 Plan, ExitIRI->getOperand(Idx), PSE);
1195 if (Escape)
1196 ExitIRI->setOperand(Idx, Escape);
1197 }
1198 }
1199 }
1200}
1201
1202/// Remove redundant ExpandSCEVRecipes in \p Plan's entry block by replacing
1203/// them with already existing recipes expanding the same SCEV expression.
1206
1207 for (VPRecipeBase &R :
1209 auto *ExpR = dyn_cast<VPExpandSCEVRecipe>(&R);
1210 if (!ExpR)
1211 continue;
1212
1213 const auto &[V, Inserted] = SCEV2VPV.try_emplace(ExpR->getSCEV(), ExpR);
1214 if (Inserted)
1215 continue;
1216
1217 ExpR->replaceAllUsesWith(V->second);
1218 if (ExpR == Plan.getTripCount())
1219 Plan.resetTripCount(V->second);
1220
1221 ExpR->eraseFromParent();
1222 }
1223}
1224
1226 SmallVector<VPValue *> WorkList;
1228 WorkList.push_back(V);
1229
1230 while (!WorkList.empty()) {
1231 VPValue *Cur = WorkList.pop_back_val();
1232 if (!Seen.insert(Cur).second)
1233 continue;
1234 VPRecipeBase *R = Cur->getDefiningRecipe();
1235 if (!R)
1236 continue;
1237 if (!isDeadRecipe(*R))
1238 continue;
1239 append_range(WorkList, R->operands());
1240 R->eraseFromParent();
1241 }
1242}
1243
1244/// Get any instruction opcode or intrinsic ID data embedded in recipe \p R.
1245/// Returns an optional pair, where the first element indicates whether it is
1246/// an intrinsic ID.
1247static std::optional<std::pair<bool, unsigned>>
1249 return TypeSwitch<const VPSingleDefRecipe *,
1250 std::optional<std::pair<bool, unsigned>>>(R)
1253 [](auto *I) { return std::make_pair(false, I->getOpcode()); })
1254 .Case([](const VPWidenIntrinsicRecipe *I) {
1255 return std::make_pair(true, I->getVectorIntrinsicID());
1256 })
1257 .Case<VPVectorPointerRecipe, VPPredInstPHIRecipe, VPScalarIVStepsRecipe>(
1258 [](auto *I) {
1259 // For recipes that do not directly map to LLVM IR instructions,
1260 // assign opcodes after the last VPInstruction opcode (which is also
1261 // after the last IR Instruction opcode), based on the VPRecipeID.
1262 return std::make_pair(false, VPInstruction::OpsEnd + 1 +
1263 I->getVPRecipeID());
1264 })
1265 .Default([](auto *) { return std::nullopt; });
1266}
1267
1268/// Try to fold \p R using InstSimplifyFolder. Will succeed and return a
1269/// non-nullptr VPValue for a handled opcode or intrinsic ID if corresponding \p
1270/// Operands are foldable live-ins.
1272 ArrayRef<VPValue *> Operands,
1273 const DataLayout &DL) {
1274 auto OpcodeOrIID = getOpcodeOrIntrinsicID(&R);
1275 if (!OpcodeOrIID)
1276 return nullptr;
1277
1279 for (VPValue *Op : Operands) {
1280 VPValue *Candidate = Op;
1281 match(Op, m_Broadcast(m_VPValue(Candidate)));
1282 if (!match(Candidate, m_LiveIn()))
1283 return nullptr;
1284 Value *V = Candidate->getUnderlyingValue();
1285 if (!V)
1286 return nullptr;
1287 Ops.push_back(V);
1288 }
1289
1290 VPlan &Plan = *R.getParent()->getPlan();
1291 auto FoldToIRValue = [&]() -> Value * {
1292 InstSimplifyFolder Folder(DL);
1293 if (OpcodeOrIID->first) {
1294 auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(&R);
1295 return Folder.FoldIntrinsic(OpcodeOrIID->second, Ops, R.getScalarType(),
1296 RFlags ? RFlags->getFastMathFlagsOrNone()
1297 : FastMathFlags());
1298 }
1299 unsigned Opcode = OpcodeOrIID->second;
1300 if (Instruction::isBinaryOp(Opcode))
1301 return Folder.FoldBinOp(static_cast<Instruction::BinaryOps>(Opcode),
1302 Ops[0], Ops[1]);
1303 if (Instruction::isCast(Opcode))
1304 return Folder.FoldCast(static_cast<Instruction::CastOps>(Opcode), Ops[0],
1305 R.getVPSingleValue()->getScalarType());
1306 switch (Opcode) {
1307 case VPInstruction::Not:
1308 return Folder.FoldBinOp(Instruction::BinaryOps::Xor, Ops[0],
1310 case Instruction::Select:
1311 return Folder.FoldSelect(Ops[0], Ops[1], Ops[2]);
1312 case Instruction::ICmp:
1313 case Instruction::FCmp:
1314 return Folder.FoldCmp(cast<VPRecipeWithIRFlags>(R).getPredicate(), Ops[0],
1315 Ops[1]);
1316 case Instruction::GetElementPtr: {
1317 auto &RFlags = cast<VPRecipeWithIRFlags>(R);
1318 auto *GEP = cast<GetElementPtrInst>(RFlags.getUnderlyingInstr());
1319 return Folder.FoldGEP(GEP->getSourceElementType(), Ops[0],
1320 drop_begin(Ops), RFlags.getGEPNoWrapFlags());
1321 }
1324 return Folder.FoldGEP(IntegerType::getInt8Ty(Plan.getContext()), Ops[0],
1325 Ops[1],
1326 cast<VPRecipeWithIRFlags>(R).getGEPNoWrapFlags());
1327 // An extract of a live-in is an extract of a broadcast, so return the
1328 // broadcasted element.
1329 case Instruction::ExtractElement:
1330 assert(!Ops[0]->getType()->isVectorTy() && "Live-ins should be scalar");
1331 return Ops[0];
1332 }
1333 return nullptr;
1334 };
1335
1336 if (Value *V = FoldToIRValue())
1337 return Plan.getOrAddLiveIn(V);
1338 return nullptr;
1339}
1340
1341/// Try to simplify logical and bitwise recipes in \p Def.
1343 bool CanCreateNewRecipe) {
1344 VPlan *Plan = Def->getParent()->getPlan();
1345
1346 // Simplify (X && Y) | (X && !Y) -> X.
1347 // TODO: Split up into simpler, modular combines: (X && Y) | (X && Z) into X
1348 // && (Y | Z) and (X | !X) into true. This requires queuing newly created
1349 // recipes to be visited during simplification.
1350 VPValue *X, *Y, *Z;
1351 if (match(Def,
1354 Def->replaceAllUsesWith(X);
1355 Def->eraseFromParent();
1356 return true;
1357 }
1358
1359 // x | AllOnes -> AllOnes
1360 if (match(Def, m_c_BinaryOr(m_VPValue(X), m_AllOnes()))) {
1361 Def->replaceAllUsesWith(Plan->getAllOnesValue(Def->getScalarType()));
1362 return true;
1363 }
1364
1365 // x | 0 -> x
1366 if (match(Def, m_c_BinaryOr(m_VPValue(X), m_ZeroInt()))) {
1367 Def->replaceAllUsesWith(X);
1368 return true;
1369 }
1370
1371 // x | !x -> AllOnes
1372 if (match(Def, m_c_BinaryOr(m_VPValue(X), m_Not(m_Deferred(X))))) {
1373 Def->replaceAllUsesWith(Plan->getAllOnesValue(Def->getScalarType()));
1374 return true;
1375 }
1376
1377 // x & 0 -> 0
1378 if (match(Def, m_c_BinaryAnd(m_VPValue(X), m_ZeroInt()))) {
1379 Def->replaceAllUsesWith(Plan->getZero(Def->getScalarType()));
1380 return true;
1381 }
1382
1383 // x & AllOnes -> x
1384 if (match(Def, m_c_BinaryAnd(m_VPValue(X), m_AllOnes()))) {
1385 Def->replaceAllUsesWith(X);
1386 return true;
1387 }
1388
1389 // x && false -> false
1390 if (match(Def, m_c_LogicalAnd(m_VPValue(X), m_False()))) {
1391 Def->replaceAllUsesWith(Plan->getFalse());
1392 return true;
1393 }
1394
1395 // x && true -> x
1396 if (match(Def, m_c_LogicalAnd(m_VPValue(X), m_True()))) {
1397 Def->replaceAllUsesWith(X);
1398 return true;
1399 }
1400
1401 // (x && y) | (x && z) -> x && (y | z)
1402 if (CanCreateNewRecipe &&
1405 // Simplify only if one of the operands has one use to avoid creating an
1406 // extra recipe.
1407 (!Def->getOperand(0)->hasMoreThanOneUniqueUser() ||
1408 !Def->getOperand(1)->hasMoreThanOneUniqueUser())) {
1409 Def->replaceAllUsesWith(
1410 Builder.createLogicalAnd(X, Builder.createOr(Y, Z)));
1411 return true;
1412 }
1413
1414 // x && (x && y) -> x && y
1415 if (match(Def, m_LogicalAnd(m_VPValue(X),
1417 Def->replaceAllUsesWith(Def->getOperand(1));
1418 return true;
1419 }
1420
1421 // x && (y && x) -> x && y
1422 if (match(Def, m_LogicalAnd(m_VPValue(X),
1424 Def->replaceAllUsesWith(Builder.createLogicalAnd(X, Y));
1425 return true;
1426 }
1427
1428 // x && !x -> 0
1429 if (match(Def, m_LogicalAnd(m_VPValue(X), m_Not(m_Deferred(X))))) {
1430 Def->replaceAllUsesWith(Plan->getFalse());
1431 return true;
1432 }
1433
1434 if (match(Def, m_Select(m_VPValue(), m_VPValue(X), m_Deferred(X)))) {
1435 Def->replaceAllUsesWith(X);
1436 return true;
1437 }
1438
1439 // select c, false, true -> not c
1440 VPValue *C;
1441 if (CanCreateNewRecipe &&
1442 match(Def, m_Select(m_VPValue(C), m_False(), m_True()))) {
1443 Def->replaceAllUsesWith(Builder.createNot(C));
1444 return true;
1445 }
1446
1447 // select !c, x, y -> select c, y, x
1448 if (match(Def, m_Select(m_Not(m_VPValue(C)), m_VPValue(X), m_VPValue(Y)))) {
1449 Def->setOperand(0, C);
1450 Def->setOperand(1, Y);
1451 Def->setOperand(2, X);
1452 return true;
1453 }
1454
1455 // select x, (i1 y | z), y -> y | (x && z)
1456 if (CanCreateNewRecipe &&
1457 match(Def, m_Select(m_VPValue(X),
1459 m_Deferred(Y))) &&
1460 Y->getScalarType()->isIntegerTy(1)) {
1461 Def->replaceAllUsesWith(
1462 Builder.createOr(Y, Builder.createLogicalAnd(X, Z)));
1463 return true;
1464 }
1465
1466 return false;
1467}
1468
1469/// Try to simplify VPSingleDefRecipe \p Def.
1471 VPlan *Plan = Def->getParent()->getPlan();
1472
1473 // Simplification of live-in IR values for SingleDef recipes using
1474 // InstSimplifyFolder.
1475 const DataLayout &DL = Plan->getDataLayout();
1476 if (VPValue *V = tryToFoldLiveIns(*Def, Def->operands(), DL))
1477 return Def->replaceAllUsesWith(V);
1478
1479 // Fold PredPHI LiveIn -> LiveIn.
1480 if (auto *PredPHI = dyn_cast<VPPredInstPHIRecipe>(Def)) {
1481 VPValue *Op = PredPHI->getOperand(0);
1482 if (isa<VPIRValue>(Op))
1483 PredPHI->replaceAllUsesWith(Op);
1484 }
1485
1486 // Drop the mask of a predicated store masked by the header mask (which is
1487 // guaranteed to be true at least for the first lane) and both the stored
1488 // value and the address are uniform across VF and UF.
1489 if (auto *RepR = dyn_cast<VPReplicateRecipe>(Def);
1490 RepR && RepR->isPredicated() && RepR->getOpcode() == Instruction::Store &&
1491 all_of(RepR->operandsWithoutMask(), vputils::isUniformAcrossVFsAndUFs) &&
1492 vputils::isHeaderMask(RepR->getMask(), *Plan)) {
1493 auto *Unmasked = new VPReplicateRecipe(
1494 RepR->getUnderlyingInstr(), RepR->operandsWithoutMask(),
1495 RepR->isSingleScalar(), /*Mask=*/nullptr, *RepR, *RepR,
1496 RepR->getDebugLoc());
1497 Unmasked->insertBefore(RepR);
1498 RepR->replaceAllUsesWith(Unmasked);
1499 RepR->eraseFromParent();
1500 return;
1501 }
1502
1503 VPBuilder Builder(Def);
1504
1505 // Avoid replacing VPInstructions with underlying values with new
1506 // VPInstructions, as we would fail to create widen/replicate recpes from the
1507 // new VPInstructions without an underlying value, and miss out on some
1508 // transformations that only apply to widened/replicated recipes later, by
1509 // doing so.
1510 // TODO: We should also not replace non-VPInstructions like VPWidenRecipe with
1511 // VPInstructions without underlying values, as those will get skipped during
1512 // cost computation.
1513 bool CanCreateNewRecipe =
1514 !isa<VPInstruction>(Def) || !Def->getUnderlyingValue();
1515
1516 VPValue *A;
1517 if (match(Def, m_Trunc(m_ZExtOrSExt(m_VPValue(A))))) {
1518 Type *TruncTy = Def->getScalarType();
1519 Type *ATy = A->getScalarType();
1520 if (TruncTy == ATy) {
1521 Def->replaceAllUsesWith(A);
1522 } else {
1523 // Don't replace a non-widened cast recipe with a widened cast.
1524 if (!isa<VPWidenCastRecipe>(Def))
1525 return;
1526 if (ATy->getScalarSizeInBits() < TruncTy->getScalarSizeInBits()) {
1527
1528 unsigned ExtOpcode = match(Def->getOperand(0), m_SExt(m_VPValue()))
1529 ? Instruction::SExt
1530 : Instruction::ZExt;
1531 auto *Ext = Builder.createWidenCast(Instruction::CastOps(ExtOpcode), A,
1532 TruncTy);
1533 if (auto *UnderlyingExt = Def->getOperand(0)->getUnderlyingValue()) {
1534 // UnderlyingExt has distinct return type, used to retain legacy cost.
1535 Ext->setUnderlyingValue(UnderlyingExt);
1536 }
1537 Def->replaceAllUsesWith(Ext);
1538 } else if (ATy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits()) {
1539 auto *Trunc = Builder.createWidenCast(Instruction::Trunc, A, TruncTy);
1540 Def->replaceAllUsesWith(Trunc);
1541 }
1542 }
1543 }
1544
1545 if (simplifyLogicalRecipe(Def, Builder, CanCreateNewRecipe))
1546 return;
1547
1548 VPValue *X, *Y, *C;
1549 if (match(Def, m_c_Add(m_VPValue(A), m_ZeroInt())))
1550 return Def->replaceAllUsesWith(A);
1551
1552 if (match(Def, m_c_Mul(m_VPValue(A), m_One())))
1553 return Def->replaceAllUsesWith(A);
1554
1555 if (match(Def, m_c_Mul(m_VPValue(A), m_ZeroInt())))
1556 return Def->replaceAllUsesWith(Plan->getZero(Def->getScalarType()));
1557
1558 if (CanCreateNewRecipe && match(Def, m_c_Mul(m_VPValue(A), m_AllOnes()))) {
1559 // Preserve nsw from the Mul on the new Sub.
1561 false, cast<VPRecipeWithIRFlags>(Def)->hasNoSignedWrap()};
1562 return Def->replaceAllUsesWith(Builder.createSub(
1563 Plan->getZero(A->getScalarType()), A, Def->getDebugLoc(), "", NW));
1564 }
1565
1566 if (CanCreateNewRecipe &&
1568 // Preserve nsw from the Add and the Sub, if it's present on both, on the
1569 // new Sub.
1571 false,
1572 cast<VPRecipeWithIRFlags>(Def)->hasNoSignedWrap() &&
1573 cast<VPRecipeWithIRFlags>(Def->getOperand(Def->getOperand(0) == X))
1574 ->hasNoSignedWrap()};
1575 return Def->replaceAllUsesWith(
1576 Builder.createSub(X, Y, Def->getDebugLoc(), "", NW));
1577 }
1578
1579 const APInt *APC;
1580 if (CanCreateNewRecipe && match(Def, m_c_Mul(m_VPValue(A), m_APInt(APC))) &&
1581 APC->isPowerOf2())
1582 return Def->replaceAllUsesWith(Builder.createNaryOp(
1583 Instruction::Shl,
1584 {A, Plan->getConstantInt(APC->getBitWidth(), APC->exactLogBase2())},
1585 *cast<VPRecipeWithIRFlags>(Def), Def->getDebugLoc()));
1586
1587 if (CanCreateNewRecipe && match(Def, m_UDiv(m_VPValue(A), m_APInt(APC))) &&
1588 APC->isPowerOf2())
1589 return Def->replaceAllUsesWith(Builder.createNaryOp(
1590 Instruction::LShr,
1591 {A, Plan->getConstantInt(APC->getBitWidth(), APC->exactLogBase2())},
1592 *cast<VPRecipeWithIRFlags>(Def), Def->getDebugLoc()));
1593
1594 if (match(Def, m_Not(m_VPValue(A)))) {
1595 if (match(A, m_Not(m_VPValue(A))))
1596 return Def->replaceAllUsesWith(A);
1597
1598 // Try to fold Not into compares by adjusting the predicate in-place.
1599 CmpPredicate Pred;
1600 if (match(A, m_Cmp(Pred, m_VPValue(), m_VPValue()))) {
1601 auto *Cmp = cast<VPRecipeWithIRFlags>(A);
1602 if (all_of(Cmp->users(),
1604 m_Not(m_Specific(Cmp)),
1605 m_Select(m_Specific(Cmp), m_VPValue(), m_VPValue()))))) {
1606 Cmp->setPredicate(CmpInst::getInversePredicate(Pred));
1607 for (VPUser *U : to_vector(Cmp->users())) {
1608 auto *R = cast<VPSingleDefRecipe>(U);
1609 if (match(R, m_Select(m_Specific(Cmp), m_VPValue(X), m_VPValue(Y)))) {
1610 // select (cmp pred), x, y -> select (cmp inv_pred), y, x
1611 R->setOperand(1, Y);
1612 R->setOperand(2, X);
1613 } else {
1614 // not (cmp pred) -> cmp inv_pred
1615 assert(match(R, m_Not(m_Specific(Cmp))) && "Unexpected user");
1616 R->replaceAllUsesWith(Cmp);
1617 }
1618 }
1619 // If Cmp doesn't have a debug location, use the one from the negation,
1620 // to preserve the location.
1621 if (!Cmp->getDebugLoc() && Def->getDebugLoc())
1622 Cmp->setDebugLoc(Def->getDebugLoc());
1623 }
1624 }
1625 }
1626
1627 // Fold any-of (fcmp uno %A, %A), (fcmp uno %B, %B), ... ->
1628 // any-of (fcmp uno %A, %B), ...
1629 if (match(Def, m_AnyOf())) {
1631 VPRecipeBase *UnpairedCmp = nullptr;
1632 for (VPValue *Op : Def->operands()) {
1633 VPValue *X;
1634 if (Op->getNumUsers() > 1 ||
1636 m_Deferred(X)))) {
1637 NewOps.push_back(Op);
1638 } else if (!UnpairedCmp) {
1639 UnpairedCmp = Op->getDefiningRecipe();
1640 } else {
1641 NewOps.push_back(Builder.createFCmp(CmpInst::FCMP_UNO,
1642 UnpairedCmp->getOperand(0), X));
1643 UnpairedCmp = nullptr;
1644 }
1645 }
1646
1647 if (UnpairedCmp)
1648 NewOps.push_back(UnpairedCmp->getVPSingleValue());
1649
1650 if (NewOps.size() < Def->getNumOperands()) {
1651 VPValue *NewAnyOf = Builder.createNaryOp(VPInstruction::AnyOf, NewOps);
1652 return Def->replaceAllUsesWith(NewAnyOf);
1653 }
1654 }
1655
1656 // Fold (fcmp uno %X, %X) or (fcmp uno %Y, %Y) -> fcmp uno %X, %Y
1657 // This is useful for fmax/fmin without fast-math flags, where we need to
1658 // check if any operand is NaN.
1659 if (CanCreateNewRecipe &&
1661 m_Deferred(X)),
1663 m_Deferred(Y))))) {
1664 VPValue *NewCmp = Builder.createFCmp(CmpInst::FCMP_UNO, X, Y);
1665 return Def->replaceAllUsesWith(NewCmp);
1666 }
1667
1668 // Remove redundant DerviedIVs, that is 0 + A * 1 -> A and 0 + 0 * x -> 0.
1669 if ((match(Def, m_DerivedIV(m_ZeroInt(), m_VPValue(A), m_One())) ||
1670 match(Def, m_DerivedIV(m_ZeroInt(), m_ZeroInt(), m_VPValue()))) &&
1671 Def->getOperand(1)->getScalarType() == Def->getScalarType())
1672 return Def->replaceAllUsesWith(Def->getOperand(1));
1673
1675 m_One()))) {
1676 Type *WideStepTy = Def->getScalarType();
1677 if (X->getScalarType() != WideStepTy)
1678 X = Builder.createWidenCast(Instruction::Trunc, X, WideStepTy);
1679 Def->replaceAllUsesWith(X);
1680 return;
1681 }
1682
1683 // For i1 vp.merges produced by AnyOf reductions:
1684 // vp.merge true, (or x, y), x, evl -> vp.merge y, true, x, evl
1686 m_VPValue(X), m_VPValue())) &&
1688 Def->getScalarType()->isIntegerTy(1)) {
1689 Def->setOperand(1, Def->getOperand(0));
1690 Def->setOperand(0, Y);
1691 return;
1692 }
1693
1694 // Simplify MaskedCond with no block mask to its single operand.
1696 !cast<VPInstruction>(Def)->isMasked())
1697 return Def->replaceAllUsesWith(Def->getOperand(0));
1698
1699 // Look through ExtractLastLane.
1700 if (match(Def, m_ExtractLastLane(m_VPValue(A)))) {
1701 if (match(A, m_BuildVector())) {
1702 auto *BuildVector = cast<VPInstruction>(A);
1703 Def->replaceAllUsesWith(
1704 BuildVector->getOperand(BuildVector->getNumOperands() - 1));
1705 return;
1706 }
1707
1708 if (match(A, m_Broadcast(m_VPValue(X))))
1709 return Def->replaceAllUsesWith(X);
1710
1712 return Def->replaceAllUsesWith(A);
1713
1714 if (Plan->hasScalarVFOnly())
1715 return Def->replaceAllUsesWith(A);
1716 }
1717
1718 // Look through ExtractPenultimateElement (BuildVector ....).
1720 auto *BuildVector = cast<VPInstruction>(Def->getOperand(0));
1721 Def->replaceAllUsesWith(
1722 BuildVector->getOperand(BuildVector->getNumOperands() - 2));
1723 return;
1724 }
1725
1726 uint64_t Idx;
1728 auto *BuildVector = cast<VPInstruction>(Def->getOperand(0));
1729 Def->replaceAllUsesWith(BuildVector->getOperand(Idx));
1730 return;
1731 }
1732
1733 if (match(Def, m_BuildVector()) && all_equal(Def->operands())) {
1734 Def->replaceAllUsesWith(
1735 Builder.createNaryOp(VPInstruction::Broadcast, Def->getOperand(0)));
1736 return;
1737 }
1738
1739 // Look through broadcast of single-scalar when used as select conditions; in
1740 // that case the scalar condition can be used directly.
1741 if (match(Def,
1744 "broadcast operand must be single-scalar");
1745 Def->setOperand(0, C);
1746 return;
1747 }
1748
1749 if (match(Def, m_Broadcast(m_VPValue(X))))
1750 return Def->replaceUsesWithIf(
1751 X, [Def](const VPUser &U, unsigned) { return U.usesScalars(Def); });
1752
1754 if (Def->getNumOperands() == 1) {
1755 Def->replaceAllUsesWith(Def->getOperand(0));
1756 return;
1757 }
1758 if (auto *Phi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(Def)) {
1759 if (all_equal(Phi->incoming_values()))
1760 Phi->replaceAllUsesWith(Phi->getOperand(0));
1761 }
1762 return;
1763 }
1764
1765 VPIRValue *IRV;
1766 if (Def->getNumOperands() == 1 &&
1768 return Def->replaceAllUsesWith(IRV);
1769
1770 // Some simplifications can only be applied after unrolling. Perform them
1771 // below.
1772 if (!Plan->isUnrolled())
1773 return;
1774
1775 // After unrolling, extract-lane may be used to extract values from multiple
1776 // scalar sources. Only simplify when extracting from a single scalar source.
1777 VPValue *LaneToExtract;
1778 if (match(Def, m_ExtractLane(m_VPValue(LaneToExtract), m_VPValue(A)))) {
1779 // Simplify extract-lane(%lane_num, %scalar_val) -> %scalar_val.
1781 return Def->replaceAllUsesWith(A);
1782
1783 // Replace extract-lane(0, canonical-WIDEN-INDUCTION) with the region's
1784 // scalar canonical IV.
1786 if (match(LaneToExtract, m_ZeroInt()) &&
1787 match(A, m_CanonicalWidenIV(WidenIV)))
1788 return Def->replaceAllUsesWith(WidenIV->getRegion()->getCanonicalIV());
1789
1790 // Simplify extract-lane with single source to extract-element.
1791 Def->replaceAllUsesWith(Builder.createNaryOp(
1792 Instruction::ExtractElement, {A, LaneToExtract}, Def->getDebugLoc()));
1793 return;
1794 }
1795
1796 // Look for cycles where Def is of the form:
1797 // X = phi(0, IVInc) ; used only by IVInc, or by IVInc and Inc = X + Y
1798 // IVInc = X + Step ; used by X and Def
1799 // Def = IVInc + Y
1800 // Fold the increment Y into the phi's start value, replace Def with IVInc,
1801 // and if Inc exists, replace it with X.
1802 if (match(Def, m_Add(m_Add(m_VPValue(X), m_VPValue()), m_VPValue(Y))) &&
1803 isa<VPIRValue>(Y) &&
1804 match(X, m_VPPhi(m_ZeroInt(), m_Specific(Def->getOperand(0))))) {
1805 auto *Phi = cast<VPPhi>(X);
1806 auto *IVInc = Def->getOperand(0);
1807 if (IVInc->getNumUsers() == 2) {
1808 // If Phi has a second user (besides IVInc's defining recipe), it must
1809 // be Inc = Phi + Y for the fold to apply.
1811 findUserOf(Phi, m_Add(m_Specific(Phi), m_Specific(Y))));
1812 if (Phi->getNumUsers() == 1 || (Phi->getNumUsers() == 2 && Inc)) {
1813 Def->replaceAllUsesWith(IVInc);
1814 if (Inc)
1815 Inc->replaceAllUsesWith(Phi);
1816 Phi->setOperand(0, Y);
1817 return;
1818 }
1819 }
1820 }
1821
1822 // Simplify unrolled VectorPointer without offset, or with zero offset, to
1823 // just the pointer operand.
1824 if (auto *VPR = dyn_cast<VPVectorPointerRecipe>(Def))
1825 if (!VPR->getVFxPart() || match(VPR->getVFxPart(), m_ZeroInt()))
1826 return VPR->replaceAllUsesWith(VPR->getOperand(0));
1827
1828 // VPScalarIVSteps after unrolling can be replaced by their start value, if
1829 // the start index is zero and only the first lane 0 is demanded.
1830 if (auto *Steps = dyn_cast<VPScalarIVStepsRecipe>(Def)) {
1831 if (!Steps->getStartIndex() && vputils::onlyFirstLaneUsed(Steps)) {
1832 Steps->replaceAllUsesWith(Steps->getOperand(0));
1833 return;
1834 }
1835 }
1836 // Simplify redundant ReductionStartVector recipes after unrolling.
1837 VPValue *StartV;
1839 m_VPValue(StartV), m_VPValue(), m_VPValue()))) {
1840 Def->replaceUsesWithIf(StartV, [](const VPUser &U, unsigned Idx) {
1841 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&U);
1842 return PhiR && PhiR->isInLoop();
1843 });
1844 return;
1845 }
1846
1847 if (Plan->getConcreteUF() == 1 && match(Def, m_ExtractLastPart(m_VPValue(A))))
1848 return Def->replaceAllUsesWith(A);
1849}
1850
1860
1862 VPValue *X;
1865 for (VPRecipeBase &R : make_early_inc_range(*VPBB))
1866 if (match(&R, m_Reverse(m_Reverse(m_VPValue(X)))))
1867 R.getVPSingleValue()->replaceAllUsesWith(X);
1868}
1869
1870/// Reassociate (headermask && x) && y -> headermask && (x && y) to allow the
1871/// header mask to be simplified further when tail folding, e.g. in
1872/// optimizeEVLMasks.
1873static void reassociateHeaderMask(VPlan &Plan) {
1874 VPValue *HeaderMask = vputils::findHeaderMask(Plan);
1875 if (!HeaderMask)
1876 return;
1877
1878 SmallVector<VPUser *> Worklist;
1879 for (VPUser *U : HeaderMask->users())
1880 if (match(U, m_LogicalAnd(m_Specific(HeaderMask), m_VPValue())))
1882
1883 while (!Worklist.empty()) {
1884 auto *R = dyn_cast<VPSingleDefRecipe>(Worklist.pop_back_val());
1885 VPValue *X, *Y;
1886 if (!R || !match(R, m_LogicalAnd(
1887 m_LogicalAnd(m_Specific(HeaderMask), m_VPValue(X)),
1888 m_VPValue(Y))))
1889 continue;
1890 append_range(Worklist, R->users());
1891 VPBuilder Builder(R);
1892 R->replaceAllUsesWith(
1893 Builder.createLogicalAnd(HeaderMask, Builder.createLogicalAnd(X, Y)));
1894 }
1895}
1896
1897static std::optional<Instruction::BinaryOps>
1899 switch (ID) {
1900 case Intrinsic::masked_udiv:
1901 return Instruction::UDiv;
1902 case Intrinsic::masked_sdiv:
1903 return Instruction::SDiv;
1904 case Intrinsic::masked_urem:
1905 return Instruction::URem;
1906 case Intrinsic::masked_srem:
1907 return Instruction::SRem;
1908 default:
1909 return {};
1910 }
1911}
1912
1914 if (Plan.hasScalarVFOnly())
1915 return;
1916
1918 vp_depth_first_deep(Plan.getEntry()))) {
1919 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
1922 continue;
1923 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
1924 if (RepR && (RepR->isSingleScalar() || RepR->isPredicated()))
1925 continue;
1926
1927 auto *RepOrWidenR = cast<VPRecipeWithIRFlags>(&R);
1928 if (RepR && RepR->getOpcode() == Instruction::Store &&
1929 vputils::isSingleScalar(RepR->getOperand(1))) {
1930 auto *Clone = new VPReplicateRecipe(
1931 RepOrWidenR->getUnderlyingInstr(), RepOrWidenR->operands(),
1932 true /*IsSingleScalar*/, nullptr /*Mask*/, *RepR /*Flags*/,
1933 *RepR /*Metadata*/, RepR->getDebugLoc());
1934 Clone->insertBefore(RepOrWidenR);
1935 VPBuilder Builder(Clone);
1936 VPValue *ExtractOp = Clone->getOperand(0);
1937 if (vputils::isUniformAcrossVFsAndUFs(RepR->getOperand(1)))
1938 ExtractOp =
1939 Builder.createNaryOp(VPInstruction::ExtractLastPart, ExtractOp);
1940 ExtractOp =
1941 Builder.createNaryOp(VPInstruction::ExtractLastLane, ExtractOp);
1942 Clone->setOperand(0, ExtractOp);
1943 RepR->eraseFromParent();
1944 continue;
1945 }
1946
1947 // Narrow llvm.masked.{u,s}{div,rem} intrinsics with a safe divisor.
1948 if (auto *IntrR = dyn_cast<VPWidenIntrinsicRecipe>(RepOrWidenR)) {
1949 if (!vputils::onlyFirstLaneUsed(IntrR))
1950 continue;
1951 auto Opc = getUnmaskedDivRemOpcode(IntrR->getVectorIntrinsicID());
1952 if (!Opc)
1953 continue;
1954 VPBuilder Builder(IntrR);
1955 VPValue *SafeDivisor = Builder.createSelect(
1956 IntrR->getOperand(2), IntrR->getOperand(1),
1957 Plan.getConstantInt(IntrR->getScalarType(), 1));
1958 VPValue *Clone = Builder.createNaryOp(
1959 *Opc, {IntrR->getOperand(0), SafeDivisor},
1960 VPIRFlags::getDefaultFlags(*Opc), IntrR->getDebugLoc());
1961 IntrR->replaceAllUsesWith(Clone);
1962 IntrR->eraseFromParent();
1963 continue;
1964 }
1965
1966 // Skip recipes that aren't single scalars.
1967 if (!vputils::isSingleScalar(RepOrWidenR))
1968 continue;
1969
1970 // Predicate to check if a user of Op introduces extra broadcasts.
1971 auto IntroducesBCastOf = [](const VPValue *Op) {
1972 return [Op](const VPUser *U) {
1973 if (auto *VPI = dyn_cast<VPInstruction>(U)) {
1977 VPI->getOpcode()))
1978 return false;
1979 }
1980 return !U->usesScalars(Op);
1981 };
1982 };
1983
1984 if (any_of(RepOrWidenR->users(), IntroducesBCastOf(RepOrWidenR)) &&
1985 none_of(RepOrWidenR->operands(), [&](VPValue *Op) {
1986 if (any_of(
1987 make_filter_range(Op->users(), not_equal_to(RepOrWidenR)),
1988 IntroducesBCastOf(Op)))
1989 return false;
1990 // Non-constant live-ins require broadcasts, while constants do not
1991 // need explicit broadcasts.
1992 auto *IRV = dyn_cast<VPIRValue>(Op);
1993 bool LiveInNeedsBroadcast = IRV && !isa<Constant>(IRV->getValue());
1994 auto *OpR = dyn_cast<VPReplicateRecipe>(Op);
1995 return LiveInNeedsBroadcast || (OpR && OpR->isSingleScalar());
1996 }))
1997 continue;
1998
1999 auto *Clone = VPBuilder::createSingleScalarOp(
2000 getOpcodeOrIntrinsicID(RepOrWidenR)->second, RepOrWidenR->operands(),
2001 /*Mask=*/nullptr, *RepOrWidenR, {}, DebugLoc::getUnknown(),
2002 RepOrWidenR->getUnderlyingInstr());
2003 Clone->insertBefore(RepOrWidenR);
2004 RepOrWidenR->replaceAllUsesWith(Clone);
2005 if (isDeadRecipe(*RepOrWidenR))
2006 RepOrWidenR->eraseFromParent();
2007 }
2008 }
2009}
2010
2011/// Try to see if all of \p Blend's masks share a common value logically and'ed
2012/// and remove it from the masks.
2014 if (Blend->isNormalized())
2015 return;
2016 VPValue *CommonEdgeMask;
2017 if (!match(Blend->getMask(0),
2018 m_LogicalAnd(m_VPValue(CommonEdgeMask), m_VPValue())))
2019 return;
2020 for (unsigned I = 0; I < Blend->getNumIncomingValues(); I++)
2021 if (!match(Blend->getMask(I),
2022 m_LogicalAnd(m_Specific(CommonEdgeMask), m_VPValue())))
2023 return;
2024 for (unsigned I = 0; I < Blend->getNumIncomingValues(); I++)
2025 Blend->setMask(I, Blend->getMask(I)->getDefiningRecipe()->getOperand(1));
2026}
2027
2028/// Normalize and simplify VPBlendRecipes. Should be run after simplifyRecipes
2029/// to make sure the masks are simplified.
2030static void simplifyBlends(VPlan &Plan) {
2033 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
2034 auto *Blend = dyn_cast<VPBlendRecipe>(&R);
2035 if (!Blend)
2036 continue;
2037
2038 removeCommonBlendMask(Blend);
2039
2040 // Try to remove redundant blend recipes.
2041 SmallPtrSet<VPValue *, 4> UniqueValues;
2042 if (Blend->isNormalized() || !match(Blend->getMask(0), m_False()))
2043 UniqueValues.insert(Blend->getIncomingValue(0));
2044 for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
2045 if (!match(Blend->getMask(I), m_False()))
2046 UniqueValues.insert(Blend->getIncomingValue(I));
2047
2048 if (UniqueValues.size() == 1) {
2049 Blend->replaceAllUsesWith(*UniqueValues.begin());
2050 Blend->eraseFromParent();
2051 continue;
2052 }
2053
2054 if (Blend->isNormalized())
2055 continue;
2056
2057 // Normalize the blend so its first incoming value is used as the initial
2058 // value with the others blended into it.
2059
2060 unsigned StartIndex = 0;
2061 for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
2062 // If a value's mask is used only by the blend then is can be deadcoded.
2063 // TODO: Find the most expensive mask that can be deadcoded, or a mask
2064 // that's used by multiple blends where it can be removed from them all.
2065 VPValue *Mask = Blend->getMask(I);
2066 if (Mask->getNumUsers() == 1 && !match(Mask, m_False())) {
2067 StartIndex = I;
2068 break;
2069 }
2070 }
2071
2072 SmallVector<VPValue *, 4> OperandsWithMask;
2073 OperandsWithMask.push_back(Blend->getIncomingValue(StartIndex));
2074
2075 for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
2076 if (I == StartIndex)
2077 continue;
2078 OperandsWithMask.push_back(Blend->getIncomingValue(I));
2079 OperandsWithMask.push_back(Blend->getMask(I));
2080 }
2081
2082 auto *NewBlend =
2083 new VPBlendRecipe(cast_or_null<PHINode>(Blend->getUnderlyingValue()),
2084 OperandsWithMask, *Blend, Blend->getDebugLoc());
2085 NewBlend->insertBefore(&R);
2086
2087 VPValue *DeadMask = Blend->getMask(StartIndex);
2088 Blend->replaceAllUsesWith(NewBlend);
2089 Blend->eraseFromParent();
2091
2092 /// Simplify BLEND %a, %b, Not(%mask) -> BLEND %b, %a, %mask.
2093 VPValue *NewMask;
2094 if (NewBlend->getNumOperands() == 3 &&
2095 match(NewBlend->getMask(1), m_Not(m_VPValue(NewMask)))) {
2096 VPValue *Inc0 = NewBlend->getOperand(0);
2097 VPValue *Inc1 = NewBlend->getOperand(1);
2098 VPValue *OldMask = NewBlend->getOperand(2);
2099 NewBlend->setOperand(0, Inc1);
2100 NewBlend->setOperand(1, Inc0);
2101 NewBlend->setOperand(2, NewMask);
2102 if (OldMask->getNumUsers() == 0)
2103 cast<VPInstruction>(OldMask)->eraseFromParent();
2104 }
2105 }
2106 }
2107}
2108
2109/// Optimize the width of vector induction variables in \p Plan based on a known
2110/// constant Trip Count, \p BestVF and \p BestUF.
2112 ElementCount BestVF,
2113 unsigned BestUF) {
2114 // Only proceed if we have not completely removed the vector region.
2115 if (!Plan.getVectorLoopRegion())
2116 return false;
2117
2118 const APInt *TC;
2119 if (!BestVF.isFixed() || !match(Plan.getTripCount(), m_APInt(TC)))
2120 return false;
2121
2122 // Calculate the minimum power-of-2 bit width that can fit the known TC, VF
2123 // and UF. Returns at least 8.
2124 auto ComputeBitWidth = [](APInt TC, uint64_t Align) {
2125 APInt AlignedTC =
2128 APInt MaxVal = AlignedTC - 1;
2129 return std::max<unsigned>(PowerOf2Ceil(MaxVal.getActiveBits()), 8);
2130 };
2131 unsigned NewBitWidth =
2132 ComputeBitWidth(*TC, BestVF.getKnownMinValue() * BestUF);
2133
2134 LLVMContext &Ctx = Plan.getContext();
2135 auto *NewIVTy = IntegerType::get(Ctx, NewBitWidth);
2136
2137 bool MadeChange = false;
2138
2139 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
2140 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
2141 // Currently only handle canonical IVs as it is trivial to replace the start
2142 // and stop values, and we currently only perform the optimization when the
2143 // IV has a single use.
2145 if (!match(&Phi, m_CanonicalWidenIV(WideIV)))
2146 continue;
2147 if (WideIV->hasMoreThanOneUniqueUser() ||
2148 NewIVTy == WideIV->getScalarType())
2149 continue;
2150
2151 // Currently only handle cases where the single user is a header-mask
2152 // comparison with the backedge-taken-count.
2153 VPUser *SingleUser = WideIV->getSingleUser();
2154 if (!SingleUser ||
2155 !match(SingleUser,
2156 m_ICmp(m_Specific(WideIV),
2158 continue;
2159
2160 // Update IV operands and comparison bound to use new narrower type.
2161 assert(!WideIV->getTruncInst() &&
2162 "canonical IV is not expected to have a truncation");
2163 auto *NewWideIV = new VPWidenIntOrFpInductionRecipe(
2164 WideIV->getPHINode(), Plan.getZero(NewIVTy),
2165 Plan.getConstantInt(NewIVTy, 1), WideIV->getVFValue(),
2166 WideIV->getInductionDescriptor(), *WideIV, WideIV->getDebugLoc());
2167 NewWideIV->insertBefore(WideIV);
2168
2169 auto *NewBTC = new VPWidenCastRecipe(
2170 Instruction::Trunc, Plan.getOrCreateBackedgeTakenCount(), NewIVTy,
2171 nullptr, VPIRFlags::getDefaultFlags(Instruction::Trunc));
2172 Plan.getVectorPreheader()->appendRecipe(NewBTC);
2173 auto *Cmp = cast<VPInstruction>(WideIV->getSingleUser());
2174 Cmp->replaceAllUsesWith(
2175 VPBuilder(Cmp).createICmp(Cmp->getPredicate(), NewWideIV, NewBTC));
2176
2177 MadeChange = true;
2178 }
2179
2180 return MadeChange;
2181}
2182
2183/// Return true if \p Cond is known to be true for given \p BestVF and \p
2184/// BestUF.
2186 ElementCount BestVF, unsigned BestUF,
2189 return any_of(Cond->getDefiningRecipe()->operands(), [&Plan, BestVF, BestUF,
2190 &PSE](VPValue *C) {
2191 return isConditionTrueViaVFAndUF(C, Plan, BestVF, BestUF, PSE);
2192 });
2193
2194 auto *CanIV = Plan.getVectorLoopRegion()->getCanonicalIV();
2197 m_c_Add(m_Specific(CanIV), m_Specific(&Plan.getVFxUF())),
2198 m_Specific(&Plan.getVectorTripCount()))))
2199 return false;
2200
2201 // The compare checks CanIV + VFxUF == vector trip count. The vector trip
2202 // count is not conveniently available as SCEV so far, so we compare directly
2203 // against the original trip count. This is stricter than necessary, as we
2204 // will only return true if the trip count == vector trip count.
2205 const SCEV *VectorTripCount =
2207 if (isa<SCEVCouldNotCompute>(VectorTripCount))
2208 VectorTripCount = vputils::getSCEVExprForVPValue(Plan.getTripCount(), PSE);
2209 assert(!isa<SCEVCouldNotCompute>(VectorTripCount) &&
2210 "Trip count SCEV must be computable");
2211 ScalarEvolution &SE = *PSE.getSE();
2212 ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF);
2213 const SCEV *C = SE.getElementCount(VectorTripCount->getType(), NumElements);
2214 return SE.isKnownPredicate(CmpInst::ICMP_EQ, VectorTripCount, C);
2215}
2216
2217/// Try to replace multiple active lane masks used for control flow with
2218/// a single, wide active lane mask instruction followed by multiple
2219/// extract subvector intrinsics. This applies to the active lane mask
2220/// instructions both in the loop and in the preheader.
2221/// Incoming values of all ActiveLaneMaskPHIs are updated to use the
2222/// new extracts from the first active lane mask, which has it's last
2223/// operand (multiplier) set to UF.
2225 unsigned UF) {
2226 if (!EnableWideActiveLaneMask || !VF.isVector() || UF == 1)
2227 return false;
2228
2229 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
2230 VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
2231 auto *Term = &ExitingVPBB->back();
2232
2233 using namespace llvm::VPlanPatternMatch;
2235 m_VPValue(), m_VPValue(), m_VPValue())))))
2236 return false;
2237
2238 auto *Header = cast<VPBasicBlock>(VectorRegion->getEntry());
2239 LLVMContext &Ctx = Plan.getContext();
2240
2241 auto ExtractFromALM = [&](VPInstruction *ALM,
2242 SmallVectorImpl<VPValue *> &Extracts) {
2243 DebugLoc DL = ALM->getDebugLoc();
2244 for (unsigned Part = 0; Part < UF; ++Part) {
2246 Ops.append({ALM, Plan.getConstantInt(64, VF.getKnownMinValue() * Part)});
2247 auto *Ext =
2248 new VPWidenIntrinsicRecipe(Intrinsic::vector_extract, Ops,
2249 IntegerType::getInt1Ty(Ctx), {}, {}, DL);
2250 Extracts[Part] = Ext;
2251 Ext->insertAfter(ALM);
2252 }
2253 };
2254
2255 // Create a list of each active lane mask phi, ordered by unroll part.
2257 for (VPRecipeBase &R : Header->phis()) {
2259 if (!Phi)
2260 continue;
2261 VPValue *Index = nullptr;
2262 match(Phi->getBackedgeValue(),
2264 assert(Index && "Expected index from ActiveLaneMask instruction");
2265
2266 uint64_t Part;
2267 if (match(Index,
2269 m_VPValue(), m_Mul(m_VPValue(), m_ConstantInt(Part)))))
2270 Phis[Part] = Phi;
2271 else {
2272 // Anything other than a CanonicalIVIncrementForPart is part 0
2273 assert(!match(
2274 Index,
2276 Phis[0] = Phi;
2277 }
2278 }
2279
2280 assert(all_of(Phis, not_equal_to(nullptr)) &&
2281 "Expected one VPActiveLaneMaskPHIRecipe for each unroll part");
2282
2283 auto *EntryALM = cast<VPInstruction>(Phis[0]->getStartValue());
2284 auto *LoopALM = cast<VPInstruction>(Phis[0]->getBackedgeValue());
2285
2286 assert((EntryALM->getOpcode() == VPInstruction::ActiveLaneMask &&
2287 LoopALM->getOpcode() == VPInstruction::ActiveLaneMask) &&
2288 "Expected incoming values of Phi to be ActiveLaneMasks");
2289
2290 // When using wide lane masks, the return type of the get.active.lane.mask
2291 // intrinsic is VF x UF (last operand).
2292 VPValue *ALMMultiplier = Plan.getConstantInt(64, UF);
2293 EntryALM->setOperand(2, ALMMultiplier);
2294 LoopALM->setOperand(2, ALMMultiplier);
2295
2296 // Create UF x extract vectors and insert into preheader.
2297 SmallVector<VPValue *> EntryExtracts(UF);
2298 ExtractFromALM(EntryALM, EntryExtracts);
2299
2300 // Create UF x extract vectors and insert before the loop compare & branch,
2301 // updating the compare to use the first extract.
2302 SmallVector<VPValue *> LoopExtracts(UF);
2303 ExtractFromALM(LoopALM, LoopExtracts);
2304 VPInstruction *Not = cast<VPInstruction>(Term->getOperand(0));
2305 Not->setOperand(0, LoopExtracts[0]);
2306
2307 // Update the incoming values of active lane mask phis.
2308 for (unsigned Part = 0; Part < UF; ++Part) {
2309 Phis[Part]->setStartValue(EntryExtracts[Part]);
2310 Phis[Part]->setBackedgeValue(LoopExtracts[Part]);
2311 }
2312
2313 return true;
2314}
2315
2316/// Try to simplify the branch condition of \p Plan. This may restrict the
2317/// resulting plan to \p BestVF and \p BestUF.
2319 unsigned BestUF,
2321 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
2322 VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
2323 auto *Term = &ExitingVPBB->back();
2324 VPValue *Cond;
2325 auto m_CanIVInc = m_Add(m_VPValue(), m_Specific(&Plan.getVFxUF()));
2326 // Check if the branch condition compares the canonical IV increment (for main
2327 // loop), or the canonical IV increment plus an offset (for epilog loop).
2328 if (match(Term, m_BranchOnCount(
2329 m_CombineOr(m_CanIVInc, m_c_Add(m_CanIVInc, m_LiveIn())),
2330 m_VPValue())) ||
2332 m_VPValue(), m_VPValue(), m_VPValue()))))) {
2333 // Try to simplify the branch condition if VectorTC <= VF * UF when the
2334 // latch terminator is BranchOnCount or BranchOnCond(Not(ActiveLaneMask)).
2335 const SCEV *VectorTripCount =
2337 if (isa<SCEVCouldNotCompute>(VectorTripCount))
2338 VectorTripCount =
2340 assert(!isa<SCEVCouldNotCompute>(VectorTripCount) &&
2341 "Trip count SCEV must be computable");
2342 ScalarEvolution &SE = *PSE.getSE();
2343 ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF);
2344 const SCEV *C = SE.getElementCount(VectorTripCount->getType(), NumElements);
2345 if (!SE.isKnownPredicate(CmpInst::ICMP_ULE, VectorTripCount, C))
2346 return false;
2347 } else if (match(Term, m_BranchOnCond(m_VPValue(Cond))) ||
2349 // For BranchOnCond, check if we can prove the condition to be true using VF
2350 // and UF.
2351 if (!isConditionTrueViaVFAndUF(Cond, Plan, BestVF, BestUF, PSE))
2352 return false;
2353 } else {
2354 return false;
2355 }
2356
2357 // The vector loop region only executes once. Convert terminator of the
2358 // exiting block to exit in the first iteration.
2359 if (match(Term, m_BranchOnTwoConds())) {
2360 Term->setOperand(1, Plan.getTrue());
2361 return true;
2362 }
2363
2364 auto *BOC = new VPInstruction(VPInstruction::BranchOnCond, Plan.getTrue(), {},
2365 {}, Term->getDebugLoc());
2366 ExitingVPBB->appendRecipe(BOC);
2367 Term->eraseFromParent();
2368
2369 return true;
2370}
2371
2372/// From the definition of llvm.experimental.get.vector.length,
2373/// VPInstruction::ExplicitVectorLength(%AVL) = %AVL when %AVL <= VF.
2377 vp_depth_first_deep(Plan.getEntry()))) {
2378 for (VPRecipeBase &R : *VPBB) {
2379 VPValue *AVL;
2380 if (!match(&R, m_EVL(m_VPValue(AVL))))
2381 continue;
2382
2383 const SCEV *AVLSCEV = vputils::getSCEVExprForVPValue(AVL, PSE);
2384 if (isa<SCEVCouldNotCompute>(AVLSCEV))
2385 continue;
2386 ScalarEvolution &SE = *PSE.getSE();
2387 const SCEV *VFSCEV = SE.getElementCount(AVLSCEV->getType(), VF);
2388 if (!SE.isKnownPredicate(CmpInst::ICMP_ULE, AVLSCEV, VFSCEV))
2389 continue;
2390
2392 AVL, Type::getInt32Ty(Plan.getContext()), AVLSCEV->getType(),
2393 R.getDebugLoc());
2394 if (Trunc != AVL) {
2395 auto *TruncR = cast<VPSingleDefRecipe>(Trunc);
2396 const DataLayout &DL = Plan.getDataLayout();
2397 if (VPValue *Folded = tryToFoldLiveIns(*TruncR, TruncR->operands(), DL))
2398 Trunc = Folded;
2399 }
2400 R.getVPSingleValue()->replaceAllUsesWith(Trunc);
2401 return true;
2402 }
2403 }
2404 return false;
2405}
2406
2408 unsigned BestUF,
2410 assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
2411 assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
2412
2413 bool MadeChange = tryToReplaceALMWithWideALM(Plan, BestVF, BestUF);
2414 MadeChange |= simplifyBranchConditionForVFAndUF(Plan, BestVF, BestUF, PSE);
2415 MadeChange |= optimizeVectorInductionWidthForTCAndVFUF(Plan, BestVF, BestUF);
2416
2417 if (MadeChange) {
2418 Plan.setVF(BestVF);
2419 assert(Plan.getConcreteUF() == BestUF && "BestUF must match the Plan's UF");
2420 }
2421}
2422
2424 for (VPRecipeBase &R :
2426 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
2427 if (!PhiR)
2428 continue;
2429 RecurKind RK = PhiR->getRecurrenceKind();
2430 if (RK != RecurKind::Add && RK != RecurKind::Mul && RK != RecurKind::Sub &&
2432 continue;
2433
2434 for (VPUser *U : collectUsersRecursively(PhiR))
2435 if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(U)) {
2436 RecWithFlags->dropPoisonGeneratingFlags();
2437 }
2438 }
2439}
2440
2441namespace {
2442struct VPCSEDenseMapInfo : public DenseMapInfo<VPSingleDefRecipe *> {
2443 /// If recipe \p R will lower to a GEP with a non-i8 source element type,
2444 /// return that source element type.
2445 static Type *getGEPSourceElementType(const VPSingleDefRecipe *R) {
2446 // All VPInstructions that lower to GEPs must have the i8 source element
2447 // type (as they are PtrAdds), so we omit it.
2449 .Case([](const VPReplicateRecipe *I) -> Type * {
2450 if (auto *GEP = dyn_cast<GetElementPtrInst>(I->getUnderlyingValue()))
2451 return GEP->getSourceElementType();
2452 return nullptr;
2453 })
2454 .Case<VPVectorPointerRecipe, VPWidenGEPRecipe>(
2455 [](auto *I) { return I->getSourceElementType(); })
2456 .Default([](auto *) { return nullptr; });
2457 }
2458
2459 /// Returns true if recipe \p Def can be safely handed for CSE.
2460 static bool canHandle(const VPSingleDefRecipe *Def) {
2461 // We can extend the list of handled recipes in the future,
2462 // provided we account for the data embedded in them while checking for
2463 // equality or hashing.
2464 auto C = getOpcodeOrIntrinsicID(Def);
2465
2466 // The issue with (Insert|Extract)Value is that the index of the
2467 // insert/extract is not a proper operand in LLVM IR, and hence also not in
2468 // VPlan.
2469 if (!C || (!C->first && (C->second == Instruction::InsertValue ||
2470 C->second == Instruction::ExtractValue)))
2471 return false;
2472
2473 // During CSE, we can only handle recipes that don't read from memory: if
2474 // they read from memory, there could be an intervening write to memory
2475 // before the next instance is CSE'd, leading to an incorrect result.
2476 return !Def->mayReadFromMemory();
2477 }
2478
2479 /// Hash the underlying data of \p Def.
2480 static unsigned getHashValue(const VPSingleDefRecipe *Def) {
2481 hash_code Result = hash_combine(
2482 Def->getVPRecipeID(), getOpcodeOrIntrinsicID(Def),
2483 getGEPSourceElementType(Def), Def->getScalarType(),
2485 if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(Def))
2486 if (RFlags->hasPredicate())
2487 return hash_combine(Result, RFlags->getPredicate());
2488 if (auto *SIVSteps = dyn_cast<VPScalarIVStepsRecipe>(Def))
2489 return hash_combine(Result, SIVSteps->getInductionOpcode());
2490 return Result;
2491 }
2492
2493 /// Check equality of underlying data of \p L and \p R.
2494 static bool isEqual(const VPSingleDefRecipe *L, const VPSingleDefRecipe *R) {
2495 if (L->getVPRecipeID() != R->getVPRecipeID() ||
2497 getGEPSourceElementType(L) != getGEPSourceElementType(R) ||
2499 !equal(L->operands(), R->operands()))
2500 return false;
2502 "must have valid opcode info for both recipes");
2503 if (auto *LFlags = dyn_cast<VPRecipeWithIRFlags>(L))
2504 if (LFlags->hasPredicate() &&
2505 LFlags->getPredicate() !=
2506 cast<VPRecipeWithIRFlags>(R)->getPredicate())
2507 return false;
2508 if (auto *LSIV = dyn_cast<VPScalarIVStepsRecipe>(L))
2509 if (LSIV->getInductionOpcode() !=
2510 cast<VPScalarIVStepsRecipe>(R)->getInductionOpcode())
2511 return false;
2512 // Recipes in replicate regions implicitly depend on predicate. If either
2513 // recipe is in a replicate region, only consider them equal if both have
2514 // the same parent.
2515 const VPRegionBlock *RegionL = L->getRegion();
2516 const VPRegionBlock *RegionR = R->getRegion();
2517 if (((RegionL && RegionL->isReplicator()) ||
2518 (RegionR && RegionR->isReplicator())) &&
2519 L->getParent() != R->getParent())
2520 return false;
2521 return L->getScalarType() == R->getScalarType();
2522 }
2523};
2524} // end anonymous namespace
2525
2526/// Perform a common-subexpression-elimination of VPSingleDefRecipes on the \p
2527/// Plan.
2529 VPDominatorTree VPDT(Plan);
2531
2533 Plan.getEntry());
2535 for (VPRecipeBase &R : *VPBB) {
2536 auto *Def = dyn_cast<VPSingleDefRecipe>(&R);
2537 if (!Def || !VPCSEDenseMapInfo::canHandle(Def))
2538 continue;
2539 if (VPSingleDefRecipe *V = CSEMap.lookup(Def)) {
2540 // V must dominate Def for a valid replacement.
2541 if (!VPDT.dominates(V->getParent(), VPBB))
2542 continue;
2543 // Only keep flags present on both V and Def.
2544 if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(V))
2545 RFlags->intersectFlags(*cast<VPRecipeWithIRFlags>(Def));
2546 Def->replaceAllUsesWith(V);
2547 continue;
2548 }
2549 CSEMap[Def] = Def;
2550 }
2551 }
2552}
2553
2554/// Return true if we do not know how to (mechanically) hoist or sink a
2555/// non-memory or memory recipe \p R out of a loop region. When sinking, passing
2556/// \p Sinking = true ensures that assumes aren't sunk.
2558 VPBasicBlock *LastBB,
2559 bool Sinking = false) {
2560 if (!isa<VPReplicateRecipe>(R) || !R.mayReadOrWriteMemory() ||
2562 return vputils::cannotHoistOrSinkRecipe(R, Sinking);
2563
2564 // Check that the memory operation doesn't alias between FirstBB and LastBB.
2565 auto MemLoc = vputils::getMemoryLocation(R);
2566
2567 // TODO: Could make use of SinkStoreInfo::isNoAliasViaDistance by collecting
2568 // stores upfront, and constructing a full SinkStoreInfo.
2569 auto SinkInfo =
2570 Sinking ? std::make_optional(SinkStoreInfo(cast<VPReplicateRecipe>(R)))
2571 : std::nullopt;
2572
2573 return !MemLoc ||
2574 !canHoistOrSinkWithNoAliasCheck(*MemLoc, FirstBB, LastBB, SinkInfo);
2575}
2576
2577/// Move loop-invariant recipes out of the vector loop region in \p Plan.
2578static void licm(VPlan &Plan) {
2579 VPBasicBlock *Preheader = Plan.getVectorPreheader();
2580
2581 // Hoist any loop invariant recipes from the vector loop region to the
2582 // preheader. Preform a shallow traversal of the vector loop region, to
2583 // exclude recipes in replicate regions. Since the top-level blocks in the
2584 // vector loop region are guaranteed to execute if the vector pre-header is,
2585 // we don't need to check speculation safety.
2586 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2587 assert(Preheader->getSingleSuccessor() == LoopRegion &&
2588 "Expected vector prehader's successor to be the vector loop region");
2590 vp_depth_first_shallow(LoopRegion->getEntry()))) {
2591 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
2592 if (cannotHoistOrSinkRecipe(R, LoopRegion->getEntryBasicBlock(),
2593 LoopRegion->getExitingBasicBlock()))
2594 continue;
2595 if (any_of(R.operands(), [](VPValue *Op) {
2596 return !Op->isDefinedOutsideLoopRegions();
2597 }))
2598 continue;
2599 R.moveBefore(*Preheader, Preheader->end());
2600 }
2601 }
2602
2603#ifndef NDEBUG
2604 VPDominatorTree VPDT(Plan);
2605#endif
2606 // Sink recipes with no users inside the vector loop region if all users are
2607 // in the same exit block of the region.
2608 // TODO: Extend to sink recipes from inner loops.
2610 LoopRegion->getEntry());
2612 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
2613 if (cannotHoistOrSinkRecipe(R, LoopRegion->getEntryBasicBlock(),
2614 LoopRegion->getExitingBasicBlock(),
2615 /*Sinking=*/true))
2616 continue;
2617
2618 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
2619 assert(!RepR->isPredicated() &&
2620 "Expected prior transformation of predicated replicates to "
2621 "replicate regions");
2622 // narrowToSingleScalarRecipes should have already maximally narrowed
2623 // replicates to single-scalar replicates.
2624 // TODO: When unrolling, replicateByVF doesn't handle sunk
2625 // non-single-scalar replicates correctly.
2626 if (!RepR->isSingleScalar())
2627 continue;
2628
2629 // The pointer operand of stores must be loop-invariant.
2630 if (RepR->getOpcode() == Instruction::Store &&
2631 !RepR->getOperand(1)->isDefinedOutsideLoopRegions())
2632 continue;
2633 }
2634
2635 [[maybe_unused]] auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
2636 assert((!R.mayWriteToMemory() ||
2637 (RepR && RepR->getOpcode() == Instruction::Store &&
2638 RepR->getOperand(1)->isDefinedOutsideLoopRegions())) &&
2639 "The only recipes that may write to memory are expected to be "
2640 "stores with invariant pointer-operand");
2641
2642 // TODO: Use R.definedValues() instead of casting to VPSingleDefRecipe to
2643 // support recipes with multiple defined values (e.g., interleaved loads).
2644 auto *Def = cast<VPSingleDefRecipe>(&R);
2645
2646 // Cannot sink the recipe if the user is defined in a loop region or a
2647 // non-successor of the vector loop region. Cannot sink if user is a phi
2648 // either.
2649 VPBasicBlock *SinkBB = nullptr;
2650 if (any_of(Def->users(), [&SinkBB, &LoopRegion](VPUser *U) {
2651 auto *UserR = cast<VPRecipeBase>(U);
2652 VPBasicBlock *Parent = UserR->getParent();
2653 // TODO: Support sinking when users are in multiple blocks.
2654 if (SinkBB && SinkBB != Parent)
2655 return true;
2656 SinkBB = Parent;
2657 // TODO: If the user is a PHI node, we should check the block of
2658 // incoming value. Support PHI node users if needed.
2659 return UserR->isPhi() || Parent->getEnclosingLoopRegion() ||
2660 Parent->getSinglePredecessor() != LoopRegion;
2661 }))
2662 continue;
2663
2664 if (!SinkBB)
2665 SinkBB = cast<VPBasicBlock>(LoopRegion->getSingleSuccessor());
2666
2667 // TODO: This will need to be a check instead of a assert after
2668 // conditional branches in vectorized loops are supported.
2669 assert(VPDT.properlyDominates(VPBB, SinkBB) &&
2670 "Defining block must dominate sink block");
2671 // TODO: Clone the recipe if users are on multiple exit paths, instead of
2672 // just moving.
2673 Def->moveBefore(*SinkBB, SinkBB->getFirstNonPhi());
2674 }
2675 }
2676}
2677
2679 VPlan &Plan, const MapVector<Instruction *, uint64_t> &MinBWs) {
2680 if (Plan.hasScalarVFOnly())
2681 return;
2682 // Keep track of created truncates, so they can be re-used. Note that we
2683 // cannot use RAUW after creating a new truncate, as this would could make
2684 // other uses have different types for their operands, making them invalidly
2685 // typed.
2687 VPBasicBlock *PH = Plan.getVectorPreheader();
2690 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
2693 continue;
2694
2695 VPValue *ResultVPV = R.getVPSingleValue();
2696 auto *UI = cast_or_null<Instruction>(ResultVPV->getUnderlyingValue());
2697 unsigned NewResSizeInBits = MinBWs.lookup(UI);
2698 if (!NewResSizeInBits)
2699 continue;
2700
2701 // If the value wasn't vectorized, we must maintain the original scalar
2702 // type. Skip those here, after incrementing NumProcessedRecipes. Also
2703 // skip casts which do not need to be handled explicitly here, as
2704 // redundant casts will be removed during recipe simplification.
2706 continue;
2707
2708 Type *OldResTy = ResultVPV->getScalarType();
2709 unsigned OldResSizeInBits = OldResTy->getScalarSizeInBits();
2710 assert(OldResTy->isIntegerTy() && "only integer types supported");
2711 (void)OldResSizeInBits;
2712
2713 auto *NewResTy = IntegerType::get(Plan.getContext(), NewResSizeInBits);
2714
2715 // Any wrapping introduced by shrinking this operation shouldn't be
2716 // considered undefined behavior. So, we can't unconditionally copy
2717 // arithmetic wrapping flags to VPW.
2718 if (auto *VPW = dyn_cast<VPRecipeWithIRFlags>(&R))
2719 VPW->dropPoisonGeneratingFlags();
2720
2721 assert((OldResSizeInBits != NewResSizeInBits ||
2722 match(&R, m_ICmp(m_VPValue(), m_VPValue()))) &&
2723 "Only ICmps should not need extending the result.");
2724 assert(!isa<VPWidenStoreRecipe>(&R) && "stores cannot be narrowed");
2725
2726 // For loads/intrinsics we don't recreate the recipe; just wrap the
2727 // original wide result in a ZExt to OldResTy.
2729 if (OldResSizeInBits != NewResSizeInBits) {
2731 Instruction::ZExt, ResultVPV, OldResTy);
2732 ResultVPV->replaceAllUsesWith(Ext);
2733 Ext->setOperand(0, ResultVPV);
2734 }
2735 continue;
2736 }
2737
2738 // Shrink operands by introducing truncates as needed.
2739 unsigned StartIdx =
2740 match(&R, m_Select(m_VPValue(), m_VPValue(), m_VPValue())) ? 1 : 0;
2741 SmallVector<VPValue *> NewOperands(R.operands());
2742 for (VPValue *&Op : drop_begin(NewOperands, StartIdx)) {
2743 unsigned OpSizeInBits = Op->getScalarType()->getScalarSizeInBits();
2744 if (OpSizeInBits == NewResSizeInBits)
2745 continue;
2746 assert(OpSizeInBits > NewResSizeInBits && "nothing to truncate");
2747 auto [ProcessedIter, Inserted] = ProcessedTruncs.try_emplace(Op);
2748 if (Inserted) {
2749 VPBuilder Builder;
2750 if (isa<VPIRValue>(Op))
2751 Builder.setInsertPoint(PH);
2752 else
2753 Builder.setInsertPoint(&R);
2754 ProcessedIter->second =
2755 Builder.createWidenCast(Instruction::Trunc, Op, NewResTy);
2756 }
2757 Op = ProcessedIter->second;
2758 }
2759
2760 auto *NWR = cast<VPWidenRecipe>(&R)->cloneWithOperands(NewOperands);
2761 NWR->insertBefore(&R);
2762
2763 // Wrap NWR in a ZExt to preserve the original wide type for downstream
2764 // users (unless this is an ICmp, which produces i1 regardless).
2765 VPValue *Replacement = NWR->getVPSingleValue();
2766 if (OldResSizeInBits != NewResSizeInBits)
2767 Replacement =
2769 .createWidenCast(Instruction::ZExt, Replacement, OldResTy)
2770 ->getVPSingleValue();
2771 ResultVPV->replaceAllUsesWith(Replacement);
2772 R.eraseFromParent();
2773 }
2774 }
2775}
2776
2777bool VPlanTransforms::removeBranchOnConst(VPlan &Plan, bool OnlyLatches) {
2778 std::optional<VPDominatorTree> VPDT;
2779 if (OnlyLatches)
2780 VPDT.emplace(Plan);
2781
2782 // Collect all blocks before modifying the CFG so we can identify unreachable
2783 // ones after constant branch removal.
2785
2786 bool SimplifiedPhi = false;
2787 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(AllBlocks)) {
2788 VPValue *Cond;
2789 // Skip blocks that are not terminated by BranchOnCond.
2790 if (VPBB->empty() || !match(&VPBB->back(), m_BranchOnCond(m_VPValue(Cond))))
2791 continue;
2792
2793 if (OnlyLatches && !VPBlockUtils::isLatch(VPBB, *VPDT))
2794 continue;
2795
2796 assert(VPBB->getNumSuccessors() == 2 &&
2797 "Two successors expected for BranchOnCond");
2798 unsigned RemovedIdx;
2799 if (match(Cond, m_True()))
2800 RemovedIdx = 1;
2801 else if (match(Cond, m_False()))
2802 RemovedIdx = 0;
2803 else
2804 continue;
2805
2806 VPBasicBlock *RemovedSucc =
2807 cast<VPBasicBlock>(VPBB->getSuccessors()[RemovedIdx]);
2808 assert(count(RemovedSucc->getPredecessors(), VPBB) == 1 &&
2809 "There must be a single edge between VPBB and its successor");
2810 // Values coming from VPBB into phi recipes of RemovedSucc are removed from
2811 // these recipes.
2812 auto Phis = RemovedSucc->phis();
2813 for (VPRecipeBase &R : Phis)
2814 cast<VPPhiAccessors>(&R)->removeIncomingValueFor(VPBB);
2815 SimplifiedPhi |= !std::empty(Phis);
2816
2817 // Disconnect blocks and remove the terminator.
2818 VPBlockUtils::disconnectBlocks(VPBB, RemovedSucc);
2819 VPBB->back().eraseFromParent();
2820 }
2821
2822 // Compute which blocks are still reachable from the entry after constant
2823 // branch removal.
2826
2827 // Detach all unreachable blocks from their successors, removing their recipes
2828 // and incoming values from phi recipes.
2829 VPSymbolicValue Tmp(nullptr);
2830 for (VPBlockBase *B : AllBlocks) {
2831 if (Reachable.contains(B))
2832 continue;
2833 for (VPBlockBase *Succ : to_vector(B->successors())) {
2834 if (auto *SuccBB = dyn_cast<VPBasicBlock>(Succ))
2835 for (VPRecipeBase &R : SuccBB->phis())
2836 cast<VPPhiAccessors>(&R)->removeIncomingValueFor(B);
2838 }
2839 for (VPBasicBlock *DeadBB :
2841 for (VPRecipeBase &R : make_early_inc_range(*DeadBB)) {
2842 for (VPValue *Def : R.definedValues())
2843 Def->replaceAllUsesWith(&Tmp);
2844 R.eraseFromParent();
2845 }
2846 }
2847 }
2848 return SimplifiedPhi;
2849}
2850
2871
2872// Add a VPActiveLaneMaskPHIRecipe and related recipes to \p Plan and replace
2873// the loop terminator with a branch-on-cond recipe with the negated
2874// active-lane-mask as operand. Note that this turns the loop into an
2875// uncountable one. Only the existing terminator is replaced, all other existing
2876// recipes/users remain unchanged, except for poison-generating flags being
2877// dropped from the canonical IV increment. Return the created
2878// VPActiveLaneMaskPHIRecipe.
2879//
2880// The function adds the following recipes:
2881//
2882// vector.ph:
2883// %EntryInc = canonical-iv-increment-for-part CanonicalIVStart
2884// %EntryALM = active-lane-mask %EntryInc, TC
2885//
2886// vector.body:
2887// ...
2888// %P = active-lane-mask-phi [ %EntryALM, %vector.ph ], [ %ALM, %vector.body ]
2889// ...
2890// %InLoopInc = canonical-iv-increment-for-part CanonicalIVIncrement
2891// %ALM = active-lane-mask %InLoopInc, TC
2892// %Negated = Not %ALM
2893// branch-on-cond %Negated
2894//
2897 VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
2898 VPBasicBlock *EB = TopRegion->getExitingBasicBlock();
2899 VPValue *StartV = Plan.getZero(TopRegion->getCanonicalIVType());
2900 auto *CanonicalIVIncrement = TopRegion->getOrCreateCanonicalIVIncrement();
2901 // TODO: Check if dropping the flags is needed.
2902 TopRegion->clearCanonicalIVNUW(CanonicalIVIncrement);
2903 DebugLoc DL = CanonicalIVIncrement->getDebugLoc();
2904 // We can't use StartV directly in the ActiveLaneMask VPInstruction, since
2905 // we have to take unrolling into account. Each part needs to start at
2906 // Part * VF
2907 auto *VecPreheader = Plan.getVectorPreheader();
2908 VPBuilder Builder(VecPreheader);
2909
2910 // Create the ActiveLaneMask instruction using the correct start values.
2911 VPValue *TC = Plan.getTripCount();
2912 VPValue *VF = &Plan.getVF();
2913
2914 auto *EntryIncrement = Builder.createOverflowingOp(
2915 VPInstruction::CanonicalIVIncrementForPart, {StartV, VF}, {false, false},
2916 DL, "index.part.next");
2917
2918 // Create the active lane mask instruction in the VPlan preheader.
2919 VPValue *ALMMultiplier =
2920 Plan.getConstantInt(TopRegion->getCanonicalIVType(), 1);
2921 auto *EntryALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
2922 {EntryIncrement, TC, ALMMultiplier}, DL,
2923 "active.lane.mask.entry");
2924
2925 // Now create the ActiveLaneMaskPhi recipe in the main loop using the
2926 // preheader ActiveLaneMask instruction.
2927 auto *LaneMaskPhi =
2929 auto *HeaderVPBB = TopRegion->getEntryBasicBlock();
2930 LaneMaskPhi->insertBefore(*HeaderVPBB, HeaderVPBB->begin());
2931
2932 // Create the active lane mask for the next iteration of the loop before the
2933 // original terminator.
2934 VPRecipeBase *OriginalTerminator = EB->getTerminator();
2935 Builder.setInsertPoint(OriginalTerminator);
2936 auto *InLoopIncrement = Builder.createOverflowingOp(
2938 {CanonicalIVIncrement, &Plan.getVF()}, {false, false}, DL);
2939 auto *ALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
2940 {InLoopIncrement, TC, ALMMultiplier}, DL,
2941 "active.lane.mask.next");
2942 LaneMaskPhi->addBackedgeValue(ALM);
2943
2944 // Replace the original terminator with BranchOnCond. We have to invert the
2945 // mask here because a true condition means jumping to the exit block.
2946 auto *NotMask = Builder.createNot(ALM, DL);
2947 Builder.createNaryOp(VPInstruction::BranchOnCond, {NotMask}, DL);
2948 OriginalTerminator->eraseFromParent();
2949 return LaneMaskPhi;
2950}
2951
2953 bool UseActiveLaneMaskForControlFlow) {
2954 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2955 auto *WideCanonicalIV =
2957 assert(WideCanonicalIV &&
2958 "Must have widened canonical IV when tail folding!");
2959 VPSingleDefRecipe *HeaderMask = vputils::findHeaderMask(Plan);
2960 VPSingleDefRecipe *LaneMask;
2961 if (UseActiveLaneMaskForControlFlow) {
2962 LaneMask = addVPLaneMaskPhiAndUpdateExitBranch(Plan);
2963 } else {
2964 VPBuilder B = VPBuilder::getToInsertAfter(WideCanonicalIV);
2965 VPValue *ALMMultiplier =
2966 Plan.getConstantInt(LoopRegion->getCanonicalIVType(), 1);
2967 LaneMask =
2968 B.createNaryOp(VPInstruction::ActiveLaneMask,
2969 {WideCanonicalIV, Plan.getTripCount(), ALMMultiplier},
2970 nullptr, "active.lane.mask");
2971 }
2972
2973 // Walk users of WideCanonicalIV and replace the header mask of the form
2974 // (ICMP_ULE, WideCanonicalIV, backedge-taken-count) with an active-lane-mask,
2975 // removing the old one to ensure there is always only a single header mask.
2976 HeaderMask->replaceAllUsesWith(LaneMask);
2977 HeaderMask->eraseFromParent();
2978}
2979
2980template <typename Op0_t, typename Op1_t> struct RemoveMask_match {
2981 Op0_t In;
2983
2984 RemoveMask_match(const Op0_t &In, Op1_t &Out) : In(In), Out(Out) {}
2985
2986 template <typename OpTy> bool match(OpTy *V) const {
2987 if (m_Specific(In).match(V)) {
2988 Out = nullptr;
2989 return true;
2990 }
2991 return m_LogicalAnd(m_Specific(In), m_VPValue(Out)).match(V);
2992 }
2993};
2994
2995/// Match a specific mask \p In, or a combination of it (logical-and In, Out).
2996/// Returns the remaining part \p Out if so, or nullptr otherwise.
2997template <typename Op0_t, typename Op1_t>
2998static inline RemoveMask_match<Op0_t, Op1_t> m_RemoveMask(const Op0_t &In,
2999 Op1_t &Out) {
3000 return RemoveMask_match<Op0_t, Op1_t>(In, Out);
3001}
3002
3003static std::optional<Intrinsic::ID> getVPDivRemIntrinsic(Intrinsic::ID IntrID) {
3004 switch (IntrID) {
3005 case Intrinsic::masked_udiv:
3006 return Intrinsic::vp_udiv;
3007 case Intrinsic::masked_sdiv:
3008 return Intrinsic::vp_sdiv;
3009 case Intrinsic::masked_urem:
3010 return Intrinsic::vp_urem;
3011 case Intrinsic::masked_srem:
3012 return Intrinsic::vp_srem;
3013 default:
3014 return std::nullopt;
3015 }
3016}
3017
3018/// Try to optimize a \p CurRecipe masked by \p HeaderMask to a corresponding
3019/// EVL-based recipe without the header mask. Returns nullptr if no EVL-based
3020/// recipe could be created.
3021/// \p HeaderMask Header Mask.
3022/// \p CurRecipe Recipe to be transform.
3023/// \p EVL The explicit vector length parameter of vector-predication
3024/// intrinsics.
3026 VPRecipeBase &CurRecipe, VPValue &EVL) {
3027 VPlan *Plan = CurRecipe.getParent()->getPlan();
3028 DebugLoc DL = CurRecipe.getDebugLoc();
3029 VPValue *Addr, *Mask, *EndPtr;
3030
3031 /// Adjust any end pointers so that they point to the end of EVL lanes not VF.
3032 auto AdjustEndPtr = [&CurRecipe, &EVL](VPValue *EndPtr) {
3033 auto *EVLEndPtr = cast<VPVectorEndPointerRecipe>(EndPtr)->clone();
3034 EVLEndPtr->insertBefore(&CurRecipe);
3035 // Cast EVL (i32) to match the VF operand's type.
3036 VPValue *EVLAsVF = VPBuilder(EVLEndPtr).createScalarZExtOrTrunc(
3037 &EVL, EVLEndPtr->getOperand(1)->getScalarType(), EVL.getScalarType(),
3039 EVLEndPtr->setOperand(1, EVLAsVF);
3040 return EVLEndPtr;
3041 };
3042
3043 auto GetVPReverse = [&CurRecipe, &EVL, Plan,
3045 if (!V)
3046 return nullptr;
3047 auto *Reverse = new VPWidenIntrinsicRecipe(
3048 Intrinsic::experimental_vp_reverse, {V, Plan->getTrue(), &EVL},
3049 V->getScalarType(), {}, {}, DL);
3050 Reverse->insertBefore(&CurRecipe);
3051 return Reverse;
3052 };
3053
3054 if (match(&CurRecipe,
3055 m_MaskedLoad(m_VPValue(Addr), m_RemoveMask(HeaderMask, Mask))))
3056 return new VPWidenLoadEVLRecipe(cast<VPWidenLoadRecipe>(CurRecipe), Addr,
3057 EVL, Mask);
3058
3059 if (match(&CurRecipe,
3060 m_MaskedLoad(m_VPValue(EndPtr),
3061 m_Reverse(m_RemoveMask(HeaderMask, Mask)))) &&
3062 match(EndPtr, m_VecEndPtr(m_VPValue(), m_Specific(&Plan->getVF())))) {
3063 Mask = GetVPReverse(Mask);
3064 Addr = AdjustEndPtr(EndPtr);
3065 auto *LoadR = new VPWidenLoadEVLRecipe(cast<VPWidenLoadRecipe>(CurRecipe),
3066 Addr, EVL, Mask);
3067 LoadR->insertBefore(&CurRecipe);
3068 VPValue *Poison = Plan->getPoison(LoadR->getScalarType());
3069 return new VPWidenIntrinsicRecipe(Intrinsic::vector_splice_left,
3070 {Poison, LoadR, &EVL},
3071 LoadR->getScalarType(), {}, {}, DL);
3072 }
3073
3074 VPValue *Stride;
3076 m_VPValue(Addr), m_VPValue(Stride),
3077 m_RemoveMask(HeaderMask, Mask),
3078 m_TruncOrSelf(m_Specific(&Plan->getVF()))))) {
3079 if (!Mask)
3080 Mask = Plan->getTrue();
3081 auto *NewLoad = cast<VPWidenMemIntrinsicRecipe>(&CurRecipe)->clone();
3082 NewLoad->setOperand(2, Mask);
3083 NewLoad->setOperand(3, &EVL);
3084 return NewLoad;
3085 }
3086
3087 VPValue *StoredVal;
3088 if (match(&CurRecipe, m_MaskedStore(m_VPValue(Addr), m_VPValue(StoredVal),
3089 m_RemoveMask(HeaderMask, Mask))))
3090 return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(CurRecipe), Addr,
3091 StoredVal, EVL, Mask);
3092
3093 if (match(&CurRecipe,
3094 m_MaskedStore(m_VPValue(EndPtr), m_VPValue(StoredVal),
3095 m_Reverse(m_RemoveMask(HeaderMask, Mask)))) &&
3096 match(EndPtr, m_VecEndPtr(m_VPValue(), m_Specific(&Plan->getVF())))) {
3097 Mask = GetVPReverse(Mask);
3098 Addr = AdjustEndPtr(EndPtr);
3099 VPValue *Poison = Plan->getPoison(StoredVal->getScalarType());
3100 auto *SpliceR = new VPWidenIntrinsicRecipe(
3101 Intrinsic::vector_splice_right, {StoredVal, Poison, &EVL},
3102 StoredVal->getScalarType(), {}, {}, DL);
3103 SpliceR->insertBefore(&CurRecipe);
3104 return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(CurRecipe), Addr,
3105 SpliceR, EVL, Mask);
3106 }
3107
3108 if (auto *Rdx = dyn_cast<VPReductionRecipe>(&CurRecipe))
3109 if (Rdx->isConditional() &&
3110 match(Rdx->getCondOp(), m_RemoveMask(HeaderMask, Mask)))
3111 return new VPReductionEVLRecipe(*Rdx, EVL, Mask);
3112
3113 if (auto *Interleave = dyn_cast<VPInterleaveRecipe>(&CurRecipe))
3114 if (Interleave->getMask() &&
3115 match(Interleave->getMask(), m_RemoveMask(HeaderMask, Mask)))
3116 return new VPInterleaveEVLRecipe(*Interleave, EVL, Mask);
3117
3118 VPValue *LHS, *RHS;
3119 if (match(&CurRecipe, m_SelectLike(m_RemoveMask(HeaderMask, Mask),
3121 return new VPWidenIntrinsicRecipe(
3122 Intrinsic::vp_merge, {Mask ? Mask : Plan->getTrue(), LHS, RHS, &EVL},
3123 LHS->getScalarType(), {}, {}, DL);
3124
3125 if (match(&CurRecipe, m_LastActiveLane(m_Specific(HeaderMask)))) {
3126 Type *Ty = CurRecipe.getVPSingleValue()->getScalarType();
3127 VPValue *ZExt =
3128 VPBuilder(&CurRecipe)
3129 .createScalarZExtOrTrunc(&EVL, Ty, EVL.getScalarType(), DL);
3130 return new VPInstruction(
3131 Instruction::Sub, {ZExt, Plan->getConstantInt(Ty, 1)},
3132 VPIRFlags::getDefaultFlags(Instruction::Sub), {}, DL);
3133 }
3134
3135 // lhs | (headermask && rhs) -> vp.merge rhs, true, lhs, evl
3136 if (match(&CurRecipe,
3138 m_LogicalAnd(m_Specific(HeaderMask), m_VPValue(RHS)))))
3139 return new VPWidenIntrinsicRecipe(Intrinsic::vp_merge,
3140 {RHS, Plan->getTrue(), LHS, &EVL},
3141 LHS->getScalarType(), {}, {}, DL);
3142
3143 if (auto *IntrR = dyn_cast<VPWidenIntrinsicRecipe>(&CurRecipe))
3144 if (auto VPID = getVPDivRemIntrinsic(IntrR->getVectorIntrinsicID()))
3145 if (match(IntrR->getOperand(2), m_RemoveMask(HeaderMask, Mask)))
3146 return new VPWidenIntrinsicRecipe(*VPID,
3147 {IntrR->getOperand(0),
3148 IntrR->getOperand(1),
3149 Mask ? Mask : Plan->getTrue(), &EVL},
3150 IntrR->getScalarType(), {}, {}, DL);
3151
3152 return nullptr;
3153}
3154
3155/// Optimize away any EVL-based header masks to VP intrinsic based recipes.
3156/// The transforms here need to preserve the original semantics.
3158 // Find the EVL-based header mask if it exists: icmp ult step-vector, EVL
3159 VPValue *HeaderMask = nullptr, *EVL = nullptr;
3162 m_VPValue(EVL))) &&
3163 match(EVL, m_EVL(m_VPValue()))) {
3164 HeaderMask = R.getVPSingleValue();
3165 break;
3166 }
3167 }
3168 if (!HeaderMask)
3169 return;
3170
3171 SmallVector<VPRecipeBase *> OldRecipes;
3172 for (VPUser *U : collectUsersRecursively(HeaderMask)) {
3174 if (auto *NewR = optimizeMaskToEVL(HeaderMask, *R, *EVL)) {
3175 NewR->insertBefore(R);
3176 for (auto [Old, New] :
3177 zip_equal(R->definedValues(), NewR->definedValues()))
3178 Old->replaceAllUsesWith(New);
3179 OldRecipes.push_back(R);
3180 }
3181 }
3182
3183 // Replace remaining (HeaderMask && Mask) with vp.merge (True, Mask,
3184 // False, EVL)
3185 for (VPUser *U : collectUsersRecursively(HeaderMask)) {
3186 VPValue *Mask;
3187 if (match(U, m_LogicalAnd(m_Specific(HeaderMask), m_VPValue(Mask)))) {
3188 auto *LogicalAnd = cast<VPInstruction>(U);
3189 auto *Merge = new VPWidenIntrinsicRecipe(
3190 Intrinsic::vp_merge, {Plan.getTrue(), Mask, Plan.getFalse(), EVL},
3191 Mask->getScalarType(), {}, {}, LogicalAnd->getDebugLoc());
3192 Merge->insertBefore(LogicalAnd);
3193 LogicalAnd->replaceAllUsesWith(Merge);
3194 OldRecipes.push_back(LogicalAnd);
3195 }
3196 }
3197
3198 // Fold the following splice patterns:
3199 // splice.right(splice.left(poison, x, evl), poison, evl) -> x
3200 // vector.reverse(splice.left(poison, x, evl)) -> vp.reverse(x, true, evl)
3201 // splice.right(vector.reverse(x), poison, evl) -> vp.reverse(x, true, evl)
3202 for (VPUser *U : collectUsersRecursively(EVL)) {
3203 auto *R = cast<VPRecipeBase>(U);
3204 VPValue *X;
3207 m_Poison(), m_VPValue(X), m_Specific(EVL)),
3208 m_Poison(), m_Specific(EVL)))) {
3209 R->getVPSingleValue()->replaceAllUsesWith(X);
3210 OldRecipes.push_back(R);
3211 continue;
3212 }
3213
3214 if (!match(U,
3217 m_Poison(), m_VPValue(X), m_Specific(EVL))),
3219 m_Reverse(m_VPValue(X)), m_Poison(), m_Specific(EVL)))))
3220 continue;
3221
3222 auto *VPReverse = new VPWidenIntrinsicRecipe(
3223 Intrinsic::experimental_vp_reverse, {X, Plan.getTrue(), EVL},
3224 X->getScalarType(), {}, {}, R->getDebugLoc());
3225 VPReverse->insertBefore(R);
3226 R->getVPSingleValue()->replaceAllUsesWith(VPReverse);
3227 OldRecipes.push_back(R);
3228 }
3229
3230 for (VPRecipeBase *R : reverse(OldRecipes)) {
3231 SmallVector<VPValue *> PossiblyDead(R->operands());
3232 R->eraseFromParent();
3233 for (VPValue *Op : PossiblyDead)
3235 }
3236}
3237
3238/// After replacing the canonical IV with a EVL-based IV, fixup recipes that use
3239/// VF to use the EVL instead to avoid incorrect updates on the penultimate
3240/// iteration.
3241static void fixupVFUsersForEVL(VPlan &Plan, VPValue &EVL) {
3242 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3243 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
3244
3245 // EVL is i32 but VF/VFxUF are IdxTy. Convert as needed.
3246 VPValue *EVLAsIdx =
3250
3251 assert(all_of(Plan.getVF().users(),
3252 [&Plan](VPUser *U) {
3253 auto IsAllowedUser =
3254 IsaPred<VPVectorEndPointerRecipe, VPScalarIVStepsRecipe,
3255 VPWidenIntOrFpInductionRecipe,
3256 VPWidenMemIntrinsicRecipe>;
3257 if (match(U, m_Trunc(m_Specific(&Plan.getVF()))))
3258 return all_of(cast<VPSingleDefRecipe>(U)->users(),
3259 IsAllowedUser);
3260 return IsAllowedUser(U);
3261 }) &&
3262 "User of VF that we can't transform to EVL.");
3263 Plan.getVF().replaceUsesWithIf(EVLAsIdx, [](VPUser &U, unsigned Idx) {
3265 });
3266
3267 assert(all_of(Plan.getVFxUF().users(),
3269 m_c_Add(m_Specific(LoopRegion->getCanonicalIV()),
3270 m_Specific(&Plan.getVFxUF())),
3272 "Only users of VFxUF should be VPWidenPointerInductionRecipe and the "
3273 "increment of the canonical induction.");
3274 Plan.getVFxUF().replaceUsesWithIf(EVLAsIdx, [](VPUser &U, unsigned Idx) {
3275 // Only replace uses in VPWidenPointerInductionRecipe; The increment of the
3276 // canonical induction must not be updated.
3278 });
3279
3280 // Create a scalar phi to track the previous EVL if fixed-order recurrence is
3281 // contained.
3282 bool ContainsFORs =
3284 if (ContainsFORs) {
3285 // TODO: Use VPInstruction::ExplicitVectorLength to get maximum EVL.
3286 VPValue *MaxEVL = &Plan.getVF();
3287 // Emit VPScalarCastRecipe in preheader if VF is not a 32 bits integer.
3288 VPBuilder Builder(LoopRegion->getPreheaderVPBB());
3289 MaxEVL = Builder.createScalarZExtOrTrunc(
3290 MaxEVL, Type::getInt32Ty(Plan.getContext()), MaxEVL->getScalarType(),
3292
3293 Builder.setInsertPoint(Header, Header->getFirstNonPhi());
3294 VPValue *PrevEVL = Builder.createScalarPhi(
3295 {MaxEVL, &EVL}, DebugLoc::getUnknown(), "prev.evl");
3296
3299 for (VPRecipeBase &R : *VPBB) {
3300 VPValue *V1, *V2;
3301 if (!match(&R,
3303 m_VPValue(V1), m_VPValue(V2))))
3304 continue;
3305 VPValue *Imm = Plan.getOrAddLiveIn(
3308 Intrinsic::experimental_vp_splice,
3309 {V1, V2, Imm, Plan.getTrue(), PrevEVL, &EVL},
3310 R.getVPSingleValue()->getScalarType(), {}, {}, R.getDebugLoc());
3311 VPSplice->insertBefore(&R);
3312 R.getVPSingleValue()->replaceAllUsesWith(VPSplice);
3313 }
3314 }
3315 }
3316
3317 VPValue *HeaderMask = vputils::findHeaderMask(Plan);
3318 if (!HeaderMask)
3319 return;
3320
3321 // Ensure that any reduction that uses a select to mask off tail lanes does so
3322 // in the vector loop, not the middle block, since EVL tail folding can have
3323 // tail elements in the penultimate iteration.
3324 assert(all_of(*Plan.getMiddleBlock(), [&Plan, HeaderMask](VPRecipeBase &R) {
3325 if (match(&R, m_ComputeReductionResult(m_Select(m_Specific(HeaderMask),
3326 m_VPValue(), m_VPValue()))))
3327 return R.getOperand(0)->getDefiningRecipe()->getRegion() ==
3328 Plan.getVectorLoopRegion();
3329 return true;
3330 }));
3331
3332 // Replace header masks with a mask equivalent to predicating by EVL:
3333 //
3334 // icmp ule widen-canonical-iv backedge-taken-count
3335 // ->
3336 // icmp ult step-vector, EVL
3337 VPRecipeBase *EVLR = EVL.getDefiningRecipe();
3338 VPBuilder Builder(EVLR->getParent(), std::next(EVLR->getIterator()));
3339 Type *EVLType = EVL.getScalarType();
3340 VPValue *EVLMask = Builder.createICmp(
3342 Builder.createNaryOp(VPInstruction::StepVector, {}, EVLType), &EVL);
3343 HeaderMask->replaceAllUsesWith(EVLMask);
3344}
3345
3346/// Converts a tail folded vector loop region to step by
3347/// VPInstruction::ExplicitVectorLength elements instead of VF elements each
3348/// iteration.
3349///
3350/// - Add a VPCurrentIterationPHIRecipe and related recipes to \p Plan and
3351/// replaces all uses of the canonical IV except for the canonical IV
3352/// increment with a VPCurrentIterationPHIRecipe. The canonical IV is used
3353/// only for loop iterations counting after this transformation.
3354///
3355/// - The header mask is replaced with a header mask based on the EVL.
3356///
3357/// - Plans with FORs have a new phi added to keep track of the EVL of the
3358/// previous iteration, and VPFirstOrderRecurrencePHIRecipes are replaced with
3359/// @llvm.vp.splice.
3360///
3361/// The function uses the following definitions:
3362/// %StartV is the canonical induction start value.
3363///
3364/// The function adds the following recipes:
3365///
3366/// vector.ph:
3367/// ...
3368///
3369/// vector.body:
3370/// ...
3371/// %CurrentIter = CURRENT-ITERATION-PHI [ %StartV, %vector.ph ],
3372/// [ %NextIter, %vector.body ]
3373/// %AVL = phi [ trip-count, %vector.ph ], [ %NextAVL, %vector.body ]
3374/// %VPEVL = EXPLICIT-VECTOR-LENGTH %AVL
3375/// ...
3376/// %OpEVL = cast i32 %VPEVL to IVSize
3377/// %NextIter = add IVSize %OpEVL, %CurrentIter
3378/// %NextAVL = sub IVSize nuw %AVL, %OpEVL
3379/// ...
3380///
3381/// If MaxSafeElements is provided, the function adds the following recipes:
3382/// vector.ph:
3383/// ...
3384///
3385/// vector.body:
3386/// ...
3387/// %CurrentIter = CURRENT-ITERATION-PHI [ %StartV, %vector.ph ],
3388/// [ %NextIter, %vector.body ]
3389/// %AVL = phi [ trip-count, %vector.ph ], [ %NextAVL, %vector.body ]
3390/// %cmp = cmp ult %AVL, MaxSafeElements
3391/// %SAFE_AVL = select %cmp, %AVL, MaxSafeElements
3392/// %VPEVL = EXPLICIT-VECTOR-LENGTH %SAFE_AVL
3393/// ...
3394/// %OpEVL = cast i32 %VPEVL to IVSize
3395/// %NextIter = add IVSize %OpEVL, %CurrentIter
3396/// %NextAVL = sub IVSize nuw %AVL, %OpEVL
3397/// ...
3398///
3400 VPlan &Plan, const std::optional<unsigned> &MaxSafeElements) {
3401 if (Plan.hasScalarVFOnly())
3402 return;
3403 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3404 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
3405
3406 auto *CanonicalIV = LoopRegion->getCanonicalIV();
3407 auto *CanIVTy = LoopRegion->getCanonicalIVType();
3408 VPValue *StartV = Plan.getZero(CanIVTy);
3409 auto *CanonicalIVIncrement = LoopRegion->getOrCreateCanonicalIVIncrement();
3410
3411 // Create the CurrentIteration recipe in the vector loop.
3412 auto *CurrentIteration =
3414 CurrentIteration->insertBefore(*Header, Header->begin());
3415 VPBuilder Builder(Header, Header->getFirstNonPhi());
3416 // Create the AVL (application vector length), starting from TC -> 0 in steps
3417 // of EVL.
3418 VPPhi *AVLPhi = Builder.createScalarPhi(
3419 {Plan.getTripCount()}, DebugLoc::getCompilerGenerated(), "avl");
3420 VPValue *AVL = AVLPhi;
3421
3422 if (MaxSafeElements) {
3423 // Support for MaxSafeDist for correct loop emission.
3424 VPValue *AVLSafe = Plan.getConstantInt(CanIVTy, *MaxSafeElements);
3425 VPValue *Cmp = Builder.createICmp(ICmpInst::ICMP_ULT, AVL, AVLSafe);
3426 AVL = Builder.createSelect(Cmp, AVL, AVLSafe, DebugLoc::getUnknown(),
3427 "safe_avl");
3428 }
3429 auto *VPEVL = Builder.createNaryOp(VPInstruction::ExplicitVectorLength, AVL,
3430 DebugLoc::getUnknown(), "evl");
3431
3432 Builder.setInsertPoint(CanonicalIVIncrement);
3433 VPValue *OpVPEVL = VPEVL;
3434
3435 auto *I32Ty = Type::getInt32Ty(Plan.getContext());
3436 OpVPEVL = Builder.createScalarZExtOrTrunc(
3437 OpVPEVL, CanIVTy, I32Ty, CanonicalIVIncrement->getDebugLoc());
3438
3439 auto *NextIter = Builder.createAdd(
3440 OpVPEVL, CurrentIteration, CanonicalIVIncrement->getDebugLoc(),
3441 "current.iteration.next", CanonicalIVIncrement->getNoWrapFlags());
3442 CurrentIteration->addBackedgeValue(NextIter);
3443
3444 VPValue *NextAVL =
3445 Builder.createSub(AVLPhi, OpVPEVL, DebugLoc::getCompilerGenerated(),
3446 "avl.next", {/*NUW=*/true, /*NSW=*/false});
3447 AVLPhi->addIncoming(NextAVL);
3448
3449 fixupVFUsersForEVL(Plan, *VPEVL);
3450 removeDeadRecipes(Plan);
3451
3452 // Replace all uses of the canonical IV with VPCurrentIterationPHIRecipe
3453 // except for the canonical IV increment.
3454 CanonicalIV->replaceAllUsesWith(CurrentIteration);
3455 CanonicalIVIncrement->setOperand(0, CanonicalIV);
3456 // TODO: support unroll factor > 1.
3457 Plan.setUF(1);
3458}
3459
3461 // Find the vector loop entry by locating VPCurrentIterationPHIRecipe.
3462 // There should be only one VPCurrentIteration in the entire plan.
3463 VPCurrentIterationPHIRecipe *CurrentIteration = nullptr;
3464
3467 for (VPRecipeBase &R : VPBB->phis())
3468 if (auto *PhiR = dyn_cast<VPCurrentIterationPHIRecipe>(&R)) {
3469 assert(!CurrentIteration &&
3470 "Found multiple CurrentIteration. Only one expected");
3471 CurrentIteration = PhiR;
3472 }
3473
3474 // Early return if it is not variable-length stepping.
3475 if (!CurrentIteration)
3476 return;
3477
3478 VPBasicBlock *HeaderVPBB = CurrentIteration->getParent();
3479 VPValue *CurrentIterationIncr = CurrentIteration->getBackedgeValue();
3480
3481 // Convert CurrentIteration to concrete recipe.
3482 auto *ScalarR =
3483 VPBuilder(CurrentIteration)
3485 {CurrentIteration->getStartValue(), CurrentIterationIncr},
3486 CurrentIteration->getDebugLoc(), "current.iteration.iv");
3487 CurrentIteration->replaceAllUsesWith(ScalarR);
3488 CurrentIteration->eraseFromParent();
3489
3490 // Replace CanonicalIVInc with CurrentIteration increment if it exists.
3491 auto *CanonicalIV = cast<VPPhi>(&*HeaderVPBB->begin());
3492 if (auto *CanIVInc = findUserOf(
3493 CanonicalIV, m_c_Add(m_VPValue(), m_Specific(&Plan.getVFxUF())))) {
3494 cast<VPInstruction>(CanIVInc)->replaceAllUsesWith(CurrentIterationIncr);
3495 CanIVInc->eraseFromParent();
3496 }
3497}
3498
3500 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3501 if (!LoopRegion)
3502 return;
3503 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
3504 if (Header->empty())
3505 return;
3506 // The EVL IV is always at the beginning.
3507 auto *EVLPhi = dyn_cast<VPCurrentIterationPHIRecipe>(&Header->front());
3508 if (!EVLPhi)
3509 return;
3510
3511 // Bail if not an EVL tail folded loop.
3512 VPValue *AVL;
3513 if (!match(EVLPhi->getBackedgeValue(),
3514 m_c_Add(m_ZExtOrSelf(m_EVL(m_VPValue(AVL))), m_Specific(EVLPhi))))
3515 return;
3516
3517 // The AVL may be capped to a safe distance.
3518 VPValue *SafeAVL, *UnsafeAVL;
3519 if (match(AVL,
3521 m_VPValue(SafeAVL)),
3522 m_Deferred(UnsafeAVL), m_Deferred(SafeAVL))))
3523 AVL = UnsafeAVL;
3524
3525 VPValue *AVLNext;
3526 [[maybe_unused]] bool FoundAVLNext =
3528 m_Specific(Plan.getTripCount()), m_VPValue(AVLNext)));
3529 assert(FoundAVLNext && "Didn't find AVL backedge?");
3530
3531 VPBasicBlock *Latch = LoopRegion->getExitingBasicBlock();
3532 auto *LatchBr = cast<VPInstruction>(Latch->getTerminator());
3533 if (match(LatchBr, m_BranchOnCond(m_True())))
3534 return;
3535
3536 VPValue *CanIVInc;
3537 [[maybe_unused]] bool FoundIncrement = match(
3538 LatchBr,
3540 m_Specific(&Plan.getVectorTripCount()))));
3541 assert(FoundIncrement &&
3542 match(CanIVInc, m_Add(m_Specific(LoopRegion->getCanonicalIV()),
3543 m_Specific(&Plan.getVFxUF()))) &&
3544 "Expected BranchOnCond with ICmp comparing CanIV + VFxUF with vector "
3545 "trip count");
3546
3547 Type *AVLTy = AVLNext->getScalarType();
3548 VPBuilder Builder(LatchBr);
3549 LatchBr->setOperand(
3550 0, Builder.createICmp(CmpInst::ICMP_EQ, AVLNext, Plan.getZero(AVLTy)));
3551}
3552
3554 VPlan &Plan, PredicatedScalarEvolution &PSE,
3555 const DenseMap<Value *, const SCEV *> &StridesMap,
3556 const VPDominatorTree &VPDT) {
3557 // Replace VPValues for known constant strides guaranteed by predicated scalar
3558 // evolution that are guaranteed to be guarded by the runtime checks; that is,
3559 // blocks dominated by the vector preheader.
3560 assert(!Plan.getVectorLoopRegion() &&
3561 "expected to run before loop regions are created");
3562 VPBlockBase *Preheader = Plan.getEntry()->getSuccessors()[1];
3563 auto CanUseVersionedStride = [&VPDT, Preheader](VPUser &U, unsigned) {
3564 auto *R = cast<VPRecipeBase>(&U);
3565 VPBlockBase *Parent = R->getParent();
3566 return VPDT.dominates(Preheader, Parent);
3567 };
3568 ValueToSCEVMapTy RewriteMap;
3569 for (const SCEV *Stride : StridesMap.values()) {
3570 using namespace SCEVPatternMatch;
3571 auto *StrideV = cast<SCEVUnknown>(Stride)->getValue();
3572 const APInt *StrideConst;
3573 if (!match(PSE.getSCEV(StrideV), m_scev_APInt(StrideConst)))
3574 // Only handle constant strides for now.
3575 continue;
3576
3577 auto *CI = Plan.getConstantInt(*StrideConst);
3578 if (VPValue *StrideVPV = Plan.getLiveIn(StrideV))
3579 StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
3580
3581 // The versioned value may not be used in the loop directly but through a
3582 // sext/zext. Add new live-ins in those cases.
3583 for (Value *U : StrideV->users()) {
3585 continue;
3586 VPValue *StrideVPV = Plan.getLiveIn(U);
3587 if (!StrideVPV)
3588 continue;
3589 unsigned BW = U->getType()->getScalarSizeInBits();
3590 APInt C =
3591 isa<SExtInst>(U) ? StrideConst->sext(BW) : StrideConst->zext(BW);
3592 VPValue *CI = Plan.getConstantInt(C);
3593 StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
3594 }
3595 RewriteMap[StrideV] = PSE.getSCEV(StrideV);
3596 }
3597
3598 for (VPRecipeBase &R : *Plan.getEntry()) {
3599 auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(&R);
3600 if (!ExpSCEV)
3601 continue;
3602 const SCEV *ScevExpr = ExpSCEV->getSCEV();
3603 auto *NewSCEV =
3604 SCEVParameterRewriter::rewrite(ScevExpr, *PSE.getSE(), RewriteMap);
3605 if (NewSCEV != ScevExpr) {
3606 VPValue *NewExp = vputils::getOrCreateVPValueForSCEVExpr(Plan, NewSCEV);
3607 ExpSCEV->replaceAllUsesWith(NewExp);
3608 if (Plan.getTripCount() == ExpSCEV)
3609 Plan.resetTripCount(NewExp);
3610 }
3611 }
3612}
3613
3615 // Collect recipes in the backward slice of `Root` that may generate a poison
3616 // value that is used after vectorization.
3618 auto CollectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) {
3620 Worklist.push_back(Root);
3621
3622 // Traverse the backward slice of Root through its use-def chain.
3623 while (!Worklist.empty()) {
3624 VPRecipeBase *CurRec = Worklist.pop_back_val();
3625
3626 if (!Visited.insert(CurRec).second)
3627 continue;
3628
3629 // Prune search if we find another recipe generating a widen memory
3630 // instruction. Widen memory instructions involved in address computation
3631 // will lead to gather/scatter instructions, which don't need to be
3632 // handled.
3634 VPHeaderPHIRecipe>(CurRec))
3635 continue;
3636
3637 // This recipe contributes to the address computation of a widen
3638 // load/store. If the underlying instruction has poison-generating flags,
3639 // drop them directly.
3640 if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(CurRec)) {
3641 VPValue *A, *B;
3642 // Dropping disjoint from an OR may yield incorrect results, as some
3643 // analysis may have converted it to an Add implicitly (e.g. SCEV used
3644 // for dependence analysis). Instead, replace it with an equivalent Add.
3645 // This is possible as all users of the disjoint OR only access lanes
3646 // where the operands are disjoint or poison otherwise.
3647 if (match(RecWithFlags, m_BinaryOr(m_VPValue(A), m_VPValue(B))) &&
3648 RecWithFlags->isDisjoint()) {
3649 VPBuilder Builder(RecWithFlags);
3650 VPInstruction *New =
3651 Builder.createAdd(A, B, RecWithFlags->getDebugLoc());
3652 New->setUnderlyingValue(RecWithFlags->getUnderlyingValue());
3653 RecWithFlags->replaceAllUsesWith(New);
3654 RecWithFlags->eraseFromParent();
3655 CurRec = New;
3656 } else
3657 RecWithFlags->dropPoisonGeneratingFlags();
3658 } else {
3661 (void)Instr;
3662 assert((!Instr || !Instr->hasPoisonGeneratingFlags()) &&
3663 "found instruction with poison generating flags not covered by "
3664 "VPRecipeWithIRFlags");
3665 }
3666
3667 // Add new definitions to the worklist.
3668 for (VPValue *Operand : CurRec->operands())
3669 if (VPRecipeBase *OpDef = Operand->getDefiningRecipe())
3670 Worklist.push_back(OpDef);
3671 }
3672 });
3673
3674 // We want to exclude the tail folding case, as we don't need to drop flags
3675 // for operations computing the first lane in this case: the first lane of the
3676 // header mask must always be true.
3677 auto IsNotHeaderMask = [&Plan](VPValue *Mask) {
3678 return Mask && !vputils::isHeaderMask(Mask, Plan);
3679 };
3680
3681 // Traverse all the recipes in the VPlan and collect the poison-generating
3682 // recipes in the backward slice starting at the address of a VPWidenRecipe or
3683 // VPInterleaveRecipe.
3684 auto Iter =
3687 for (VPRecipeBase &Recipe : *VPBB) {
3688 if (auto *WidenRec = dyn_cast<VPWidenMemoryRecipe>(&Recipe)) {
3689 VPRecipeBase *AddrDef = WidenRec->getAddr()->getDefiningRecipe();
3690 if (AddrDef && WidenRec->isConsecutive() &&
3691 IsNotHeaderMask(WidenRec->getMask()))
3692 CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
3693 } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) {
3694 VPRecipeBase *AddrDef = InterleaveRec->getAddr()->getDefiningRecipe();
3695 if (AddrDef && IsNotHeaderMask(InterleaveRec->getMask()))
3696 CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
3697 }
3698 }
3699 }
3700}
3701
3703 VPlan &Plan,
3705 &InterleaveGroups,
3706 const bool &EpilogueAllowed) {
3707 if (InterleaveGroups.empty())
3708 return;
3709
3711 for (VPBasicBlock *VPBB :
3714 for (VPRecipeBase &R : make_filter_range(*VPBB, [](VPRecipeBase &R) {
3715 return isa<VPWidenMemoryRecipe>(&R);
3716 })) {
3717 auto *MemR = cast<VPWidenMemoryRecipe>(&R);
3718 IRMemberToRecipe[&MemR->getIngredient()] = MemR;
3719 }
3720
3721 // Interleave memory: for each Interleave Group we marked earlier as relevant
3722 // for this VPlan, replace the Recipes widening its memory instructions with a
3723 // single VPInterleaveRecipe at its insertion point.
3724 VPDominatorTree VPDT(Plan);
3725 for (const auto *IG : InterleaveGroups) {
3726 // Skip interleave groups where members don't have recipes. This can happen
3727 // when removeDeadRecipes removes recipes that are part of interleave groups
3728 // but have no users.
3729 if (llvm::any_of(IG->members(), [&IRMemberToRecipe](Instruction *Member) {
3730 return !IRMemberToRecipe.contains(Member);
3731 }))
3732 continue;
3733
3734 auto *Start = IRMemberToRecipe.lookup(IG->getMember(0));
3735 VPIRMetadata InterleaveMD(*Start);
3736 SmallVector<VPValue *, 4> StoredValues;
3737 if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(Start->getAsRecipe()))
3738 StoredValues.push_back(StoreR->getStoredValue());
3739 for (unsigned I = 1; I < IG->getFactor(); ++I) {
3740 Instruction *MemberI = IG->getMember(I);
3741 if (!MemberI)
3742 continue;
3743 VPWidenMemoryRecipe *MemoryR = IRMemberToRecipe.lookup(MemberI);
3744 if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(MemoryR->getAsRecipe()))
3745 StoredValues.push_back(StoreR->getStoredValue());
3746 InterleaveMD.intersect(*MemoryR);
3747 }
3748
3749 bool NeedsMaskForGaps =
3750 (IG->requiresScalarEpilogue() && !EpilogueAllowed) ||
3751 (!StoredValues.empty() && !IG->isFull());
3752
3753 Instruction *IRInsertPos = IG->getInsertPos();
3754 auto *InsertPos = IRMemberToRecipe.lookup(IRInsertPos);
3755 VPRecipeBase *InsertPosR = InsertPos->getAsRecipe();
3756
3758 if (auto *Gep = dyn_cast<GetElementPtrInst>(
3759 getLoadStorePointerOperand(IRInsertPos)->stripPointerCasts()))
3760 NW = Gep->getNoWrapFlags().withoutNoUnsignedWrap();
3761
3762 // Get or create the start address for the interleave group.
3763 VPValue *Addr = Start->getAddr();
3764 VPRecipeBase *AddrDef = Addr->getDefiningRecipe();
3765 if (AddrDef && !VPDT.properlyDominates(AddrDef, InsertPosR)) {
3766 // We cannot re-use the address of member zero because it does not
3767 // dominate the insert position. Instead, use the address of the insert
3768 // position and create a PtrAdd adjusting it to the address of member
3769 // zero.
3770 // TODO: Hoist Addr's defining recipe (and any operands as needed) to
3771 // InsertPos or sink loads above zero members to join it.
3772 assert(IG->getIndex(IRInsertPos) != 0 &&
3773 "index of insert position shouldn't be zero");
3774 auto &DL = IRInsertPos->getDataLayout();
3775 APInt Offset(32,
3776 DL.getTypeAllocSize(getLoadStoreType(IRInsertPos)) *
3777 IG->getIndex(IRInsertPos),
3778 /*IsSigned=*/true);
3779 VPValue *OffsetVPV = Plan.getConstantInt(-Offset);
3780 VPBuilder B(InsertPosR);
3781 Addr = B.createNoWrapPtrAdd(InsertPos->getAddr(), OffsetVPV, NW);
3782 }
3783 // If the group is reverse, adjust the index to refer to the last vector
3784 // lane instead of the first. We adjust the index from the first vector
3785 // lane, rather than directly getting the pointer for lane VF - 1, because
3786 // the pointer operand of the interleaved access is supposed to be uniform.
3787 if (IG->isReverse()) {
3788 auto *ReversePtr = new VPVectorEndPointerRecipe(
3789 Addr, &Plan.getVF(), getLoadStoreType(IRInsertPos),
3790 -(int64_t)IG->getFactor(), NW, InsertPosR->getDebugLoc());
3791 ReversePtr->insertBefore(InsertPosR);
3792 Addr = ReversePtr;
3793 }
3794 auto *VPIG = new VPInterleaveRecipe(
3795 IG, Addr, StoredValues, InsertPos->getMask(), NeedsMaskForGaps,
3796 InterleaveMD, InsertPosR->getDebugLoc());
3797 VPIG->insertBefore(InsertPosR);
3798
3799 unsigned J = 0;
3800 for (unsigned i = 0; i < IG->getFactor(); ++i)
3801 if (Instruction *Member = IG->getMember(i)) {
3802 VPRecipeBase *MemberR = IRMemberToRecipe.lookup(Member)->getAsRecipe();
3803 if (!Member->getType()->isVoidTy()) {
3804 VPValue *OriginalV = MemberR->getVPSingleValue();
3805 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));
3806 J++;
3807 }
3808 MemberR->eraseFromParent();
3809 }
3810 }
3811}
3812
3813/// Expand a VPWidenIntOrFpInduction into executable recipes, for the initial
3814/// value, phi and backedge value. In the following example:
3815///
3816/// vector.ph:
3817/// Successor(s): vector loop
3818///
3819/// <x1> vector loop: {
3820/// vector.body:
3821/// WIDEN-INDUCTION %i = phi %start, %step, %vf
3822/// ...
3823/// EMIT branch-on-count ...
3824/// No successors
3825/// }
3826///
3827/// WIDEN-INDUCTION will get expanded to:
3828///
3829/// vector.ph:
3830/// ...
3831/// vp<%induction.start> = ...
3832/// vp<%induction.increment> = ...
3833///
3834/// Successor(s): vector loop
3835///
3836/// <x1> vector loop: {
3837/// vector.body:
3838/// ir<%i> = WIDEN-PHI vp<%induction.start>, vp<%vec.ind.next>
3839/// ...
3840/// vp<%vec.ind.next> = add ir<%i>, vp<%induction.increment>
3841/// EMIT branch-on-count ...
3842/// No successors
3843/// }
3844static void
3846 VPlan *Plan = WidenIVR->getParent()->getPlan();
3847 VPValue *Start = WidenIVR->getStartValue();
3848 VPValue *Step = WidenIVR->getStepValue();
3849 VPValue *VF = WidenIVR->getVFValue();
3850 DebugLoc DL = WidenIVR->getDebugLoc();
3851
3852 // The value from the original loop to which we are mapping the new induction
3853 // variable.
3854 Type *Ty = WidenIVR->getScalarType();
3855
3856 const InductionDescriptor &ID = WidenIVR->getInductionDescriptor();
3859 VPIRFlags Flags = *WidenIVR;
3860 if (ID.getKind() == InductionDescriptor::IK_IntInduction) {
3861 AddOp = Instruction::Add;
3862 MulOp = Instruction::Mul;
3863 } else {
3864 AddOp = ID.getInductionOpcode();
3865 MulOp = Instruction::FMul;
3866 }
3867
3868 // If the phi is truncated, truncate the start and step values.
3869 VPBuilder Builder(Plan->getVectorPreheader());
3870 Type *StepTy = Step->getScalarType();
3871 if (Ty->getScalarSizeInBits() < StepTy->getScalarSizeInBits()) {
3872 assert(StepTy->isIntegerTy() && "Truncation requires an integer type");
3873 Step = Builder.createScalarCast(Instruction::Trunc, Step, Ty, DL);
3874 Start = Builder.createScalarCast(Instruction::Trunc, Start, Ty, DL);
3875 StepTy = Ty;
3876 }
3877
3878 // Construct the initial value of the vector IV in the vector loop preheader.
3879 Type *IVIntTy =
3881 VPValue *Init = Builder.createNaryOp(VPInstruction::StepVector, {}, IVIntTy);
3882 if (StepTy->isFloatingPointTy())
3883 Init = Builder.createWidenCast(Instruction::UIToFP, Init, StepTy);
3884
3885 VPValue *SplatStart = Builder.createNaryOp(VPInstruction::Broadcast, Start);
3886 VPValue *SplatStep = Builder.createNaryOp(VPInstruction::Broadcast, Step);
3887
3888 Init = Builder.createNaryOp(MulOp, {Init, SplatStep}, Flags);
3889 Init = Builder.createNaryOp(AddOp, {SplatStart, Init}, Flags,
3890 DebugLoc::getUnknown(), "induction");
3891
3892 // Create the widened phi of the vector IV.
3893 auto *WidePHI = VPBuilder(WidenIVR).createWidenPhi(
3894 Init, WidenIVR->getDebugLoc(), "vec.ind");
3895
3896 // Create the backedge value for the vector IV.
3897 VPValue *Inc;
3898 VPValue *Prev;
3899 // If unrolled, use the increment and prev value from the operands.
3900 if (auto *SplatVF = WidenIVR->getSplatVFValue()) {
3901 Inc = SplatVF;
3902 Prev = WidenIVR->getLastUnrolledPartOperand();
3903 } else {
3904 // Move the insertion point after the VF definition when the VF is defined
3905 // inside a loop, such as for EVL tail-folding.
3906 if (VPRecipeBase *R = VF->getDefiningRecipe())
3907 if (R->getParent()->getEnclosingLoopRegion())
3908 Builder.setInsertPoint(R->getParent(), std::next(R->getIterator()));
3909
3910 // Multiply the vectorization factor by the step using integer or
3911 // floating-point arithmetic as appropriate.
3912 if (StepTy->isFloatingPointTy())
3913 VF = Builder.createScalarCast(Instruction::CastOps::UIToFP, VF, StepTy,
3914 DL);
3915 else
3916 VF = Builder.createScalarZExtOrTrunc(VF, StepTy, VF->getScalarType(), DL);
3917
3918 Inc = Builder.createNaryOp(MulOp, {Step, VF}, Flags);
3919 Inc = Builder.createNaryOp(VPInstruction::Broadcast, Inc);
3920 Prev = WidePHI;
3921 }
3922
3924 Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator());
3925 auto *Next = Builder.createNaryOp(AddOp, {Prev, Inc}, Flags,
3926 WidenIVR->getDebugLoc(), "vec.ind.next");
3927
3928 WidePHI->addIncoming(Next);
3929
3930 WidenIVR->replaceAllUsesWith(WidePHI);
3931}
3932
3933/// Expand a VPWidenPointerInductionRecipe into executable recipes, for the
3934/// initial value, phi and backedge value. In the following example:
3935///
3936/// <x1> vector loop: {
3937/// vector.body:
3938/// EMIT ir<%ptr.iv> = WIDEN-POINTER-INDUCTION %start, %step, %vf
3939/// ...
3940/// EMIT branch-on-count ...
3941/// }
3942///
3943/// WIDEN-POINTER-INDUCTION will get expanded to:
3944///
3945/// <x1> vector loop: {
3946/// vector.body:
3947/// EMIT-SCALAR %pointer.phi = phi %start, %ptr.ind
3948/// EMIT %mul = mul %stepvector, %step
3949/// EMIT %vector.gep = wide-ptradd %pointer.phi, %mul
3950/// ...
3951/// EMIT %ptr.ind = ptradd %pointer.phi, %vf
3952/// EMIT branch-on-count ...
3953/// }
3955 VPlan *Plan = R->getParent()->getPlan();
3956 VPValue *Start = R->getStartValue();
3957 VPValue *Step = R->getStepValue();
3958 VPValue *VF = R->getVFValue();
3959
3960 assert(R->getInductionDescriptor().getKind() ==
3962 "Not a pointer induction according to InductionDescriptor!");
3963 assert(R->getScalarType()->isPointerTy() && "Unexpected type.");
3964 assert(!R->onlyScalarsGenerated(Plan->hasScalableVF()) &&
3965 "Recipe should have been replaced");
3966
3967 VPBuilder Builder(R);
3968 DebugLoc DL = R->getDebugLoc();
3969
3970 // Build a scalar pointer phi.
3971 VPPhi *ScalarPtrPhi = Builder.createScalarPhi(Start, DL, "pointer.phi");
3972
3973 // Create actual address geps that use the pointer phi as base and a
3974 // vectorized version of the step value (<step*0, ..., step*N>) as offset.
3975 Builder.setInsertPoint(R->getParent(), R->getParent()->getFirstNonPhi());
3976 Type *StepTy = Step->getScalarType();
3977 VPValue *Offset = Builder.createNaryOp(VPInstruction::StepVector, {}, StepTy);
3978 Offset = Builder.createOverflowingOp(Instruction::Mul, {Offset, Step});
3979 VPValue *PtrAdd =
3980 Builder.createWidePtrAdd(ScalarPtrPhi, Offset, DL, "vector.gep");
3981 R->replaceAllUsesWith(PtrAdd);
3982
3983 // Create the backedge value for the scalar pointer phi.
3985 Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator());
3986 VF = Builder.createScalarZExtOrTrunc(VF, StepTy, VF->getScalarType(), DL);
3987 VPValue *Inc = Builder.createOverflowingOp(Instruction::Mul, {Step, VF});
3988
3989 VPValue *InductionGEP =
3990 Builder.createPtrAdd(ScalarPtrPhi, Inc, DL, "ptr.ind");
3991 ScalarPtrPhi->addIncoming(InductionGEP);
3992}
3993
3994/// Expand a VPDerivedIVRecipe into executable recipes.
3996 VPBuilder Builder(R);
3997 VPIRValue *Start = R->getStartValue();
3998 VPValue *Step = R->getStepValue();
3999 VPValue *Index = R->getIndex();
4000 Type *StepTy = Step->getScalarType();
4001 Type *IndexTy = Index->getScalarType();
4002 Index = StepTy->isIntegerTy()
4003 ? Builder.createScalarSExtOrTrunc(
4004 Index, StepTy, IndexTy, DebugLoc::getCompilerGenerated())
4005 : Builder.createScalarCast(Instruction::SIToFP, Index, StepTy,
4007 switch (R->getInductionKind()) {
4009 assert(Index->getScalarType() == Start->getScalarType() &&
4010 "Index type does not match StartValue type");
4011 return R->replaceAllUsesWith(Builder.createAdd(
4012 Start, Builder.createOverflowingOp(Instruction::Mul, {Index, Step})));
4013 }
4015 return R->replaceAllUsesWith(Builder.createPtrAdd(
4016 Start, Builder.createOverflowingOp(Instruction::Mul, {Index, Step})));
4018 assert(StepTy->isFloatingPointTy() && "Expected FP Step value");
4019 const FPMathOperator *FPBinOp = R->getFPBinOp();
4020 assert(FPBinOp &&
4021 (FPBinOp->getOpcode() == Instruction::FAdd ||
4022 FPBinOp->getOpcode() == Instruction::FSub) &&
4023 "Original BinOp should be defined for FP induction");
4024 FastMathFlags FMF = FPBinOp->getFastMathFlags();
4025 VPValue *FMul = Builder.createNaryOp(Instruction::FMul, {Step, Index}, FMF);
4026 return R->replaceAllUsesWith(
4027 Builder.createNaryOp(FPBinOp->getOpcode(), {Start, FMul}, FMF));
4028 }
4030 return;
4031 }
4032 llvm_unreachable("Unhandled induction kind");
4033}
4034
4036 // Replace loop regions with explicity CFG.
4037 SmallVector<VPRegionBlock *> LoopRegions;
4039 vp_depth_first_deep(Plan.getEntry()))) {
4040 if (!R->isReplicator())
4041 LoopRegions.push_back(R);
4042 }
4043 for (VPRegionBlock *R : LoopRegions)
4044 R->dissolveToCFGLoop();
4045}
4046
4049 // The transform runs after dissolving loop regions, so all VPBasicBlocks
4050 // terminated with BranchOnTwoConds are reached via a shallow traversal.
4053 if (!VPBB->empty() && match(&VPBB->back(), m_BranchOnTwoConds()))
4054 WorkList.push_back(cast<VPInstruction>(&VPBB->back()));
4055 }
4056
4057 // Expand BranchOnTwoConds instructions into explicit CFG with two new
4058 // single-condition branches:
4059 // 1. A branch that replaces BranchOnTwoConds, jumps to the first successor if
4060 // the first condition is true, and otherwise jumps to a new interim block.
4061 // 2. A branch that ends the interim block, jumps to the second successor if
4062 // the second condition is true, and otherwise jumps to the third
4063 // successor.
4064 for (VPInstruction *Br : WorkList) {
4065 assert(Br->getNumOperands() == 2 &&
4066 "BranchOnTwoConds must have exactly 2 conditions");
4067 DebugLoc DL = Br->getDebugLoc();
4068 VPBasicBlock *BrOnTwoCondsBB = Br->getParent();
4069 const auto Successors = to_vector(BrOnTwoCondsBB->getSuccessors());
4070 assert(Successors.size() == 3 &&
4071 "BranchOnTwoConds must have exactly 3 successors");
4072
4073 for (VPBlockBase *Succ : Successors)
4074 VPBlockUtils::disconnectBlocks(BrOnTwoCondsBB, Succ);
4075
4076 VPValue *Cond0 = Br->getOperand(0);
4077 VPValue *Cond1 = Br->getOperand(1);
4078 VPBlockBase *Succ0 = Successors[0];
4079 VPBlockBase *Succ1 = Successors[1];
4080 VPBlockBase *Succ2 = Successors[2];
4081
4082 // If the successor block for both conditions is the same, then combine the
4083 // two conditions and plant a single conditional branch.
4084 if (Succ0 == Succ1) {
4085 VPBuilder Builder(Br);
4086 VPValue *Combined = Builder.createOr(Cond0, Cond1, DL);
4087 Builder.createNaryOp(VPInstruction::BranchOnCond, {Combined}, DL);
4088 VPBlockUtils::connectBlocks(BrOnTwoCondsBB, Succ0);
4089 VPBlockUtils::connectBlocks(BrOnTwoCondsBB, Succ2);
4090 Br->eraseFromParent();
4091 continue;
4092 }
4093
4094 assert(!Succ0->getParent() && !Succ1->getParent() && !Succ2->getParent() &&
4095 !BrOnTwoCondsBB->getParent() && "regions must already be dissolved");
4096
4097 VPBasicBlock *InterimBB =
4098 Plan.createVPBasicBlock(BrOnTwoCondsBB->getName() + ".interim");
4099
4100 VPBuilder(BrOnTwoCondsBB)
4102 VPBlockUtils::connectBlocks(BrOnTwoCondsBB, Succ0);
4103 VPBlockUtils::connectBlocks(BrOnTwoCondsBB, InterimBB);
4104
4106 VPBlockUtils::connectBlocks(InterimBB, Succ1);
4107 VPBlockUtils::connectBlocks(InterimBB, Succ2);
4108 Br->eraseFromParent();
4109 }
4110}
4111
4114 vp_depth_first_deep(Plan.getEntry()))) {
4115 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
4116 VPBuilder Builder(&R);
4117 if (auto *WidenIVR = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R)) {
4119 WidenIVR->eraseFromParent();
4120 continue;
4121 }
4122
4123 if (auto *WidenIVR = dyn_cast<VPWidenPointerInductionRecipe>(&R)) {
4124 // If the recipe only generates scalars, scalarize it instead of
4125 // expanding it.
4126 if (WidenIVR->onlyScalarsGenerated(Plan.hasScalableVF())) {
4127 VPValue *PtrAdd =
4128 scalarizeVPWidenPointerInduction(WidenIVR, Plan, Builder);
4129 WidenIVR->replaceAllUsesWith(PtrAdd);
4130 WidenIVR->eraseFromParent();
4131 continue;
4132 }
4134 WidenIVR->eraseFromParent();
4135 continue;
4136 }
4137
4138 if (auto *DerivedIVR = dyn_cast<VPDerivedIVRecipe>(&R)) {
4139 expandVPDerivedIV(DerivedIVR);
4140 DerivedIVR->eraseFromParent();
4141 continue;
4142 }
4143
4144 if (auto *WideCanIV = dyn_cast<VPWidenCanonicalIVRecipe>(&R)) {
4145 VPValue *CanIV = WideCanIV->getCanonicalIV();
4146 Type *CanIVTy = CanIV->getScalarType();
4147 VPValue *Step = WideCanIV->getStepValue();
4148 if (!Step) {
4149 assert(Plan.getConcreteUF() == 1 &&
4150 "Expected unroller to have materialized step for UF != 1");
4151 Step = Plan.getZero(CanIVTy);
4152 }
4153 CanIV = Builder.createNaryOp(VPInstruction::Broadcast, CanIV);
4154 Step = Builder.createNaryOp(VPInstruction::Broadcast, Step);
4155 Step = Builder.createAdd(
4156 Step, Builder.createNaryOp(VPInstruction::StepVector, {}, CanIVTy));
4157 VPValue *CanVecIV =
4158 Builder.createAdd(CanIV, Step, WideCanIV->getDebugLoc(), "vec.iv",
4159 WideCanIV->getNoWrapFlags());
4160 WideCanIV->replaceAllUsesWith(CanVecIV);
4161 WideCanIV->eraseFromParent();
4162 continue;
4163 }
4164
4165 // Expand VPBlendRecipe into VPInstruction::Select.
4166 if (auto *Blend = dyn_cast<VPBlendRecipe>(&R)) {
4167 VPValue *Select = Blend->getIncomingValue(0);
4168 for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
4169 Select = Builder.createSelect(Blend->getMask(I),
4170 Blend->getIncomingValue(I), Select,
4171 R.getDebugLoc(), "predphi", *Blend);
4172 Blend->replaceAllUsesWith(Select);
4173 Blend->eraseFromParent();
4174 continue;
4175 }
4176
4177 if (auto *VEPR = dyn_cast<VPVectorEndPointerRecipe>(&R)) {
4178 if (!VEPR->getOffset()) {
4179 assert(Plan.getConcreteUF() == 1 &&
4180 "Expected unroller to have materialized offset for UF != 1");
4181 VEPR->materializeOffset();
4182 }
4183 continue;
4184 }
4185
4186 if (auto *Expr = dyn_cast<VPExpressionRecipe>(&R)) {
4187 Expr->decompose();
4188 Expr->eraseFromParent();
4189 continue;
4190 }
4191
4192 // Expand LastActiveLane into Not + FirstActiveLane + Sub.
4193 auto *LastActiveL = dyn_cast<VPInstruction>(&R);
4194 if (LastActiveL &&
4195 LastActiveL->getOpcode() == VPInstruction::LastActiveLane) {
4196 // Create Not(Mask) for all operands.
4198 for (VPValue *Op : LastActiveL->operands()) {
4199 VPValue *NotMask = Builder.createNot(Op, LastActiveL->getDebugLoc());
4200 NotMasks.push_back(NotMask);
4201 }
4202
4203 // Create FirstActiveLane on the inverted masks.
4204 VPValue *FirstInactiveLane = Builder.createFirstActiveLane(
4205 NotMasks, LastActiveL->getDebugLoc(), "first.inactive.lane");
4206
4207 // Subtract 1 to get the last active lane.
4208 VPValue *One =
4209 Plan.getConstantInt(FirstInactiveLane->getScalarType(), 1);
4210 VPValue *LastLane =
4211 Builder.createSub(FirstInactiveLane, One,
4212 LastActiveL->getDebugLoc(), "last.active.lane");
4213
4214 LastActiveL->replaceAllUsesWith(LastLane);
4215 LastActiveL->eraseFromParent();
4216 continue;
4217 }
4218
4219 // Lower MaskedCond with block mask to LogicalAnd.
4221 auto *VPI = cast<VPInstruction>(&R);
4222 assert(VPI->isMasked() &&
4223 "Unmasked MaskedCond should be simplified earlier");
4224 VPI->replaceAllUsesWith(Builder.createNaryOp(
4225 VPInstruction::LogicalAnd, {VPI->getMask(), VPI->getOperand(0)}));
4226 VPI->eraseFromParent();
4227 continue;
4228 }
4229
4230 // Lower CanonicalIVIncrementForPart to plain Add.
4231 if (match(
4232 &R,
4234 auto *VPI = cast<VPInstruction>(&R);
4235 VPValue *Add = Builder.createOverflowingOp(
4236 Instruction::Add, VPI->operands(), VPI->getNoWrapFlags(),
4237 VPI->getDebugLoc());
4238 VPI->replaceAllUsesWith(Add);
4239 VPI->eraseFromParent();
4240 continue;
4241 }
4242
4243 // Lower BranchOnCount to ICmp + BranchOnCond.
4244 VPValue *IV, *TC;
4245 if (match(&R, m_BranchOnCount(m_VPValue(IV), m_VPValue(TC)))) {
4246 auto *BranchOnCountInst = cast<VPInstruction>(&R);
4247 DebugLoc DL = BranchOnCountInst->getDebugLoc();
4248 VPValue *Cond = Builder.createICmp(CmpInst::ICMP_EQ, IV, TC, DL);
4249 Builder.createNaryOp(VPInstruction::BranchOnCond, Cond, DL);
4250 BranchOnCountInst->eraseFromParent();
4251 continue;
4252 }
4253
4254 VPValue *VectorStep;
4255 VPValue *ScalarStep;
4257 m_VPValue(VectorStep), m_VPValue(ScalarStep))))
4258 continue;
4259
4260 // Expand WideIVStep.
4261 auto *VPI = cast<VPInstruction>(&R);
4262 Type *IVTy = VPI->getScalarType();
4263 if (VectorStep->getScalarType() != IVTy) {
4265 ? Instruction::UIToFP
4266 : Instruction::Trunc;
4267 VectorStep = Builder.createWidenCast(CastOp, VectorStep, IVTy);
4268 }
4269
4270 assert(!match(ScalarStep, m_One()) && "Expected non-unit scalar-step");
4271 if (ScalarStep->getScalarType() != IVTy) {
4272 ScalarStep =
4273 Builder.createWidenCast(Instruction::Trunc, ScalarStep, IVTy);
4274 }
4275
4276 VPIRFlags Flags;
4277 unsigned MulOpc;
4278 if (IVTy->isFloatingPointTy()) {
4279 MulOpc = Instruction::FMul;
4280 Flags = VPI->getFastMathFlagsOrNone();
4281 } else {
4282 MulOpc = Instruction::Mul;
4283 Flags = VPIRFlags::getDefaultFlags(MulOpc);
4284 }
4285
4286 VPInstruction *Mul = Builder.createNaryOp(
4287 MulOpc, {VectorStep, ScalarStep}, Flags, R.getDebugLoc());
4288 VectorStep = Mul;
4289 VPI->replaceAllUsesWith(VectorStep);
4290 VPI->eraseFromParent();
4291 }
4292 }
4293}
4294
4295/// Returns the VPValue representing the uncountable exit comparison used by
4296/// AnyOf if the recipes it depends on can be traced back to live-ins and
4297/// the addresses (in GEP/PtrAdd form) of any (non-masked) load used in
4298/// generating the values for the comparison. The recipes are stored in
4299/// \p Recipes.
4300static std::optional<VPValue *>
4302 VPBasicBlock *LatchVPBB) {
4303 // Given a plain CFG VPlan loop with countable latch exiting block
4304 // \p LatchVPBB, we're looking to match the recipes contributing to the
4305 // uncountable exit condition comparison (here, vp<%4>) back to either
4306 // live-ins or the address nodes for the load used as part of the uncountable
4307 // exit comparison so that we can either move them within the loop, or copy
4308 // them to the preheader depending on the chosen method for dealing with
4309 // stores in uncountable exit loops.
4310 //
4311 // Currently, the address of the load is restricted to a GEP with 2 operands
4312 // and a live-in base address. This constraint may be relaxed later.
4313 //
4314 // VPlan ' for UF>=1' {
4315 // Live-in vp<%0> = VF * UF
4316 // Live-in vp<%1> = vector-trip-count
4317 // Live-in ir<20> = original trip-count
4318 //
4319 // ir-bb<entry>:
4320 // Successor(s): scalar.ph, vector.ph
4321 //
4322 // vector.ph:
4323 // Successor(s): for.body
4324 //
4325 // for.body:
4326 // EMIT vp<%2> = phi ir<0>, vp<%index.next>
4327 // EMIT-SCALAR ir<%iv> = phi [ ir<0>, vector.ph ], [ ir<%iv.next>, for.inc ]
4328 // EMIT ir<%uncountable.addr> = getelementptr inbounds nuw ir<%pred>,ir<%iv>
4329 // EMIT ir<%uncountable.val> = load ir<%uncountable.addr>
4330 // EMIT ir<%uncountable.cond> = icmp sgt ir<%uncountable.val>, ir<500>
4331 // EMIT vp<%3> = masked-cond ir<%uncountable.cond>
4332 // Successor(s): for.inc
4333 //
4334 // for.inc:
4335 // EMIT ir<%iv.next> = add nuw nsw ir<%iv>, ir<1>
4336 // EMIT ir<%countable.cond> = icmp eq ir<%iv.next>, ir<20>
4337 // EMIT vp<%index.next> = add nuw vp<%2>, vp<%0>
4338 // EMIT vp<%4> = any-of ir<%3>
4339 // EMIT vp<%5> = icmp eq vp<%index.next>, vp<%1>
4340 // EMIT branch-on-two-conds vp<%4>, vp<%5>
4341 // Successor(s): middle.block, middle.block, for.body
4342 //
4343 // middle.block:
4344 // Successor(s): ir-bb<exit>, scalar.ph
4345 //
4346 // ir-bb<exit>:
4347 // No successors
4348 //
4349 // scalar.ph:
4350 // }
4351
4352 // Find the uncountable loop exit condition.
4353 VPValue *UncountableCondition = nullptr;
4354 if (!match(LatchVPBB->getTerminator(),
4355 m_BranchOnTwoConds(m_AnyOf(m_VPValue(UncountableCondition)),
4356 m_VPValue())))
4357 return std::nullopt;
4358
4360 Worklist.push_back(UncountableCondition);
4361 while (!Worklist.empty()) {
4362 VPValue *V = Worklist.pop_back_val();
4363
4364 // Any value defined outside the loop does not need to be copied.
4365 if (V->isDefinedOutsideLoopRegions())
4366 continue;
4367
4368 // FIXME: Remove the single user restriction; it's here because we're
4369 // starting with the simplest set of loops we can, and multiple
4370 // users means needing to add PHI nodes in the transform.
4371 if (V->getNumUsers() > 1)
4372 return std::nullopt;
4373
4374 VPValue *Op1, *Op2;
4375 // Walk back through recipes until we find at least one load from memory.
4376 if (match(V, m_ICmp(m_VPValue(Op1), m_VPValue(Op2)))) {
4377 Worklist.push_back(Op1);
4378 Worklist.push_back(Op2);
4379 Recipes.push_back(cast<VPInstruction>(V->getDefiningRecipe()));
4380 } else if (match(V, m_VPInstruction<Instruction::Load>(m_VPValue(Op1)))) {
4381 VPRecipeBase *GepR = Op1->getDefiningRecipe();
4382 // Only matching base + single offset term for now.
4383 if (GepR->getNumOperands() != 2)
4384 return std::nullopt;
4385 // Matching a GEP with a loop-invariant base ptr.
4387 m_LiveIn(), m_VPValue())))
4388 return std::nullopt;
4389 Recipes.push_back(cast<VPInstruction>(V->getDefiningRecipe()));
4390 Recipes.push_back(cast<VPInstruction>(GepR));
4392 m_VPValue(Op1)))) {
4393 Worklist.push_back(Op1);
4394 Recipes.push_back(cast<VPInstruction>(V->getDefiningRecipe()));
4395 } else
4396 return std::nullopt;
4397 }
4398
4399 // If we couldn't match anything, don't return the condition. It may be
4400 // defined outside the loop.
4401 if (Recipes.empty() || none_of(Recipes, [](VPInstruction *I) {
4403 }))
4404 return std::nullopt;
4405
4406 return UncountableCondition;
4407}
4408
4414
4415/// Update \p Plan to mask memory operations in the loop based on whether the
4416/// early exit is taken or not.
4417///
4418/// We're currently expecting to find a loop with properties similar to the
4419/// following:
4420///
4421/// for.body:
4422/// ir<%indvars.iv> = WIDEN-INDUCTION nuw nsw ir<0>, ir<1>, vp<%0>
4423/// EMIT ir<%arrayidx> = getelementptr inbounds nuw ir<@c>, ir<%indvars.iv>
4424/// EMIT-SCALAR ir<%0> = load ir<%arrayidx>
4425/// EMIT ir<%cmp1> = icmp sgt ir<%0>, ir<5>
4426/// EMIT vp<%1> = masked-cond ir<%cmp1>
4427/// Successor(s): if.end
4428///
4429/// if.end:
4430/// EMIT ir<%arrayidx3> = getelementptr inbounds nuw ir<@src>, ir<%indvars.iv>
4431/// EMIT-SCALAR ir<%2> = load ir<%arrayidx3>
4432/// EMIT ir<%add> = add nsw ir<%2>, ir<42>
4433/// EMIT ir<%arrayidx5> = getelementptr inbounds nuw ir<@dst>, ir<%indvars.iv>
4434/// EMIT store ir<%add>, ir<%arrayidx5>
4435/// EMIT ir<%indvars.iv.next> = add nuw nsw ir<%indvars.iv>, ir<1>
4436/// EMIT vp<%3> = any-of ir<%1>
4437/// EMIT ir<%exitcond.not> = icmp eq ir<%indvars.iv.next>, ir<10000>
4438/// EMIT branch-on-two-conds vp<%3>, ir<%exitcond.not>
4439/// Successor(s): middle.block, middle.block, for.body
4440///
4441/// We currently expect LoopVectorizationLegality to ensure that:
4442/// * There must also be a counted exit. We will need to support speculative
4443/// or first-faulting loads before we can remove this restriction.
4444/// * Any stores within the loop must not alias with the load used for the
4445/// uncountable exit. We can relax this a bit with runtime aliasing checks.
4446/// * Other memory operations in the loop can take place before or after the
4447/// uncountable exit, but must also be unconditional. We need to support
4448/// combining the conditions in VPlanPredicator.
4449/// * The loop must have a single unconditional load contributing to the
4450/// uncountable exit comparison, and the other term must be loop-invariant.
4451/// Improving upon this requires work in getRecipesForUncountableExit to
4452/// handle more complex recipe graphs.
4455 VPBasicBlock *HeaderVPBB, VPBasicBlock *LatchVPBB, VPBasicBlock *MiddleVPBB,
4456 Loop *TheLoop, PredicatedScalarEvolution &PSE, DominatorTree &DT,
4457 AssumptionCache *AC) {
4458
4459 // Disconnect early exiting blocks from successors, remove branches. We
4460 // currently don't support multiple uses for recipes involved in creating
4461 // the uncountable exit condition.
4462 for (auto &Exit : Exits) {
4463 if (Exit.EarlyExitingVPBB == LatchVPBB)
4464 continue;
4465
4466 for (VPRecipeBase &R : Exit.EarlyExitVPBB->phis())
4467 cast<VPIRPhi>(&R)->removeIncomingValueFor(Exit.EarlyExitingVPBB);
4468 Exit.EarlyExitingVPBB->getTerminator()->eraseFromParent();
4469 VPBlockUtils::disconnectBlocks(Exit.EarlyExitingVPBB, Exit.EarlyExitVPBB);
4470 }
4471
4472 VPDominatorTree VPDT(Plan);
4473
4474 // We can abandon a VPlan entirely if we return false here, so we shouldn't
4475 // crash if some earlier assumptions on scalar IR don't hold for the vplan
4476 // version of the loop.
4477 SmallVector<VPInstruction *, 8> ConditionRecipes;
4478
4479 std::optional<VPValue *> Cond =
4480 getRecipesForUncountableExit(ConditionRecipes, LatchVPBB);
4481 if (!Cond)
4482 return false;
4483
4484 // Find load contributing to condition.
4485 // At the moment LoopVectorizationLegality only supports a single
4486 // early-exit expression with a compare and a single load that must
4487 // be unconditional.
4488 // TODO: Support more than one load.
4489 auto *Load =
4490 find_singleton<VPInstruction>(ConditionRecipes, [](auto *I, bool _) {
4492 ? I
4493 : nullptr;
4494 });
4495 assert(Load && "Couldn't find exactly one load");
4496 // TODO: Support conditional loads for uncountable exits.
4497 assert(VPDT.dominates(Load->getParent(), LatchVPBB) &&
4498 "Uncountable exit condition load is conditional.");
4499 VPInstruction *Ptr = cast<VPInstruction>(Load->getOperand(0));
4500
4501 // Ensure that we are guaranteed to be able to dereference the memory used
4502 // for determining the uncountable exit for the maximum possible number of
4503 // scalar iterations of the loop.
4504 //
4505 // TODO: Support first-faulting loads in cases where we don't know whether
4506 // all possible addresses are dereferenceable.
4507 {
4509 const SCEV *PtrSCEV = vputils::getSCEVExprForVPValue(Ptr, PSE, TheLoop);
4510 const DataLayout &DL = Plan.getDataLayout();
4511 APInt EltSize(DL.getIndexTypeSizeInBits(Ptr->getScalarType()),
4512 DL.getTypeStoreSize(Load->getScalarType()).getFixedValue());
4514 PtrSCEV, cast<LoadInst>(Load->getUnderlyingInstr())->getAlign(),
4515 PSE.getSE()->getConstant(EltSize), TheLoop, *PSE.getSE(), DT, AC,
4516 &Predicates))
4517 return false;
4518 }
4519
4520 // Check for a single GEP for the condition load to see if we can link it to
4521 // a widen IV recipe with a step of 1; we're only interested in contiguous
4522 // accesses for the condition load right now.
4523 auto *IV = cast<VPWidenInductionRecipe>(&HeaderVPBB->front());
4524 if (!match(IV->getStartValue(), m_SpecificInt(0)) ||
4525 !match(IV->getStepValue(), m_SpecificInt(1)))
4526 return false;
4528 m_Specific(IV))))
4529 return false;
4530
4531 // We want to guarantee that the uncountable exit condition (and the mask
4532 // we will generate from it) are available for all operations in the loop
4533 // that need to be masked. If the condition recipes are not already the first
4534 // recipes in the header after the last phi, move them there.
4535 auto InsertIt = HeaderVPBB->getFirstNonPhi();
4536 while (InsertIt != HeaderVPBB->end() &&
4537 is_contained(ConditionRecipes, &*InsertIt)) {
4538 erase(ConditionRecipes, &*InsertIt);
4539 InsertIt++;
4540 }
4541 for (auto *Recipe : reverse(ConditionRecipes))
4542 Recipe->moveBefore(*HeaderVPBB, InsertIt);
4543
4544 // Create a mask to represent all lanes that fully execute in the vector loop,
4545 // stopping short of any early exit.
4546 VPBuilder MaskBuilder(HeaderVPBB, InsertIt);
4547 VPValue *FirstActive = MaskBuilder.createFirstActiveLane(*Cond);
4548 Type *IVScalarTy = IV->getScalarType();
4549 Type *FirstActiveTy = FirstActive->getScalarType();
4550 VPValue *ALMMultiplier = Plan.getConstantInt(IVScalarTy, 1);
4551 VPValue *Zero = Plan.getZero(IVScalarTy);
4552 FirstActive = MaskBuilder.createScalarZExtOrTrunc(FirstActive, IVScalarTy,
4553 FirstActiveTy, DebugLoc());
4555 {Zero, FirstActive, ALMMultiplier},
4556 DebugLoc(), "uncountable.exit.mask");
4557
4558 // Convert all other memory operations to use the mask.
4559 for (VPBasicBlock *VPBB : vp_rpo_plain_cfg_loop_body(HeaderVPBB))
4560 for (VPRecipeBase &R : *VPBB)
4561 if (R.mayReadOrWriteMemory() && &R != Load) {
4562 // TODO: Handle conditional memory operations in the loop.
4563 if (!VPDT.dominates(R.getParent(), LatchVPBB))
4564 return false;
4565 cast<VPInstruction>(&R)->addMask(Mask);
4566 }
4567
4568 // Update middle block branch to compare (IV + however many lanes were active)
4569 // against the full trip count, since we may be exiting the vector loop early.
4570 // If we didn't take an early exit, we should get the equivalent of VF from
4571 // the FirstActiveLane.
4572 VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->end());
4573 VPValue *ScalarIV = MiddleBuilder.createNaryOp(VPInstruction::ExtractLane,
4574 {Zero, IV}, DebugLoc());
4575 VPValue *ExitIV = MiddleBuilder.createAdd(ScalarIV, FirstActive);
4576 VPValue *FullTC =
4577 MiddleBuilder.createICmp(CmpInst::ICMP_EQ, ExitIV, Plan.getTripCount());
4578 MiddleBuilder.createNaryOp(VPInstruction::BranchOnCond, {FullTC});
4579
4580 // Update resume phi in scalar.ph.
4581 VPBasicBlock *ScalarPH = Plan.getScalarPreheader();
4582 auto Phis = ScalarPH->phis();
4583 // TODO: Handle more than one Phi; re-derive from IV.
4584 // TODO: Handle reductions.
4585 if (range_size(Phis) != 1)
4586 return false;
4587 VPPhi *ContinueIV = cast<VPPhi>(Phis.begin());
4588 // Make sure we're referring to the same IV.
4589 assert(
4590 match(ContinueIV->getOperand(0),
4592 "Continuing from different IV");
4593 ContinueIV->setOperand(0, ExitIV);
4594 return true;
4595}
4596
4598 VPlan &Plan, VPBasicBlock *HeaderVPBB, VPBasicBlock *LatchVPBB,
4599 VPBasicBlock *MiddleVPBB, Loop *TheLoop, PredicatedScalarEvolution &PSE,
4601#ifndef NDEBUG
4602 VPDominatorTree VPDT(Plan);
4603#endif
4604 VPBuilder LatchBuilder(LatchVPBB->getTerminator());
4606 for (VPIRBasicBlock *ExitBlock : Plan.getExitBlocks()) {
4607 for (VPBlockBase *Pred : to_vector(ExitBlock->getPredecessors())) {
4608 if (Pred == MiddleVPBB)
4609 continue;
4610 // Collect condition for this early exit.
4611 auto *EarlyExitingVPBB = cast<VPBasicBlock>(Pred);
4612 VPBlockBase *TrueSucc = EarlyExitingVPBB->getSuccessors()[0];
4613 VPValue *CondOfEarlyExitingVPBB;
4614 [[maybe_unused]] bool Matched =
4615 match(EarlyExitingVPBB->getTerminator(),
4616 m_BranchOnCond(m_VPValue(CondOfEarlyExitingVPBB)));
4617 assert(Matched && "Terminator must be BranchOnCond");
4618
4619 // Insert the MaskedCond in the EarlyExitingVPBB so the predicator adds
4620 // the correct block mask.
4621 VPBuilder EarlyExitingBuilder(EarlyExitingVPBB->getTerminator());
4622 auto *CondToEarlyExit = EarlyExitingBuilder.createNaryOp(
4624 TrueSucc == ExitBlock
4625 ? CondOfEarlyExitingVPBB
4626 : EarlyExitingBuilder.createNot(CondOfEarlyExitingVPBB));
4627 assert((isa<VPIRValue>(CondOfEarlyExitingVPBB) ||
4628 !VPDT.properlyDominates(EarlyExitingVPBB, LatchVPBB) ||
4629 VPDT.properlyDominates(
4630 CondOfEarlyExitingVPBB->getDefiningRecipe()->getParent(),
4631 LatchVPBB)) &&
4632 "exit condition must dominate the latch");
4633 Exits.push_back({
4634 EarlyExitingVPBB,
4635 ExitBlock,
4636 CondToEarlyExit,
4637 });
4638 }
4639 }
4640
4641 assert(!Exits.empty() && "must have at least one early exit");
4642 // Sort exits by RPO order to get correct program order. RPO gives a
4643 // topological ordering of the CFG, ensuring upstream exits are checked
4644 // before downstream exits in the dispatch chain.
4646 HeaderVPBB);
4648 for (const auto &[Num, VPB] : enumerate(RPOT))
4649 RPOIdx[VPB] = Num;
4650 llvm::sort(Exits, [&RPOIdx](const EarlyExitInfo &A, const EarlyExitInfo &B) {
4651 return RPOIdx[A.EarlyExitingVPBB] < RPOIdx[B.EarlyExitingVPBB];
4652 });
4653#ifndef NDEBUG
4654 // After RPO sorting, verify that for any pair where one exit dominates
4655 // another, the dominating exit comes first. This is guaranteed by RPO
4656 // (topological order) and is required for the dispatch chain correctness.
4657 for (unsigned I = 0; I + 1 < Exits.size(); ++I)
4658 for (unsigned J = I + 1; J < Exits.size(); ++J)
4659 assert(!VPDT.properlyDominates(Exits[J].EarlyExitingVPBB,
4660 Exits[I].EarlyExitingVPBB) &&
4661 "RPO sort must place dominating exits before dominated ones");
4662#endif
4663
4664 // Build the AnyOf condition for the latch terminator using logical OR
4665 // to avoid poison propagation from later exit conditions when an earlier
4666 // exit is taken.
4667 VPValue *Combined = Exits[0].CondToExit;
4668 for (const EarlyExitInfo &Info : drop_begin(Exits))
4669 Combined = LatchBuilder.createLogicalOr(Combined, Info.CondToExit);
4670
4671 VPValue *IsAnyExitTaken =
4672 LatchBuilder.createNaryOp(VPInstruction::AnyOf, {Combined});
4673
4674 // Create a comparison for the latch exit condition and replace the
4675 // BranchOnCond with a BranchOnTwoConds. The original BranchOnCond's condition
4676 // is used as the latch-exit condition; canonical IV recipes have not been
4677 // introduced yet, so there is no BranchOnCount to derive the condition from.
4678 auto *LatchExitingBranch = cast<VPInstruction>(LatchVPBB->getTerminator());
4679 assert(LatchExitingBranch->getOpcode() == VPInstruction::BranchOnCond &&
4680 "Unexpected terminator");
4681 VPValue *IsLatchExitTaken = LatchExitingBranch->getOperand(0);
4682 DebugLoc LatchDL = LatchExitingBranch->getDebugLoc();
4683 LatchExitingBranch->eraseFromParent();
4684 LatchBuilder.setInsertPoint(LatchVPBB);
4686 {IsAnyExitTaken, IsLatchExitTaken}, LatchDL);
4687 LatchVPBB->clearSuccessors();
4688
4690 // If handling the exiting lane in the scalar loop, combine the exit
4691 // conditions into a single BranchOnCond.
4692 LatchVPBB->setSuccessors({MiddleVPBB, MiddleVPBB, HeaderVPBB});
4693 MiddleVPBB->clearPredecessors();
4694 MiddleVPBB->setPredecessors({LatchVPBB, LatchVPBB});
4696 Plan, Exits, HeaderVPBB, LatchVPBB, MiddleVPBB, TheLoop, PSE, DT, AC);
4697 }
4698
4699 // Create the vector.early.exit blocks.
4700 SmallVector<VPBasicBlock *> VectorEarlyExitVPBBs(Exits.size());
4701 for (unsigned Idx = 0; Idx != Exits.size(); ++Idx) {
4702 Twine BlockSuffix = Exits.size() == 1 ? "" : Twine(".") + Twine(Idx);
4703 VPBasicBlock *VectorEarlyExitVPBB =
4704 Plan.createVPBasicBlock("vector.early.exit" + BlockSuffix);
4705 VectorEarlyExitVPBBs[Idx] = VectorEarlyExitVPBB;
4706 }
4707
4708 // Create the dispatch block (or reuse the single exit block if only one
4709 // exit). The dispatch block computes the first active lane of the combined
4710 // condition and, for multiple exits, chains through conditions to determine
4711 // which exit to take.
4712 VPBasicBlock *DispatchVPBB =
4713 Exits.size() == 1 ? VectorEarlyExitVPBBs[0]
4714 : Plan.createVPBasicBlock("vector.early.exit.check");
4715 DispatchVPBB->setPredecessors({LatchVPBB});
4716 LatchVPBB->setSuccessors({DispatchVPBB, MiddleVPBB, HeaderVPBB});
4717 VPBuilder DispatchBuilder(DispatchVPBB, DispatchVPBB->begin());
4718 VPValue *FirstActiveLane = DispatchBuilder.createFirstActiveLane(
4719 {Combined}, DebugLoc::getUnknown(), "first.active.lane");
4720
4721 // For each early exit, disconnect the original exiting block
4722 // (early.exiting.I) from the exit block (ir-bb<exit.I>) and route through a
4723 // new vector.early.exit block. Update ir-bb<exit.I>'s phis to extract their
4724 // values at the first active lane:
4725 //
4726 // Input:
4727 // early.exiting.I:
4728 // ...
4729 // EMIT branch-on-cond vp<%cond.I>
4730 // Successor(s): in.loop.succ, ir-bb<exit.I>
4731 //
4732 // ir-bb<exit.I>:
4733 // IR %phi = phi [ vp<%incoming.I>, early.exiting.I ], ...
4734 //
4735 // Output:
4736 // early.exiting.I:
4737 // ...
4738 // Successor(s): in.loop.succ
4739 //
4740 // vector.early.exit.I:
4741 // EMIT vp<%exit.val> = extract-lane vp<%first.lane>, vp<%incoming.I>
4742 // Successor(s): ir-bb<exit.I>
4743 //
4744 // ir-bb<exit.I>:
4745 // IR %phi = phi ... (extra operand: vp<%exit.val> from
4746 // vector.early.exit.I)
4747 //
4748 for (auto [Exit, VectorEarlyExitVPBB] :
4749 zip_equal(Exits, VectorEarlyExitVPBBs)) {
4750 auto &[EarlyExitingVPBB, EarlyExitVPBB, _] = Exit;
4751 // Adjust the phi nodes in EarlyExitVPBB.
4752 // 1. remove incoming values from EarlyExitingVPBB,
4753 // 2. extract the incoming value at FirstActiveLane
4754 // 3. add back the extracts as last operands for the phis
4755 // Then adjust the CFG, removing the edge between EarlyExitingVPBB and
4756 // EarlyExitVPBB and adding a new edge between VectorEarlyExitVPBB and
4757 // EarlyExitVPBB. The extracts at FirstActiveLane are now the incoming
4758 // values from VectorEarlyExitVPBB.
4759 for (VPRecipeBase &R : EarlyExitVPBB->phis()) {
4760 auto *ExitIRI = cast<VPIRPhi>(&R);
4761 VPValue *IncomingVal =
4762 ExitIRI->getIncomingValueForBlock(EarlyExitingVPBB);
4763 VPValue *NewIncoming = IncomingVal;
4764 if (!isa<VPIRValue>(IncomingVal)) {
4765 VPBuilder EarlyExitBuilder(VectorEarlyExitVPBB);
4766 NewIncoming = EarlyExitBuilder.createNaryOp(
4767 VPInstruction::ExtractLane, {FirstActiveLane, IncomingVal},
4768 DebugLoc::getUnknown(), "early.exit.value");
4769 }
4770 ExitIRI->removeIncomingValueFor(EarlyExitingVPBB);
4771 ExitIRI->addIncoming(NewIncoming);
4772 }
4773
4774 EarlyExitingVPBB->getTerminator()->eraseFromParent();
4775 VPBlockUtils::disconnectBlocks(EarlyExitingVPBB, EarlyExitVPBB);
4776 VPBlockUtils::connectBlocks(VectorEarlyExitVPBB, EarlyExitVPBB);
4777 }
4778
4779 // Chain through exits: for each exit, check if its condition is true at
4780 // the first active lane. If so, take that exit; otherwise, try the next.
4781 // The last exit needs no check since it must be taken if all others fail.
4782 //
4783 // For 3 exits (cond.0, cond.1, cond.2), this creates:
4784 //
4785 // latch:
4786 // ...
4787 // EMIT vp<%combined> = logical-or vp<%cond.0>, vp<%cond.1>, vp<%cond.2>
4788 // ...
4789 //
4790 // vector.early.exit.check:
4791 // EMIT vp<%first.lane> = first-active-lane vp<%combined>
4792 // EMIT vp<%at.cond.0> = extract-lane vp<%first.lane>, vp<%cond.0>
4793 // EMIT branch-on-cond vp<%at.cond.0>
4794 // Successor(s): vector.early.exit.0, vector.early.exit.check.0
4795 //
4796 // vector.early.exit.check.0:
4797 // EMIT vp<%at.cond.1> = extract-lane vp<%first.lane>, vp<%cond.1>
4798 // EMIT branch-on-cond vp<%at.cond.1>
4799 // Successor(s): vector.early.exit.1, vector.early.exit.2
4800 VPBasicBlock *CurrentBB = DispatchVPBB;
4801 for (auto [I, Exit] : enumerate(ArrayRef(Exits).drop_back())) {
4802 VPValue *LaneVal = DispatchBuilder.createNaryOp(
4803 VPInstruction::ExtractLane, {FirstActiveLane, Exit.CondToExit},
4804 DebugLoc::getUnknown(), "exit.cond.at.lane");
4805
4806 // For the last dispatch, branch directly to the last exit on false;
4807 // otherwise, create a new check block.
4808 bool IsLastDispatch = (I + 2 == Exits.size());
4809 VPBasicBlock *FalseBB =
4810 IsLastDispatch ? VectorEarlyExitVPBBs.back()
4811 : Plan.createVPBasicBlock(
4812 Twine("vector.early.exit.check.") + Twine(I));
4813
4814 DispatchBuilder.createNaryOp(VPInstruction::BranchOnCond, {LaneVal});
4815 CurrentBB->setSuccessors({VectorEarlyExitVPBBs[I], FalseBB});
4816 VectorEarlyExitVPBBs[I]->setPredecessors({CurrentBB});
4817 FalseBB->setPredecessors({CurrentBB});
4818
4819 CurrentBB = FalseBB;
4820 DispatchBuilder.setInsertPoint(CurrentBB);
4821 }
4822
4823 return true;
4824}
4825
4826/// This function tries convert extended in-loop reductions to
4827/// VPExpressionRecipe and clamp the \p Range if it is beneficial and
4828/// valid. The created recipe must be decomposed to its constituent
4829/// recipes before execution.
4830static VPExpressionRecipe *
4832 VFRange &Range) {
4833 Type *RedTy = Red->getScalarType();
4834 VPValue *VecOp = Red->getVecOp();
4835
4836 assert(!Red->isPartialReduction() &&
4837 "This path does not support partial reductions");
4838
4839 // Clamp the range if using extended-reduction is profitable.
4840 auto IsExtendedRedValidAndClampRange =
4841 [&](unsigned Opcode, Instruction::CastOps ExtOpc, Type *SrcTy) -> bool {
4843 [&](ElementCount VF) {
4844 auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF));
4846
4848 InstructionCost ExtCost =
4849 cast<VPWidenCastRecipe>(VecOp)->computeCost(VF, Ctx);
4850 InstructionCost RedCost = Red->computeCost(VF, Ctx);
4851
4852 assert(!RedTy->isFloatingPointTy() &&
4853 "getExtendedReductionCost only supports integer types");
4854 ExtRedCost = Ctx.TTI.getExtendedReductionCost(
4855 Opcode, ExtOpc == Instruction::CastOps::ZExt, RedTy, SrcVecTy,
4856 Red->getFastMathFlagsOrNone(), CostKind);
4857 return ExtRedCost.isValid() && ExtRedCost < ExtCost + RedCost;
4858 },
4859 Range);
4860 };
4861
4862 VPValue *A;
4863 // Match reduce(ext)).
4865 IsExtendedRedValidAndClampRange(
4866 RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()),
4867 cast<VPWidenCastRecipe>(VecOp)->getOpcode(), A->getScalarType()))
4868 return new VPExpressionRecipe(cast<VPWidenCastRecipe>(VecOp), Red);
4869
4870 return nullptr;
4871}
4872
4873/// This function tries convert extended in-loop reductions to
4874/// VPExpressionRecipe and clamp the \p Range if it is beneficial
4875/// and valid. The created VPExpressionRecipe must be decomposed to its
4876/// constituent recipes before execution. Patterns of the
4877/// VPExpressionRecipe:
4878/// reduce.add(mul(...)),
4879/// reduce.add(mul(ext(A), ext(B))),
4880/// reduce.add(ext(mul(ext(A), ext(B)))).
4881/// reduce.fadd(fmul(ext(A), ext(B)))
4882static VPExpressionRecipe *
4884 VPCostContext &Ctx, VFRange &Range) {
4885 unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind());
4886 if (Opcode != Instruction::Add && Opcode != Instruction::Sub &&
4887 Opcode != Instruction::FAdd)
4888 return nullptr;
4889
4890 assert(!Red->isPartialReduction() &&
4891 "This path does not support partial reductions");
4892 Type *RedTy = Red->getScalarType();
4893
4894 // Clamp the range if using multiply-accumulate-reduction is profitable.
4895 auto IsMulAccValidAndClampRange =
4897 VPWidenCastRecipe *OuterExt) -> bool {
4899 [&](ElementCount VF) {
4901 Type *SrcTy = Ext0 ? Ext0->getOperand(0)->getScalarType() : RedTy;
4902 InstructionCost MulAccCost;
4903
4904 // getMulAccReductionCost for in-loop reductions does not support
4905 // mixed or floating-point extends.
4906 if (Ext0 && Ext1 &&
4907 (Ext0->getOpcode() != Ext1->getOpcode() ||
4908 Ext0->getOpcode() == Instruction::CastOps::FPExt))
4909 return false;
4910
4911 bool IsZExt =
4912 !Ext0 || Ext0->getOpcode() == Instruction::CastOps::ZExt;
4913 auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF));
4914 MulAccCost = Ctx.TTI.getMulAccReductionCost(IsZExt, Opcode, RedTy,
4915 SrcVecTy, CostKind);
4916
4917 InstructionCost MulCost = Mul->computeCost(VF, Ctx);
4918 InstructionCost RedCost = Red->computeCost(VF, Ctx);
4919 InstructionCost ExtCost = 0;
4920 if (Ext0)
4921 ExtCost += Ext0->computeCost(VF, Ctx);
4922 if (Ext1)
4923 ExtCost += Ext1->computeCost(VF, Ctx);
4924 if (OuterExt)
4925 ExtCost += OuterExt->computeCost(VF, Ctx);
4926
4927 return MulAccCost.isValid() &&
4928 MulAccCost < ExtCost + MulCost + RedCost;
4929 },
4930 Range);
4931 };
4932
4933 VPValue *VecOp = Red->getVecOp();
4934 VPRecipeBase *Sub = nullptr;
4935 VPValue *A, *B;
4936 VPValue *Tmp = nullptr;
4937
4938 if (RedTy->isFloatingPointTy())
4939 return nullptr;
4940
4941 // Sub reductions could have a sub between the add reduction and vec op.
4942 if (match(VecOp, m_Sub(m_ZeroInt(), m_VPValue(Tmp)))) {
4943 Sub = VecOp->getDefiningRecipe();
4944 VecOp = Tmp;
4945 }
4946
4947 // If ValB is a constant and can be safely extended, truncate it to the same
4948 // type as ExtA's operand, then extend it to the same type as ExtA. This
4949 // creates two uniform extends that can more easily be matched by the rest of
4950 // the bundling code. The ExtB reference, ValB and operand 1 of Mul are all
4951 // replaced with the new extend of the constant.
4952 auto ExtendAndReplaceConstantOp = [](VPWidenCastRecipe *ExtA,
4953 VPWidenCastRecipe *&ExtB, VPValue *&ValB,
4954 VPWidenRecipe *Mul) {
4955 if (!ExtA || ExtB || !isa<VPIRValue>(ValB))
4956 return;
4957 Type *NarrowTy = ExtA->getOperand(0)->getScalarType();
4958 Instruction::CastOps ExtOpc = ExtA->getOpcode();
4959 const APInt *Const;
4960 if (!match(ValB, m_APInt(Const)) ||
4962 Const, NarrowTy, TTI::getPartialReductionExtendKind(ExtOpc)))
4963 return;
4964 // The truncate ensures that the type of each extended operand is the
4965 // same, and it's been proven that the constant can be extended from
4966 // NarrowTy safely. Necessary since ExtA's extended operand would be
4967 // e.g. an i8, while the const will likely be an i32. This will be
4968 // elided by later optimisations.
4969 VPBuilder Builder(Mul);
4970 auto *Trunc =
4971 Builder.createWidenCast(Instruction::CastOps::Trunc, ValB, NarrowTy);
4972 Type *WideTy = ExtA->getScalarType();
4973 ValB = ExtB = Builder.createWidenCast(ExtOpc, Trunc, WideTy);
4974 Mul->setOperand(1, ExtB);
4975 };
4976
4977 // Try to match reduce.add(mul(...)).
4978 if (match(VecOp, m_Mul(m_VPValue(A), m_VPValue(B)))) {
4979 auto *RecipeA = dyn_cast<VPWidenCastRecipe>(A);
4980 auto *RecipeB = dyn_cast<VPWidenCastRecipe>(B);
4981 auto *Mul = cast<VPWidenRecipe>(VecOp);
4982
4983 // Convert reduce.add(mul(ext, const)) to reduce.add(mul(ext, ext(const)))
4984 ExtendAndReplaceConstantOp(RecipeA, RecipeB, B, Mul);
4985
4986 // Match reduce.add/sub(mul(ext, ext)).
4987 if (RecipeA && RecipeB && match(RecipeA, m_ZExtOrSExt(m_VPValue())) &&
4988 match(RecipeB, m_ZExtOrSExt(m_VPValue())) &&
4989 IsMulAccValidAndClampRange(Mul, RecipeA, RecipeB, nullptr)) {
4990 if (Sub)
4991 return new VPExpressionRecipe(RecipeA, RecipeB, Mul,
4992 cast<VPWidenRecipe>(Sub), Red);
4993 return new VPExpressionRecipe(RecipeA, RecipeB, Mul, Red);
4994 }
4995 // TODO: Add an expression type for this variant with a negated mul
4996 if (!Sub && IsMulAccValidAndClampRange(Mul, nullptr, nullptr, nullptr))
4997 return new VPExpressionRecipe(Mul, Red);
4998 }
4999 // TODO: Add an expression type for negated versions of other expression
5000 // variants.
5001 if (Sub)
5002 return nullptr;
5003
5004 // Match reduce.add(ext(mul(A, B))).
5005 if (match(VecOp, m_ZExtOrSExt(m_Mul(m_VPValue(A), m_VPValue(B))))) {
5006 auto *Ext = cast<VPWidenCastRecipe>(VecOp);
5007 auto *Mul = cast<VPWidenRecipe>(Ext->getOperand(0));
5008 auto *Ext0 = dyn_cast<VPWidenCastRecipe>(A);
5009 auto *Ext1 = dyn_cast<VPWidenCastRecipe>(B);
5010
5011 // reduce.add(ext(mul(ext, const)))
5012 // -> reduce.add(ext(mul(ext, ext(const))))
5013 ExtendAndReplaceConstantOp(Ext0, Ext1, B, Mul);
5014
5015 // reduce.add(ext(mul(ext(A), ext(B))))
5016 // -> reduce.add(mul(wider_ext(A), wider_ext(B)))
5017 // The inner extends must either have the same opcode as the outer extend or
5018 // be the same, in which case the multiply can never result in a negative
5019 // value and the outer extend can be folded away by doing wider
5020 // extends for the operands of the mul.
5021 if (Ext0 && Ext1 &&
5022 (Ext->getOpcode() == Ext0->getOpcode() || Ext0 == Ext1) &&
5023 Ext0->getOpcode() == Ext1->getOpcode() &&
5024 IsMulAccValidAndClampRange(Mul, Ext0, Ext1, Ext) && Mul->hasOneUse()) {
5025 auto *NewExt0 = new VPWidenCastRecipe(
5026 Ext0->getOpcode(), Ext0->getOperand(0), Ext->getScalarType(), nullptr,
5027 *Ext0, *Ext0, Ext0->getDebugLoc());
5028 NewExt0->insertBefore(Ext0);
5029
5030 VPWidenCastRecipe *NewExt1 = NewExt0;
5031 if (Ext0 != Ext1) {
5032 NewExt1 = new VPWidenCastRecipe(Ext1->getOpcode(), Ext1->getOperand(0),
5033 Ext->getScalarType(), nullptr, *Ext1,
5034 *Ext1, Ext1->getDebugLoc());
5035 NewExt1->insertBefore(Ext1);
5036 }
5037 auto *NewMul = Mul->cloneWithOperands({NewExt0, NewExt1});
5038 NewMul->insertBefore(Mul);
5039 Ext->replaceAllUsesWith(NewMul);
5040 Ext->eraseFromParent();
5041 Mul->eraseFromParent();
5042 return new VPExpressionRecipe(NewExt0, NewExt1, NewMul, Red);
5043 }
5044 }
5045 return nullptr;
5046}
5047
5048/// This function tries to create abstract recipes from the reduction recipe for
5049/// following optimizations and cost estimation.
5051 VPCostContext &Ctx,
5052 VFRange &Range) {
5053 // Creation of VPExpressions for partial reductions is entirely handled in
5054 // transformToPartialReduction.
5055 assert(!Red->isPartialReduction() &&
5056 "This path does not support partial reductions");
5057
5058 VPExpressionRecipe *AbstractR = nullptr;
5059 auto IP = std::next(Red->getIterator());
5060 auto *VPBB = Red->getParent();
5061 if (auto *MulAcc = tryToMatchAndCreateMulAccumulateReduction(Red, Ctx, Range))
5062 AbstractR = MulAcc;
5063 else if (auto *ExtRed = tryToMatchAndCreateExtendedReduction(Red, Ctx, Range))
5064 AbstractR = ExtRed;
5065 // Cannot create abstract inloop reduction recipes.
5066 if (!AbstractR)
5067 return;
5068
5069 AbstractR->insertBefore(*VPBB, IP);
5070 Red->replaceAllUsesWith(AbstractR);
5071}
5072
5083
5085 if (Plan.hasScalarVFOnly())
5086 return;
5087
5088#ifndef NDEBUG
5089 VPDominatorTree VPDT(Plan);
5090#endif
5091
5092 SmallVector<VPValue *> VPValues;
5093 if (VPValue *BTC = Plan.getBackedgeTakenCount())
5094 VPValues.push_back(BTC);
5095 append_range(VPValues, Plan.getLiveIns());
5096 for (VPRecipeBase &R : *Plan.getEntry())
5097 append_range(VPValues, R.definedValues());
5098
5099 auto *VectorPreheader = Plan.getVectorPreheader();
5100 for (VPValue *VPV : VPValues) {
5102 (isa<VPIRValue>(VPV) && isa<Constant>(VPV->getLiveInIRValue())))
5103 continue;
5104
5105 // Add explicit broadcast at the insert point that dominates all users.
5106 VPBasicBlock *HoistBlock = VectorPreheader;
5107 VPBasicBlock::iterator HoistPoint = VectorPreheader->end();
5108 for (VPUser *User : VPV->users()) {
5109 if (User->usesScalars(VPV))
5110 continue;
5111 if (cast<VPRecipeBase>(User)->getParent() == VectorPreheader)
5112 HoistPoint = HoistBlock->begin();
5113 else
5114 assert(VPDT.dominates(VectorPreheader,
5115 cast<VPRecipeBase>(User)->getParent()) &&
5116 "All users must be in the vector preheader or dominated by it");
5117 }
5118
5119 VPBuilder Builder(cast<VPBasicBlock>(HoistBlock), HoistPoint);
5120 auto *Broadcast = Builder.createNaryOp(VPInstruction::Broadcast, {VPV});
5121 VPV->replaceUsesWithIf(Broadcast,
5122 [VPV, Broadcast](VPUser &U, unsigned Idx) {
5123 return Broadcast != &U && !U.usesScalars(VPV);
5124 });
5125 }
5126}
5127
5128// Collect common metadata from a group of replicate recipes by intersecting
5129// metadata from all recipes in the group.
5131 VPIRMetadata CommonMetadata = *Recipes.front();
5132 for (VPReplicateRecipe *Recipe : drop_begin(Recipes))
5133 CommonMetadata.intersect(*Recipe);
5134 return CommonMetadata;
5135}
5136
5137template <unsigned Opcode>
5141 const Loop *L) {
5142 static_assert(Opcode == Instruction::Load || Opcode == Instruction::Store,
5143 "Only Load and Store opcodes supported");
5144 [[maybe_unused]] constexpr bool IsLoad = (Opcode == Instruction::Load);
5145
5146 // For each address, collect operations with the same or complementary masks.
5149 Plan, PSE, L,
5150 [](VPReplicateRecipe *RepR) { return RepR->isPredicated(); });
5151 for (auto Recipes : Groups) {
5152 if (Recipes.size() < 2)
5153 continue;
5154
5156 map_range(Recipes, bind_back<getLoadStoreValueType>(IsLoad))) &&
5157 "Expected all recipes in group to have the same load-store type");
5158
5159 // Collect groups with the same or complementary masks.
5160 for (VPReplicateRecipe *&RecipeI : Recipes) {
5161 if (!RecipeI)
5162 continue;
5163
5164 VPValue *MaskI = RecipeI->getMask();
5166 Group.push_back(RecipeI);
5167 RecipeI = nullptr;
5168
5169 // Find all operations with the same or complementary masks.
5170 bool HasComplementaryMask = false;
5171 for (VPReplicateRecipe *&RecipeJ : Recipes) {
5172 if (!RecipeJ)
5173 continue;
5174
5175 VPValue *MaskJ = RecipeJ->getMask();
5176 // Check if any operation in the group has a complementary mask with
5177 // another, that is M1 == NOT(M2) or M2 == NOT(M1).
5178 HasComplementaryMask |= match(MaskI, m_Not(m_Specific(MaskJ))) ||
5179 match(MaskJ, m_Not(m_Specific(MaskI)));
5180 Group.push_back(RecipeJ);
5181 RecipeJ = nullptr;
5182 }
5183
5184 if (HasComplementaryMask) {
5185 assert(Group.size() >= 2 && "must have at least 2 entries");
5186 AllGroups.push_back(std::move(Group));
5187 }
5188 }
5189 }
5190
5191 return AllGroups;
5192}
5193
5194// Find the recipe with minimum alignment in the group.
5195template <typename InstType>
5196static VPReplicateRecipe *
5198 return *min_element(Group, [](VPReplicateRecipe *A, VPReplicateRecipe *B) {
5199 return cast<InstType>(A->getUnderlyingInstr())->getAlign() <
5200 cast<InstType>(B->getUnderlyingInstr())->getAlign();
5201 });
5202}
5203
5206 const Loop *L) {
5207 auto Groups =
5209 if (Groups.empty())
5210 return;
5211
5212 // Process each group of loads.
5213 for (auto &Group : Groups) {
5214 // Try to use the earliest (most dominating) load to replace all others.
5215 VPReplicateRecipe *EarliestLoad = Group[0];
5216 VPBasicBlock *FirstBB = EarliestLoad->getParent();
5217 VPBasicBlock *LastBB = Group.back()->getParent();
5218
5219 // Check that the load doesn't alias with stores between first and last.
5220 auto LoadLoc = vputils::getMemoryLocation(*EarliestLoad);
5221 if (!LoadLoc || !canHoistOrSinkWithNoAliasCheck(*LoadLoc, FirstBB, LastBB))
5222 continue;
5223
5224 // Collect common metadata from all loads in the group.
5225 VPIRMetadata CommonMetadata = getCommonMetadata(Group);
5226
5227 // Find the load with minimum alignment to use.
5228 auto *LoadWithMinAlign = findRecipeWithMinAlign<LoadInst>(Group);
5229
5230 bool IsSingleScalar = EarliestLoad->isSingleScalar();
5231 assert(all_of(Group,
5232 [IsSingleScalar](VPReplicateRecipe *R) {
5233 return R->isSingleScalar() == IsSingleScalar;
5234 }) &&
5235 "all members in group must agree on IsSingleScalar");
5236
5237 // Create an unpredicated version of the earliest load with common
5238 // metadata.
5239 auto *UnpredicatedLoad = new VPReplicateRecipe(
5240 LoadWithMinAlign->getUnderlyingInstr(), {EarliestLoad->getOperand(0)},
5241 IsSingleScalar, /*Mask=*/nullptr, *EarliestLoad, CommonMetadata);
5242
5243 UnpredicatedLoad->insertBefore(EarliestLoad);
5244
5245 // Replace all loads in the group with the unpredicated load.
5246 for (VPReplicateRecipe *Load : Group) {
5247 Load->replaceAllUsesWith(UnpredicatedLoad);
5248 Load->eraseFromParent();
5249 }
5250 }
5251}
5252
5253static bool
5255 PredicatedScalarEvolution &PSE, const Loop &L) {
5256 auto StoreLoc = vputils::getMemoryLocation(*StoresToSink.front());
5257 if (!StoreLoc || !StoreLoc->AATags.Scope)
5258 return false;
5259
5260 // When sinking a group of stores, all members of the group alias each other.
5261 // Skip them during the alias checks.
5262 VPBasicBlock *FirstBB = StoresToSink.front()->getParent();
5263 VPBasicBlock *LastBB = StoresToSink.back()->getParent();
5264 SinkStoreInfo SinkInfo(StoresToSink, *StoresToSink[0], PSE, L);
5265 return canHoistOrSinkWithNoAliasCheck(*StoreLoc, FirstBB, LastBB, SinkInfo);
5266}
5267
5270 const Loop *L) {
5271 auto Groups =
5273 if (Groups.empty())
5274 return;
5275
5276 for (auto &Group : Groups) {
5277 if (!canSinkStoreWithNoAliasCheck(Group, PSE, *L))
5278 continue;
5279
5280 // Use the last (most dominated) store's location for the unconditional
5281 // store.
5282 VPReplicateRecipe *LastStore = Group.back();
5283 VPBasicBlock *InsertBB = LastStore->getParent();
5284
5285 // Collect common alias metadata from all stores in the group.
5286 VPIRMetadata CommonMetadata = getCommonMetadata(Group);
5287
5288 // Build select chain for stored values.
5289 VPValue *SelectedValue = Group[0]->getOperand(0);
5290 VPBuilder Builder(InsertBB, LastStore->getIterator());
5291
5292 bool IsSingleScalar = Group[0]->isSingleScalar();
5293 for (unsigned I = 1; I < Group.size(); ++I) {
5294 assert(IsSingleScalar == Group[I]->isSingleScalar() &&
5295 "all members in group must agree on IsSingleScalar");
5296 VPValue *Mask = Group[I]->getMask();
5297 VPValue *Value = Group[I]->getOperand(0);
5298 SelectedValue = Builder.createSelect(Mask, Value, SelectedValue,
5299 Group[I]->getDebugLoc());
5300 }
5301
5302 // Find the store with minimum alignment to use.
5303 auto *StoreWithMinAlign = findRecipeWithMinAlign<StoreInst>(Group);
5304
5305 // Create unconditional store with selected value and common metadata.
5306 auto *UnpredicatedStore = new VPReplicateRecipe(
5307 StoreWithMinAlign->getUnderlyingInstr(),
5308 {SelectedValue, LastStore->getOperand(1)}, IsSingleScalar,
5309 /*Mask=*/nullptr, *LastStore, CommonMetadata);
5310 UnpredicatedStore->insertBefore(*InsertBB, LastStore->getIterator());
5311
5312 // Remove all predicated stores from the group.
5313 for (VPReplicateRecipe *Store : Group)
5314 Store->eraseFromParent();
5315 }
5316}
5317
5319 VPlan &Plan, ElementCount BestVF, unsigned BestUF,
5321 assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
5322 assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
5323
5324 VPValue *TC = Plan.getTripCount();
5325 if (TC->getNumUsers() == 0)
5326 return;
5327
5328 // Skip cases for which the trip count may be non-trivial to materialize.
5329 // I.e., when a scalar tail is absent - due to tail folding, or when a scalar
5330 // tail is required.
5331 if (!Plan.hasScalarTail() ||
5333 Plan.getScalarPreheader() ||
5334 !isa<VPIRValue>(TC))
5335 return;
5336
5337 // Materialize vector trip counts for constants early if it can simply
5338 // be computed as (Original TC / VF * UF) * VF * UF.
5339 // TODO: Compute vector trip counts for loops requiring a scalar epilogue and
5340 // tail-folded loops.
5341 ScalarEvolution &SE = *PSE.getSE();
5342 auto *TCScev = SE.getSCEV(TC->getLiveInIRValue());
5343 if (!isa<SCEVConstant>(TCScev))
5344 return;
5345 const SCEV *VFxUF = SE.getElementCount(TCScev->getType(), BestVF * BestUF);
5346 auto VecTCScev = SE.getMulExpr(SE.getUDivExpr(TCScev, VFxUF), VFxUF);
5347 if (auto *ConstVecTC = dyn_cast<SCEVConstant>(VecTCScev))
5348 Plan.getVectorTripCount().setUnderlyingValue(ConstVecTC->getValue());
5349}
5350
5352 VPBasicBlock *VectorPH) {
5354 if (BTC->getNumUsers() == 0)
5355 return;
5356
5357 VPBuilder Builder(VectorPH, VectorPH->begin());
5358 auto *TCTy = Plan.getTripCount()->getScalarType();
5359 auto *TCMO =
5360 Builder.createSub(Plan.getTripCount(), Plan.getConstantInt(TCTy, 1),
5361 DebugLoc::getCompilerGenerated(), "trip.count.minus.1");
5362 BTC->replaceAllUsesWith(TCMO);
5363}
5364
5366 if (Plan.hasScalarVFOnly())
5367 return;
5368
5369 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
5370 auto VPBBsOutsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
5372 auto VPBBsInsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
5373 vp_depth_first_shallow(LoopRegion->getEntry()));
5374 // Materialize Build(Struct)Vector for all replicating VPReplicateRecipes,
5375 // VPScalarIVStepsRecipe and VPInstructions, excluding ones in replicate
5376 // regions. Those are not materialized explicitly yet.
5377 // TODO: materialize build vectors for replicating recipes in replicating
5378 // regions.
5379 for (VPBasicBlock *VPBB :
5380 concat<VPBasicBlock *>(VPBBsOutsideLoopRegion, VPBBsInsideLoopRegion)) {
5381 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
5383 continue;
5384 auto *DefR = cast<VPSingleDefRecipe>(&R);
5385 auto UsesVectorOrInsideReplicateRegion = [DefR, LoopRegion](VPUser *U) {
5386 VPRegionBlock *ParentRegion = cast<VPRecipeBase>(U)->getRegion();
5387 return !U->usesScalars(DefR) || ParentRegion != LoopRegion;
5388 };
5389 if ((isa<VPReplicateRecipe>(DefR) &&
5390 cast<VPReplicateRecipe>(DefR)->isSingleScalar()) ||
5391 (isa<VPInstruction>(DefR) &&
5393 !cast<VPInstruction>(DefR)->doesGeneratePerAllLanes())) ||
5394 none_of(DefR->users(), UsesVectorOrInsideReplicateRegion))
5395 continue;
5396
5397 Type *ScalarTy = DefR->getScalarType();
5398 unsigned Opcode = ScalarTy->isStructTy()
5401 auto *BuildVector = new VPInstruction(Opcode, {DefR});
5402 BuildVector->insertAfter(DefR);
5403
5404 DefR->replaceUsesWithIf(
5405 BuildVector, [BuildVector, &UsesVectorOrInsideReplicateRegion](
5406 VPUser &U, unsigned) {
5407 return &U != BuildVector && UsesVectorOrInsideReplicateRegion(&U);
5408 });
5409 }
5410 }
5411
5412 // Create explicit VPInstructions to convert vectors to scalars. The current
5413 // implementation is conservative - it may miss some cases that may or may not
5414 // be vector values. TODO: introduce Unpacks speculatively - remove them later
5415 // if they are known to operate on scalar values.
5416 for (VPBasicBlock *VPBB : VPBBsInsideLoopRegion) {
5417 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
5419 VPDerivedIVRecipe>(&R))
5420 continue;
5421 for (VPValue *Def : R.definedValues()) {
5422 // Skip recipes that are single-scalar or only have their first lane
5423 // used.
5424 // TODO: The Defs skipped here may or may not be vector values.
5425 // Introduce Unpacks, and remove them later, if they are guaranteed to
5426 // produce scalar values.
5428 continue;
5429
5430 // At the moment, we create unpacks only for scalar users outside
5431 // replicate regions. Recipes inside replicate regions still extract the
5432 // required lanes implicitly.
5433 // TODO: Remove once replicate regions are unrolled completely.
5434 auto IsCandidateUnpackUser = [Def](VPUser *U) {
5435 VPRegionBlock *ParentRegion = cast<VPRecipeBase>(U)->getRegion();
5436 return U->usesScalars(Def) &&
5437 (!ParentRegion || !ParentRegion->isReplicator());
5438 };
5439 if (none_of(Def->users(), IsCandidateUnpackUser))
5440 continue;
5441
5442 auto *Unpack = new VPInstruction(VPInstruction::Unpack, {Def});
5443 if (R.isPhi())
5444 Unpack->insertBefore(*VPBB, VPBB->getFirstNonPhi());
5445 else
5446 Unpack->insertAfter(&R);
5447 Def->replaceUsesWithIf(Unpack,
5448 [&IsCandidateUnpackUser](VPUser &U, unsigned) {
5449 return IsCandidateUnpackUser(&U);
5450 });
5451 }
5452 }
5453 }
5454}
5455
5457 VPlan &Plan, VPBasicBlock *VectorPHVPBB, bool TailByMasking,
5458 bool RequiresScalarEpilogue, VPValue *Step,
5459 std::optional<uint64_t> MaxRuntimeStep) {
5460 VPSymbolicValue &VectorTC = Plan.getVectorTripCount();
5461 // There's nothing to do if there are no users of the vector trip count or its
5462 // IR value has already been set.
5463 if (VectorTC.getNumUsers() == 0 || VectorTC.getUnderlyingValue())
5464 return;
5465
5466 VPValue *TC = Plan.getTripCount();
5467 Type *TCTy = TC->getScalarType();
5468 VPBasicBlock::iterator InsertPt = VectorPHVPBB->begin();
5469 if (auto *StepR = Step->getDefiningRecipe()) {
5470 assert(VPDominatorTree(Plan).dominates(StepR->getParent(), VectorPHVPBB) &&
5471 "Step VPBB must dominate VectorPHVPBB");
5472 // Insert after Step's definition to maintain valid def-use ordering.
5473 InsertPt = std::next(StepR->getIterator());
5474 }
5475 VPBuilder Builder(VectorPHVPBB, InsertPt);
5476
5477 // For scalable steps, if TC is a constant and is divisible by the maximum
5478 // possible runtime step, then TC % Step == 0 for all valid vscale values
5479 // and the vector trip count equals TC directly.
5480 const APInt *TCVal;
5481 if (!RequiresScalarEpilogue && match(TC, m_APInt(TCVal)) && MaxRuntimeStep &&
5482 TCVal->urem(*MaxRuntimeStep) == 0) {
5483 VectorTC.replaceAllUsesWith(TC);
5484 return;
5485 }
5486
5487 // If the tail is to be folded by masking, round the number of iterations N
5488 // up to a multiple of Step instead of rounding down. This is done by first
5489 // adding Step-1 and then rounding down. Note that it's ok if this addition
5490 // overflows: the vector induction variable will eventually wrap to zero given
5491 // that it starts at zero and its Step is a power of two; the loop will then
5492 // exit, with the last early-exit vector comparison also producing all-true.
5493 if (TailByMasking) {
5494 TC = Builder.createAdd(
5495 TC, Builder.createSub(Step, Plan.getConstantInt(TCTy, 1)),
5496 DebugLoc::getCompilerGenerated(), "n.rnd.up");
5497 }
5498
5499 // Now we need to generate the expression for the part of the loop that the
5500 // vectorized body will execute. This is equal to N - (N % Step) if scalar
5501 // iterations are not required for correctness, or N - Step, otherwise. Step
5502 // is equal to the vectorization factor (number of SIMD elements) times the
5503 // unroll factor (number of SIMD instructions).
5504 VPValue *R =
5505 Builder.createNaryOp(Instruction::URem, {TC, Step},
5506 DebugLoc::getCompilerGenerated(), "n.mod.vf");
5507
5508 // There are cases where we *must* run at least one iteration in the remainder
5509 // loop. See the cost model for when this can happen. If the step evenly
5510 // divides the trip count, we set the remainder to be equal to the step. If
5511 // the step does not evenly divide the trip count, no adjustment is necessary
5512 // since there will already be scalar iterations. Note that the minimum
5513 // iterations check ensures that N >= Step.
5514 if (RequiresScalarEpilogue) {
5515 assert(!TailByMasking &&
5516 "requiring scalar epilogue is not supported with fail folding");
5517 VPValue *IsZero =
5518 Builder.createICmp(CmpInst::ICMP_EQ, R, Plan.getZero(TCTy));
5519 R = Builder.createSelect(IsZero, Step, R);
5520 }
5521
5522 VPValue *Res =
5523 Builder.createSub(TC, R, DebugLoc::getCompilerGenerated(), "n.vec");
5524 VectorTC.replaceAllUsesWith(Res);
5525}
5526
5528 ElementCount VFEC) {
5529 // If VF and VFxUF have already been materialized (no remaining users),
5530 // there's nothing more to do.
5531 if (Plan.getVF().isMaterialized()) {
5532 assert(Plan.getVFxUF().isMaterialized() &&
5533 "VF and VFxUF must be materialized together");
5534 return;
5535 }
5536
5537 VPBuilder Builder(VectorPH, VectorPH->begin());
5538 Type *TCTy = Plan.getTripCount()->getScalarType();
5539 VPValue &VF = Plan.getVF();
5540 VPValue &VFxUF = Plan.getVFxUF();
5541 // If there are no users of the runtime VF, compute VFxUF by constant folding
5542 // the multiplication of VF and UF.
5543 if (VF.getNumUsers() == 0) {
5544 VPValue *RuntimeVFxUF =
5545 Builder.createElementCount(TCTy, VFEC * Plan.getConcreteUF());
5546 VFxUF.replaceAllUsesWith(RuntimeVFxUF);
5547 return;
5548 }
5549
5550 // For users of the runtime VF, compute it as VF * vscale, and VFxUF as (VF *
5551 // vscale) * UF.
5552 VPValue *RuntimeVF = Builder.createElementCount(TCTy, VFEC);
5554 VPValue *BC = Builder.createNaryOp(VPInstruction::Broadcast, RuntimeVF);
5556 BC, [&VF](VPUser &U, unsigned) { return !U.usesScalars(&VF); });
5557 }
5558 VF.replaceAllUsesWith(RuntimeVF);
5559
5560 VPValue *MulByUF = Builder.createOverflowingOp(
5561 Instruction::Mul,
5562 {RuntimeVF, Plan.getConstantInt(TCTy, Plan.getConcreteUF())},
5563 {true, false});
5564 VFxUF.replaceAllUsesWith(MulByUF);
5565}
5566
5568 VPSingleDefRecipe *HeaderMask = vputils::findHeaderMask(Plan);
5569 auto *HeaderMaskDef = HeaderMask->getDefiningRecipe();
5570 Type *I1Ty = IntegerType::getInt1Ty(Plan.getContext());
5571
5572 VPBuilder Builder(Plan.getVectorPreheader());
5573 auto *AliasMask = Builder.createNaryOp(
5574 VPInstruction::IncomingAliasMask, {}, nullptr, {}, {},
5575 DebugLoc::getUnknown(), "incoming.alias.mask", I1Ty);
5576
5577 if (HeaderMaskDef->isPhi())
5578 Builder = VPBuilder(&*HeaderMaskDef->getParent()->getFirstNonPhi());
5579 else
5580 Builder = VPBuilder::getToInsertAfter(HeaderMaskDef);
5581
5582 // Update all existing users of the header mask to "HeaderMask & AliasMask".
5583 auto *ClampedHeaderMask = Builder.createAnd(HeaderMask, AliasMask);
5584 HeaderMask->replaceUsesWithIf(ClampedHeaderMask, [&](VPUser &U, unsigned) {
5585 return &U != ClampedHeaderMask;
5586 });
5587}
5588
5589VPValue *
5591 ArrayRef<PointerDiffInfo> DiffChecks) {
5592 VPBuilder Builder(AliasCheckVPBB);
5593 Type *I1Ty = IntegerType::getInt1Ty(Plan.getContext());
5594
5595 VPValue *IncomingAliasMask = vputils::findIncomingAliasMask(Plan);
5596 assert(IncomingAliasMask && "Expected an alias mask!");
5597
5598 VPValue *AliasMask = nullptr;
5599 for (const PointerDiffInfo &Check : DiffChecks) {
5601 VPValue *Sink =
5603 Type *AddrType = Src->getScalarType();
5604
5605 // TODO: Only freeze the required pointer (not both src and sink).
5606 if (Check.NeedsFreeze) {
5607 Src = Builder.createScalarFreeze(Src, AddrType, DebugLoc::getUnknown());
5608 Sink = Builder.createScalarFreeze(Sink, AddrType, DebugLoc::getUnknown());
5609 }
5610
5611 // TODO: Generate loop_dependence_raw_mask when there's a read-after-write
5612 // dependency between the source and the sink. This is not necessary for
5613 // correctness of the mask, but using the "raw" variant prevents loads
5614 // depending on the completion of stores.
5615 VPWidenIntrinsicRecipe *WARMask = Builder.insert(new VPWidenIntrinsicRecipe(
5616 Intrinsic::loop_dependence_war_mask,
5617 {Src, Sink, Plan.getConstantInt(AddrType, Check.AccessSize)}, I1Ty));
5618
5619 if (AliasMask)
5620 AliasMask = Builder.createAnd(AliasMask, WARMask);
5621 else
5622 AliasMask = WARMask;
5623 }
5624
5626 Type *IndexTy = Plan.getDataLayout().getIndexType(Plan.getContext(), 0);
5627 VPValue *NumActive = Builder.createNaryOp(
5628 VPInstruction::NumActiveLanes, {AliasMask}, nullptr, {}, {},
5629 DebugLoc::getUnknown(), "num.active.lanes", IndexTy);
5630 VPValue *ClampedVF = Builder.createScalarZExtOrTrunc(
5631 NumActive, IVTy, IndexTy, DebugLoc::getCompilerGenerated());
5632
5633 IncomingAliasMask->replaceAllUsesWith(AliasMask);
5634
5635 return ClampedVF;
5636}
5637
5639 VPlan &Plan, ArrayRef<PointerDiffInfo> DiffChecks, bool HasBranchWeights) {
5640 VPBasicBlock *ClampedVFCheck =
5641 Plan.createVPBasicBlock("vector.clamped.vf.check");
5642
5643 VPValue *ClampedVF = materializeAliasMask(Plan, ClampedVFCheck, DiffChecks);
5644 VPBuilder Builder(ClampedVFCheck);
5646 Type *TCTy = Plan.getTripCount()->getScalarType();
5647
5648 // Check the "ClampedVF" from the alias mask is larger than one.
5649 VPValue *IsScalar =
5650 Builder.createICmp(CmpInst::ICMP_ULE, ClampedVF,
5651 Plan.getConstantInt(TCTy, 1), DL, "vf.is.scalar");
5652
5653 VPValue *TripCount = Plan.getTripCount();
5654 VPValue *MaxUIntTripCount =
5656 VPValue *DistanceToMax = Builder.createSub(MaxUIntTripCount, TripCount);
5657
5658 // For tail-folding: Don't execute the vector loop if (UMax - n) < ClampedVF.
5659 // Note: The ClampedVF may not be a power-of-two. This means the loop exit
5660 // condition (index.next == n.vec) may not be correct in the case of an
5661 // overflow. The issue is `n.vec` could be zero due to an overflow, but
5662 // index.next is not guaranteed to overflow to zero as the ClampedVF is not a
5663 // power-of-two).
5664 VPValue *TripCountCheck = Builder.createICmp(
5665 ICmpInst::ICMP_ULT, DistanceToMax, ClampedVF, DL, "vf.step.overflow");
5666
5667 VPValue *Cond = Builder.createOr(IsScalar, TripCountCheck, DL);
5668 attachVPCheckBlock(Plan, Cond, ClampedVFCheck, HasBranchWeights);
5669
5670 // Materialize the trip count early as this will add a use of (VFxUF) that
5671 // needs to be replaced with the ClampedVF.
5673 /*TailByMasking=*/true,
5674 /*RequiresScalarEpilogue=*/false,
5675 &Plan.getVFxUF());
5676
5677 assert(Plan.getConcreteUF() == 1 &&
5678 "Clamped VF not supported with interleaving");
5679 Plan.getVF().replaceAllUsesWith(ClampedVF);
5680 Plan.getVFxUF().replaceAllUsesWith(ClampedVF);
5681}
5682
5684 ScalarEvolution &SE) {
5685 auto *Entry = Plan.getEntry();
5686 VPBuilder Builder(Entry, Entry->begin());
5688 ->getIRBasicBlock()
5689 ->getTerminator()
5690 ->getDebugLoc();
5691 VPSCEVExpander Expander(Builder, SE, DL);
5692
5693 // Expand VPExpandSCEVRecipes to VPInstructions using VPSCEVExpander. During
5694 // the transition, unsupported VPExpandSCEVRecipes are skipped and left for
5695 // late expansion.
5696 for (VPRecipeBase &R : make_early_inc_range(*Entry)) {
5697 auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(&R);
5698 if (!ExpSCEV || ExpSCEV->getNumUsers() == 0)
5699 continue;
5700 Builder.setInsertPoint(ExpSCEV);
5701 VPValue *Expanded = Expander.tryToExpand(ExpSCEV->getSCEV());
5702 if (!Expanded)
5703 continue;
5704 ExpSCEV->replaceAllUsesWith(Expanded);
5705 // TripCount should not be used after expansion to VPInstructions. Reset to
5706 // poison to avoid dangling references.
5707 if (Plan.getTripCount() == ExpSCEV)
5708 Plan.resetTripCount(Plan.getPoison(ExpSCEV->getScalarType()));
5709 ExpSCEV->eraseFromParent();
5710 }
5711}
5712
5715 SCEVExpander Expander(SE, "induction", /*PreserveLCSSA=*/false);
5716
5717 auto *Entry = cast<VPIRBasicBlock>(Plan.getEntry());
5718 BasicBlock *EntryBB = Entry->getIRBasicBlock();
5719 DenseMap<const SCEV *, Value *> ExpandedSCEVs;
5720 // Expand remaining VPExpandSCEVRecipes to IR instructions using SCEVExpander.
5721 for (VPRecipeBase &R : make_early_inc_range(*Entry)) {
5722 auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(&R);
5723 if (!ExpSCEV)
5724 continue;
5725 const SCEV *Expr = ExpSCEV->getSCEV();
5726 Value *Res =
5727 Expander.expandCodeFor(Expr, Expr->getType(), EntryBB->getTerminator());
5728 ExpandedSCEVs[Expr] = Res;
5729 VPValue *Exp = Plan.getOrAddLiveIn(Res);
5730 ExpSCEV->replaceAllUsesWith(Exp);
5731 if (Plan.getTripCount() == ExpSCEV)
5732 Plan.resetTripCount(Exp);
5733 ExpSCEV->eraseFromParent();
5734 }
5736 "all VPExpandSCEVRecipes must have been expanded");
5737 // Add IR instructions in the entry basic block but not in the VPIRBasicBlock
5738 // to the VPIRBasicBlock.
5739 auto EI = Entry->begin();
5740 for (Instruction &I : drop_end(*EntryBB)) {
5741 if (EI != Entry->end() && isa<VPIRInstruction>(*EI) &&
5742 &cast<VPIRInstruction>(&*EI)->getInstruction() == &I) {
5743 EI++;
5744 continue;
5745 }
5747 }
5748
5749 return ExpandedSCEVs;
5750}
5751
5752/// Returns true if \p V is VPWidenLoadRecipe or VPInterleaveRecipe that can be
5753/// converted to a narrower recipe. \p V is used by a wide recipe that feeds a
5754/// store interleave group at index \p Idx, \p WideMember0 is the recipe feeding
5755/// the same interleave group at index 0. A VPWidenLoadRecipe can be narrowed to
5756/// an index-independent load if it feeds all wide ops at all indices (\p OpV
5757/// must be the operand at index \p OpIdx for both the recipe at lane 0, \p
5758/// WideMember0). A VPInterleaveRecipe can be narrowed to a wide load, if \p V
5759/// is defined at \p Idx of a load interleave group.
5760static bool canNarrowLoad(VPSingleDefRecipe *WideMember0, unsigned OpIdx,
5761 VPValue *OpV, unsigned Idx, bool IsScalable) {
5762 VPValue *Member0Op = WideMember0->getOperand(OpIdx);
5763 VPRecipeBase *Member0OpR = Member0Op->getDefiningRecipe();
5764 if (!Member0OpR) {
5765 // Member0's operand is a uniform live-in, broadcast across all fields.
5766 if (Member0Op == OpV)
5767 return true;
5768 // Otherwise distinct per-field live-ins are assembled into a BuildVector.
5769 return !IsScalable && !OpV->hasDefiningRecipe() &&
5770 OpV->getScalarType() == Member0Op->getScalarType();
5771 }
5772 if (auto *W = dyn_cast<VPWidenLoadRecipe>(Member0OpR))
5773 // For scalable VFs, the narrowed plan processes vscale iterations at once,
5774 // so a shared wide load cannot be narrowed to a uniform scalar; bail out.
5775 return !IsScalable && !W->getMask() && W->isConsecutive() &&
5776 Member0Op == OpV;
5777 if (auto *IR = dyn_cast<VPInterleaveRecipe>(Member0OpR))
5778 return IR->getInterleaveGroup()->isFull() && IR->getVPValue(Idx) == OpV;
5779 return false;
5780}
5781
5782static bool canNarrowOps(ArrayRef<VPValue *> Ops, bool IsScalable) {
5784 auto *WideMember0 = dyn_cast<VPRecipeWithIRFlags>(Ops[0]);
5785 if (!WideMember0)
5786 return false;
5787 for (VPValue *V : Ops) {
5789 return false;
5790 auto *R = cast<VPRecipeWithIRFlags>(V);
5791 if (getOpcodeOrIntrinsicID(R) != getOpcodeOrIntrinsicID(WideMember0))
5792 return false;
5793 if (R->getScalarType() != WideMember0->getScalarType())
5794 return false;
5795 if (R->hasPredicate() && R->getPredicate() != WideMember0->getPredicate())
5796 return false;
5797 }
5798
5799 for (unsigned Idx = 0; Idx != WideMember0->getNumOperands(); ++Idx) {
5801 for (VPValue *Op : Ops)
5802 OpsI.push_back(Op->getDefiningRecipe()->getOperand(Idx));
5803
5804 if (canNarrowOps(OpsI, IsScalable))
5805 continue;
5806
5807 if (any_of(enumerate(OpsI), [WideMember0, Idx, IsScalable](const auto &P) {
5808 const auto &[OpIdx, OpV] = P;
5809 return !canNarrowLoad(WideMember0, Idx, OpV, OpIdx, IsScalable);
5810 }))
5811 return false;
5812 }
5813
5814 return true;
5815}
5816
5817/// Returns VF from \p VFs if \p IR is a full interleave group with factor and
5818/// number of members both equal to VF. The interleave group must also access
5819/// the full vector width.
5820static std::optional<ElementCount>
5823 const TargetTransformInfo &TTI) {
5824 if (!InterleaveR || InterleaveR->getMask())
5825 return std::nullopt;
5826
5827 Type *GroupElementTy = nullptr;
5828 if (InterleaveR->getStoredValues().empty()) {
5829 GroupElementTy = InterleaveR->getVPValue(0)->getScalarType();
5830 if (!all_of(InterleaveR->definedValues(), [GroupElementTy](VPValue *Op) {
5831 return Op->getScalarType() == GroupElementTy;
5832 }))
5833 return std::nullopt;
5834 } else {
5835 GroupElementTy = InterleaveR->getStoredValues()[0]->getScalarType();
5836 if (!all_of(InterleaveR->getStoredValues(), [GroupElementTy](VPValue *Op) {
5837 return Op->getScalarType() == GroupElementTy;
5838 }))
5839 return std::nullopt;
5840 }
5841
5842 auto IG = InterleaveR->getInterleaveGroup();
5843 if (IG->getFactor() != IG->getNumMembers())
5844 return std::nullopt;
5845
5846 auto GetVectorBitWidthForVF = [&TTI](ElementCount VF) {
5847 TypeSize Size = TTI.getRegisterBitWidth(
5850 assert(Size.isScalable() == VF.isScalable() &&
5851 "if Size is scalable, VF must be scalable and vice versa");
5852 return Size.getKnownMinValue();
5853 };
5854
5855 for (ElementCount VF : VFs) {
5856 unsigned MinVal = VF.getKnownMinValue();
5857 unsigned GroupSize = GroupElementTy->getScalarSizeInBits() * MinVal;
5858 if (IG->getFactor() == MinVal && GroupSize == GetVectorBitWidthForVF(VF))
5859 return {VF};
5860 }
5861 return std::nullopt;
5862}
5863
5864/// Returns true if \p VPValue is a narrow VPValue.
5865static bool isAlreadyNarrow(VPValue *VPV) {
5866 if (isa<VPIRValue>(VPV))
5867 return true;
5868 auto *RepR = dyn_cast<VPReplicateRecipe>(VPV);
5869 return RepR && RepR->isSingleScalar();
5870}
5871
5872// Convert the wide recipes defining the VPValues in \p Members feeding an
5873// interleave group to a single narrow variant. The first member is reused as
5874// the narrowed recipe. BuildVectors for live-in operands are inserted into \p
5875// Preheader.
5877 SmallPtrSetImpl<VPValue *> &NarrowedOps,
5878 VPBasicBlock *Preheader) {
5879 VPValue *V = Members.front();
5880 auto *R = V->getDefiningRecipe();
5881 if (NarrowedOps.contains(V))
5882 return V;
5883
5884 if (!R) {
5885 assert(all_of(Members,
5886 [V](VPValue *M) {
5887 return !M->hasDefiningRecipe() &&
5888 M->getScalarType() == V->getScalarType();
5889 }) &&
5890 "expected distinct live-ins of matching scalar type");
5891 auto *BV = new VPInstruction(VPInstruction::BuildVector, Members);
5892 Preheader->appendRecipe(BV);
5893 NarrowedOps.insert(BV);
5894 return BV;
5895 }
5896
5897 if (isAlreadyNarrow(V))
5898 return V;
5899
5901 auto *WideMember0 = cast<VPRecipeWithIRFlags>(R);
5902 for (VPValue *Member : Members.drop_front())
5903 WideMember0->intersectFlags(*cast<VPRecipeWithIRFlags>(Member));
5904 for (unsigned Idx = 0, E = WideMember0->getNumOperands(); Idx != E; ++Idx) {
5906 for (VPValue *Member : Members)
5907 OpsI.push_back(Member->getDefiningRecipe()->getOperand(Idx));
5908 WideMember0->setOperand(
5909 Idx, narrowInterleaveGroupOp(OpsI, NarrowedOps, Preheader));
5910 }
5911 return V;
5912 }
5913
5914 if (auto *LoadGroup = dyn_cast<VPInterleaveRecipe>(R)) {
5915 // Narrow interleave group to wide load, as transformed VPlan will only
5916 // process one original iteration.
5917 auto *LI = cast<LoadInst>(LoadGroup->getInterleaveGroup()->getInsertPos());
5918 auto *L = new VPWidenLoadRecipe(*LI, LoadGroup->getAddr(),
5919 LoadGroup->getMask(), /*Consecutive=*/true,
5920 *LoadGroup, LoadGroup->getDebugLoc());
5921 L->insertBefore(LoadGroup);
5922 NarrowedOps.insert(L);
5923 return L;
5924 }
5925
5926 if (auto *RepR = dyn_cast<VPReplicateRecipe>(R)) {
5927 assert(RepR->isSingleScalar() && RepR->getOpcode() == Instruction::Load &&
5928 "must be a single scalar load");
5929 NarrowedOps.insert(RepR);
5930 return RepR;
5931 }
5932
5933 auto *WideLoad = cast<VPWidenLoadRecipe>(R);
5934 VPValue *PtrOp = WideLoad->getAddr();
5935 if (auto *VecPtr = dyn_cast<VPVectorPointerRecipe>(PtrOp))
5936 PtrOp = VecPtr->getOperand(0);
5937 // Narrow wide load to uniform scalar load, as transformed VPlan will only
5938 // process one original iteration.
5939 auto *N = new VPReplicateRecipe(&WideLoad->getIngredient(), {PtrOp},
5940 /*IsUniform*/ true,
5941 /*Mask*/ nullptr, {}, *WideLoad);
5942 N->insertBefore(WideLoad);
5943 NarrowedOps.insert(N);
5944 return N;
5945}
5946
5947std::unique_ptr<VPlan>
5949 const TargetTransformInfo &TTI) {
5950 VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
5951
5952 if (!VectorLoop)
5953 return nullptr;
5954
5955 // Only handle single-block loops for now.
5956 if (VectorLoop->getEntryBasicBlock() != VectorLoop->getExitingBasicBlock())
5957 return nullptr;
5958
5959 // Skip plans when we may not be able to properly narrow.
5960 VPBasicBlock *Exiting = VectorLoop->getExitingBasicBlock();
5961 if (!match(&Exiting->back(), m_BranchOnCount()))
5962 return nullptr;
5963
5964 assert(match(&Exiting->back(),
5966 m_Specific(&Plan.getVectorTripCount()))) &&
5967 "unexpected branch-on-count");
5968
5970 std::optional<ElementCount> VFToOptimize;
5971 for (auto &R : *VectorLoop->getEntryBasicBlock()) {
5974 continue;
5975
5976 // Bail out on recipes not supported at the moment:
5977 // * phi recipes other than the canonical induction
5978 // * recipes writing to memory except interleave groups
5979 // Only support plans with a canonical induction phi.
5980 if (R.isPhi())
5981 return nullptr;
5982
5983 auto *InterleaveR = dyn_cast<VPInterleaveRecipe>(&R);
5984 if (R.mayWriteToMemory() && !InterleaveR)
5985 return nullptr;
5986
5987 // Bail out if any recipe defines a vector value used outside the
5988 // vector loop region.
5989 if (any_of(R.definedValues(), [&](VPValue *V) {
5990 return any_of(V->users(), [&](VPUser *U) {
5991 auto *UR = cast<VPRecipeBase>(U);
5992 return UR->getParent()->getParent() != VectorLoop;
5993 });
5994 }))
5995 return nullptr;
5996
5997 // All other ops are allowed, but we reject uses that cannot be converted
5998 // when checking all allowed consumers (store interleave groups) below.
5999 if (!InterleaveR)
6000 continue;
6001
6002 // Try to find a single VF, where all interleave groups are consecutive and
6003 // saturate the full vector width. If we already have a candidate VF, check
6004 // if it is applicable for the current InterleaveR, otherwise look for a
6005 // suitable VF across the Plan's VFs.
6007 VFToOptimize ? SmallVector<ElementCount>({*VFToOptimize})
6008 : to_vector(Plan.vectorFactors());
6009 std::optional<ElementCount> NarrowedVF =
6010 isConsecutiveInterleaveGroup(InterleaveR, VFs, TTI);
6011 if (!NarrowedVF || (VFToOptimize && NarrowedVF != VFToOptimize))
6012 return nullptr;
6013 VFToOptimize = NarrowedVF;
6014
6015 // Skip read interleave groups.
6016 if (InterleaveR->getStoredValues().empty())
6017 continue;
6018
6019 // Narrow interleave groups, if all operands are already matching narrow
6020 // ops.
6021 auto *Member0 = InterleaveR->getStoredValues()[0];
6022 if (isAlreadyNarrow(Member0) &&
6023 all_of(InterleaveR->getStoredValues(), equal_to(Member0))) {
6024 StoreGroups.push_back(InterleaveR);
6025 continue;
6026 }
6027
6028 // For now, we only support full interleave groups storing load interleave
6029 // groups.
6030 if (all_of(enumerate(InterleaveR->getStoredValues()), [](auto Op) {
6031 VPRecipeBase *DefR = Op.value()->getDefiningRecipe();
6032 if (!DefR)
6033 return false;
6034 auto *IR = dyn_cast<VPInterleaveRecipe>(DefR);
6035 return IR && IR->getInterleaveGroup()->isFull() &&
6036 IR->getVPValue(Op.index()) == Op.value();
6037 })) {
6038 StoreGroups.push_back(InterleaveR);
6039 continue;
6040 }
6041
6042 // Check if all values feeding InterleaveR are matching wide recipes, which
6043 // operands that can be narrowed.
6044 if (!canNarrowOps(InterleaveR->getStoredValues(),
6045 VFToOptimize->isScalable()))
6046 return nullptr;
6047 StoreGroups.push_back(InterleaveR);
6048 }
6049
6050 if (StoreGroups.empty())
6051 return nullptr;
6052
6053 VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
6054 bool RequiresScalarEpilogue =
6055 MiddleVPBB->getNumSuccessors() == 1 &&
6056 MiddleVPBB->getSingleSuccessor() == Plan.getScalarPreheader();
6057 // Bail out for tail-folding (middle block with a single successor to exit).
6058 if (MiddleVPBB->getNumSuccessors() != 2 && !RequiresScalarEpilogue)
6059 return nullptr;
6060
6061 // All interleave groups in Plan can be narrowed for VFToOptimize. Split the
6062 // original Plan into 2: a) a new clone which contains all VFs of Plan, except
6063 // VFToOptimize, and b) the original Plan with VFToOptimize as single VF.
6064 // TODO: Handle cases where only some interleave groups can be narrowed.
6065 std::unique_ptr<VPlan> NewPlan;
6066 if (size(Plan.vectorFactors()) != 1) {
6067 NewPlan = std::unique_ptr<VPlan>(Plan.duplicate());
6068 Plan.setVF(*VFToOptimize);
6069 NewPlan->removeVF(*VFToOptimize);
6070 }
6071
6072 // Convert InterleaveGroup \p R to a single VPWidenLoadRecipe.
6073 SmallPtrSet<VPValue *, 4> NarrowedOps;
6074 VPBasicBlock *Preheader = Plan.getVectorPreheader();
6075 // Narrow operation tree rooted at store groups.
6076 for (auto *StoreGroup : StoreGroups) {
6077 VPValue *Res = narrowInterleaveGroupOp(StoreGroup->getStoredValues(),
6078 NarrowedOps, Preheader);
6079 auto *SI =
6080 cast<StoreInst>(StoreGroup->getInterleaveGroup()->getInsertPos());
6081 auto *S = new VPWidenStoreRecipe(*SI, StoreGroup->getAddr(), Res, nullptr,
6082 /*Consecutive=*/true, *StoreGroup,
6083 StoreGroup->getDebugLoc());
6084 S->insertBefore(StoreGroup);
6085 StoreGroup->eraseFromParent();
6086 }
6087
6088 // Adjust induction to reflect that the transformed plan only processes one
6089 // original iteration.
6091 Type *CanIVTy = VectorLoop->getCanonicalIVType();
6092 VPBasicBlock *VectorPH = Plan.getVectorPreheader();
6093 VPBuilder PHBuilder(VectorPH, VectorPH->begin());
6094
6095 VPValue *UF = &Plan.getUF();
6096 VPValue *Step;
6097 if (VFToOptimize->isScalable()) {
6098 VPValue *VScale =
6099 PHBuilder.createElementCount(CanIVTy, ElementCount::getScalable(1));
6100 Step = PHBuilder.createOverflowingOp(Instruction::Mul, {VScale, UF},
6101 {true, false});
6102 Plan.getVF().replaceAllUsesWith(VScale);
6103 } else {
6104 Step = UF;
6105 Plan.getVF().replaceAllUsesWith(Plan.getConstantInt(CanIVTy, 1));
6106 }
6107 // Materialize vector trip count with the narrowed step.
6108 materializeVectorTripCount(Plan, VectorPH, /*TailByMasking=*/false,
6109 RequiresScalarEpilogue, Step);
6110
6111 CanIVInc->setOperand(1, Step);
6112 Plan.getVFxUF().replaceAllUsesWith(Step);
6113
6114 removeDeadRecipes(Plan);
6115 assert(none_of(*VectorLoop->getEntryBasicBlock(),
6117 "All VPVectorPointerRecipes should have been removed");
6118 return NewPlan;
6119}
6120
6121/// Add branch weight metadata, if the \p Plan's middle block is terminated by a
6122/// BranchOnCond recipe.
6124 VPlan &Plan, ElementCount VF, std::optional<unsigned> VScaleForTuning) {
6125 VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
6126 auto *MiddleTerm =
6128 // Only add branch metadata if there is a (conditional) terminator.
6129 if (!MiddleTerm)
6130 return;
6131
6132 assert(MiddleTerm->getOpcode() == VPInstruction::BranchOnCond &&
6133 "must have a BranchOnCond");
6134 // Assume that `TripCount % VectorStep ` is equally distributed.
6135 unsigned VectorStep = Plan.getConcreteUF() * VF.getKnownMinValue();
6136 if (VF.isScalable() && VScaleForTuning.has_value())
6137 VectorStep *= *VScaleForTuning;
6138 assert(VectorStep > 0 && "trip count should not be zero");
6139 MDBuilder MDB(Plan.getContext());
6140 MDNode *BranchWeights =
6141 MDB.createBranchWeights({1, VectorStep - 1}, /*IsExpected=*/false);
6142 MiddleTerm->setMetadata(LLVMContext::MD_prof, BranchWeights);
6143}
6144
6146 VFRange &Range) {
6147 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
6148 auto *MiddleVPBB = Plan.getMiddleBlock();
6149 VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
6150
6151 auto IsScalableOne = [](ElementCount VF) -> bool {
6152 return VF == ElementCount::getScalable(1);
6153 };
6154
6155 for (auto &HeaderPhi : VectorRegion->getEntryBasicBlock()->phis()) {
6156 auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&HeaderPhi);
6157 if (!FOR)
6158 continue;
6159
6160 assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
6161 "Cannot handle loops with uncountable early exits");
6162
6163 // Find the existing splice for this FOR, created in
6164 // createHeaderPhiRecipes. All uses of FOR have already been replaced with
6165 // RecurSplice there; only RecurSplice itself still references FOR.
6166 auto *RecurSplice =
6168 assert(RecurSplice && "expected FirstOrderRecurrenceSplice");
6169
6170 // For VF vscale x 1, if vscale = 1, we are unable to extract the
6171 // penultimate value of the recurrence. Instead we rely on the existing
6172 // extract of the last element from the result of
6173 // VPInstruction::FirstOrderRecurrenceSplice.
6174 // TODO: Consider vscale_range info and UF.
6175 if (any_of(RecurSplice->users(),
6176 [](VPUser *U) { return !cast<VPRecipeBase>(U)->getRegion(); }) &&
6178 Range))
6179 return;
6180
6181 // This is the second phase of vectorizing first-order recurrences, creating
6182 // extracts for users outside the loop. An overview of the transformation is
6183 // described below. Suppose we have the following loop with some use after
6184 // the loop of the last a[i-1],
6185 //
6186 // for (int i = 0; i < n; ++i) {
6187 // t = a[i - 1];
6188 // b[i] = a[i] - t;
6189 // }
6190 // use t;
6191 //
6192 // There is a first-order recurrence on "a". For this loop, the shorthand
6193 // scalar IR looks like:
6194 //
6195 // scalar.ph:
6196 // s.init = a[-1]
6197 // br scalar.body
6198 //
6199 // scalar.body:
6200 // i = phi [0, scalar.ph], [i+1, scalar.body]
6201 // s1 = phi [s.init, scalar.ph], [s2, scalar.body]
6202 // s2 = a[i]
6203 // b[i] = s2 - s1
6204 // br cond, scalar.body, exit.block
6205 //
6206 // exit.block:
6207 // use = lcssa.phi [s1, scalar.body]
6208 //
6209 // In this example, s1 is a recurrence because it's value depends on the
6210 // previous iteration. In the first phase of vectorization, we created a
6211 // VPFirstOrderRecurrencePHIRecipe v1 for s1. Now we create the extracts
6212 // for users in the scalar preheader and exit block.
6213 //
6214 // vector.ph:
6215 // v_init = vector(..., ..., ..., a[-1])
6216 // br vector.body
6217 //
6218 // vector.body
6219 // i = phi [0, vector.ph], [i+4, vector.body]
6220 // v1 = phi [v_init, vector.ph], [v2, vector.body]
6221 // v2 = a[i, i+1, i+2, i+3]
6222 // v1' = splice(v1(3), v2(0, 1, 2))
6223 // b[i, i+1, i+2, i+3] = v2 - v1'
6224 // br cond, vector.body, middle.block
6225 //
6226 // middle.block:
6227 // vector.recur.extract.for.phi = v2(2)
6228 // vector.recur.extract = v2(3)
6229 // br cond, scalar.ph, exit.block
6230 //
6231 // scalar.ph:
6232 // scalar.recur.init = phi [vector.recur.extract, middle.block],
6233 // [s.init, otherwise]
6234 // br scalar.body
6235 //
6236 // scalar.body:
6237 // i = phi [0, scalar.ph], [i+1, scalar.body]
6238 // s1 = phi [scalar.recur.init, scalar.ph], [s2, scalar.body]
6239 // s2 = a[i]
6240 // b[i] = s2 - s1
6241 // br cond, scalar.body, exit.block
6242 //
6243 // exit.block:
6244 // lo = lcssa.phi [s1, scalar.body],
6245 // [vector.recur.extract.for.phi, middle.block]
6246 //
6247 // Update extracts of the splice in the middle block: they extract the
6248 // penultimate element of the recurrence.
6250 make_range(MiddleVPBB->getFirstNonPhi(), MiddleVPBB->end()))) {
6251 if (!match(&R, m_ExtractLastLaneOfLastPart(m_Specific(RecurSplice))))
6252 continue;
6253
6254 auto *ExtractR = cast<VPInstruction>(&R);
6255 VPValue *PenultimateElement = MiddleBuilder.createNaryOp(
6256 VPInstruction::ExtractPenultimateElement, RecurSplice->getOperand(1),
6257 {}, "vector.recur.extract.for.phi");
6258 for (VPUser *ExitU : to_vector(ExtractR->users())) {
6259 if (auto *ExitPhi = dyn_cast<VPIRPhi>(ExitU))
6260 ExitPhi->replaceUsesOfWith(ExtractR, PenultimateElement);
6261 }
6262 }
6263 }
6264}
6265
6266/// Check if \p V is a binary expression of a widened IV and a loop-invariant
6267/// value. Returns the widened IV if found, nullptr otherwise.
6269 auto *BinOp = dyn_cast<VPWidenRecipe>(V);
6270 if (!BinOp || !Instruction::isBinaryOp(BinOp->getOpcode()) ||
6271 Instruction::isIntDivRem(BinOp->getOpcode()))
6272 return nullptr;
6273
6274 VPValue *WidenIVCandidate = BinOp->getOperand(0);
6275 VPValue *InvariantCandidate = BinOp->getOperand(1);
6276 if (!isa<VPWidenIntOrFpInductionRecipe>(WidenIVCandidate))
6277 std::swap(WidenIVCandidate, InvariantCandidate);
6278
6279 if (!InvariantCandidate->isDefinedOutsideLoopRegions())
6280 return nullptr;
6281
6282 return dyn_cast<VPWidenIntOrFpInductionRecipe>(WidenIVCandidate);
6283}
6284
6285/// Create a scalar version of \p BinOp, with its \p WidenIV operand replaced
6286/// by \p ScalarIV, and place it after \p ScalarIV's defining recipe.
6290 BinOp->getNumOperands() == 2 && "BinOp must have 2 operands");
6291 auto *ClonedOp = BinOp->clone();
6292 if (ClonedOp->getOperand(0) == WidenIV) {
6293 ClonedOp->setOperand(0, ScalarIV);
6294 } else {
6295 assert(ClonedOp->getOperand(1) == WidenIV && "one operand must be WideIV");
6296 ClonedOp->setOperand(1, ScalarIV);
6297 }
6298 ClonedOp->insertAfter(ScalarIV->getDefiningRecipe());
6299 return ClonedOp;
6300}
6301
6304 Loop &L) {
6305 ScalarEvolution &SE = *PSE.getSE();
6306 VPRegionBlock *VectorLoopRegion = Plan.getVectorLoopRegion();
6307
6308 // Helper lambda to check if the IV range excludes the sentinel value. Try
6309 // signed first, then unsigned. Return an excluded sentinel if found,
6310 // otherwise return std::nullopt.
6311 auto CheckSentinel = [&SE](const SCEV *IVSCEV,
6312 bool UseMax) -> std::optional<APSInt> {
6313 unsigned BW = IVSCEV->getType()->getScalarSizeInBits();
6314 for (bool Signed : {true, false}) {
6315 APSInt Sentinel = UseMax ? APSInt::getMinValue(BW, /*Unsigned=*/!Signed)
6316 : APSInt::getMaxValue(BW, /*Unsigned=*/!Signed);
6317
6318 ConstantRange IVRange =
6319 Signed ? SE.getSignedRange(IVSCEV) : SE.getUnsignedRange(IVSCEV);
6320 if (!IVRange.contains(Sentinel))
6321 return Sentinel;
6322 }
6323 return std::nullopt;
6324 };
6325
6326 VPValue *HeaderMask = vputils::findHeaderMask(Plan);
6327 for (VPRecipeBase &Phi :
6328 make_early_inc_range(VectorLoopRegion->getEntryBasicBlock()->phis())) {
6329 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&Phi);
6331 PhiR->getRecurrenceKind()))
6332 continue;
6333
6334 Type *PhiTy = PhiR->getScalarType();
6335 if (PhiTy->isPointerTy() || PhiTy->isFloatingPointTy())
6336 continue;
6337
6338 // If there's a header mask, the backedge select will not be the find-last
6339 // select.
6340 VPValue *BackedgeVal = PhiR->getBackedgeValue();
6341 auto *FindLastSelect = cast<VPSingleDefRecipe>(BackedgeVal);
6342 if (HeaderMask &&
6343 !match(BackedgeVal,
6344 m_Select(m_Specific(HeaderMask),
6345 m_VPSingleDefRecipe(FindLastSelect), m_Specific(PhiR))))
6346 continue;
6347
6348 // Get the find-last expression from the find-last select of the reduction
6349 // phi. The find-last select should be a select between the phi and the
6350 // find-last expression.
6351 VPValue *Cond, *FindLastExpression;
6352 if (!match(FindLastSelect, m_SelectLike(m_VPValue(Cond), m_Specific(PhiR),
6353 m_VPValue(FindLastExpression))) &&
6354 !match(FindLastSelect,
6355 m_SelectLike(m_VPValue(Cond), m_VPValue(FindLastExpression),
6356 m_Specific(PhiR))))
6357 continue;
6358
6359 // Check if FindLastExpression is a simple expression of a widened IV. If
6360 // so, we can track the underlying IV instead and sink the expression.
6361 auto *IVOfExpressionToSink = getExpressionIV(FindLastExpression);
6362 const SCEV *IVSCEV = vputils::getSCEVExprForVPValue(
6363 IVOfExpressionToSink ? IVOfExpressionToSink : FindLastExpression, PSE,
6364 &L);
6365 const SCEV *Step;
6366 if (!match(IVSCEV, m_scev_AffineAddRec(m_SCEV(), m_SCEV(Step)))) {
6367 assert(!match(vputils::getSCEVExprForVPValue(FindLastExpression, PSE, &L),
6369 "IVOfExpressionToSink not being an AddRec must imply "
6370 "FindLastExpression not being an AddRec.");
6371 continue;
6372 }
6373
6374 // Determine direction from SCEV step.
6375 if (!SE.isKnownNonZero(Step))
6376 continue;
6377
6378 // Positive step means we need UMax/SMax to find the last IV value, and
6379 // UMin/SMin otherwise.
6380 bool UseMax = SE.isKnownPositive(Step);
6381 std::optional<APSInt> SentinelVal = CheckSentinel(IVSCEV, UseMax);
6382 bool UseSigned = SentinelVal && SentinelVal->isSigned();
6383
6384 // Sinking an expression will disable epilogue vectorization. Only use it,
6385 // if FindLastExpression cannot be vectorized via a sentinel. Sinking may
6386 // also prevent vectorizing using a sentinel (e.g., if the expression is a
6387 // multiply or divide by large constant, respectively), which also makes
6388 // sinking undesirable.
6389 if (IVOfExpressionToSink) {
6390 const SCEV *FindLastExpressionSCEV =
6391 vputils::getSCEVExprForVPValue(FindLastExpression, PSE, &L);
6392 if (match(FindLastExpressionSCEV,
6393 m_scev_AffineAddRec(m_SCEV(), m_SCEV(Step)))) {
6394 bool NewUseMax = SE.isKnownPositive(Step);
6395 if (auto NewSentinel =
6396 CheckSentinel(FindLastExpressionSCEV, NewUseMax)) {
6397 // The original expression already has a sentinel, so prefer not
6398 // sinking to keep epilogue vectorization possible.
6399 SentinelVal = *NewSentinel;
6400 UseSigned = NewSentinel->isSigned();
6401 UseMax = NewUseMax;
6402 IVSCEV = FindLastExpressionSCEV;
6403 IVOfExpressionToSink = nullptr;
6404 }
6405 }
6406 }
6407
6408 // If no sentinel was found, fall back to a boolean AnyOf reduction to track
6409 // if the condition was ever true. Requires the IV to not wrap, otherwise we
6410 // cannot use min/max.
6411 if (!SentinelVal) {
6412 auto *AR = cast<SCEVAddRecExpr>(IVSCEV);
6413 if (AR->hasNoSignedWrap())
6414 UseSigned = true;
6415 else if (AR->hasNoUnsignedWrap())
6416 UseSigned = false;
6417 else
6418 continue;
6419 }
6420
6422 BackedgeVal,
6424
6425 VPValue *NewFindLastSelect = BackedgeVal;
6426 VPValue *SelectCond = Cond;
6427 if (!SentinelVal || IVOfExpressionToSink) {
6428 // When we need to create a new select, normalize the condition so that
6429 // PhiR is the last operand and include the header mask if needed.
6430 DebugLoc DL = FindLastSelect->getDefiningRecipe()->getDebugLoc();
6431 VPBuilder LoopBuilder(FindLastSelect->getDefiningRecipe());
6432 if (FindLastSelect->getDefiningRecipe()->getOperand(1) == PhiR)
6433 SelectCond = LoopBuilder.createNot(SelectCond);
6434
6435 // When tail folding, mask the condition with the header mask to prevent
6436 // propagating poison from inactive lanes in the last vector iteration.
6437 if (HeaderMask)
6438 SelectCond = LoopBuilder.createLogicalAnd(HeaderMask, SelectCond);
6439
6440 if (SelectCond != Cond || IVOfExpressionToSink) {
6441 NewFindLastSelect = LoopBuilder.createSelect(
6442 SelectCond,
6443 IVOfExpressionToSink ? IVOfExpressionToSink : FindLastExpression,
6444 PhiR, DL);
6445 }
6446 }
6447
6448 // Create the reduction result in the middle block using sentinel directly.
6449 RecurKind MinMaxKind =
6450 UseMax ? (UseSigned ? RecurKind::SMax : RecurKind::UMax)
6451 : (UseSigned ? RecurKind::SMin : RecurKind::UMin);
6452 VPIRFlags Flags(MinMaxKind, /*IsOrdered=*/false, /*IsInLoop=*/false,
6453 FastMathFlags());
6454 DebugLoc ExitDL = RdxResult->getDebugLoc();
6455 VPBuilder MiddleBuilder(RdxResult);
6456 VPValue *ReducedIV =
6458 NewFindLastSelect, Flags, ExitDL);
6459
6460 // If IVOfExpressionToSink is an expression to sink, sink it now.
6461 VPValue *VectorRegionExitingVal = ReducedIV;
6462 if (IVOfExpressionToSink)
6463 VectorRegionExitingVal =
6464 cloneBinOpForScalarIV(cast<VPWidenRecipe>(FindLastExpression),
6465 ReducedIV, IVOfExpressionToSink);
6466
6467 VPValue *NewRdxResult;
6468 VPValue *StartVPV = PhiR->getStartValue();
6469 if (SentinelVal) {
6470 // Sentinel-based approach: reduce IVs with min/max, compare against
6471 // sentinel to detect if condition was ever true, select accordingly.
6472 VPValue *Sentinel = Plan.getConstantInt(*SentinelVal);
6473 auto *Cmp = MiddleBuilder.createICmp(CmpInst::ICMP_NE, ReducedIV,
6474 Sentinel, ExitDL);
6475 NewRdxResult = MiddleBuilder.createSelect(Cmp, VectorRegionExitingVal,
6476 StartVPV, ExitDL);
6477 StartVPV = Sentinel;
6478 } else {
6479 // Introduce a boolean AnyOf reduction to track if the condition was ever
6480 // true in the loop. Use it to select the initial start value, if it was
6481 // never true.
6482 auto *AnyOfPhi = new VPReductionPHIRecipe(
6483 /*Phi=*/nullptr, RecurKind::Or, *Plan.getFalse(), *Plan.getFalse(),
6484 RdxUnordered{1}, {}, /*HasUsesOutsideReductionChain=*/false);
6485 AnyOfPhi->insertAfter(PhiR);
6486
6487 VPBuilder LoopBuilder(BackedgeVal->getDefiningRecipe());
6488 VPValue *OrVal = LoopBuilder.createOr(AnyOfPhi, SelectCond);
6489 AnyOfPhi->setOperand(1, OrVal);
6490
6491 NewRdxResult = MiddleBuilder.createAnyOfReduction(
6492 OrVal, VectorRegionExitingVal, StartVPV, ExitDL);
6493
6494 // Initialize the IV reduction phi with the neutral element, not the
6495 // original start value, to ensure correct min/max reduction results.
6496 StartVPV = Plan.getOrAddLiveIn(
6497 getRecurrenceIdentity(MinMaxKind, IVSCEV->getType(), {}));
6498 }
6499 RdxResult->replaceAllUsesWith(NewRdxResult);
6500 RdxResult->eraseFromParent();
6501
6502 auto *NewPhiR = new VPReductionPHIRecipe(
6503 cast<PHINode>(PhiR->getUnderlyingInstr()), RecurKind::FindIV, *StartVPV,
6504 *NewFindLastSelect, RdxUnordered{1}, {},
6505 PhiR->hasUsesOutsideReductionChain());
6506 NewPhiR->insertBefore(PhiR);
6507 PhiR->replaceAllUsesWith(NewPhiR);
6508 PhiR->eraseFromParent();
6509 }
6510}
6511
6512namespace {
6513
6514using ExtendKind = TTI::PartialReductionExtendKind;
6515struct ReductionExtend {
6516 Type *SrcType = nullptr;
6517 ExtendKind Kind = ExtendKind::PR_None;
6518};
6519
6520/// Describes the extends used to compute the extended reduction operand.
6521/// ExtendB is optional. If ExtendB is present, ExtendsUser is a binary
6522/// operation.
6523struct ExtendedReductionOperand {
6524 /// The recipe that consumes the extends.
6525 VPWidenRecipe *ExtendsUser = nullptr;
6526 /// Extend descriptions (inputs to getPartialReductionCost).
6527 ReductionExtend ExtendA, ExtendB;
6528};
6529
6530/// A chain of recipes that form a partial reduction. Matches either
6531/// reduction_bin_op (extended op, accumulator), or
6532/// reduction_bin_op (accumulator, extended op).
6533/// The possible forms of the "extended op" are listed in
6534/// matchExtendedReductionOperand.
6535struct VPPartialReductionChain {
6536 /// The top-level binary operation that forms the reduction to a scalar
6537 /// after the loop body.
6538 VPWidenRecipe *ReductionBinOp = nullptr;
6539 /// The user of the extends that is then reduced.
6540 ExtendedReductionOperand ExtendedOp;
6541 /// The recurrence kind for the entire partial reduction chain.
6542 /// This allows distinguishing between Sub and AddWithSub recurrences,
6543 /// when the ReductionBinOp is a Instruction::Sub.
6544 RecurKind RK;
6545 /// The index of the accumulator operand of ReductionBinOp. The extended op
6546 /// is `1 - AccumulatorOpIdx`.
6547 unsigned AccumulatorOpIdx;
6548 unsigned ScaleFactor;
6549};
6550
6551static VPSingleDefRecipe *
6552optimizeExtendsForPartialReduction(VPSingleDefRecipe *Op) {
6553 // reduce.add(mul(ext(A), C))
6554 // -> reduce.add(mul(ext(A), ext(trunc(C))))
6555 const APInt *Const;
6556 if (match(Op, m_Mul(m_ZExtOrSExt(m_VPValue()), m_APInt(Const)))) {
6557 auto *ExtA = cast<VPWidenCastRecipe>(Op->getOperand(0));
6558 Instruction::CastOps ExtOpc = ExtA->getOpcode();
6559 Type *NarrowTy = ExtA->getOperand(0)->getScalarType();
6560 if (!Op->hasOneUse() ||
6562 Const, NarrowTy, TTI::getPartialReductionExtendKind(ExtOpc)))
6563 return Op;
6564
6565 VPBuilder Builder(Op);
6566 auto *Trunc = Builder.createWidenCast(Instruction::CastOps::Trunc,
6567 Op->getOperand(1), NarrowTy);
6568 Type *WideTy = ExtA->getScalarType();
6569 Op->setOperand(1, Builder.createWidenCast(ExtOpc, Trunc, WideTy));
6570 return Op;
6571 }
6572
6573 // reduce.add(abs(sub(ext(A), ext(B))))
6574 // -> reduce.add(ext(absolute-difference(A, B)))
6575 VPValue *X, *Y;
6578 auto *Sub = Op->getOperand(0)->getDefiningRecipe();
6579 auto *Ext = cast<VPWidenCastRecipe>(Sub->getOperand(0));
6580 assert(Ext->getOpcode() ==
6581 cast<VPWidenCastRecipe>(Sub->getOperand(1))->getOpcode() &&
6582 "Expected both the LHS and RHS extends to be the same");
6583 bool IsSigned = Ext->getOpcode() == Instruction::SExt;
6584 VPBuilder Builder(Op);
6585 Type *SrcTy = X->getScalarType();
6586 auto *FreezeX = Builder.insert(new VPWidenRecipe(Instruction::Freeze, {X}));
6587 auto *FreezeY = Builder.insert(new VPWidenRecipe(Instruction::Freeze, {Y}));
6588 auto *Max = Builder.insert(
6589 new VPWidenIntrinsicRecipe(IsSigned ? Intrinsic::smax : Intrinsic::umax,
6590 {FreezeX, FreezeY}, SrcTy));
6591 auto *Min = Builder.insert(
6592 new VPWidenIntrinsicRecipe(IsSigned ? Intrinsic::smin : Intrinsic::umin,
6593 {FreezeX, FreezeY}, SrcTy));
6594 auto *AbsDiff =
6595 Builder.insert(new VPWidenRecipe(Instruction::Sub, {Max, Min}));
6596 return Builder.createWidenCast(Instruction::CastOps::ZExt, AbsDiff,
6597 Op->getScalarType());
6598 }
6599
6600 // reduce.add(ext(mul(ext(A), ext(B))))
6601 // -> reduce.add(mul(wider_ext(A), wider_ext(B)))
6602 // TODO: Support this optimization for float types.
6604 m_ZExtOrSExt(m_VPValue()))))) {
6605 auto *Ext = cast<VPWidenCastRecipe>(Op);
6606 auto *Mul = cast<VPWidenRecipe>(Ext->getOperand(0));
6607 auto *MulLHS = cast<VPWidenCastRecipe>(Mul->getOperand(0));
6608 auto *MulRHS = cast<VPWidenCastRecipe>(Mul->getOperand(1));
6609 if (!Mul->hasOneUse() ||
6610 (Ext->getOpcode() != MulLHS->getOpcode() && MulLHS != MulRHS) ||
6611 MulLHS->getOpcode() != MulRHS->getOpcode())
6612 return Op;
6613 VPBuilder Builder(Mul);
6614 auto *NewLHS = Builder.createWidenCast(
6615 MulLHS->getOpcode(), MulLHS->getOperand(0), Ext->getScalarType());
6616 auto *NewRHS = MulLHS == MulRHS
6617 ? NewLHS
6618 : Builder.createWidenCast(MulRHS->getOpcode(),
6619 MulRHS->getOperand(0),
6620 Ext->getScalarType());
6621 auto *NewMul = Mul->cloneWithOperands({NewLHS, NewRHS});
6622 Builder.insert(NewMul);
6623 Op->replaceAllUsesWith(NewMul);
6624 Op->eraseFromParent();
6625 Mul->eraseFromParent();
6626 return NewMul;
6627 }
6628
6629 return Op;
6630}
6631
6632static VPExpressionRecipe *
6633createPartialReductionExpression(VPReductionRecipe *Red) {
6634 VPValue *VecOp = Red->getVecOp();
6635
6636 // reduce.[f]add(ext(op))
6637 // -> VPExpressionRecipe(op, red)
6638 if (match(VecOp, m_WidenAnyExtend(m_VPValue())))
6639 return new VPExpressionRecipe(cast<VPWidenCastRecipe>(VecOp), Red);
6640
6641 // reduce.[f]add(neg(ext(op)))
6642 // -> VPExpressionRecipe(op, sub/neg, red)
6643 if (match(VecOp, m_AnyNeg(m_WidenAnyExtend(m_VPValue())))) {
6644 auto *Neg = cast<VPWidenRecipe>(VecOp);
6645 auto *Ext =
6646 cast<VPWidenCastRecipe>(Neg->getOperand(Neg->getNumOperands() - 1));
6647 return new VPExpressionRecipe(Ext, Neg, Red);
6648 }
6649
6650 // reduce.[f]add([f]mul(ext(a), ext(b)))
6651 // -> VPExpressionRecipe(a, b, mul, red)
6652 if (match(VecOp, m_FMul(m_FPExt(m_VPValue()), m_FPExt(m_VPValue()))) ||
6653 match(VecOp,
6655 auto *Mul = cast<VPWidenRecipe>(VecOp);
6656 auto *ExtA = cast<VPWidenCastRecipe>(Mul->getOperand(0));
6657 auto *ExtB = cast<VPWidenCastRecipe>(Mul->getOperand(1));
6658 return new VPExpressionRecipe(ExtA, ExtB, Mul, Red);
6659 }
6660
6661 // reduce.fadd(fneg(fmul(fpext(a), fpext(b))))
6662 // -> VPExpressionRecipe(a, b, fmul, fsub, red)
6663 if (match(VecOp,
6665 auto *FNeg = cast<VPWidenRecipe>(VecOp);
6666 auto *FMul = cast<VPWidenRecipe>(FNeg->getOperand(0));
6667 auto *ExtA = cast<VPWidenCastRecipe>(FMul->getOperand(0));
6668 auto *ExtB = cast<VPWidenCastRecipe>(FMul->getOperand(1));
6669 return new VPExpressionRecipe(ExtA, ExtB, FMul, FNeg, Red);
6670 }
6671
6672 // reduce.add(neg(mul(ext(a), ext(b))))
6673 // -> VPExpressionRecipe(a, b, mul, sub, red)
6675 m_ZExtOrSExt(m_VPValue()))))) {
6676 auto *Sub = cast<VPWidenRecipe>(VecOp);
6677 auto *Mul = cast<VPWidenRecipe>(Sub->getOperand(1));
6678 auto *ExtA = cast<VPWidenCastRecipe>(Mul->getOperand(0));
6679 auto *ExtB = cast<VPWidenCastRecipe>(Mul->getOperand(1));
6680 return new VPExpressionRecipe(ExtA, ExtB, Mul, Sub, Red);
6681 }
6682
6683 llvm_unreachable("Unsupported expression");
6684}
6685
6686// Helper to transform a partial reduction chain into a partial reduction
6687// recipe. Assumes profitability has been checked.
6688static void transformToPartialReduction(const VPPartialReductionChain &Chain,
6689 VPlan &Plan,
6690 VPReductionPHIRecipe *RdxPhi) {
6691 VPWidenRecipe *WidenRecipe = Chain.ReductionBinOp;
6692 assert(WidenRecipe->getNumOperands() == 2 && "Expected binary operation");
6693
6694 VPValue *Accumulator = WidenRecipe->getOperand(Chain.AccumulatorOpIdx);
6695 auto *ExtendedOp = cast<VPSingleDefRecipe>(
6696 WidenRecipe->getOperand(1 - Chain.AccumulatorOpIdx));
6697
6698 // FIXME: Do these transforms before invoking the cost-model.
6699 ExtendedOp = optimizeExtendsForPartialReduction(ExtendedOp);
6700
6701 // Sub-reductions can be implemented in two ways:
6702 // (1) negate the operand in the vector loop (the default way).
6703 // (2) subtract the reduced value from the init value in the middle block.
6704 // Both ways keep the reduction itself as an 'add' reduction.
6705 //
6706 // The ISD nodes for partial reductions don't support folding the
6707 // sub/negation into its operands because the following is not a valid
6708 // transformation:
6709 // sub(0, mul(ext(a), ext(b)))
6710 // -> mul(ext(a), ext(sub(0, b)))
6711 //
6712 // It's therefore better to choose option (2) such that the partial
6713 // reduction is always positive (starting at '0') and to do a final
6714 // subtract in the middle block.
6715 if ((WidenRecipe->getOpcode() == Instruction::Sub &&
6716 Chain.RK != RecurKind::Sub) ||
6717 (WidenRecipe->getOpcode() == Instruction::FSub &&
6718 Chain.RK != RecurKind::FSub)) {
6719 VPBuilder Builder(WidenRecipe);
6720 Type *ElemTy = ExtendedOp->getScalarType();
6721 VPWidenRecipe *NegRecipe;
6722 if (WidenRecipe->getOpcode() == Instruction::FSub) {
6723 NegRecipe =
6724 new VPWidenRecipe(Instruction::FNeg, {ExtendedOp}, VPIRFlags(),
6726 } else {
6727 auto *Zero = Plan.getZero(ElemTy);
6728 NegRecipe =
6729 new VPWidenRecipe(Instruction::Sub, {Zero, ExtendedOp}, VPIRFlags(),
6731 }
6732 Builder.insert(NegRecipe);
6733 ExtendedOp = NegRecipe;
6734 }
6735
6736 // Check if WidenRecipe is the final result of the reduction. If so look
6737 // through selects for predicated reductions.
6738 VPValue *Cond = nullptr;
6740 findUserOf(WidenRecipe, m_Select(m_VPValue(Cond), m_Specific(WidenRecipe),
6741 m_Specific(RdxPhi))));
6742 bool IsLastInChain = RdxPhi->getBackedgeValue() == WidenRecipe ||
6743 RdxPhi->getBackedgeValue() == ExitValue;
6744 assert((!ExitValue || IsLastInChain) &&
6745 "if we found ExitValue, it must match RdxPhi's backedge value");
6746
6747 Type *PhiType = RdxPhi->getScalarType();
6748 RecurKind RdxKind =
6750 auto *PartialRed = new VPReductionRecipe(
6751 RdxKind,
6752 RdxKind == RecurKind::FAdd ? WidenRecipe->getFastMathFlagsOrNone()
6753 : FastMathFlags(),
6754 WidenRecipe->getUnderlyingInstr(), Accumulator, ExtendedOp, Cond,
6755 RdxUnordered{/*VFScaleFactor=*/Chain.ScaleFactor});
6756 PartialRed->insertBefore(WidenRecipe);
6757
6758 if (Cond)
6759 ExitValue->replaceAllUsesWith(PartialRed);
6760 WidenRecipe->replaceAllUsesWith(PartialRed);
6761
6762 // For cost-model purposes, fold this into a VPExpression.
6763 VPExpressionRecipe *E = createPartialReductionExpression(PartialRed);
6764 E->insertBefore(WidenRecipe);
6765 PartialRed->replaceAllUsesWith(E);
6766
6767 // We only need to update the PHI node once, which is when we find the
6768 // last reduction in the chain.
6769 if (!IsLastInChain)
6770 return;
6771
6772 // Scale the PHI and ReductionStartVector by the VFScaleFactor
6773 assert(RdxPhi->getVFScaleFactor() == 1 && "scale factor must not be set");
6774 RdxPhi->setVFScaleFactor(Chain.ScaleFactor);
6775
6776 auto *StartInst = cast<VPInstruction>(RdxPhi->getStartValue());
6777 assert(StartInst->getOpcode() == VPInstruction::ReductionStartVector);
6778 auto *NewScaleFactor = Plan.getConstantInt(32, Chain.ScaleFactor);
6779 StartInst->setOperand(2, NewScaleFactor);
6780
6781 // If this is the last value in a sub-reduction chain, then update the PHI
6782 // node to start at `0` and update the reduction-result to subtract from
6783 // the PHI's start value.
6784 if (Chain.RK != RecurKind::Sub && Chain.RK != RecurKind::FSub)
6785 return;
6786
6787 VPValue *OldStartValue = StartInst->getOperand(0);
6788 StartInst->setOperand(0, StartInst->getOperand(1));
6789
6790 // Replace reduction_result by 'sub (startval, reductionresult)'.
6792 assert(RdxResult && "Could not find reduction result");
6793
6794 VPBuilder Builder = VPBuilder::getToInsertAfter(RdxResult);
6795 unsigned SubOpc = Chain.RK == RecurKind::FSub ? Instruction::BinaryOps::FSub
6796 : Instruction::BinaryOps::Sub;
6797 VPInstruction *NewResult = Builder.createNaryOp(
6798 SubOpc, {OldStartValue, RdxResult}, VPIRFlags::getDefaultFlags(SubOpc),
6799 RdxPhi->getDebugLoc());
6800 RdxResult->replaceUsesWithIf(
6801 NewResult,
6802 [&NewResult](VPUser &U, unsigned Idx) { return &U != NewResult; });
6803}
6804
6805/// Returns the cost of a link in a partial-reduction chain for a given VF.
6806static InstructionCost
6807getPartialReductionLinkCost(VPCostContext &CostCtx,
6808 const VPPartialReductionChain &Link,
6809 ElementCount VF) {
6810 Type *RdxType = Link.ReductionBinOp->getScalarType();
6811 const ExtendedReductionOperand &ExtendedOp = Link.ExtendedOp;
6812 std::optional<unsigned> BinOpc = std::nullopt;
6813 // If ExtendB is not none, then the "ExtendsUser" is the binary operation.
6814 if (ExtendedOp.ExtendB.Kind != ExtendKind::PR_None)
6815 BinOpc = ExtendedOp.ExtendsUser->getOpcode();
6816
6817 std::optional<llvm::FastMathFlags> Flags;
6818 if (RdxType->isFloatingPointTy())
6819 Flags = Link.ReductionBinOp->getFastMathFlagsOrNone();
6820
6821 auto GetLinkOpcode = [&Link]() -> unsigned {
6822 switch (Link.RK) {
6823 case RecurKind::Sub:
6824 return Instruction::Add;
6825 case RecurKind::FSub:
6826 return Instruction::FAdd;
6827 default:
6828 return Link.ReductionBinOp->getOpcode();
6829 }
6830 };
6831
6832 return CostCtx.TTI.getPartialReductionCost(
6833 GetLinkOpcode(), ExtendedOp.ExtendA.SrcType, ExtendedOp.ExtendB.SrcType,
6834 RdxType, VF, ExtendedOp.ExtendA.Kind, ExtendedOp.ExtendB.Kind, BinOpc,
6835 CostCtx.CostKind, Flags);
6836}
6837
6838static ExtendKind getPartialReductionExtendKind(VPWidenCastRecipe *Cast) {
6840}
6841
6842/// Checks if \p Op (which is an operand of \p UpdateR) is an extended reduction
6843/// operand. This is an operand where the source of the value (e.g. a load) has
6844/// been extended (sext, zext, or fpext) before it is used in the reduction.
6845///
6846/// Possible forms matched by this function:
6847/// - UpdateR(PrevValue, ext(...))
6848/// - UpdateR(PrevValue, mul(ext(...), ext(...)))
6849/// - UpdateR(PrevValue, mul(ext(...), Constant))
6850/// - UpdateR(PrevValue, ext(mul(ext(...), ext(...))))
6851/// - UpdateR(PrevValue, ext(mul(ext(...), Constant)))
6852/// - UpdateR(PrevValue, abs(sub(ext(...), ext(...)))
6853///
6854/// Note: The second operand of UpdateR corresponds to \p Op in the examples.
6855static std::optional<ExtendedReductionOperand>
6856matchExtendedReductionOperand(VPWidenRecipe *UpdateR, VPValue *Op) {
6857 assert(is_contained(UpdateR->operands(), Op) &&
6858 "Op should be operand of UpdateR");
6859
6860 // Try matching an absolute difference operand of the form
6861 // `abs(sub(ext(A), ext(B)))`. This will be later transformed into
6862 // `ext(absolute-difference(A, B))`. This allows us to perform the absolute
6863 // difference on a wider type and get the extend for "free" from the partial
6864 // reduction.
6865 VPValue *X, *Y;
6866 if (Op->hasOneUse() &&
6870 auto *Abs = cast<VPWidenIntrinsicRecipe>(Op);
6871 auto *Sub = cast<VPWidenRecipe>(Abs->getOperand(0));
6872 auto *LHSExt = cast<VPWidenCastRecipe>(Sub->getOperand(0));
6873 auto *RHSExt = cast<VPWidenCastRecipe>(Sub->getOperand(1));
6874 Type *LHSInputType = X->getScalarType();
6875 Type *RHSInputType = Y->getScalarType();
6876 if (LHSInputType != RHSInputType ||
6877 LHSExt->getOpcode() != RHSExt->getOpcode())
6878 return std::nullopt;
6879 // Note: This is essentially the same as matching ext(...) as we will
6880 // rewrite this operand to ext(absolute-difference(A, B)).
6881 return ExtendedReductionOperand{
6882 Sub,
6883 /*ExtendA=*/{LHSInputType, getPartialReductionExtendKind(LHSExt)},
6884 /*ExtendB=*/{}};
6885 }
6886
6887 std::optional<TTI::PartialReductionExtendKind> OuterExtKind;
6889 auto *CastRecipe = cast<VPWidenCastRecipe>(Op);
6890 VPValue *CastSource = CastRecipe->getOperand(0);
6891 OuterExtKind = getPartialReductionExtendKind(CastRecipe);
6892 if (match(CastSource, m_Mul(m_VPValue(), m_VPValue())) ||
6893 match(CastSource, m_FMul(m_VPValue(), m_VPValue()))) {
6894 // Match: ext(mul(...))
6895 // Record the outer extend kind and set `Op` to the mul. We can then match
6896 // this as a binary operation. Note: We can optimize out the outer extend
6897 // by widening the inner extends to match it. See
6898 // optimizeExtendsForPartialReduction.
6899 Op = CastSource;
6900 } else {
6901 return ExtendedReductionOperand{
6902 UpdateR,
6903 /*ExtendA=*/{CastSource->getScalarType(), *OuterExtKind},
6904 /*ExtendB=*/{}};
6905 }
6906 }
6907
6908 if (!Op->hasOneUse())
6909 return std::nullopt;
6910
6912 if (!MulOp ||
6913 !is_contained({Instruction::Mul, Instruction::FMul}, MulOp->getOpcode()))
6914 return std::nullopt;
6915
6916 // The rest of the matching assumes `Op` is a (possibly extended) mul
6917 // operation.
6918
6919 VPValue *LHS = MulOp->getOperand(0);
6920 VPValue *RHS = MulOp->getOperand(1);
6921
6922 // The LHS of the operation must always be an extend.
6924 return std::nullopt;
6925
6926 auto *LHSCast = cast<VPWidenCastRecipe>(LHS);
6927 Type *LHSInputType = LHSCast->getOperand(0)->getScalarType();
6928 ExtendKind LHSExtendKind = getPartialReductionExtendKind(LHSCast);
6929
6930 // The RHS of the operation can be an extend or a constant integer.
6931 const APInt *RHSConst = nullptr;
6932 VPWidenCastRecipe *RHSCast = nullptr;
6934 RHSCast = cast<VPWidenCastRecipe>(RHS);
6935 else if (!match(RHS, m_APInt(RHSConst)) ||
6936 !canConstantBeExtended(RHSConst, LHSInputType, LHSExtendKind))
6937 return std::nullopt;
6938
6939 // The outer extend kind must match the inner extends for folding.
6940 for (VPWidenCastRecipe *Cast : {LHSCast, RHSCast})
6941 if (Cast && OuterExtKind &&
6942 getPartialReductionExtendKind(Cast) != OuterExtKind)
6943 return std::nullopt;
6944
6945 Type *RHSInputType = LHSInputType;
6946 ExtendKind RHSExtendKind = LHSExtendKind;
6947 if (RHSCast) {
6948 RHSInputType = RHSCast->getOperand(0)->getScalarType();
6949 RHSExtendKind = getPartialReductionExtendKind(RHSCast);
6950 }
6951
6952 return ExtendedReductionOperand{
6953 MulOp, {LHSInputType, LHSExtendKind}, {RHSInputType, RHSExtendKind}};
6954}
6955
6956/// Examines each operation in the reduction chain corresponding to \p RedPhiR,
6957/// and determines if the target can use a cheaper operation with a wider
6958/// per-iteration input VF and narrower PHI VF. If successful, returns the chain
6959/// of operations in the reduction.
6960static std::optional<SmallVector<VPPartialReductionChain>>
6961getScaledReductions(VPReductionPHIRecipe *RedPhiR, VPCostContext &CostCtx,
6962 VFRange &Range) {
6963 // Get the backedge value from the reduction PHI and find the
6964 // ComputeReductionResult that uses it (directly or through a select for
6965 // predicated reductions).
6966 auto *RdxResult = vputils::findComputeReductionResult(RedPhiR);
6967 if (!RdxResult)
6968 return std::nullopt;
6969 VPValue *ExitValue = RdxResult->getOperand(0);
6970 match(ExitValue, m_Select(m_VPValue(), m_VPValue(ExitValue), m_VPValue()));
6971
6973 RecurKind RK = RedPhiR->getRecurrenceKind();
6974 Type *PhiType = RedPhiR->getScalarType();
6975 TypeSize PHISize = PhiType->getPrimitiveSizeInBits();
6976
6977 // Work backwards from the ExitValue examining each reduction operation.
6978 VPValue *CurrentValue = ExitValue;
6979 while (CurrentValue != RedPhiR) {
6980 auto *UpdateR = dyn_cast<VPWidenRecipe>(CurrentValue);
6981 if (!UpdateR || !Instruction::isBinaryOp(UpdateR->getOpcode()))
6982 return std::nullopt;
6983
6984 VPValue *Op = UpdateR->getOperand(1);
6985 VPValue *PrevValue = UpdateR->getOperand(0);
6986
6987 // Find the extended operand. The other operand (PrevValue) is the next link
6988 // in the reduction chain.
6989 std::optional<ExtendedReductionOperand> ExtendedOp =
6990 matchExtendedReductionOperand(UpdateR, Op);
6991 if (!ExtendedOp) {
6992 ExtendedOp = matchExtendedReductionOperand(UpdateR, PrevValue);
6993 if (!ExtendedOp)
6994 return std::nullopt;
6995 std::swap(Op, PrevValue);
6996 }
6997
6998 Type *ExtSrcType = ExtendedOp->ExtendA.SrcType;
6999 TypeSize ExtSrcSize = ExtSrcType->getPrimitiveSizeInBits();
7000 if (!PHISize.hasKnownScalarFactor(ExtSrcSize))
7001 return std::nullopt;
7002
7003 // Check if a partial reduction chain is supported by the target (i.e. does
7004 // not have an invalid cost) for the given VF range. Clamps the range and
7005 // returns true if feasible for any VF.
7006 VPPartialReductionChain Link(
7007 {UpdateR, *ExtendedOp, RK,
7008 PrevValue == UpdateR->getOperand(0) ? 0U : 1U,
7009 static_cast<unsigned>(PHISize.getKnownScalarFactor(ExtSrcSize))});
7010 Chain.push_back(Link);
7011 CurrentValue = PrevValue;
7012 }
7013
7014 // The chain links were collected by traversing backwards from the exit value.
7015 // Reverse the chains so they are in program order.
7016 std::reverse(Chain.begin(), Chain.end());
7017 return Chain;
7018}
7019} // namespace
7020
7022 VPCostContext &CostCtx,
7023 VFRange &Range) {
7024 // Find all possible valid partial reductions, grouping chains by their PHI.
7025 // This grouping allows invalidating the whole chain, if any link is not a
7026 // valid partial reduction.
7028 ChainsByPhi;
7029 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
7030 for (VPRecipeBase &R : HeaderVPBB->phis()) {
7031 auto *RedPhiR = dyn_cast<VPReductionPHIRecipe>(&R);
7032 if (!RedPhiR)
7033 continue;
7034
7035 if (auto Chains = getScaledReductions(RedPhiR, CostCtx, Range))
7036 ChainsByPhi.try_emplace(RedPhiR, std::move(*Chains));
7037 }
7038
7039 if (ChainsByPhi.empty())
7040 return;
7041
7042 // Build set of partial reduction operations for extend user validation and
7043 // a map of reduction bin ops to their scale factors for scale validation.
7044 SmallPtrSet<VPRecipeBase *, 4> PartialReductionOps;
7045 DenseMap<VPSingleDefRecipe *, unsigned> ScaledReductionMap;
7046 for (const auto &[_, Chains] : ChainsByPhi)
7047 for (const VPPartialReductionChain &Chain : Chains) {
7048 PartialReductionOps.insert(Chain.ExtendedOp.ExtendsUser);
7049 ScaledReductionMap[Chain.ReductionBinOp] = Chain.ScaleFactor;
7050 }
7051
7052 // A partial reduction is invalid if any of its extends are used by
7053 // something that isn't another partial reduction. This is because the
7054 // extends are intended to be lowered along with the reduction itself.
7055 auto ExtendUsersValid = [&](VPValue *Ext) {
7056 return !isa<VPWidenCastRecipe>(Ext) || all_of(Ext->users(), [&](VPUser *U) {
7057 return PartialReductionOps.contains(cast<VPRecipeBase>(U));
7058 });
7059 };
7060
7061 auto IsProfitablePartialReductionChainForVF =
7062 [&](ArrayRef<VPPartialReductionChain> Chain, ElementCount VF) -> bool {
7063 InstructionCost PartialCost = 0, RegularCost = 0;
7064
7065 // The chain is a profitable partial reduction chain if the cost of handling
7066 // the entire chain is cheaper when using partial reductions than when
7067 // handling the entire chain using regular reductions.
7068 for (const VPPartialReductionChain &Link : Chain) {
7069 const ExtendedReductionOperand &ExtendedOp = Link.ExtendedOp;
7070 InstructionCost LinkCost = getPartialReductionLinkCost(CostCtx, Link, VF);
7071 if (!LinkCost.isValid())
7072 return false;
7073
7074 PartialCost += LinkCost;
7075 RegularCost += Link.ReductionBinOp->computeCost(VF, CostCtx);
7076 // If ExtendB is not none, then the "ExtendsUser" is the binary operation.
7077 if (ExtendedOp.ExtendB.Kind != ExtendKind::PR_None)
7078 RegularCost += ExtendedOp.ExtendsUser->computeCost(VF, CostCtx);
7079 for (VPValue *Op : ExtendedOp.ExtendsUser->operands())
7080 if (auto *Extend = dyn_cast<VPWidenCastRecipe>(Op))
7081 RegularCost += Extend->computeCost(VF, CostCtx);
7082 }
7083 return PartialCost.isValid() && PartialCost < RegularCost;
7084 };
7085
7086 // Validate chains: check that extends are only used by partial reductions,
7087 // and that reduction bin ops are only used by other partial reductions with
7088 // matching scale factors, are outside the loop region or the select
7089 // introduced by tail-folding. Otherwise we would create users of scaled
7090 // reductions where the types of the other operands don't match.
7091 for (auto &[RedPhiR, Chains] : ChainsByPhi) {
7092 for (const VPPartialReductionChain &Chain : Chains) {
7093 if (!all_of(Chain.ExtendedOp.ExtendsUser->operands(), ExtendUsersValid)) {
7094 Chains.clear();
7095 break;
7096 }
7097 auto UseIsValid = [&, RedPhiR = RedPhiR](VPUser *U) {
7098 if (auto *PhiR = dyn_cast<VPReductionPHIRecipe>(U))
7099 return PhiR == RedPhiR;
7100 auto *R = cast<VPSingleDefRecipe>(U);
7101 return Chain.ScaleFactor == ScaledReductionMap.lookup_or(R, 0) ||
7103 m_Specific(Chain.ReductionBinOp))) ||
7104 match(R, m_Select(m_VPValue(), m_Specific(Chain.ReductionBinOp),
7105 m_Specific(RedPhiR)));
7106 };
7107 if (!all_of(Chain.ReductionBinOp->users(), UseIsValid)) {
7108 Chains.clear();
7109 break;
7110 }
7111
7112 // Check if the compute-reduction-result is used by a sunk store.
7113 // TODO: Also form partial reductions in those cases.
7114 if (auto *RdxResult = vputils::findComputeReductionResult(RedPhiR)) {
7115 if (any_of(RdxResult->users(), [](VPUser *U) {
7116 auto *RepR = dyn_cast<VPReplicateRecipe>(U);
7117 return RepR && RepR->getOpcode() == Instruction::Store;
7118 })) {
7119 Chains.clear();
7120 break;
7121 }
7122 }
7123 }
7124
7125 // Clear the chain if it is not profitable.
7127 [&, &Chains = Chains](ElementCount VF) {
7128 return IsProfitablePartialReductionChainForVF(Chains, VF);
7129 },
7130 Range))
7131 Chains.clear();
7132 }
7133
7134 for (auto &[Phi, Chains] : ChainsByPhi)
7135 for (const VPPartialReductionChain &Chain : Chains)
7136 transformToPartialReduction(Chain, Plan, Phi);
7137}
7138
7140 VPlan &Plan, VFRange &Range, VPRecipeBuilder &RecipeBuilder) {
7141 // Collect all loads/stores first. We will start with ones having simpler
7142 // decisions followed by more complex ones that are potentially
7143 // guided/dependent on the simpler ones.
7145 for (VPBasicBlock *VPBB :
7148 for (VPRecipeBase &R : *VPBB) {
7149 auto *VPI = dyn_cast<VPInstruction>(&R);
7150 if (VPI && VPI->getUnderlyingValue() &&
7151 is_contained({Instruction::Load, Instruction::Store},
7152 VPI->getOpcode()))
7153 MemOps.push_back(VPI);
7154 }
7155 }
7156
7157 // Few helpers to process different kinds of memory operations.
7158
7159 // To be used as argument to `VPlanTransforms::runPass` which explicitly
7160 // specified pass name, hence `VPlan &` parameter.
7161 auto ProcessSubset = [&](VPlan &, auto ProcessVPInst) {
7162 SmallVector<VPInstruction *> RemainingMemOps;
7163 for (VPInstruction *VPI : MemOps) {
7164 if (!ProcessVPInst(VPI))
7165 RemainingMemOps.push_back(VPI);
7166 }
7167
7168 MemOps.clear();
7169 std::swap(MemOps, RemainingMemOps);
7170 };
7171
7172 auto ReplaceWith = [&](VPInstruction *VPI, VPRecipeBase *New) {
7173 New->insertBefore(VPI);
7174 if (VPI->getOpcode() == Instruction::Load)
7175 VPI->replaceAllUsesWith(New->getVPSingleValue());
7176 VPI->eraseFromParent();
7177
7178 // VPI has been processed.
7179 return true;
7180 };
7181
7182 auto Scalarize = [&](VPInstruction *VPI) {
7183 return ReplaceWith(VPI, RecipeBuilder.handleReplication(VPI, Range));
7184 };
7185
7186 VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
7187 VPBuilder FinalRedStoresBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
7189 "lowerMemoryIdioms", ProcessSubset, Plan, [&](VPInstruction *VPI) {
7190 if (RecipeBuilder.replaceWithFinalIfReductionStore(
7191 VPI, FinalRedStoresBuilder))
7192 return true;
7193
7194 // Filter out scalar VPlan for the remaining idioms.
7196 [](ElementCount VF) { return VF.isScalar(); }, Range))
7197 return false;
7198
7199 if (VPHistogramRecipe *Histogram = RecipeBuilder.widenIfHistogram(VPI))
7200 return ReplaceWith(VPI, Histogram);
7201
7202 return false;
7203 });
7204
7205 // Filter out scalar VPlan for the remaining memory operations.
7207 [](ElementCount VF) { return VF.isScalar(); }, Range))
7208 return;
7209
7210 // If the instruction's allocated size doesn't equal it's type size, it
7211 // requires padding and will be scalarized.
7213 "scalarizeMemOpsWithIrregularTypes", ProcessSubset, Plan,
7214 [&](VPInstruction *VPI) {
7216 if (hasIrregularType(getLoadStoreType(I), I->getDataLayout()))
7217 return Scalarize(VPI);
7218
7219 return false;
7220 });
7221
7222 VPlanTransforms::runPass("delegateMemOpWideningToLegacyCM", ProcessSubset,
7223 Plan, [&](VPInstruction *VPI) {
7224 if (VPRecipeBase *Recipe =
7225 RecipeBuilder.tryToWidenMemory(VPI, Range))
7226 return ReplaceWith(VPI, Recipe);
7227
7228 return Scalarize(VPI);
7229 });
7230}
7231
7234 [&](ElementCount VF) { return VF.isScalar(); }, Range))
7235 return;
7236
7238 Plan.getEntry());
7240 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
7241 auto *VPI = dyn_cast<VPInstruction>(&R);
7242 if (!VPI)
7243 continue;
7244
7245 auto *I = cast_or_null<Instruction>(VPI->getUnderlyingValue());
7246 // Wouldn't be able to create a `VPReplicateRecipe` anyway.
7247 if (!I)
7248 continue;
7249
7250 // If executing other lanes produces side-effects we can't avoid them.
7251 if (VPI->mayHaveSideEffects())
7252 continue;
7253
7254 // We want to drop the mask operand, verify we can safely do that.
7255 if (VPI->isMasked() && !VPI->isSafeToSpeculativelyExecute())
7256 continue;
7257
7258 // Avoid rewriting IV increment as that interferes with
7259 // `removeRedundantCanonicalIVs`.
7260 if (VPI->getOpcode() == Instruction::Add &&
7262 continue;
7263
7264 // Other lanes are needed - can't drop them.
7266 continue;
7267
7268 auto *Recipe = VPBuilder::createSingleScalarOp(
7269 VPI->getOpcode(), VPI->operandsWithoutMask(), /*Mask=*/nullptr, *VPI,
7270 *VPI, VPI->getDebugLoc(), I);
7271 Recipe->insertBefore(VPI);
7272 VPI->replaceAllUsesWith(Recipe);
7273 VPI->eraseFromParent();
7274 }
7275 }
7276}
7277
7278/// Returns true if \p Info's parameter kinds are compatible with \p Args.
7279static bool areVFParamsOk(const VFInfo &Info, ArrayRef<VPValue *> Args,
7280 PredicatedScalarEvolution &PSE, const Loop *L) {
7281 ScalarEvolution *SE = PSE.getSE();
7282 return all_of(Info.Shape.Parameters, [&](VFParameter Param) {
7283 switch (Param.ParamKind) {
7284 case VFParamKind::Vector:
7285 case VFParamKind::GlobalPredicate:
7286 return true;
7287 case VFParamKind::OMP_Uniform:
7288 return SE->isSCEVable(Args[Param.ParamPos]->getScalarType()) &&
7289 SE->isLoopInvariant(
7290 vputils::getSCEVExprForVPValue(Args[Param.ParamPos], PSE, L),
7291 L);
7292 case VFParamKind::OMP_Linear:
7293 return match(vputils::getSCEVExprForVPValue(Args[Param.ParamPos], PSE, L),
7294 m_scev_AffineAddRec(
7295 m_SCEV(), m_scev_SpecificSInt(Param.LinearStepOrPos),
7296 m_SpecificLoop(L)));
7297 default:
7298 return false;
7299 }
7300 });
7301}
7302
7303/// Find a vector variant of \p CI for \p VF, respecting \p MaskRequired.
7304/// Returns the variant function, or nullptr. Masked variants are assumed to
7305/// take the mask as a trailing parameter.
7307 ElementCount VF, bool MaskRequired,
7309 const Loop *L) {
7310 if (CI->isNoBuiltin())
7311 return nullptr;
7312 auto Mappings = VFDatabase::getMappings(*CI);
7313 const auto *It = find_if(Mappings, [&](const VFInfo &Info) {
7314 return Info.Shape.VF == VF && (!MaskRequired || Info.isMasked()) &&
7315 areVFParamsOk(Info, Args, PSE, L);
7316 });
7317 if (It == Mappings.end())
7318 return nullptr;
7319 return CI->getModule()->getFunction(It->VectorName);
7320}
7321
7322namespace {
7323/// The outcome of choosing how to widen a call at a given VF.
7324struct CallWideningDecision {
7325 enum class KindTy { Scalarize, Intrinsic, VectorVariant };
7326 CallWideningDecision(KindTy Kind, Function *Variant = nullptr)
7327 : Kind(Kind), Variant(Variant) {}
7328 KindTy Kind;
7329
7330 /// Set when Kind == VectorVariant.
7332
7333 bool operator==(const CallWideningDecision &Other) const {
7334 return Kind == Other.Kind && Variant == Other.Variant;
7335 }
7336};
7337} // namespace
7338
7339/// Pick the cheapest widening for the call \p VPI at \p VF among scalarization,
7340/// vector intrinsic, and vector library variant.
7341static CallWideningDecision decideCallWidening(VPInstruction &VPI,
7343 ElementCount VF,
7344 VPCostContext &CostCtx) {
7345 auto *CI = cast<CallInst>(VPI.getUnderlyingInstr());
7346
7347 // Scalar VFs and calls forced or known to scalarize always replicate.
7348 if (VF.isScalar() || CostCtx.willBeScalarized(CI, VF))
7349 return CallWideningDecision::KindTy::Scalarize;
7350
7351 auto *CalledFn = cast<Function>(
7353 Type *ResultTy = VPI.getScalarType();
7355 bool MaskRequired = CostCtx.isMaskRequired(CI);
7356
7357 // Pseudo intrinsics (assume, lifetime, ...) are always scalarized.
7359 return CallWideningDecision::KindTy::Scalarize;
7360
7361 InstructionCost ScalarCost =
7362 VPReplicateRecipe::computeCallCost(CalledFn, ResultTy, Ops,
7363 /*IsSingleScalar=*/false, VF, CostCtx);
7364
7365 Function *VecFunc =
7366 findVectorVariant(CI, Ops, VF, MaskRequired, CostCtx.PSE, CostCtx.L);
7368 if (VecFunc)
7369 VecCallCost = VPWidenCallRecipe::computeCallCost(VecFunc, CostCtx);
7370
7371 // Prefer the intrinsic if it is at least as cheap as scalarizing and any
7372 // available vector variant.
7373 if (ID) {
7376 if (IntrinsicCost.isValid() && ScalarCost >= IntrinsicCost &&
7377 (!VecFunc || VecCallCost >= IntrinsicCost))
7378 return CallWideningDecision::KindTy::Intrinsic;
7379 }
7380
7381 // Otherwise, use a vector library variant when it beats scalarizing.
7382 if (VecFunc && ScalarCost >= VecCallCost)
7383 return {CallWideningDecision::KindTy::VectorVariant, VecFunc};
7384
7385 return CallWideningDecision::KindTy::Scalarize;
7386}
7387
7389 VPRecipeBuilder &RecipeBuilder,
7390 VPCostContext &CostCtx) {
7393 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
7394 auto *VPI = dyn_cast<VPInstruction>(&R);
7395 if (!VPI || !VPI->getUnderlyingValue() ||
7396 VPI->getOpcode() != Instruction::Call)
7397 continue;
7398
7399 auto *CI = cast<CallInst>(VPI->getUnderlyingInstr());
7400 SmallVector<VPValue *, 4> Ops(VPI->op_begin(),
7401 VPI->op_begin() + CI->arg_size());
7402
7403 CallWideningDecision Decision =
7404 decideCallWidening(*VPI, Ops, Range.Start, CostCtx);
7406 [&](ElementCount VF) {
7407 return Decision == decideCallWidening(*VPI, Ops, VF, CostCtx);
7408 },
7409 Range);
7410
7411 VPSingleDefRecipe *Replacement = nullptr;
7412 switch (Decision.Kind) {
7413 case CallWideningDecision::KindTy::Intrinsic: {
7415 Type *ResultTy = VPI->getScalarType();
7416 Replacement = new VPWidenIntrinsicRecipe(*CI, ID, Ops, ResultTy, *VPI,
7417 *VPI, VPI->getDebugLoc());
7418 break;
7419 }
7420 case CallWideningDecision::KindTy::VectorVariant: {
7421 // Masked variants take the mask as a trailing parameter, so they have
7422 // one more parameter than the original call's arguments.
7423 if (Decision.Variant->arg_size() > Ops.size()) {
7424 VPValue *Mask = VPI->isMasked() ? VPI->getMask() : Plan.getTrue();
7425 Ops.push_back(Mask);
7426 }
7427 Ops.push_back(VPI->getOperand(VPI->getNumOperandsWithoutMask() - 1));
7428 Replacement = new VPWidenCallRecipe(CI, Decision.Variant, Ops, *VPI,
7429 *VPI, VPI->getDebugLoc());
7430 break;
7431 }
7432 case CallWideningDecision::KindTy::Scalarize:
7433 Replacement = RecipeBuilder.handleReplication(VPI, Range);
7434 break;
7435 }
7436
7437 Replacement->insertBefore(VPI);
7438 VPI->replaceAllUsesWith(Replacement);
7439 VPI->eraseFromParent();
7440 }
7441 }
7442}
7443
7446 Loop &L, VPCostContext &Ctx,
7447 VFRange &Range) {
7448 if (Plan.hasScalarVFOnly())
7449 return;
7450
7451 VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
7452 VPValue *I32VF = nullptr;
7454 vp_depth_first_shallow(VectorLoop->getEntry()))) {
7455 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
7456 auto *LoadR = dyn_cast<VPWidenLoadRecipe>(&R);
7457 // TODO: Support strided store.
7458 // TODO: Transform reverse access into strided access with -1 stride.
7459 // TODO: Transform gather/scatter with uniform address into strided access
7460 // with 0 stride.
7461 // TODO: Transform interleave access into multiple strided accesses.
7462 if (!LoadR || LoadR->isConsecutive())
7463 continue;
7464
7465 auto *Ptr = dyn_cast<VPWidenGEPRecipe>(LoadR->getAddr());
7466 if (!Ptr)
7467 continue;
7468
7469 // Check if this is a strided access by analyzing the address SCEV for an
7470 // affine addRec.
7471 const SCEV *PtrSCEV = vputils::getSCEVExprForVPValue(Ptr, PSE, &L);
7472 const SCEV *Start;
7473 const SCEVConstant *Step;
7474 // TODO: Support non-constant loop invariant stride.
7475 if (!match(PtrSCEV,
7477 m_SpecificLoop(&L))))
7478 continue;
7479
7480 Type *LoadTy = LoadR->getScalarType();
7481 Align Alignment = LoadR->getAlign();
7482 auto IsProfitable = [&](ElementCount VF) {
7483 Type *DataTy = toVectorTy(LoadTy, VF);
7484 if (!Ctx.TTI.isLegalStridedLoadStore(DataTy, Alignment))
7485 return false;
7486 const InstructionCost CurrentCost = LoadR->computeCost(VF, Ctx);
7487 const InstructionCost StridedLoadStoreCost =
7489 Intrinsic::experimental_vp_strided_load, DataTy,
7490 LoadR->isMasked(), Alignment, Ctx);
7491 return StridedLoadStoreCost < CurrentCost;
7492 };
7493
7495 Range))
7496 continue;
7497
7498 // Invalidate the legacy widening decision so the cost of replaced load is
7499 // not counted during precomputeCosts.
7500 // TODO: Remove once the legacy exit cost computation is retired.
7501 for (ElementCount VF : Range)
7502 Ctx.invalidateWideningDecision(&LoadR->getIngredient(), VF);
7503
7504 // Get VF as i32 for the vector length operand.
7505 if (!I32VF) {
7506 VPBuilder Builder(Plan.getVectorPreheader());
7507 I32VF = Builder.createScalarZExtOrTrunc(
7508 &Plan.getVF(), Type::getInt32Ty(Plan.getContext()),
7510 }
7511
7512 VPBuilder Builder(LoadR);
7513 // Create the base pointer of strided access.
7514 // TODO: reuse VPDerivedIVRecipe for base pointer computation when it
7515 // supports a general VPValue as the start value.
7516 VPValue *StartVPV = vputils::getOrCreateVPValueForSCEVExpr(Plan, Start);
7517 VPValue *StrideInBytes = Plan.getOrAddLiveIn(Step->getValue());
7518 Type *IndexTy = Plan.getDataLayout().getIndexType(Ptr->getScalarType());
7519 assert(IndexTy == StrideInBytes->getScalarType() &&
7520 "Stride type from SCEV must match the index type");
7521 VPValue *CanIV = Builder.createScalarSExtOrTrunc(
7522 VectorLoop->getCanonicalIV(), IndexTy,
7523 VectorLoop->getCanonicalIVType(), DebugLoc::getUnknown());
7524 auto *AddRecPtr = cast<SCEVAddRecExpr>(PtrSCEV);
7525 auto *Offset = Builder.createOverflowingOp(
7526 Instruction::Mul, {CanIV, StrideInBytes},
7527 {AddRecPtr->hasNoUnsignedWrap(), AddRecPtr->hasNoSignedWrap()});
7528 auto *BasePtr = Builder.createNoWrapPtrAdd(
7529 StartVPV, Offset,
7530 AddRecPtr->hasNoUnsignedWrap() ? GEPNoWrapFlags::noUnsignedWrap()
7532
7533 // Create a new vector pointer for strided access.
7534 VPValue *NewPtr = Builder.createVectorPointer(
7535 BasePtr, Type::getInt8Ty(Plan.getContext()), StrideInBytes,
7536 Ptr->getGEPNoWrapFlags(), Ptr->getDebugLoc());
7537
7538 VPValue *Mask = LoadR->getMask();
7539 if (!Mask)
7540 Mask = Plan.getTrue();
7541 auto *StridedLoad = Builder.createWidenMemIntrinsic(
7542 Intrinsic::experimental_vp_strided_load,
7543 {NewPtr, StrideInBytes, Mask, I32VF}, LoadTy, Alignment, *LoadR,
7544 LoadR->getDebugLoc());
7545 LoadR->replaceAllUsesWith(StridedLoad);
7546 }
7547 }
7548}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
AMDGPU Register Bank Select
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool isEqual(const Function &Caller, const Function &Callee)
static const Function * getParent(const Value *V)
#define X(NUM, ENUM, NAME)
Definition ELF.h:853
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
static cl::opt< IntrinsicCostStrategy > IntrinsicCost("intrinsic-cost-strategy", cl::desc("Costing strategy for intrinsic instructions"), cl::init(IntrinsicCostStrategy::InstructionCost), cl::values(clEnumValN(IntrinsicCostStrategy::InstructionCost, "instruction-cost", "Use TargetTransformInfo::getInstructionCost"), clEnumValN(IntrinsicCostStrategy::IntrinsicCost, "intrinsic-cost", "Use TargetTransformInfo::getIntrinsicInstrCost"), clEnumValN(IntrinsicCostStrategy::TypeBasedIntrinsicCost, "type-based-intrinsic-cost", "Calculate the intrinsic cost based only on argument types")))
@ Default
Hexagon Common GEP
#define _
iv Induction Variable Users
Definition IVUsers.cpp:48
iv users
Definition IVUsers.cpp:48
static std::pair< Value *, APInt > getMask(Value *WideMask, unsigned Factor, ElementCount LeafValueEC)
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
licm
Definition LICM.cpp:383
Legalize the Machine IR a function s Machine IR
Definition Legalizer.cpp:81
#define I(x, y, z)
Definition MD5.cpp:57
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first DebugLoc that has line number information, given a range of instructions.
This file provides utility analysis objects describing memory locations.
This file contains the declarations for metadata subclasses.
MachineInstr unsigned OpIdx
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
#define P(N)
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
R600 Clause Merge
const SmallVectorImpl< MachineOperand > & Cond
static bool dominates(InstrPosIndexes &PosIndexes, const MachineInstr &A, const MachineInstr &B)
This file contains some templates that are useful if you are working with the STL at all.
This is the interface for a metadata-based scoped no-alias analysis.
This file implements a set that has insertion order iteration characteristics.
This file defines the SmallPtrSet class.
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This file implements the TypeSwitch template, which mimics a switch() statement whose cases are type ...
This file implements dominator tree analysis for a single level of a VPlan's H-CFG.
This file contains the declarations of different VPlan-related auxiliary helpers.
static SmallVector< SmallVector< VPReplicateRecipe *, 4 > > collectComplementaryPredicatedMemOps(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L)
static void removeCommonBlendMask(VPBlendRecipe *Blend)
Try to see if all of Blend's masks share a common value logically and'ed and remove it from the masks...
static void tryToCreateAbstractReductionRecipe(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries to create abstract recipes from the reduction recipe for following optimizations ...
static VPReplicateRecipe * findRecipeWithMinAlign(ArrayRef< VPReplicateRecipe * > Group)
static bool handleUncountableExitsWithSideEffects(VPlan &Plan, SmallVectorImpl< EarlyExitInfo > &Exits, VPBasicBlock *HeaderVPBB, VPBasicBlock *LatchVPBB, VPBasicBlock *MiddleVPBB, Loop *TheLoop, PredicatedScalarEvolution &PSE, DominatorTree &DT, AssumptionCache *AC)
Update Plan to mask memory operations in the loop based on whether the early exit is taken or not.
static CallWideningDecision decideCallWidening(VPInstruction &VPI, ArrayRef< VPValue * > Ops, ElementCount VF, VPCostContext &CostCtx)
Pick the cheapest widening for the call VPI at VF among scalarization, vector intrinsic,...
static bool areVFParamsOk(const VFInfo &Info, ArrayRef< VPValue * > Args, PredicatedScalarEvolution &PSE, const Loop *L)
Returns true if Info's parameter kinds are compatible with Args.
static std::optional< VPValue * > getRecipesForUncountableExit(SmallVectorImpl< VPInstruction * > &Recipes, VPBasicBlock *LatchVPBB)
Returns the VPValue representing the uncountable exit comparison used by AnyOf if the recipes it depe...
static bool simplifyLogicalRecipe(VPSingleDefRecipe *Def, VPBuilder &Builder, bool CanCreateNewRecipe)
Try to simplify logical and bitwise recipes in Def.
static bool sinkScalarOperands(VPlan &Plan)
static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Try to simplify the branch condition of Plan.
static VPValue * cloneBinOpForScalarIV(VPWidenRecipe *BinOp, VPValue *ScalarIV, VPWidenIntOrFpInductionRecipe *WidenIV)
Create a scalar version of BinOp, with its WidenIV operand replaced by ScalarIV, and place it after S...
static VPWidenIntOrFpInductionRecipe * getExpressionIV(VPValue *V)
Check if V is a binary expression of a widened IV and a loop-invariant value.
static void removeRedundantInductionCasts(VPlan &Plan)
Remove redundant casts of inductions.
static bool isConditionTrueViaVFAndUF(VPValue *Cond, VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Return true if Cond is known to be true for given BestVF and BestUF.
static bool tryToReplaceALMWithWideALM(VPlan &Plan, ElementCount VF, unsigned UF)
Try to replace multiple active lane masks used for control flow with a single, wide active lane mask ...
static std::optional< std::pair< bool, unsigned > > getOpcodeOrIntrinsicID(const VPSingleDefRecipe *R)
Get any instruction opcode or intrinsic ID data embedded in recipe R.
static VPExpressionRecipe * tryToMatchAndCreateExtendedReduction(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries convert extended in-loop reductions to VPExpressionRecipe and clamp the Range if ...
static RemoveMask_match< Op0_t, Op1_t > m_RemoveMask(const Op0_t &In, Op1_t &Out)
Match a specific mask In, or a combination of it (logical-and In, Out).
static std::optional< ElementCount > isConsecutiveInterleaveGroup(VPInterleaveRecipe *InterleaveR, ArrayRef< ElementCount > VFs, const TargetTransformInfo &TTI)
Returns VF from VFs if IR is a full interleave group with factor and number of members both equal to ...
static Type * getLoadStoreValueType(VPReplicateRecipe *R, bool IsLoad)
Get the value type of the replicate load or store.
static VPIRMetadata getCommonMetadata(ArrayRef< VPReplicateRecipe * > Recipes)
static VPValue * getPredicatedMask(VPRegionBlock *R)
If R is a region with a VPBranchOnMaskRecipe in the entry block, return the mask.
static bool mergeReplicateRegionsIntoSuccessors(VPlan &Plan)
static Function * findVectorVariant(CallInst *CI, ArrayRef< VPValue * > Args, ElementCount VF, bool MaskRequired, PredicatedScalarEvolution &PSE, const Loop *L)
Find a vector variant of CI for VF, respecting MaskRequired.
static VPScalarIVStepsRecipe * createScalarIVSteps(VPlan &Plan, InductionDescriptor::InductionKind Kind, Instruction::BinaryOps InductionOpcode, FPMathOperator *FPBinOp, Instruction *TruncI, VPIRValue *StartV, VPValue *Step, DebugLoc DL, VPBuilder &Builder)
static VPWidenInductionRecipe * getOptimizableIVOf(VPValue *VPV, PredicatedScalarEvolution &PSE)
Check if VPV is an untruncated wide induction, either before or after the increment.
static void fixupVFUsersForEVL(VPlan &Plan, VPValue &EVL)
After replacing the canonical IV with a EVL-based IV, fixup recipes that use VF to use the EVL instea...
static bool canNarrowLoad(VPSingleDefRecipe *WideMember0, unsigned OpIdx, VPValue *OpV, unsigned Idx, bool IsScalable)
Returns true if V is VPWidenLoadRecipe or VPInterleaveRecipe that can be converted to a narrower reci...
static void simplifyRecipe(VPSingleDefRecipe *Def)
Try to simplify VPSingleDefRecipe Def.
static bool isDeadRecipe(VPRecipeBase &R)
Returns true if R is dead and can be removed.
static void legalizeAndOptimizeInductions(VPlan &Plan)
Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd (IndStart, ScalarIVSteps (0,...
static void addReplicateRegions(VPlan &Plan)
static SmallVector< SmallVector< VPReplicateRecipe *, 4 > > collectGroupedReplicateMemOps(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L, function_ref< bool(VPReplicateRecipe *)> FilterFn)
Collect either replicated Loads or Stores grouped by their address SCEV and their load-store type,...
static VPValue * tryToComputeEndValueForInduction(VPWidenInductionRecipe *WideIV, VPBuilder &VectorPHBuilder, VPValue *VectorTC)
Compute the end value for WideIV, unless it is truncated.
static std::optional< Intrinsic::ID > getVPDivRemIntrinsic(Intrinsic::ID IntrID)
static void removeRedundantExpandSCEVRecipes(VPlan &Plan)
Remove redundant ExpandSCEVRecipes in Plan's entry block by replacing them with already existing reci...
static VPValue * optimizeEarlyExitInductionUser(VPlan &Plan, VPValue *Op, PredicatedScalarEvolution &PSE)
Attempts to optimize the induction variable exit values for users in the early exit block.
static VPValue * narrowInterleaveGroupOp(ArrayRef< VPValue * > Members, SmallPtrSetImpl< VPValue * > &NarrowedOps, VPBasicBlock *Preheader)
static VPValue * scalarizeVPWidenPointerInduction(VPWidenPointerInductionRecipe *PtrIV, VPlan &Plan, VPBuilder &Builder)
Scalarize a VPWidenPointerInductionRecipe by replacing it with a PtrAdd (IndStart,...
static VPIRValue * tryToFoldLiveIns(VPSingleDefRecipe &R, ArrayRef< VPValue * > Operands, const DataLayout &DL)
Try to fold R using InstSimplifyFolder.
static SmallVector< VPUser * > collectUsersRecursively(VPValue *V)
static VPValue * optimizeLatchExitInductionUser(VPlan &Plan, VPValue *Op, DenseMap< VPValue *, VPValue * > &EndValues, PredicatedScalarEvolution &PSE)
Attempts to optimize the induction variable exit values for users in the exit block coming from the l...
static void recursivelyDeleteDeadRecipes(VPValue *V)
static void reassociateHeaderMask(VPlan &Plan)
Reassociate (headermask && x) && y -> headermask && (x && y) to allow the header mask to be simplifie...
static VPActiveLaneMaskPHIRecipe * addVPLaneMaskPhiAndUpdateExitBranch(VPlan &Plan)
static void expandVPDerivedIV(VPDerivedIVRecipe *R)
Expand a VPDerivedIVRecipe into executable recipes.
static VPBasicBlock * getPredicatedThenBlock(VPRegionBlock *R)
If R is a triangle region, return the 'then' block of the triangle.
static bool canHoistOrSinkWithNoAliasCheck(const MemoryLocation &MemLoc, VPBasicBlock *FirstBB, VPBasicBlock *LastBB, std::optional< SinkStoreInfo > SinkInfo={})
Check if a memory operation doesn't alias with memory operations using scoped noalias metadata,...
static VPRegionBlock * createReplicateRegion(VPReplicateRecipe *PredRecipe, VPRegionBlock *ParentRegion, VPlan &Plan)
static void simplifyBlends(VPlan &Plan)
Normalize and simplify VPBlendRecipes.
static bool cannotHoistOrSinkRecipe(VPRecipeBase &R, VPBasicBlock *FirstBB, VPBasicBlock *LastBB, bool Sinking=false)
Return true if we do not know how to (mechanically) hoist or sink a non-memory or memory recipe R out...
static std::optional< Instruction::BinaryOps > getUnmaskedDivRemOpcode(Intrinsic::ID ID)
static bool isAlreadyNarrow(VPValue *VPV)
Returns true if VPValue is a narrow VPValue.
static bool canNarrowOps(ArrayRef< VPValue * > Ops, bool IsScalable)
static bool optimizeVectorInductionWidthForTCAndVFUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF)
Optimize the width of vector induction variables in Plan based on a known constant Trip Count,...
static VPExpressionRecipe * tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries convert extended in-loop reductions to VPExpressionRecipe and clamp the Range if ...
static bool canSinkStoreWithNoAliasCheck(ArrayRef< VPReplicateRecipe * > StoresToSink, PredicatedScalarEvolution &PSE, const Loop &L)
static void expandVPWidenIntOrFpInduction(VPWidenIntOrFpInductionRecipe *WidenIVR)
Expand a VPWidenIntOrFpInduction into executable recipes, for the initial value, phi and backedge val...
static VPRecipeBase * optimizeMaskToEVL(VPValue *HeaderMask, VPRecipeBase &CurRecipe, VPValue &EVL)
Try to optimize a CurRecipe masked by HeaderMask to a corresponding EVL-based recipe without the head...
static void expandVPWidenPointerInduction(VPWidenPointerInductionRecipe *R)
Expand a VPWidenPointerInductionRecipe into executable recipes, for the initial value,...
static void narrowToSingleScalarRecipes(VPlan &Plan)
This file provides utility VPlan to VPlan transformations.
#define RUN_VPLAN_PASS(PASS,...)
This file declares the class VPlanVerifier, which contains utility functions to check the consistency...
This file contains the declarations of the Vectorization Plan base classes:
static const X86InstrFMA3Group Groups[]
Value * RHS
Value * LHS
BinaryOperator * Mul
static const uint32_t IV[8]
Definition blake3_impl.h:83
Helper for extra no-alias checks via known-safe recipe and SCEV.
SinkStoreInfo(ArrayRef< VPReplicateRecipe * > ExcludeRecipes, VPReplicateRecipe &GroupLeader, PredicatedScalarEvolution &PSE, const Loop &L)
SinkStoreInfo(VPReplicateRecipe &GroupLeader)
bool shouldSkip(VPRecipeBase &R) const
Return true if R should be skipped during alias checking, either because it's in the exclude set or b...
Class for arbitrary precision integers.
Definition APInt.h:78
LLVM_ABI APInt zext(unsigned width) const
Zero extend to a new width.
Definition APInt.cpp:1055
unsigned getActiveBits() const
Compute the number of active bits in the value.
Definition APInt.h:1535
APInt abs() const
Get the absolute value.
Definition APInt.h:1818
LLVM_ABI APInt urem(const APInt &RHS) const
Unsigned remainder operation.
Definition APInt.cpp:1692
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1511
LLVM_ABI APInt sext(unsigned width) const
Sign extend to a new width.
Definition APInt.cpp:1028
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition APInt.h:441
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition APInt.h:1228
An arbitrary precision integer that knows its signedness.
Definition APSInt.h:24
static APSInt getMinValue(uint32_t numBits, bool Unsigned)
Return the APSInt representing the minimum integer value with the given bit width and signedness.
Definition APSInt.h:310
static APSInt getMaxValue(uint32_t numBits, bool Unsigned)
Return the APSInt representing the maximum integer value with the given bit width and signedness.
Definition APSInt.h:302
@ NoAlias
The two locations do not alias at all.
Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
const T & back() const
Get the last element.
Definition ArrayRef.h:150
ArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
Definition ArrayRef.h:194
const T & front() const
Get the first element.
Definition ArrayRef.h:144
A cache of @llvm.assume calls within a function.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction; assumes that the block is well-formed.
Definition BasicBlock.h:237
bool isNoBuiltin() const
Return true if the call should not be treated as a call to a builtin.
This class represents a function call, abstracting a target machine's calling convention.
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:765
@ ICMP_NE
not equal
Definition InstrTypes.h:762
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:766
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition InstrTypes.h:750
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition InstrTypes.h:852
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
static ConstantInt * getSigned(IntegerType *Ty, int64_t V, bool ImplicitTrunc=false)
Return a ConstantInt with the specified value for the specified type.
Definition Constants.h:135
This class represents a range of values.
LLVM_ABI bool contains(const APInt &Val) const
Return true if the specified value is in the set.
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
LLVM_ABI IntegerType * getIndexType(LLVMContext &C, unsigned AddressSpace) const
Returns the type of a GEP index in AddressSpace.
A debug info location.
Definition DebugLoc.h:124
static DebugLoc getCompilerGenerated()
Definition DebugLoc.h:152
static DebugLoc getUnknown()
Definition DebugLoc.h:151
ValueT lookup(const_arg_type_t< KeyT > Val) const
Return the entry for the specified key, or a default constructed value if no such entry exists.
Definition DenseMap.h:252
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:301
ValueT lookup_or(const_arg_type_t< KeyT > Val, U &&Default) const
Definition DenseMap.h:262
bool dominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
dominates - Returns true iff A dominates B.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:151
constexpr bool isVector() const
One or more elements.
Definition TypeSize.h:324
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition TypeSize.h:312
constexpr bool isScalar() const
Exactly one element.
Definition TypeSize.h:320
Utility class for floating point operations which can have information about relaxed accuracy require...
Definition Operator.h:202
FastMathFlags getFastMathFlags() const
Convenience function for getting all the fast-math flags.
Definition Operator.h:291
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:23
size_t arg_size() const
Definition Function.h:901
Represents flags for the getelementptr instruction/expression.
static GEPNoWrapFlags noUnsignedWrap()
GEPNoWrapFlags withoutNoUnsignedWrap() const
static GEPNoWrapFlags none()
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
A struct for saving information about induction variables.
static LLVM_ABI InductionDescriptor getCanonicalIntInduction(Type *Ty, ScalarEvolution &SE)
Returns the canonical integer induction for type Ty with start = 0 and step = 1.
InductionKind
This enum represents the kinds of inductions that we support.
@ IK_NoInduction
Not an induction variable.
@ IK_FpInduction
Floating point induction variable.
@ IK_PtrInduction
Pointer induction var. Step = C.
@ IK_IntInduction
Integer induction variable. Step = C.
InstSimplifyFolder - Use InstructionSimplify to fold operations to existing values.
static InstructionCost getInvalid(CostType Val=0)
bool isCast() const
LLVM_ABI const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
bool isBinaryOp() const
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
bool isIntDivRem() const
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:350
The group of interleaved loads/stores sharing the same stride and close to each other.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
static bool getDecisionAndClampRange(const std::function< bool(ElementCount)> &Predicate, VFRange &Range)
Test a Predicate on a Range of VF's.
Definition VPlan.cpp:1659
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
LLVM_ABI MDNode * createBranchWeights(uint32_t TrueWeight, uint32_t FalseWeight, bool IsExpected=false)
Return metadata containing two branch weights.
Definition MDBuilder.cpp:38
Metadata node.
Definition Metadata.h:1069
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:38
ValueT lookup(const KeyT &Key) const
Definition MapVector.h:110
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
Definition MapVector.h:118
bool empty() const
Definition MapVector.h:79
Representation for a specific memory location.
AAMDNodes AATags
The metadata nodes which describes the aliasing of the location (each member is null if that kind of ...
Function * getFunction(StringRef Name) const
Look up the specified function in the module symbol table.
Definition Module.cpp:235
unsigned getOpcode() const
Return the opcode for this Instruction or ConstantExpr.
Definition Operator.h:43
Post-order traversal of a graph.
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
ScalarEvolution * getSE() const
Returns the ScalarEvolution analysis used.
LLVM_ABI const SCEV * getSCEV(Value *V)
Returns the SCEV expression of V, in the context of the current SCEV predicate.
static LLVM_ABI unsigned getOpcode(RecurKind Kind)
Returns the opcode corresponding to the RecurrenceKind.
static bool isFindLastRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is of the form select(cmp(),x,y) where one of (x,...
RegionT * getParent() const
Get the parent of the Region.
Definition RegionInfo.h:362
This class represents a constant integer value.
ConstantInt * getValue() const
This class uses information about analyze scalars to rewrite expressions in canonical form.
LLVM_ABI Value * expandCodeFor(SCEVUse SH, Type *Ty, BasicBlock::iterator I)
Insert code to directly compute the specified SCEV expression into the program.
static const SCEV * rewrite(const SCEV *Scev, ScalarEvolution &SE, ValueToSCEVMapTy &Map)
This class represents an analyzed expression in the program.
LLVM_ABI Type * getType() const
Return the LLVM type of this SCEV expression.
The main scalar evolution driver.
LLVM_ABI const SCEV * getUDivExpr(SCEVUse LHS, SCEVUse RHS)
Get a canonical unsigned division expression, or something simpler if possible.
const DataLayout & getDataLayout() const
Return the DataLayout associated with the module this SCEV instance is operating on.
LLVM_ABI const SCEV * getNegativeSCEV(const SCEV *V, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
Return the SCEV object corresponding to -V.
LLVM_ABI bool isKnownNonZero(const SCEV *S)
Test if the given expression is known to be non-zero.
LLVM_ABI const SCEV * getConstant(ConstantInt *V)
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI const SCEV * getMinusSCEV(SCEVUse LHS, SCEVUse RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
ConstantRange getSignedRange(const SCEV *S)
Determine the signed range for a particular SCEV.
LLVM_ABI bool isKnownPositive(const SCEV *S)
Test if the given expression is known to be positive.
LLVM_ABI const SCEV * getElementCount(Type *Ty, ElementCount EC, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
ConstantRange getUnsignedRange(const SCEV *S)
Determine the unsigned range for a particular SCEV.
LLVM_ABI const SCEV * getMulExpr(SmallVectorImpl< SCEVUse > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
LLVM_ABI bool isKnownPredicate(CmpPredicate Pred, SCEVUse LHS, SCEVUse RHS)
Test if the given expression is known to satisfy the condition described by Pred, LHS,...
static LLVM_ABI AliasResult alias(const MemoryLocation &LocA, const MemoryLocation &LocB)
A vector that has set insertion semantics.
Definition SetVector.h:57
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:103
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:151
size_type size() const
Definition SmallPtrSet.h:99
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
iterator begin() const
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
Provides information about what library functions are available for the current target.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
static LLVM_ABI PartialReductionExtendKind getPartialReductionExtendKind(Instruction *I)
Get the kind of extension that an instruction represents.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
LLVM_ABI InstructionCost getPartialReductionCost(unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType, ElementCount VF, PartialReductionExtendKind OpAExtend, PartialReductionExtendKind OpBExtend, std::optional< unsigned > BinOp, TTI::TargetCostKind CostKind, std::optional< FastMathFlags > FMF) const
@ SK_Broadcast
Broadcast element 0 to all other elements.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
This class implements a switch-like dispatch statement for a value of 'T' using dyn_cast functionalit...
Definition TypeSwitch.h:89
TypeSwitch< T, ResultT > & Case(CallableT &&caseFn)
Add a case on the given type.
Definition TypeSwitch.h:98
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:309
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:282
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
Definition Type.cpp:307
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:368
bool isStructTy() const
True if this is an instance of StructType.
Definition Type.h:276
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:197
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:232
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:306
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:186
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:257
op_range operands()
Definition User.h:267
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
Definition VectorUtils.h:76
A recipe for generating the active lane mask for the vector loop that is used to predicate the vector...
Definition VPlan.h:4055
VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph.
Definition VPlan.h:4407
void appendRecipe(VPRecipeBase *Recipe)
Augment the existing recipes of a VPBasicBlock with an additional Recipe as the last recipe.
Definition VPlan.h:4482
RecipeListTy::iterator iterator
Instruction iterators...
Definition VPlan.h:4434
iterator end()
Definition VPlan.h:4444
iterator begin()
Recipe iterator methods.
Definition VPlan.h:4442
iterator_range< iterator > phis()
Returns an iterator range over the PHI-like recipes in the block.
Definition VPlan.h:4495
iterator getFirstNonPhi()
Return the position of the first non-phi node recipe in the block.
Definition VPlan.cpp:266
VPBasicBlock * splitAt(iterator SplitAt)
Split current block at SplitAt by inserting a new block between the current block and its successors ...
Definition VPlan.cpp:560
const VPRecipeBase & front() const
Definition VPlan.h:4454
VPRecipeBase * getTerminator()
If the block has multiple successors, return the branch recipe terminating the block.
Definition VPlan.cpp:639
const VPRecipeBase & back() const
Definition VPlan.h:4456
A recipe for vectorizing a phi-node as a sequence of mask-based select instructions.
Definition VPlan.h:2957
VPValue * getMask(unsigned Idx) const
Return mask number Idx.
Definition VPlan.h:3007
unsigned getNumIncomingValues() const
Return the number of incoming values, taking into account when normalized the first incoming value wi...
Definition VPlan.h:2997
void setMask(unsigned Idx, VPValue *V)
Set mask number Idx to V.
Definition VPlan.h:3013
bool isNormalized() const
A normalized blend is one that has an odd number of operands, whereby the first operand does not have...
Definition VPlan.h:2993
VPBlockBase is the building block of the Hierarchical Control-Flow Graph.
Definition VPlan.h:94
void setSuccessors(ArrayRef< VPBlockBase * > NewSuccs)
Set each VPBasicBlock in NewSuccss as successor of this VPBlockBase.
Definition VPlan.h:315
VPRegionBlock * getParent()
Definition VPlan.h:186
const VPBasicBlock * getExitingBasicBlock() const
Definition VPlan.cpp:236
size_t getNumSuccessors() const
Definition VPlan.h:237
void setPredecessors(ArrayRef< VPBlockBase * > NewPreds)
Set each VPBasicBlock in NewPreds as predecessor of this VPBlockBase.
Definition VPlan.h:306
const VPBlocksTy & getPredecessors() const
Definition VPlan.h:222
VPlan * getPlan()
Definition VPlan.cpp:211
const std::string & getName() const
Definition VPlan.h:177
void clearSuccessors()
Remove all the successors of this block.
Definition VPlan.h:325
VPBlockBase * getSinglePredecessor() const
Definition VPlan.h:233
void clearPredecessors()
Remove all the predecessor of this block.
Definition VPlan.h:322
const VPBasicBlock * getEntryBasicBlock() const
Definition VPlan.cpp:216
VPBlockBase * getSingleHierarchicalPredecessor()
Definition VPlan.h:279
VPBlockBase * getSingleSuccessor() const
Definition VPlan.h:227
const VPBlocksTy & getSuccessors() const
Definition VPlan.h:211
static auto blocksAs(T &&Range)
Return an iterator range over Range with each block cast to BlockTy.
Definition VPlanUtils.h:331
static void insertOnEdge(VPBlockBase *From, VPBlockBase *To, VPBlockBase *BlockPtr)
Inserts BlockPtr on the edge between From and To.
Definition VPlanUtils.h:350
static bool isLatch(const VPBlockBase *VPB, const VPDominatorTree &VPDT)
Returns true if VPB is a loop latch, using isHeader().
static void insertTwoBlocksAfter(VPBlockBase *IfTrue, VPBlockBase *IfFalse, VPBlockBase *BlockPtr)
Insert disconnected VPBlockBases IfTrue and IfFalse after BlockPtr.
Definition VPlanUtils.h:240
static void connectBlocks(VPBlockBase *From, VPBlockBase *To, unsigned PredIdx=-1u, unsigned SuccIdx=-1u)
Connect VPBlockBases From and To bi-directionally.
Definition VPlanUtils.h:258
static void disconnectBlocks(VPBlockBase *From, VPBlockBase *To)
Disconnect VPBlockBases From and To bi-directionally.
Definition VPlanUtils.h:276
static auto blocksOnly(T &&Range)
Return an iterator range over Range which only includes BlockTy blocks.
Definition VPlanUtils.h:312
static void transferSuccessors(VPBlockBase *Old, VPBlockBase *New)
Transfer successors from Old to New. New must have no successors.
Definition VPlanUtils.h:296
static SmallVector< VPBasicBlock * > blocksInSingleSuccessorChainBetween(VPBasicBlock *FirstBB, VPBasicBlock *LastBB)
Returns the blocks between FirstBB and LastBB, where FirstBB to LastBB forms a single-sucessor chain.
A recipe for generating conditional branches on the bits of a mask.
Definition VPlan.h:3502
RAII object that stores the current insertion point and restores it when the object is destroyed.
VPlan-based builder utility analogous to IRBuilder.
VPInstruction * createFirstActiveLane(ArrayRef< VPValue * > Masks, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
VPInstruction * createAdd(VPValue *LHS, VPValue *RHS, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="", VPRecipeWithIRFlags::WrapFlagsTy WrapFlags={false, false})
VPInstruction * createOr(VPValue *LHS, VPValue *RHS, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
VPValue * createScalarZExtOrTrunc(VPValue *Op, Type *ResultTy, Type *SrcTy, DebugLoc DL)
VPInstruction * createLogicalOr(VPValue *LHS, VPValue *RHS, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
VPInstruction * createNot(VPValue *Operand, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
VPInstruction * createAnyOfReduction(VPValue *ChainOp, VPValue *TrueVal, VPValue *FalseVal, DebugLoc DL=DebugLoc::getUnknown())
Create an AnyOf reduction pattern: or-reduce ChainOp, freeze the result, then select between TrueVal ...
Definition VPlan.cpp:1646
VPInstruction * createLogicalAnd(VPValue *LHS, VPValue *RHS, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
VPDerivedIVRecipe * createDerivedIV(InductionDescriptor::InductionKind Kind, FPMathOperator *FPBinOp, VPIRValue *Start, VPValue *Current, VPValue *Step)
Convert the input value Current to the corresponding value of an induction with Start and Step values...
VPInstruction * createScalarCast(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy, DebugLoc DL, const VPIRMetadata &Metadata={})
VPWidenPHIRecipe * createWidenPhi(ArrayRef< VPValue * > IncomingValues, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
static VPBuilder getToInsertAfter(VPRecipeBase *R)
Create a VPBuilder to insert after R.
VPWidenCastRecipe * createWidenCast(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy)
VPInstruction * createICmp(CmpInst::Predicate Pred, VPValue *A, VPValue *B, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
Create a new ICmp VPInstruction with predicate Pred and operands A and B.
VPPhi * createScalarPhi(ArrayRef< VPValue * > IncomingValues, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="", const VPIRFlags &Flags={}, Type *ResultTy=nullptr)
VPInstruction * createSelect(VPValue *Cond, VPValue *TrueVal, VPValue *FalseVal, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="", const VPIRFlags &Flags={})
VPInstruction * createNaryOp(unsigned Opcode, ArrayRef< VPValue * > Operands, Instruction *Inst=nullptr, const VPIRFlags &Flags={}, const VPIRMetadata &MD={}, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="", Type *ResultTy=nullptr)
Create an N-ary operation with Opcode, Operands and set Inst as its underlying Instruction.
static VPSingleDefRecipe * createSingleScalarOp(unsigned Opcode, ArrayRef< VPValue * > Operands, VPValue *Mask, const VPIRFlags &Flags, const VPIRMetadata &Metadata, DebugLoc DL, Instruction *UV)
Create a single-scalar recipe with Opcode and Operands without inserting it.
void setInsertPoint(VPBasicBlock *TheBB)
This specifies that created VPInstructions should be appended to the end of the specified block.
A recipe for generating the phi node tracking the current scalar iteration index.
Definition VPlan.h:4087
unsigned getNumDefinedValues() const
Returns the number of values defined by the VPDef.
Definition VPlanValue.h:561
VPValue * getVPSingleValue()
Returns the only VPValue defined by the VPDef.
Definition VPlanValue.h:534
VPValue * getVPValue(unsigned I)
Returns the VPValue with index I defined by the VPDef.
Definition VPlanValue.h:546
ArrayRef< VPRecipeValue * > definedValues()
Returns an ArrayRef of the values defined by the VPDef.
Definition VPlanValue.h:556
A recipe for converting the input value IV value to the corresponding value of an IV with different s...
Definition VPlan.h:4188
Template specialization of the standard LLVM dominator tree utility for VPBlockBases.
bool properlyDominates(const VPRecipeBase *A, const VPRecipeBase *B) const
A recipe to combine multiple recipes into a single 'expression' recipe, which should be considered a ...
Definition VPlan.h:3547
A pure virtual base class for all recipes modeling header phis, including phis for first order recurr...
Definition VPlan.h:2436
virtual VPValue * getBackedgeValue()
Returns the incoming value from the loop backedge.
Definition VPlan.h:2483
VPValue * getStartValue()
Returns the start value of the phi, if one is set.
Definition VPlan.h:2472
A recipe representing a sequence of load -> update -> store as part of a histogram operation.
Definition VPlan.h:2163
A special type of VPBasicBlock that wraps an existing IR basic block.
Definition VPlan.h:4560
Class to record and manage LLVM IR flags.
Definition VPlan.h:695
static VPIRFlags getDefaultFlags(unsigned Opcode)
Returns default flags for Opcode for opcodes that support it, asserts otherwise.
LLVM_ABI_FOR_TEST FastMathFlags getFastMathFlagsOrNone() const
void dropPoisonGeneratingFlags()
Drop all poison-generating flags.
Definition VPlan.h:892
static LLVM_ABI_FOR_TEST VPIRInstruction * create(Instruction &I)
Create a new VPIRPhi for \I , if it is a PHINode, otherwise create a VPIRInstruction.
Helper to manage IR metadata for recipes.
Definition VPlan.h:1171
void intersect(const VPIRMetadata &MD)
Intersect this VPIRMetadata object with MD, keeping only metadata nodes that are common to both.
This is a concrete Recipe that models a single VPlan-level instruction.
Definition VPlan.h:1226
unsigned getNumOperandsWithoutMask() const
Returns the number of operands, excluding the mask if the VPInstruction is masked.
Definition VPlan.h:1473
@ ExtractLane
Extracts a single lane (first operand) from a set of vector operands.
Definition VPlan.h:1319
@ Unpack
Extracts all lanes from its (non-scalable) vector operand.
Definition VPlan.h:1269
@ ReductionStartVector
Start vector for reductions with 3 operands: the original start value, the identity value for the red...
Definition VPlan.h:1315
@ BuildVector
Creates a fixed-width vector containing all operands.
Definition VPlan.h:1264
@ BuildStructVector
Given operands of (the same) struct type, creates a struct of fixed- width vectors each containing a ...
Definition VPlan.h:1261
@ CanonicalIVIncrementForPart
Definition VPlan.h:1245
@ ComputeReductionResult
Reduce the operands to the final reduction result using the operation specified via the operation's V...
Definition VPlan.h:1272
unsigned getOpcode() const
Definition VPlan.h:1417
const InterleaveGroup< Instruction > * getInterleaveGroup() const
Definition VPlan.h:3109
VPValue * getMask() const
Return the mask used by this recipe.
Definition VPlan.h:3101
ArrayRef< VPValue * > getStoredValues() const
Return the VPValues stored by this interleave group.
Definition VPlan.h:3130
A recipe for interleaved memory operations with vector-predication intrinsics.
Definition VPlan.h:3182
VPInterleaveRecipe is a recipe for transforming an interleave group of load or stores into one wide l...
Definition VPlan.h:3140
void addIncoming(VPValue *IncomingV)
Append IncomingV as an incoming value to the phi-like recipe.
Definition VPlan.h:1665
VPPredInstPHIRecipe is a recipe for generating the phi nodes needed when control converges back from ...
Definition VPlan.h:3713
VPRecipeBase is a base class modeling a sequence of one or more output IR instructions.
Definition VPlan.h:402
VPBasicBlock * getParent()
Definition VPlan.h:477
DebugLoc getDebugLoc() const
Returns the debug location of the recipe.
Definition VPlan.h:555
void moveBefore(VPBasicBlock &BB, iplist< VPRecipeBase >::iterator I)
Unlink this recipe and insert into BB before I.
void insertBefore(VPRecipeBase *InsertPos)
Insert an unlinked recipe into a basic block immediately before the specified recipe.
void insertAfter(VPRecipeBase *InsertPos)
Insert an unlinked Recipe into a basic block immediately after the specified Recipe.
iplist< VPRecipeBase >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Helper class to create VPRecipies from IR instructions.
VPHistogramRecipe * widenIfHistogram(VPInstruction *VPI)
If VPI represents a histogram operation (as determined by LoopVectorizationLegality) make that safe f...
VPRecipeBase * tryToWidenMemory(VPInstruction *VPI, VFRange &Range)
Check if the load or store instruction VPI should widened for Range.Start and potentially masked.
bool replaceWithFinalIfReductionStore(VPInstruction *VPI, VPBuilder &FinalRedStoresBuilder)
If VPI is a store of a reduction into an invariant address, delete it.
VPSingleDefRecipe * handleReplication(VPInstruction *VPI, VFRange &Range)
Build a replicating or single-scalar recipe for VPI.
Type * getScalarType() const
Returns the scalar type of this VPRecipeValue.
Definition VPlanValue.h:337
A recipe to represent inloop reduction operations with vector-predication intrinsics,...
Definition VPlan.h:3353
A recipe for handling reduction phis.
Definition VPlan.h:2864
void setVFScaleFactor(unsigned ScaleFactor)
Set the VFScaleFactor for this reduction phi.
Definition VPlan.h:2915
unsigned getVFScaleFactor() const
Get the factor that the VF of this recipe's output should be scaled by, or 1 if it isn't scaled.
Definition VPlan.h:2908
RecurKind getRecurrenceKind() const
Returns the recurrence kind of the reduction.
Definition VPlan.h:2921
A recipe to represent inloop, ordered or partial reduction operations.
Definition VPlan.h:3233
VPRegionBlock represents a collection of VPBasicBlocks and VPRegionBlocks which form a Single-Entry-S...
Definition VPlan.h:4617
const VPBlockBase * getEntry() const
Definition VPlan.h:4661
bool isReplicator() const
An indicator whether this region is to generate multiple replicated instances of output IR correspond...
Definition VPlan.h:4693
VPInstruction * getOrCreateCanonicalIVIncrement()
Get the canonical IV increment instruction if it exists.
Definition VPlan.cpp:857
void setExiting(VPBlockBase *ExitingBlock)
Set ExitingBlock as the exiting VPBlockBase of this VPRegionBlock.
Definition VPlan.h:4678
Type * getCanonicalIVType() const
Return the type of the canonical IV for loop regions.
Definition VPlan.h:4737
void clearCanonicalIVNUW(VPInstruction *Increment)
Unsets NUW for the canonical IV increment Increment, for loop regions.
Definition VPlan.h:4745
VPRegionValue * getCanonicalIV()
Return the canonical induction variable of the region, null for replicating regions.
Definition VPlan.h:4729
const VPBlockBase * getExiting() const
Definition VPlan.h:4673
VPBasicBlock * getPreheaderVPBB()
Returns the pre-header VPBasicBlock of the loop region.
Definition VPlan.h:4686
VPReplicateRecipe replicates a given instruction producing multiple scalar copies of the original sca...
Definition VPlan.h:3398
bool isSingleScalar() const
Definition VPlan.h:3456
static InstructionCost computeCallCost(Function *CalledFn, Type *ResultTy, ArrayRef< const VPValue * > ArgOps, bool IsSingleScalar, ElementCount VF, VPCostContext &Ctx)
Return the cost of scalarizing a call to CalledFn with argument operands ArgOps for a given VF.
operand_range operandsWithoutMask()
Return the recipe's operands, excluding the mask of a predicated recipe.
Definition VPlan.h:3481
bool isPredicated() const
Definition VPlan.h:3458
VPValue * getMask()
Return the mask of a predicated VPReplicateRecipe.
Definition VPlan.h:3475
Lightweight SCEV-to-VPlan expander.
Definition VPlanUtils.h:178
VPValue * tryToExpand(const SCEV *S)
Try to expand S into recipes and live-ins using the builder.
A recipe for handling phi nodes of integer and floating-point inductions, producing their scalar valu...
Definition VPlan.h:4255
VPSingleDefRecipe is a base class for recipes that model a sequence of one or more output IR that def...
Definition VPlan.h:609
Instruction * getUnderlyingInstr()
Returns the underlying instruction.
Definition VPlan.h:680
VPSingleDefRecipe * clone() override=0
Clone the current recipe.
This class augments VPValue with operands which provide the inverse def-use edges from VPValue's user...
Definition VPlanValue.h:384
operand_range operands()
Definition VPlanValue.h:457
void setOperand(unsigned I, VPValue *New)
Definition VPlanValue.h:430
unsigned getNumOperands() const
Definition VPlanValue.h:424
VPValue * getOperand(unsigned N) const
Definition VPlanValue.h:425
This is the base class of the VPlan Def/Use graph, used for modeling the data flow into,...
Definition VPlanValue.h:50
bool hasDefiningRecipe() const
Returns true if this VPValue is defined by a recipe.
Definition VPlanValue.h:202
Type * getScalarType() const
Returns the scalar type of this VPValue, dispatching based on the concrete subclass.
Definition VPlan.cpp:149
Value * getLiveInIRValue() const
Return the underlying IR value for a VPIRValue.
Definition VPlan.cpp:143
bool isDefinedOutsideLoopRegions() const
Returns true if the VPValue is defined outside any loop.
Definition VPlan.cpp:1471
VPRecipeBase * getDefiningRecipe()
Returns the recipe defining this VPValue or nullptr if it is not defined by a recipe,...
Definition VPlan.cpp:130
bool hasMoreThanOneUniqueUser() const
Returns true if the value has more than one unique user.
Definition VPlanValue.h:163
Value * getUnderlyingValue() const
Return the underlying Value attached to this VPValue.
Definition VPlanValue.h:75
void setUnderlyingValue(Value *Val)
Definition VPlanValue.h:208
VPUser * getSingleUser()
Return the single user of this value, or nullptr if there is not exactly one user.
Definition VPlanValue.h:178
void replaceAllUsesWith(VPValue *New)
Definition VPlan.cpp:1474
unsigned getNumUsers() const
Definition VPlanValue.h:115
void replaceUsesWithIf(VPValue *New, llvm::function_ref< bool(VPUser &U, unsigned Idx)> ShouldReplace)
Go through the uses list for this VPValue and make each use point to New if the callback ShouldReplac...
Definition VPlan.cpp:1480
user_range users()
Definition VPlanValue.h:157
A recipe to compute a pointer to the last element of each part of a widened memory access for widened...
Definition VPlan.h:2266
A recipe for widening Call instructions using library calls.
Definition VPlan.h:2097
static InstructionCost computeCallCost(Function *Variant, VPCostContext &Ctx)
Return the cost of widening a call using the vector function Variant.
VPWidenCastRecipe is a recipe to create vector cast instructions.
Definition VPlan.h:1878
Instruction::CastOps getOpcode() const
Definition VPlan.h:1914
A recipe for handling GEP instructions.
Definition VPlan.h:2206
Base class for widened induction (VPWidenIntOrFpInductionRecipe and VPWidenPointerInductionRecipe),...
Definition VPlan.h:2516
VPIRValue * getStartValue() const
Returns the start value of the induction.
Definition VPlan.h:2564
PHINode * getPHINode() const
Returns the underlying PHINode if one exists, or null otherwise.
Definition VPlan.h:2582
VPValue * getStepValue()
Returns the step value of the induction.
Definition VPlan.h:2567
const InductionDescriptor & getInductionDescriptor() const
Returns the induction descriptor for the recipe.
Definition VPlan.h:2587
A recipe for handling phi nodes of integer and floating-point inductions, producing their vector valu...
Definition VPlan.h:2623
VPIRValue * getStartValue() const
Returns the start value of the induction.
Definition VPlan.h:2670
VPValue * getSplatVFValue() const
If the recipe has been unrolled, return the VPValue for the induction increment, otherwise return nul...
Definition VPlan.h:2674
TruncInst * getTruncInst()
Returns the first defined value as TruncInst, if it is one or nullptr otherwise.
Definition VPlan.h:2685
VPValue * getLastUnrolledPartOperand()
Returns the VPValue representing the value of this induction at the last unrolled part,...
Definition VPlan.h:2696
A recipe for widening vector intrinsics.
Definition VPlan.h:1925
static InstructionCost computeCallCost(Intrinsic::ID ID, ArrayRef< const VPValue * > Operands, const VPRecipeWithIRFlags &R, ElementCount VF, VPCostContext &Ctx)
Compute the cost of a vector intrinsic with ID and Operands.
static InstructionCost computeMemIntrinsicCost(Intrinsic::ID IID, Type *Ty, bool IsMasked, Align Alignment, VPCostContext &Ctx)
Helper function for computing the cost of vector memory intrinsic.
A common mixin class for widening memory operations.
Definition VPlan.h:3749
virtual VPRecipeBase * getAsRecipe()=0
Return a VPRecipeBase* to the current object.
A recipe for widened phis.
Definition VPlan.h:2754
VPWidenRecipe is a recipe for producing a widened instruction using the opcode and operands of the re...
Definition VPlan.h:1817
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPWidenRecipe.
VPWidenRecipe * clone() override
Clone the current recipe.
Definition VPlan.h:1838
unsigned getOpcode() const
Definition VPlan.h:1857
VPlan models a candidate for vectorization, encoding various decisions take to produce efficient outp...
Definition VPlan.h:4765
VPIRValue * getLiveIn(Value *V) const
Return the live-in VPIRValue for V, if there is one or nullptr otherwise.
Definition VPlan.h:5095
bool hasVF(ElementCount VF) const
Definition VPlan.h:4988
const DataLayout & getDataLayout() const
Definition VPlan.h:4970
LLVMContext & getContext() const
Definition VPlan.h:4966
VPBasicBlock * getEntry()
Definition VPlan.h:4861
bool hasScalableVF() const
Definition VPlan.h:4989
VPValue * getTripCount() const
The trip count of the original loop.
Definition VPlan.h:4924
VPValue * getOrCreateBackedgeTakenCount()
The backedge taken count of the original loop.
Definition VPlan.h:4945
iterator_range< SmallSetVector< ElementCount, 2 >::iterator > vectorFactors() const
Returns an iterator range over all VFs of the plan.
Definition VPlan.h:4995
VPIRValue * getFalse()
Return a VPIRValue wrapping i1 false.
Definition VPlan.h:5061
VPSymbolicValue & getVFxUF()
Returns VF * UF of the vector loop region.
Definition VPlan.h:4964
VPIRValue * getAllOnesValue(Type *Ty)
Return a VPIRValue wrapping the AllOnes value of type Ty.
Definition VPlan.h:5067
VPRegionBlock * createReplicateRegion(VPBlockBase *Entry, VPBlockBase *Exiting, const std::string &Name="")
Create a new replicate region with Entry, Exiting and Name.
Definition VPlan.h:5144
auto getLiveIns() const
Return the list of live-in VPValues available in the VPlan.
Definition VPlan.h:5098
bool hasUF(unsigned UF) const
Definition VPlan.h:5013
VPIRValue * getPoison(Type *Ty)
Return a VPIRValue wrapping a poison value of type Ty.
Definition VPlan.h:5089
ArrayRef< VPIRBasicBlock * > getExitBlocks() const
Return an ArrayRef containing VPIRBasicBlocks wrapping the exit blocks of the original scalar loop.
Definition VPlan.h:4914
VPSymbolicValue & getVectorTripCount()
The vector trip count.
Definition VPlan.h:4954
VPValue * getBackedgeTakenCount() const
Definition VPlan.h:4951
VPIRValue * getOrAddLiveIn(Value *V)
Gets the live-in VPIRValue for V or adds a new live-in (if none exists yet) for V.
Definition VPlan.h:5038
VPIRValue * getZero(Type *Ty)
Return a VPIRValue wrapping the null value of type Ty.
Definition VPlan.h:5064
void setVF(ElementCount VF)
Definition VPlan.h:4976
bool isUnrolled() const
Returns true if the VPlan already has been unrolled, i.e.
Definition VPlan.h:5029
LLVM_ABI_FOR_TEST VPRegionBlock * getVectorLoopRegion()
Returns the VPRegionBlock of the vector loop.
Definition VPlan.cpp:1061
unsigned getConcreteUF() const
Returns the concrete UF of the plan, after unrolling.
Definition VPlan.h:5016
void resetTripCount(VPValue *NewTripCount)
Resets the trip count for the VPlan.
Definition VPlan.h:4938
VPBasicBlock * getMiddleBlock()
Returns the 'middle' block of the plan, that is the block that selects whether to execute the scalar ...
Definition VPlan.h:4890
VPBasicBlock * createVPBasicBlock(const Twine &Name, VPRecipeBase *Recipe=nullptr)
Create a new VPBasicBlock with Name and containing Recipe if present.
Definition VPlan.h:5121
VPIRValue * getTrue()
Return a VPIRValue wrapping i1 true.
Definition VPlan.h:5058
VPBasicBlock * getVectorPreheader() const
Returns the preheader of the vector loop region, if one exists, or null otherwise.
Definition VPlan.h:4866
VPSymbolicValue & getUF()
Returns the UF of the vector loop region.
Definition VPlan.h:4961
bool hasScalarVFOnly() const
Definition VPlan.h:5006
VPBasicBlock * getScalarPreheader() const
Return the VPBasicBlock for the preheader of the scalar loop.
Definition VPlan.h:4904
VPSymbolicValue & getVF()
Returns the VF of the vector loop region.
Definition VPlan.h:4957
void setUF(unsigned UF)
Definition VPlan.h:5021
bool hasScalarTail() const
Returns true if the scalar tail may execute after the vector loop, i.e.
Definition VPlan.h:5176
LLVM_ABI_FOR_TEST VPlan * duplicate()
Clone the current VPlan, update all VPValues of the new VPlan and cloned recipes to refer to the clon...
Definition VPlan.cpp:1217
VPIRValue * getConstantInt(Type *Ty, uint64_t Val, bool IsSigned=false)
Return a VPIRValue wrapping a ConstantInt with the given type and value.
Definition VPlan.h:5072
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255
iterator_range< user_iterator > users()
Definition Value.h:426
bool hasName() const
Definition Value.h:261
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:319
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
constexpr bool hasKnownScalarFactor(const FixedOrScalableQuantity &RHS) const
Returns true if there exists a value X where RHS.multiplyCoefficientBy(X) will result in a value whos...
Definition TypeSize.h:269
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
constexpr ScalarTy getKnownScalarFactor(const FixedOrScalableQuantity &RHS) const
Returns a value X where RHS.multiplyCoefficientBy(X) will result in a value whose quantity matches ou...
Definition TypeSize.h:277
static constexpr bool isKnownLT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:216
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:168
constexpr LeafTy multiplyCoefficientBy(ScalarTy RHS) const
Definition TypeSize.h:256
constexpr bool isFixed() const
Returns true if the quantity is not scaled by vscale.
Definition TypeSize.h:171
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
An efficient, type-erasing, non-owning reference to a callable.
self_iterator getIterator()
Definition ilist_node.h:123
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
LLVM_ABI APInt RoundingUDiv(const APInt &A, const APInt &B, APInt::Rounding RM)
Return A unsign-divided by B, rounded by the given rounding mode.
Definition APInt.cpp:2798
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
std::variant< std::monostate, Loc::Single, Loc::Multi, Loc::MMI, Loc::EntryValue > Variant
Alias for the std::variant specialization base class of DbgVariable.
Definition DwarfDebug.h:190
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
match_combine_or< Ty... > m_CombineOr(const Ty &...Ps)
Combine pattern matchers matching any of Ps patterns.
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
auto m_Cmp()
Matches any compare instruction and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
m_Intrinsic_Ty< Opnd0, Opnd1, Opnd2 >::Ty m_MaskedStore(const Opnd0 &Op0, const Opnd1 &Op1, const Opnd2 &Op2)
Matches MaskedStore Intrinsic.
match_combine_or< CastInst_match< OpTy, TruncInst >, OpTy > m_TruncOrSelf(const OpTy &Op)
auto m_Poison()
Match an arbitrary poison constant.
ap_match< APInt > m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
CastInst_match< OpTy, TruncInst > m_Trunc(const OpTy &Op)
Matches Trunc.
LogicalOp_match< LHS, RHS, Instruction::And > m_LogicalAnd(const LHS &L, const RHS &R)
Matches L && R either in the form of L & R or L ?
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, ZExtInst >, OpTy > m_ZExtOrSelf(const OpTy &Op)
bool match(Val *V, const Pattern &P)
match_deferred< Value > m_Deferred(Value *const &V)
Like m_Specific(), but works if the specific value to match is determined as part of the same match()...
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
auto match_fn(const Pattern &P)
A match functor that can be used as a UnaryPredicate in functional algorithms like all_of.
m_Intrinsic_Ty< Opnd0, Opnd1, Opnd2 >::Ty m_MaskedLoad(const Opnd0 &Op0, const Opnd1 &Op1, const Opnd2 &Op2)
Matches MaskedLoad Intrinsic.
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
SpecificCmpClass_match< LHS, RHS, CmpInst > m_SpecificCmp(CmpPredicate MatchPred, const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
CastInst_match< OpTy, FPExtInst > m_FPExt(const OpTy &Op)
SpecificCmpClass_match< LHS, RHS, ICmpInst > m_SpecificICmp(CmpPredicate MatchPred, const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::UDiv > m_UDiv(const LHS &L, const RHS &R)
SelectLike_match< CondTy, LTy, RTy > m_SelectLike(const CondTy &C, const LTy &TrueC, const RTy &FalseC)
Matches a value that behaves like a boolean-controlled select, i.e.
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
CmpClass_match< LHS, RHS, ICmpInst > m_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
FNeg_match< OpTy > m_FNeg(const OpTy &X)
Match 'fneg X' as 'fsub -0.0, X'.
BinaryOp_match< LHS, RHS, Instruction::FAdd, true > m_c_FAdd(const LHS &L, const RHS &R)
Matches FAdd with LHS and RHS in either order.
LogicalOp_match< LHS, RHS, Instruction::And, true > m_c_LogicalAnd(const LHS &L, const RHS &R)
Matches L && R with LHS and RHS in either order.
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
BinaryOp_match< LHS, RHS, Instruction::Mul, true > m_c_Mul(const LHS &L, const RHS &R)
Matches a Mul with LHS and RHS in either order.
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
auto m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
bind_cst_ty m_scev_APInt(const APInt *&C)
Match an SCEV constant and bind it to an APInt.
specificloop_ty m_SpecificLoop(const Loop *L)
bool match(const SCEV *S, const Pattern &P)
SCEVAffineAddRec_match< Op0_t, Op1_t, match_isa< const Loop > > m_scev_AffineAddRec(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::ExtractLastLane, VPInstruction_match< VPInstruction::ExtractLastPart, Op0_t > > m_ExtractLastLaneOfLastPart(const Op0_t &Op0)
AllRecipe_commutative_match< Instruction::And, Op0_t, Op1_t > m_c_BinaryAnd(const Op0_t &Op0, const Op1_t &Op1)
Match a binary AND operation.
AllRecipe_match< Instruction::Or, Op0_t, Op1_t > m_BinaryOr(const Op0_t &Op0, const Op1_t &Op1)
Match a binary OR operation.
VPInstruction_match< VPInstruction::AnyOf > m_AnyOf()
AllRecipe_commutative_match< Instruction::Or, Op0_t, Op1_t > m_c_BinaryOr(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::ComputeReductionResult, Op0_t > m_ComputeReductionResult(const Op0_t &Op0)
auto m_WidenAnyExtend(const Op0_t &Op0)
match_bind< VPIRValue > m_VPIRValue(VPIRValue *&V)
Match a VPIRValue.
VPInstruction_match< VPInstruction::StepVector > m_StepVector()
auto m_VPPhi(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::BranchOnTwoConds > m_BranchOnTwoConds()
AllRecipe_match< Opcode, Op0_t, Op1_t > m_Binary(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::LastActiveLane, Op0_t > m_LastActiveLane(const Op0_t &Op0)
auto m_WidenIntrinsic(const T &...Ops)
canonical_widen_iv_match m_CanonicalWidenIV()
VPInstruction_match< VPInstruction::ExitingIVValue, Op0_t > m_ExitingIVValue(const Op0_t &Op0)
VPInstruction_match< Instruction::ExtractElement, Op0_t, Op1_t > m_ExtractElement(const Op0_t &Op0, const Op1_t &Op1)
specific_intval< 1 > m_False()
VPInstruction_match< VPInstruction::ExtractLastLane, Op0_t > m_ExtractLastLane(const Op0_t &Op0)
VPInstruction_match< VPInstruction::ActiveLaneMask, Op0_t, Op1_t, Op2_t > m_ActiveLaneMask(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2)
match_bind< VPSingleDefRecipe > m_VPSingleDefRecipe(VPSingleDefRecipe *&V)
Match a VPSingleDefRecipe, capturing if we match.
VPInstruction_match< VPInstruction::BranchOnCount > m_BranchOnCount()
auto m_GetElementPtr(const Op0_t &Op0, const Op1_t &Op1)
specific_intval< 1 > m_True()
auto m_VPValue()
Match an arbitrary VPValue and ignore it.
VectorEndPointerRecipe_match< Op0_t, Op1_t > m_VecEndPtr(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::ExtractLastPart, Op0_t > m_ExtractLastPart(const Op0_t &Op0)
VPInstruction_match< VPInstruction::Broadcast, Op0_t > m_Broadcast(const Op0_t &Op0)
VPInstruction_match< VPInstruction::ExplicitVectorLength, Op0_t > m_EVL(const Op0_t &Op0)
VPInstruction_match< VPInstruction::BuildVector > m_BuildVector()
BuildVector is matches only its opcode, w/o matching its operands as the number of operands is not fi...
VPInstruction_match< VPInstruction::ExtractPenultimateElement, Op0_t > m_ExtractPenultimateElement(const Op0_t &Op0)
match_bind< VPInstruction > m_VPInstruction(VPInstruction *&V)
Match a VPInstruction, capturing if we match.
VPInstruction_match< VPInstruction::FirstActiveLane, Op0_t > m_FirstActiveLane(const Op0_t &Op0)
auto m_DerivedIV(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2)
VPInstruction_match< VPInstruction::BranchOnCond > m_BranchOnCond()
static VPRecipeBase * findUserOf(VPValue *V, const MatchT &P)
If V is used by a recipe matching pattern P, return it.
VPInstruction_match< VPInstruction::ExtractLane, Op0_t, Op1_t > m_ExtractLane(const Op0_t &Op0, const Op1_t &Op1)
auto m_AnyNeg(const Op0_t &Op0)
VPInstruction_match< VPInstruction::Reverse, Op0_t > m_Reverse(const Op0_t &Op0)
NodeAddr< DefNode * > Def
Definition RDFGraph.h:384
bool isSingleScalar(const VPValue *VPV)
Returns true if VPV is a single scalar, either because it produces the same value for all lanes or on...
VPValue * getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr)
Get or create a VPValue that corresponds to the expansion of Expr.
bool cannotHoistOrSinkRecipe(const VPRecipeBase &R, bool Sinking=false)
Return true if we do not know how to (mechanically) hoist or sink R.
VPInstruction * findComputeReductionResult(VPReductionPHIRecipe *PhiR)
Find the ComputeReductionResult recipe for PhiR, looking through selects inserted for predicated redu...
VPInstruction * findCanonicalIVIncrement(VPlan &Plan)
Find the canonical IV increment of Plan's vector loop region.
std::optional< MemoryLocation > getMemoryLocation(const VPRecipeBase &R)
Return a MemoryLocation for R with noalias metadata populated from R, if the recipe is supported and ...
bool onlyFirstLaneUsed(const VPValue *Def)
Returns true if only the first lane of Def is used.
VPValue * findIncomingAliasMask(const VPlan &Plan)
Finds the incoming alias-mask within the vector preheader.
VPRecipeBase * findRecipe(VPValue *Start, PredT Pred)
Search Start's users for a recipe satisfying Pred, looking through recipes with definitions.
Definition VPlanUtils.h:128
VPSingleDefRecipe * findHeaderMask(VPlan &Plan)
Collect the header mask with the pattern: (ICMP_ULE, WideCanonicalIV, backedge-taken-count) Note: If ...
bool onlyScalarValuesUsed(const VPValue *Def)
Returns true if only scalar values of Def are used by all users.
bool isUniformAcrossVFsAndUFs(const VPValue *V)
Checks if V is uniform across all VF lanes and UF parts.
const SCEV * getSCEVExprForVPValue(const VPValue *V, PredicatedScalarEvolution &PSE, const Loop *L=nullptr)
Return the SCEV expression for V.
bool isHeaderMask(const VPValue *V, const VPlan &Plan)
Return true if V is a header mask in Plan.
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:315
SmallVector< VPBasicBlock * > vp_rpo_plain_cfg_loop_body(VPBasicBlock *Header)
Returns the VPBasicBlocks forming the loop body of a plain (pre-region) VPlan in reverse post-order s...
Definition VPlanCFG.h:265
@ Offset
Definition DWP.cpp:558
constexpr auto not_equal_to(T &&Arg)
Functor variant of std::not_equal_to that can be used as a UnaryPredicate in functional algorithms li...
Definition STLExtras.h:2180
void stable_sort(R &&Range)
Definition STLExtras.h:2116
auto min_element(R &&Range)
Provide wrappers to std::min_element which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:2078
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1669
LLVM_ABI Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
Definition STLExtras.h:840
DenseMap< const Value *, const SCEV * > ValueToSCEVMapTy
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2554
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
constexpr from_range_t from_range
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2208
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:633
auto cast_or_null(const Y &Val)
Definition Casting.h:714
iterator_range< df_iterator< VPBlockShallowTraversalWrapper< VPBlockBase * > > > vp_depth_first_shallow(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order.
Definition VPlanCFG.h:253
constexpr auto bind_back(FnT &&Fn, BindArgsT &&...BindArgs)
C++23 bind_back.
iterator_range< df_iterator< VPBlockDeepTraversalWrapper< VPBlockBase * > > > vp_depth_first_deep(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order while traversing t...
Definition VPlanCFG.h:288
constexpr auto equal_to(T &&Arg)
Functor variant of std::equal_to that can be used as a UnaryPredicate in functional algorithms like a...
Definition STLExtras.h:2173
bool operator==(const AddressRangeValuePair &LHS, const AddressRangeValuePair &RHS)
SmallVector< VPRegisterUsage, 8 > calculateRegisterUsageForPlan(VPlan &Plan, ArrayRef< ElementCount > VFs, const TargetTransformInfo &TTI, const SmallPtrSetImpl< const Value * > &ValuesToIgnore)
Estimate the register usage for Plan and vectorization factors in VFs by calculating the highest numb...
auto map_range(ContainerTy &&C, FuncTy F)
Return a range that applies F to the elements of C.
Definition STLExtras.h:365
detail::concat_range< ValueT, RangeTs... > concat(RangeTs &&...Ranges)
Returns a concatenated range across two or more ranges.
Definition STLExtras.h:1151
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:385
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
Definition STLExtras.h:2200
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1746
auto reverse(ContainerTy &&C)
Definition STLExtras.h:407
constexpr size_t range_size(R &&Range)
Returns the size of the Range, i.e., the number of elements.
Definition STLExtras.h:1694
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1636
bool hasIrregularType(Type *Ty, const DataLayout &DL)
A helper function that returns true if the given type is irregular.
LLVM_ABI_FOR_TEST cl::opt< bool > EnableWideActiveLaneMask
UncountableExitStyle
Different methods of handling early exits.
Definition VPlan.h:79
@ MaskedHandleExitInScalarLoop
All memory operations other than the load(s) required to determine whether an uncountable exit occurr...
Definition VPlan.h:89
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1753
SmallVector< ValueTypeFromRangeType< R >, Size > to_vector(R &&Range)
Given a range of type R, iterate the entire range and return a SmallVector with elements of the vecto...
iterator_range< filter_iterator< detail::IterOfRange< RangeT >, PredicateT > > make_filter_range(RangeT &&Range, PredicateT Pred)
Convenience function that takes a range of elements and a predicate, and return a new filter_iterator...
Definition STLExtras.h:551
bool canConstantBeExtended(const APInt *C, Type *NarrowType, TTI::PartialReductionExtendKind ExtKind)
Check if a constant CI can be safely treated as having been extended from a narrower type with the gi...
Definition VPlan.cpp:1843
T * find_singleton(R &&Range, Predicate P, bool AllowRepeats=false)
Return the single value in Range that satisfies P(<member of Range> *, AllowRepeats)->T * returning n...
Definition STLExtras.h:1837
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
auto drop_end(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the last N elements excluded.
Definition STLExtras.h:322
@ Other
Any other memory.
Definition ModRef.h:68
TargetTransformInfo TTI
RecurKind
These are the kinds of recurrences that we support.
@ UMin
Unsigned integer min implemented in terms of select(cmp()).
@ FindIV
FindIV reduction with select(icmp(),x,y) where one of (x,y) is a loop induction variable (increasing ...
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ FSub
Subtraction of floats.
@ FMul
Product of floats.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ SMin
Signed integer min implemented in terms of select(cmp()).
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
@ AddChainWithSubs
A chain of adds and subs.
@ FAdd
Sum of floats.
@ UMax
Unsigned integer max implemented in terms of select(cmp()).
LLVM_ABI Value * getRecurrenceIdentity(RecurKind K, Type *Tp, FastMathFlags FMF)
Given information about an recurrence kind, return the identity for the @llvm.vector....
LLVM_ABI BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="")
Split the specified block at the specified instruction.
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition STLExtras.h:2012
DWARFExpression::Operation Op
auto max_element(R &&Range)
Provide wrappers to std::max_element which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:2088
ArrayRef(const T &OneElt) -> ArrayRef< T >
auto make_second_range(ContainerTy &&c)
Given a container of pairs, return a range over the second elements.
Definition STLExtras.h:1409
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1772
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1947
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition STLExtras.h:2166
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition Hashing.h:305
bool equal(L &&LRange, R &&RRange)
Wrapper function around std::equal to detect if pair-wise elements between two ranges are the same.
Definition STLExtras.h:2146
Type * toVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
LLVM_ABI bool isDereferenceableAndAlignedInLoop(LoadInst *LI, Loop *L, ScalarEvolution &SE, DominatorTree &DT, AssumptionCache *AC=nullptr, SmallVectorImpl< const SCEVPredicate * > *Predicates=nullptr)
Return true if we can prove that the given load (which is assumed to be within the specified loop) wo...
Definition Loads.cpp:304
@ Default
The result value is uniform if and only if all operands are uniform.
Definition Uniformity.h:20
constexpr detail::IsaCheckPredicate< Types... > IsaPred
Function object wrapper for the llvm::isa type check.
Definition Casting.h:866
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition Hashing.h:285
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:862
#define N
VPBasicBlock * EarlyExitingVPBB
VPIRBasicBlock * EarlyExitVPBB
RemoveMask_match(const Op0_t &In, Op1_t &Out)
bool match(OpTy *V) const
MDNode * Scope
The tag for alias scope specification (used with noalias).
Definition Metadata.h:786
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
An information struct used to provide DenseMap with the various necessary components for a given valu...
This reduction is unordered with the partial result scaled down by some factor.
Definition VPlan.h:2846
Holds the VFShape for a specific scalar to vector function mapping.
Encapsulates information needed to describe a parameter.
A range of powers-of-2 vectorization factors with fixed start and adjustable end.
Struct to hold various analysis needed for cost computations.
static bool isFreeScalarIntrinsic(Intrinsic::ID ID)
Returns true if ID is a pseudo intrinsic that is dropped via scalarization rather than widened.
Definition VPlan.cpp:1939
bool isMaskRequired(Instruction *I) const
Forwards to LoopVectorizationCostModel::isMaskRequired.
PredicatedScalarEvolution & PSE
bool willBeScalarized(Instruction *I, ElementCount VF) const
Returns true if I is known to be scalarized at VF.
TargetTransformInfo::TargetCostKind CostKind
const TargetLibraryInfo & TLI
const TargetTransformInfo & TTI
A VPValue representing a live-in from the input IR or a constant.
Definition VPlanValue.h:246
Type * getType() const
Returns the type of the underlying IR value.
Definition VPlan.cpp:147
A struct that represents some properties of the register usage of a loop.
SmallMapVector< unsigned, unsigned, 4 > MaxLocalUsers
Holds the maximum number of concurrent live intervals in the loop.
InstructionCost spillCost(const TargetTransformInfo &TTI, TargetTransformInfo::TargetCostKind CostKind, unsigned OverrideMaxNumRegs=0) const
Calculate the estimated cost of any spills due to using more registers than the number available for ...
A symbolic live-in VPValue, used for values like vector trip count, VF, and VFxUF.
Definition VPlanValue.h:286
bool isMaterialized() const
Returns true if this symbolic value has been materialized.
Definition VPlanValue.h:297
A recipe for widening load operations with vector-predication intrinsics, using the address to load f...
Definition VPlan.h:3863
A recipe for widening load operations, using the address to load from and an optional mask.
Definition VPlan.h:3813
A recipe for widening store operations with vector-predication intrinsics, using the value to store,...
Definition VPlan.h:3966
A recipe for widening store operations, using the stored value, the address to store to and an option...
Definition VPlan.h:3912
static VPValue * materializeAliasMask(VPlan &Plan, VPBasicBlock *AliasCheckVPBB, ArrayRef< PointerDiffInfo > DiffChecks)
Materializes within the AliasCheckVPBB block.
static LLVM_ABI_FOR_TEST bool tryToConvertVPInstructionsToVPRecipes(VPlan &Plan, const TargetLibraryInfo &TLI)
Replaces the VPInstructions in Plan with corresponding widen recipes.
static void makeMemOpWideningDecisions(VPlan &Plan, VFRange &Range, VPRecipeBuilder &RecipeBuilder)
Convert load/store VPInstructions in Plan into widened or replicate recipes.
static decltype(auto) runPass(StringRef PassName, PassTy &&Pass, VPlan &Plan, ArgsTy &&...Args)
Helper to run a VPlan pass Pass on VPlan, forwarding extra arguments to the pass.
static void expandSCEVsToVPInstructions(VPlan &Plan, ScalarEvolution &SE)
Try to expand VPExpandSCEVRecipes in Plan's entry block to VPInstructions.
static void materializeBroadcasts(VPlan &Plan)
Add explicit broadcasts for live-ins and VPValues defined in Plan's entry block if they are used as v...
static void materializePacksAndUnpacks(VPlan &Plan)
Add explicit Build[Struct]Vector recipes to Pack multiple scalar values into vectors and Unpack recip...
static void createInterleaveGroups(VPlan &Plan, const SmallPtrSetImpl< const InterleaveGroup< Instruction > * > &InterleaveGroups, const bool &EpilogueAllowed)
static bool simplifyKnownEVL(VPlan &Plan, ElementCount VF, PredicatedScalarEvolution &PSE)
Try to simplify VPInstruction::ExplicitVectorLength recipes when the AVL is known to be <= VF,...
static void materializeFactors(VPlan &Plan, VPBasicBlock *VectorPH, ElementCount VF)
Materialize UF, VF and VFxUF to be computed explicitly using VPInstructions.
static void materializeBackedgeTakenCount(VPlan &Plan, VPBasicBlock *VectorPH)
Materialize the backedge-taken count to be computed explicitly using VPInstructions.
static void addActiveLaneMask(VPlan &Plan, bool UseActiveLaneMaskForControlFlow)
Replace (ICMP_ULE, wide canonical IV, backedge-taken-count) checks with an (active-lane-mask recipe,...
static void replaceWideCanonicalIVWithWideIV(VPlan &Plan, ScalarEvolution &SE, const TargetTransformInfo &TTI, TargetTransformInfo::TargetCostKind CostKind, ElementCount VF, unsigned UF, const SmallPtrSetImpl< const Value * > &ValuesToIgnore)
Replace a VPWidenCanonicalIVRecipe if it is present in Plan, with a VPWidenIntOrFpInductionRecipe,...
static void createAndOptimizeReplicateRegions(VPlan &Plan)
Wrap predicated VPReplicateRecipes with a mask operand in an if-then region block and remove the mask...
static void convertToVariableLengthStep(VPlan &Plan)
Transform loops with variable-length stepping after region dissolution.
static void addBranchWeightToMiddleTerminator(VPlan &Plan, ElementCount VF, std::optional< unsigned > VScaleForTuning)
Add branch weight metadata, if the Plan's middle block is terminated by a BranchOnCond recipe.
static std::unique_ptr< VPlan > narrowInterleaveGroups(VPlan &Plan, const TargetTransformInfo &TTI)
Try to find a single VF among Plan's VFs for which all interleave groups (with known minimum VF eleme...
static void materializeAliasMaskCheckBlock(VPlan &Plan, ArrayRef< PointerDiffInfo > DiffChecks, bool HasBranchWeights)
Materializes the alias mask within a check block before the loop.
static DenseMap< const SCEV *, Value * > expandSCEVs(VPlan &Plan, ScalarEvolution &SE)
Expand remaining VPExpandSCEVRecipes in Plan's entry block using SCEVExpander.
static void convertToConcreteRecipes(VPlan &Plan)
Lower abstract recipes to concrete ones, that can be codegen'd.
static void expandBranchOnTwoConds(VPlan &Plan)
Expand BranchOnTwoConds instructions into explicit CFG with BranchOnCond instructions.
static void materializeVectorTripCount(VPlan &Plan, VPBasicBlock *VectorPHVPBB, bool TailByMasking, bool RequiresScalarEpilogue, VPValue *Step, std::optional< uint64_t > MaxRuntimeStep=std::nullopt)
Materialize vector trip count computations to a set of VPInstructions.
static void hoistPredicatedLoads(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L)
Hoist predicated loads from the same address to the loop entry block, if they are guaranteed to execu...
static bool mergeBlocksIntoPredecessors(VPlan &Plan)
Remove redundant VPBasicBlocks by merging them into their single predecessor if the latter has a sing...
static void attachAliasMaskToHeaderMask(VPlan &Plan)
Attaches the alias-mask to the existing header-mask.
static void optimizeFindIVReductions(VPlan &Plan, PredicatedScalarEvolution &PSE, Loop &L)
Optimize FindLast reductions selecting IVs (or expressions of IVs) by converting them to FindIV reduc...
static void convertToAbstractRecipes(VPlan &Plan, VPCostContext &Ctx, VFRange &Range)
This function converts initial recipes to the abstract recipes and clamps Range based on cost model f...
static void materializeConstantVectorTripCount(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
static void makeScalarizationDecisions(VPlan &Plan, VFRange &Range)
Make VPlan-based scalarization decision prior to delegating to the ones made by the legacy CM.
static void addExplicitVectorLength(VPlan &Plan, const std::optional< unsigned > &MaxEVLSafeElements)
Add a VPCurrentIterationPHIRecipe and related recipes to Plan and replaces all uses of the canonical ...
static void simplifyReverses(VPlan &Plan)
Cancel out redundant reverses in Plan, e.g. reverse(reverse(x)) -> x.
static void makeCallWideningDecisions(VPlan &Plan, VFRange &Range, VPRecipeBuilder &RecipeBuilder, VPCostContext &CostCtx)
Convert call VPInstructions in Plan into widened call, vector intrinsic or replicate recipes based on...
static void adjustFirstOrderRecurrenceMiddleUsers(VPlan &Plan, VFRange &Range)
Adjust first-order recurrence users in the middle block: create penultimate element extracts for LCSS...
static void optimizeEVLMasks(VPlan &Plan)
Optimize recipes which use an EVL-based header mask to VP intrinsics, for example:
static void removeDeadRecipes(VPlan &Plan)
Remove dead recipes from Plan.
static void simplifyRecipes(VPlan &Plan)
Perform instcombine-like simplifications on recipes in Plan.
static void sinkPredicatedStores(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L)
Sink predicated stores to the same address with complementary predicates (P and NOT P) to an uncondit...
static void replaceSymbolicStrides(VPlan &Plan, PredicatedScalarEvolution &PSE, const DenseMap< Value *, const SCEV * > &StridesMap, const VPDominatorTree &VPDT)
Replace symbolic strides from StridesMap in Plan with constants when possible.
static bool removeBranchOnConst(VPlan &Plan, bool OnlyLatches=false)
Remove BranchOnCond recipes with true or false conditions together with removing dead edges to their ...
static void convertToStridedAccesses(VPlan &Plan, PredicatedScalarEvolution &PSE, Loop &L, VPCostContext &Ctx, VFRange &Range)
Transform widen memory recipes into strided access recipes when legal and profitable.
static bool handleUncountableEarlyExits(VPlan &Plan, VPBasicBlock *HeaderVPBB, VPBasicBlock *LatchVPBB, VPBasicBlock *MiddleVPBB, Loop *TheLoop, PredicatedScalarEvolution &PSE, DominatorTree &DT, AssumptionCache *AC, UncountableExitStyle Style)
Update Plan to account for uncountable early exits by introducing appropriate branching logic in the ...
static void clearReductionWrapFlags(VPlan &Plan)
Clear NSW/NUW flags from reduction instructions if necessary.
static void optimizeInductionLiveOutUsers(VPlan &Plan, PredicatedScalarEvolution &PSE, bool FoldTail)
If there's a single exit block, optimize its phi recipes that use exiting IV values by feeding them p...
static void createPartialReductions(VPlan &Plan, VPCostContext &CostCtx, VFRange &Range)
Detect and create partial reduction recipes for scaled reductions in Plan.
static void cse(VPlan &Plan)
Perform common-subexpression-elimination on Plan.
static void attachVPCheckBlock(VPlan &Plan, VPValue *Cond, VPBasicBlock *CheckBlock, bool AddBranchWeights)
Wrap runtime check block CheckBlock in a VPIRBB and Cond in a VPValue and connect the block to Plan,...
static LLVM_ABI_FOR_TEST void optimize(VPlan &Plan)
Apply VPlan-to-VPlan optimizations to Plan, including induction recipe optimizations,...
static void dissolveLoopRegions(VPlan &Plan)
Replace loop regions with explicit CFG.
static void truncateToMinimalBitwidths(VPlan &Plan, const MapVector< Instruction *, uint64_t > &MinBWs)
Insert truncates and extends for any truncated recipe.
static void dropPoisonGeneratingRecipes(VPlan &Plan)
Drop poison flags from recipes that may generate a poison value that is used after vectorization,...
static void optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Optimize Plan based on BestVF and BestUF.
static void convertEVLExitCond(VPlan &Plan)
Replaces the exit condition from (branch-on-cond eq CanonicalIVInc, VectorTripCount) to (branch-on-co...