LLVM 23.0.0git
VPlanTransforms.cpp
Go to the documentation of this file.
1//===-- VPlanTransforms.cpp - Utility VPlan to VPlan transforms -----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file implements a set of utility VPlan to VPlan transformations.
11///
12//===----------------------------------------------------------------------===//
13
14#include "VPlanTransforms.h"
15#include "VPRecipeBuilder.h"
16#include "VPlan.h"
17#include "VPlanAnalysis.h"
18#include "VPlanCFG.h"
19#include "VPlanDominatorTree.h"
20#include "VPlanHelpers.h"
21#include "VPlanPatternMatch.h"
22#include "VPlanUtils.h"
23#include "VPlanVerifier.h"
24#include "llvm/ADT/APInt.h"
26#include "llvm/ADT/STLExtras.h"
28#include "llvm/ADT/SetVector.h"
30#include "llvm/ADT/TypeSwitch.h"
38#include "llvm/IR/Intrinsics.h"
39#include "llvm/IR/MDBuilder.h"
40#include "llvm/IR/Metadata.h"
45
46using namespace llvm;
47using namespace VPlanPatternMatch;
48using namespace SCEVPatternMatch;
49
51 VPlan &Plan, const TargetLibraryInfo &TLI) {
52
54 Plan.getVectorLoopRegion());
56 // Skip blocks outside region
57 if (!VPBB->getParent())
58 break;
59 VPRecipeBase *Term = VPBB->getTerminator();
60 auto EndIter = Term ? Term->getIterator() : VPBB->end();
61 // Introduce each ingredient into VPlan.
62 for (VPRecipeBase &Ingredient :
63 make_early_inc_range(make_range(VPBB->begin(), EndIter))) {
64
65 VPValue *VPV = Ingredient.getVPSingleValue();
66 if (!VPV->getUnderlyingValue())
67 continue;
68
70
71 VPRecipeBase *NewRecipe = nullptr;
72 if (auto *PhiR = dyn_cast<VPPhi>(&Ingredient)) {
73 auto *Phi = cast<PHINode>(PhiR->getUnderlyingValue());
74 NewRecipe = new VPWidenPHIRecipe(PhiR->operands(), PhiR->getDebugLoc(),
75 Phi->getName());
76 } else if (auto *VPI = dyn_cast<VPInstruction>(&Ingredient)) {
77 assert(!isa<PHINode>(Inst) && "phis should be handled above");
78 // Create VPWidenMemoryRecipe for loads and stores.
79 if (LoadInst *Load = dyn_cast<LoadInst>(Inst)) {
80 NewRecipe = new VPWidenLoadRecipe(
81 *Load, Ingredient.getOperand(0), nullptr /*Mask*/,
82 false /*Consecutive*/, *VPI, Ingredient.getDebugLoc());
83 } else if (StoreInst *Store = dyn_cast<StoreInst>(Inst)) {
84 NewRecipe = new VPWidenStoreRecipe(
85 *Store, Ingredient.getOperand(1), Ingredient.getOperand(0),
86 nullptr /*Mask*/, false /*Consecutive*/, *VPI,
87 Ingredient.getDebugLoc());
89 NewRecipe = new VPWidenGEPRecipe(GEP->getSourceElementType(),
90 Ingredient.operands(), *VPI,
91 Ingredient.getDebugLoc(), GEP);
92 } else if (CallInst *CI = dyn_cast<CallInst>(Inst)) {
93 Intrinsic::ID VectorID = getVectorIntrinsicIDForCall(CI, &TLI);
94 if (VectorID == Intrinsic::not_intrinsic)
95 return false;
96
97 // The noalias.scope.decl intrinsic declares a noalias scope that
98 // is valid for a single iteration. Emitting it as a single-scalar
99 // replicate would incorrectly extend the scope across multiple
100 // original iterations packed into one vector iteration.
101 // FIXME: If we want to vectorize this loop, then we have to drop
102 // all the associated !alias.scope and !noalias.
103 if (VectorID == Intrinsic::experimental_noalias_scope_decl)
104 return false;
105
106 // These intrinsics are recognized by getVectorIntrinsicIDForCall
107 // but are not widenable. Emit them as replicate instead of widening.
108 if (VectorID == Intrinsic::assume ||
109 VectorID == Intrinsic::lifetime_end ||
110 VectorID == Intrinsic::lifetime_start ||
111 VectorID == Intrinsic::sideeffect ||
112 VectorID == Intrinsic::pseudoprobe) {
113 // If the operand of llvm.assume holds before vectorization, it will
114 // also hold per lane.
115 // llvm.pseudoprobe requires to be duplicated per lane for accurate
116 // sample count.
117 const bool IsSingleScalar = VectorID != Intrinsic::assume &&
118 VectorID != Intrinsic::pseudoprobe;
119 NewRecipe = new VPReplicateRecipe(CI, Ingredient.operands(),
120 /*IsSingleScalar=*/IsSingleScalar,
121 /*Mask=*/nullptr, *VPI, *VPI,
122 Ingredient.getDebugLoc());
123 } else {
124 NewRecipe = new VPWidenIntrinsicRecipe(
125 *CI, VectorID, drop_end(Ingredient.operands()), CI->getType(),
126 VPIRFlags(*CI), *VPI, CI->getDebugLoc());
127 }
128 } else if (auto *CI = dyn_cast<CastInst>(Inst)) {
129 NewRecipe = new VPWidenCastRecipe(
130 CI->getOpcode(), Ingredient.getOperand(0), CI->getType(), CI,
131 VPIRFlags(*CI), VPIRMetadata(*CI));
132 } else {
133 NewRecipe = new VPWidenRecipe(*Inst, Ingredient.operands(), *VPI,
134 *VPI, Ingredient.getDebugLoc());
135 }
136 } else {
138 "inductions must be created earlier");
139 continue;
140 }
141
142 NewRecipe->insertBefore(&Ingredient);
143 if (NewRecipe->getNumDefinedValues() == 1)
144 VPV->replaceAllUsesWith(NewRecipe->getVPSingleValue());
145 else
146 assert(NewRecipe->getNumDefinedValues() == 0 &&
147 "Only recpies with zero or one defined values expected");
148 Ingredient.eraseFromParent();
149 }
150 }
151 return true;
152}
153
154/// Helper for extra no-alias checks via known-safe recipe and SCEV.
156 const SmallPtrSetImpl<VPRecipeBase *> &ExcludeRecipes;
157 VPReplicateRecipe &GroupLeader;
159 const Loop &L;
160
161 // Return true if \p A and \p B are known to not alias for all VFs in the
162 // plan, checked via the distance between the accesses
163 bool isNoAliasViaDistance(VPReplicateRecipe *A, VPReplicateRecipe *B) const {
164 if (A->getOpcode() != Instruction::Store ||
165 B->getOpcode() != Instruction::Store)
166 return false;
167
168 VPValue *AddrA = A->getOperand(1);
169 const SCEV *SCEVA = vputils::getSCEVExprForVPValue(AddrA, PSE, &L);
170 VPValue *AddrB = B->getOperand(1);
171 const SCEV *SCEVB = vputils::getSCEVExprForVPValue(AddrB, PSE, &L);
173 return false;
174
175 const APInt *Distance;
176 ScalarEvolution &SE = *PSE.getSE();
177 if (!match(SE.getMinusSCEV(SCEVA, SCEVB), m_scev_APInt(Distance)))
178 return false;
179
180 const DataLayout &DL = SE.getDataLayout();
181 Type *TyA = A->getOperand(0)->getScalarType();
182 uint64_t SizeA = DL.getTypeStoreSize(TyA);
183 Type *TyB = B->getOperand(0)->getScalarType();
184 uint64_t SizeB = DL.getTypeStoreSize(TyB);
185
186 // Use the maximum store size to ensure no overlap from either direction.
187 // Currently only handles fixed sizes, as it is only used for
188 // replicating VPReplicateRecipes.
189 uint64_t MaxStoreSize = std::max(SizeA, SizeB);
190
191 auto VFs = B->getParent()->getPlan()->vectorFactors();
193 if (MaxVF.isScalable())
194 return false;
195 return Distance->abs().uge(
196 MaxVF.multiplyCoefficientBy(MaxStoreSize).getFixedValue());
197 }
198
199public:
202 const Loop &L)
203 : ExcludeRecipes(ExcludeRecipes), GroupLeader(GroupLeader), PSE(PSE),
204 L(L) {}
205
206 /// Return true if \p R should be skipped during alias checking, either
207 /// because it's in the exclude set or because no-alias can be proven via
208 /// SCEV.
209 bool shouldSkip(VPRecipeBase &R) const {
210 auto *Store = dyn_cast<VPReplicateRecipe>(&R);
211 return ExcludeRecipes.contains(&R) ||
212 (Store && isNoAliasViaDistance(Store, &GroupLeader));
213 }
214};
215
216/// Check if a memory operation doesn't alias with memory operations using
217/// scoped noalias metadata, in blocks in the single-successor chain between \p
218/// FirstBB and \p LastBB. If \p SinkInfo is std::nullopt, only recipes that may
219/// write to memory are checked (for load hoisting). Otherwise recipes that both
220/// read and write memory are checked, and SCEV is used to prove no-alias
221/// between the group leader and other replicate recipes (for store sinking).
222static bool
224 VPBasicBlock *FirstBB, VPBasicBlock *LastBB,
225 std::optional<SinkStoreInfo> SinkInfo = {}) {
226 bool CheckReads = SinkInfo.has_value();
227 if (!MemLoc.AATags.Scope)
228 return false;
229
230 for (VPBasicBlock *VPBB :
232 for (VPRecipeBase &R : *VPBB) {
233 if (SinkInfo && SinkInfo->shouldSkip(R))
234 continue;
235
236 // Skip recipes that don't need checking.
237 if (!R.mayWriteToMemory() && !(CheckReads && R.mayReadFromMemory()))
238 continue;
239
241 if (!Loc)
242 // Conservatively assume aliasing for memory operations without
243 // location.
244 return false;
245
247 return false;
248 }
249 }
250 return true;
251}
252
253/// Get the value type of the replicate load or store. \p IsLoad indicates
254/// whether it is a load.
256 return (IsLoad ? R : R->getOperand(0))->getScalarType();
257}
258
259/// Collect either replicated Loads or Stores grouped by their address SCEV and
260/// their load-store type, in a deep-traversal of the vector loop region in \p
261/// Plan.
262template <unsigned Opcode>
265 VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L,
266 function_ref<bool(VPReplicateRecipe *)> FilterFn) {
267 static_assert(Opcode == Instruction::Load || Opcode == Instruction::Store,
268 "Only Load and Store opcodes supported");
269 constexpr bool IsLoad = (Opcode == Instruction::Load);
272 RecipesByAddressAndType;
275 for (VPRecipeBase &R : *VPBB) {
276 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
277 if (!RepR || RepR->getOpcode() != Opcode || !FilterFn(RepR))
278 continue;
279
280 // For loads, operand 0 is address; for stores, operand 1 is address.
281 VPValue *Addr = RepR->getOperand(IsLoad ? 0 : 1);
282 const Type *LoadStoreTy = getLoadStoreValueType(RepR, IsLoad);
283 const SCEV *AddrSCEV = vputils::getSCEVExprForVPValue(Addr, PSE, L);
284 if (!isa<SCEVCouldNotCompute>(AddrSCEV))
285 RecipesByAddressAndType[{AddrSCEV, LoadStoreTy}].push_back(RepR);
286 }
287 }
288 auto Groups = to_vector(RecipesByAddressAndType.values());
289 VPDominatorTree VPDT(Plan);
290 for (auto &Group : Groups) {
291 // Sort mem ops by dominance order, with earliest (most dominating) first.
293 return VPDT.properlyDominates(A, B);
294 });
295 }
296 return Groups;
297}
298
299static bool sinkScalarOperands(VPlan &Plan) {
300 auto Iter = vp_depth_first_deep(Plan.getEntry());
301 bool ScalarVFOnly = Plan.hasScalarVFOnly();
302 bool Changed = false;
303
305 auto InsertIfValidSinkCandidate = [ScalarVFOnly, &WorkList](
306 VPBasicBlock *SinkTo, VPValue *Op) {
307 auto *Candidate =
308 dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe());
309 if (!Candidate)
310 return;
311
312 // We only know how to sink VPReplicateRecipes and VPScalarIVStepsRecipes
313 // for now.
315 return;
316
317 if (Candidate->getParent() == SinkTo ||
318 vputils::cannotHoistOrSinkRecipe(*Candidate, /*Sinking=*/true))
319 return;
320
321 if (auto *RepR = dyn_cast<VPReplicateRecipe>(Candidate))
322 if (!ScalarVFOnly && RepR->isSingleScalar())
323 return;
324
325 WorkList.insert({SinkTo, Candidate});
326 };
327
328 // First, collect the operands of all recipes in replicate blocks as seeds for
329 // sinking.
331 VPBasicBlock *EntryVPBB = VPR->getEntryBasicBlock();
332 if (!VPR->isReplicator() || EntryVPBB->getSuccessors().size() != 2)
333 continue;
334 VPBasicBlock *VPBB = cast<VPBasicBlock>(EntryVPBB->getSuccessors().front());
335 if (VPBB->getSingleSuccessor() != VPR->getExitingBasicBlock())
336 continue;
337 for (auto &Recipe : *VPBB)
338 for (VPValue *Op : Recipe.operands())
339 InsertIfValidSinkCandidate(VPBB, Op);
340 }
341
342 // Try to sink each replicate or scalar IV steps recipe in the worklist.
343 for (unsigned I = 0; I != WorkList.size(); ++I) {
344 VPBasicBlock *SinkTo;
345 VPSingleDefRecipe *SinkCandidate;
346 std::tie(SinkTo, SinkCandidate) = WorkList[I];
347
348 // All recipe users of SinkCandidate must be in the same block SinkTo or all
349 // users outside of SinkTo must only use the first lane of SinkCandidate. In
350 // the latter case, we need to duplicate SinkCandidate.
351 auto UsersOutsideSinkTo =
352 make_filter_range(SinkCandidate->users(), [SinkTo](VPUser *U) {
353 return cast<VPRecipeBase>(U)->getParent() != SinkTo;
354 });
355 if (any_of(UsersOutsideSinkTo, [SinkCandidate](VPUser *U) {
356 return !U->usesFirstLaneOnly(SinkCandidate);
357 }))
358 continue;
359 bool NeedsDuplicating = !UsersOutsideSinkTo.empty();
360
361 if (NeedsDuplicating) {
362 if (ScalarVFOnly)
363 continue;
364 VPSingleDefRecipe *Clone;
365 if (auto *SinkCandidateRepR =
366 dyn_cast<VPReplicateRecipe>(SinkCandidate)) {
367 // TODO: Handle converting to uniform recipes as separate transform,
368 // then cloning should be sufficient here.
369 Instruction *I = SinkCandidate->getUnderlyingInstr();
370 Clone = new VPReplicateRecipe(I, SinkCandidate->operands(), true,
371 nullptr /*Mask*/, *SinkCandidateRepR,
372 *SinkCandidateRepR);
373 // TODO: add ".cloned" suffix to name of Clone's VPValue.
374 } else {
375 Clone = SinkCandidate->clone();
376 }
377
378 Clone->insertBefore(SinkCandidate);
379 SinkCandidate->replaceUsesWithIf(Clone, [SinkTo](VPUser &U, unsigned) {
380 return cast<VPRecipeBase>(&U)->getParent() != SinkTo;
381 });
382 }
383 SinkCandidate->moveBefore(*SinkTo, SinkTo->getFirstNonPhi());
384 for (VPValue *Op : SinkCandidate->operands())
385 InsertIfValidSinkCandidate(SinkTo, Op);
386 Changed = true;
387 }
388 return Changed;
389}
390
391/// If \p R is a region with a VPBranchOnMaskRecipe in the entry block, return
392/// the mask.
394 auto *EntryBB = dyn_cast<VPBasicBlock>(R->getEntry());
395 if (!EntryBB || EntryBB->size() != 1 ||
396 !isa<VPBranchOnMaskRecipe>(EntryBB->begin()))
397 return nullptr;
398
399 return cast<VPBranchOnMaskRecipe>(&*EntryBB->begin())->getOperand(0);
400}
401
402/// If \p R is a triangle region, return the 'then' block of the triangle.
404 auto *EntryBB = cast<VPBasicBlock>(R->getEntry());
405 if (EntryBB->getNumSuccessors() != 2)
406 return nullptr;
407
408 auto *Succ0 = dyn_cast<VPBasicBlock>(EntryBB->getSuccessors()[0]);
409 auto *Succ1 = dyn_cast<VPBasicBlock>(EntryBB->getSuccessors()[1]);
410 if (!Succ0 || !Succ1)
411 return nullptr;
412
413 if (Succ0->getNumSuccessors() + Succ1->getNumSuccessors() != 1)
414 return nullptr;
415 if (Succ0->getSingleSuccessor() == Succ1)
416 return Succ0;
417 if (Succ1->getSingleSuccessor() == Succ0)
418 return Succ1;
419 return nullptr;
420}
421
422// Merge replicate regions in their successor region, if a replicate region
423// is connected to a successor replicate region with the same predicate by a
424// single, empty VPBasicBlock.
426 SmallPtrSet<VPRegionBlock *, 4> TransformedRegions;
427
428 // Collect replicate regions followed by an empty block, followed by another
429 // replicate region with matching masks to process front. This is to avoid
430 // iterator invalidation issues while merging regions.
433 vp_depth_first_deep(Plan.getEntry()))) {
434 if (!Region1->isReplicator())
435 continue;
436 auto *MiddleBasicBlock =
437 dyn_cast_or_null<VPBasicBlock>(Region1->getSingleSuccessor());
438 if (!MiddleBasicBlock || !MiddleBasicBlock->empty())
439 continue;
440
441 auto *Region2 =
442 dyn_cast_or_null<VPRegionBlock>(MiddleBasicBlock->getSingleSuccessor());
443 if (!Region2 || !Region2->isReplicator())
444 continue;
445
446 VPValue *Mask1 = getPredicatedMask(Region1);
447 VPValue *Mask2 = getPredicatedMask(Region2);
448 if (!Mask1 || Mask1 != Mask2)
449 continue;
450
451 assert(Mask1 && Mask2 && "both region must have conditions");
452 WorkList.push_back(Region1);
453 }
454
455 // Move recipes from Region1 to its successor region, if both are triangles.
456 for (VPRegionBlock *Region1 : WorkList) {
457 if (TransformedRegions.contains(Region1))
458 continue;
459 auto *MiddleBasicBlock = cast<VPBasicBlock>(Region1->getSingleSuccessor());
460 auto *Region2 = cast<VPRegionBlock>(MiddleBasicBlock->getSingleSuccessor());
461
462 VPBasicBlock *Then1 = getPredicatedThenBlock(Region1);
463 VPBasicBlock *Then2 = getPredicatedThenBlock(Region2);
464 if (!Then1 || !Then2)
465 continue;
466
467 // Note: No fusion-preventing memory dependencies are expected in either
468 // region. Such dependencies should be rejected during earlier dependence
469 // checks, which guarantee accesses can be re-ordered for vectorization.
470 //
471 // Move recipes to the successor region.
472 for (VPRecipeBase &ToMove : make_early_inc_range(reverse(*Then1)))
473 ToMove.moveBefore(*Then2, Then2->getFirstNonPhi());
474
475 auto *Merge1 = cast<VPBasicBlock>(Then1->getSingleSuccessor());
476 auto *Merge2 = cast<VPBasicBlock>(Then2->getSingleSuccessor());
477
478 // Move VPPredInstPHIRecipes from the merge block to the successor region's
479 // merge block. Update all users inside the successor region to use the
480 // original values.
481 for (VPRecipeBase &Phi1ToMove : make_early_inc_range(reverse(*Merge1))) {
482 VPValue *PredInst1 =
483 cast<VPPredInstPHIRecipe>(&Phi1ToMove)->getOperand(0);
484 VPValue *Phi1ToMoveV = Phi1ToMove.getVPSingleValue();
485 Phi1ToMoveV->replaceUsesWithIf(PredInst1, [Then2](VPUser &U, unsigned) {
486 return cast<VPRecipeBase>(&U)->getParent() == Then2;
487 });
488
489 // Remove phi recipes that are unused after merging the regions.
490 if (Phi1ToMove.getVPSingleValue()->getNumUsers() == 0) {
491 Phi1ToMove.eraseFromParent();
492 continue;
493 }
494 Phi1ToMove.moveBefore(*Merge2, Merge2->begin());
495 }
496
497 // Remove the dead recipes in Region1's entry block.
498 for (VPRecipeBase &R :
499 make_early_inc_range(reverse(*Region1->getEntryBasicBlock())))
500 R.eraseFromParent();
501
502 // Finally, remove the first region.
503 for (VPBlockBase *Pred : make_early_inc_range(Region1->getPredecessors())) {
504 VPBlockUtils::disconnectBlocks(Pred, Region1);
505 VPBlockUtils::connectBlocks(Pred, MiddleBasicBlock);
506 }
507 VPBlockUtils::disconnectBlocks(Region1, MiddleBasicBlock);
508 TransformedRegions.insert(Region1);
509 }
510
511 return !TransformedRegions.empty();
512}
513
515 VPRegionBlock *ParentRegion,
516 VPlan &Plan) {
517 Instruction *Instr = PredRecipe->getUnderlyingInstr();
518 // Build the triangular if-then region.
519 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
520 assert(Instr->getParent() && "Predicated instruction not in any basic block");
521 auto *BlockInMask = PredRecipe->getMask();
522 auto *MaskDef = BlockInMask->getDefiningRecipe();
523 auto *BOMRecipe = new VPBranchOnMaskRecipe(
524 BlockInMask, MaskDef ? MaskDef->getDebugLoc() : DebugLoc::getUnknown());
525 auto *Entry =
526 Plan.createVPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
527
528 // Replace predicated replicate recipe with a replicate recipe without a
529 // mask but in the replicate region.
530 auto *RecipeWithoutMask = new VPReplicateRecipe(
531 PredRecipe->getUnderlyingInstr(), drop_end(PredRecipe->operands()),
532 PredRecipe->isSingleScalar(), nullptr /*Mask*/, *PredRecipe, *PredRecipe,
533 PredRecipe->getDebugLoc());
534 auto *Pred =
535 Plan.createVPBasicBlock(Twine(RegionName) + ".if", RecipeWithoutMask);
536 auto *Exiting = Plan.createVPBasicBlock(Twine(RegionName) + ".continue");
538 Plan.createReplicateRegion(Entry, Exiting, RegionName);
539
540 // Note: first set Entry as region entry and then connect successors starting
541 // from it in order, to propagate the "parent" of each VPBasicBlock.
542 Region->setParent(ParentRegion);
543 VPBlockUtils::insertTwoBlocksAfter(Pred, Exiting, Entry);
544 VPBlockUtils::connectBlocks(Pred, Exiting);
545
546 if (PredRecipe->getNumUsers() != 0) {
547 auto *PHIRecipe = new VPPredInstPHIRecipe(RecipeWithoutMask,
548 RecipeWithoutMask->getDebugLoc());
549 Exiting->appendRecipe(PHIRecipe);
550 PredRecipe->replaceAllUsesWith(PHIRecipe);
551 }
552 PredRecipe->eraseFromParent();
553 return Region;
554}
555
556static void addReplicateRegions(VPlan &Plan) {
559 vp_depth_first_deep(Plan.getEntry()))) {
560 for (VPRecipeBase &R : *VPBB)
561 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
562 if (RepR->isPredicated())
563 WorkList.push_back(RepR);
564 }
565 }
566
567 unsigned BBNum = 0;
568 for (VPReplicateRecipe *RepR : WorkList) {
569 VPBasicBlock *CurrentBlock = RepR->getParent();
570 VPBasicBlock *SplitBlock = CurrentBlock->splitAt(RepR->getIterator());
571
572 BasicBlock *OrigBB = RepR->getUnderlyingInstr()->getParent();
573 SplitBlock->setName(
574 OrigBB->hasName() ? OrigBB->getName() + "." + Twine(BBNum++) : "");
575 // Record predicated instructions for above packing optimizations.
577 createReplicateRegion(RepR, CurrentBlock->getParent(), Plan);
579
580 VPRegionBlock *ParentRegion = Region->getParent();
581 if (ParentRegion && ParentRegion->getExiting() == CurrentBlock)
582 ParentRegion->setExiting(SplitBlock);
583 }
584}
585
589 vp_depth_first_deep(Plan.getEntry()))) {
590 // Don't fold the blocks in the skeleton of the Plan into their single
591 // predecessors for now.
592 // TODO: Remove restriction once more of the skeleton is modeled in VPlan.
593 if (!VPBB->getParent())
594 continue;
595 auto *PredVPBB =
596 dyn_cast_or_null<VPBasicBlock>(VPBB->getSinglePredecessor());
597 if (!PredVPBB || PredVPBB->getNumSuccessors() != 1 ||
598 isa<VPIRBasicBlock>(PredVPBB))
599 continue;
600 WorkList.push_back(VPBB);
601 }
602
603 for (VPBasicBlock *VPBB : WorkList) {
604 VPBasicBlock *PredVPBB = cast<VPBasicBlock>(VPBB->getSinglePredecessor());
605 for (VPRecipeBase &R : make_early_inc_range(*VPBB))
606 R.moveBefore(*PredVPBB, PredVPBB->end());
607 VPBlockUtils::disconnectBlocks(PredVPBB, VPBB);
608 auto *ParentRegion = VPBB->getParent();
609 if (ParentRegion && ParentRegion->getExiting() == VPBB)
610 ParentRegion->setExiting(PredVPBB);
611 VPBlockUtils::transferSuccessors(VPBB, PredVPBB);
612 // VPBB is now dead and will be cleaned up when the plan gets destroyed.
613 }
614 return !WorkList.empty();
615}
616
618 // Convert masked VPReplicateRecipes to if-then region blocks.
620
621 bool ShouldSimplify = true;
622 while (ShouldSimplify) {
623 ShouldSimplify = sinkScalarOperands(Plan);
624 ShouldSimplify |= mergeReplicateRegionsIntoSuccessors(Plan);
625 ShouldSimplify |= mergeBlocksIntoPredecessors(Plan);
626 }
627}
628
629/// Remove redundant casts of inductions.
630///
631/// Such redundant casts are casts of induction variables that can be ignored,
632/// because we already proved that the casted phi is equal to the uncasted phi
633/// in the vectorized loop. There is no need to vectorize the cast - the same
634/// value can be used for both the phi and casts in the vector loop.
636 for (auto &Phi : Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
638 if (!IV || IV->getTruncInst())
639 continue;
640
641 // A sequence of IR Casts has potentially been recorded for IV, which
642 // *must be bypassed* when the IV is vectorized, because the vectorized IV
643 // will produce the desired casted value. This sequence forms a def-use
644 // chain and is provided in reverse order, ending with the cast that uses
645 // the IV phi. Search for the recipe of the last cast in the chain and
646 // replace it with the original IV. Note that only the final cast is
647 // expected to have users outside the cast-chain and the dead casts left
648 // over will be cleaned up later.
649 ArrayRef<Instruction *> Casts = IV->getInductionDescriptor().getCastInsts();
650 VPValue *FindMyCast = IV;
651 for (Instruction *IRCast : reverse(Casts)) {
652 VPSingleDefRecipe *FoundUserCast = nullptr;
653 for (auto *U : FindMyCast->users()) {
654 auto *UserCast = dyn_cast<VPSingleDefRecipe>(U);
655 if (UserCast && UserCast->getUnderlyingValue() == IRCast) {
656 FoundUserCast = UserCast;
657 break;
658 }
659 }
660 // A cast recipe in the chain may have been removed by earlier DCE.
661 if (!FoundUserCast)
662 break;
663 FindMyCast = FoundUserCast;
664 }
665 if (FindMyCast != IV)
666 FindMyCast->replaceAllUsesWith(IV);
667 }
668}
669
672 Instruction::BinaryOps InductionOpcode,
673 FPMathOperator *FPBinOp, Instruction *TruncI,
674 VPIRValue *StartV, VPValue *Step, DebugLoc DL,
675 VPBuilder &Builder) {
676 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
677 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
678 VPValue *CanonicalIV = LoopRegion->getCanonicalIV();
679 VPSingleDefRecipe *BaseIV =
680 Builder.createDerivedIV(Kind, FPBinOp, StartV, CanonicalIV, Step);
681
682 // Truncate base induction if needed.
683 Type *ResultTy = BaseIV->getScalarType();
684 if (TruncI) {
685 Type *TruncTy = TruncI->getType();
686 assert(ResultTy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits() &&
687 "Not truncating.");
688 assert(ResultTy->isIntegerTy() && "Truncation requires an integer type");
689 BaseIV = Builder.createScalarCast(Instruction::Trunc, BaseIV, TruncTy, DL);
690 ResultTy = TruncTy;
691 }
692
693 // Truncate step if needed.
694 Type *StepTy = Step->getScalarType();
695 if (ResultTy != StepTy) {
696 assert(StepTy->getScalarSizeInBits() > ResultTy->getScalarSizeInBits() &&
697 "Not truncating.");
698 assert(StepTy->isIntegerTy() && "Truncation requires an integer type");
699 auto *VecPreheader =
701 VPBuilder::InsertPointGuard Guard(Builder);
702 Builder.setInsertPoint(VecPreheader);
703 Step = Builder.createScalarCast(Instruction::Trunc, Step, ResultTy, DL);
704 }
705 return Builder.createScalarIVSteps(InductionOpcode, FPBinOp, BaseIV, Step,
706 &Plan.getVF(), DL);
707}
708
710 VPlan &Plan, ScalarEvolution &SE, const TargetTransformInfo &TTI,
712 const SmallPtrSetImpl<const Value *> &ValuesToIgnore) {
713 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
714 if (!LoopRegion)
715 return;
716
717 auto *WideCanIV =
719 if (!WideCanIV)
720 return;
721
722 Type *CanIVTy = LoopRegion->getCanonicalIVType();
723
724 // Replace the wide canonical IV with a scalar-iv-steps over the canonical
725 // IV.
726 if (Plan.hasScalarVFOnly() || vputils::onlyFirstLaneUsed(WideCanIV)) {
727 VPBuilder Builder(WideCanIV);
728 WideCanIV->replaceAllUsesWith(createScalarIVSteps(
729 Plan, InductionDescriptor::IK_IntInduction, Instruction::Add, nullptr,
730 nullptr, Plan.getZero(CanIVTy), Plan.getConstantInt(CanIVTy, 1),
731 WideCanIV->getDebugLoc(), Builder));
732 WideCanIV->eraseFromParent();
733 return;
734 }
735
736 if (vputils::onlyScalarValuesUsed(WideCanIV))
737 return;
738
739 // If a canonical VPWidenIntOrFpInductionRecipe already produces vector lanes
740 // in the header, reuse it instead of introducing another wide induction phi.
741 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
742 for (VPRecipeBase &Phi : Header->phis()) {
744 if (!match(&Phi, m_CanonicalWidenIV(WidenIV)))
745 continue;
746 // The reused wide IV feeds the header mask, whose lanes may extend past
747 // the trip count; drop flags that only hold inside the scalar loop.
748 WidenIV->dropPoisonGeneratingFlags();
749 WideCanIV->replaceAllUsesWith(WidenIV);
750 WideCanIV->eraseFromParent();
751 return;
752 }
753
754 // Introduce a new VPWidenIntOrFpInductionRecipe if profitable.
755 auto *VecTy = VectorType::get(CanIVTy, VF);
756 InstructionCost BroadcastCost = TTI.getShuffleCost(
758 InstructionCost PHICost = TTI.getCFInstrCost(Instruction::PHI, CostKind);
759 if (PHICost > BroadcastCost)
760 return;
761
762 // Bail out if the additional wide induction phi increase the expected spill
763 // cost.
764 VPRegisterUsage UnrolledBase =
765 calculateRegisterUsageForPlan(Plan, VF, TTI, ValuesToIgnore)[0];
766 for (unsigned &NumUsers : make_second_range(UnrolledBase.MaxLocalUsers))
767 NumUsers *= UF;
768 unsigned RegClass = TTI.getRegisterClassForType(/*Vector=*/true, VecTy);
769 VPRegisterUsage Projected = UnrolledBase;
770 Projected.MaxLocalUsers[RegClass] += TTI.getRegUsageForType(VecTy);
771 if (Projected.spillCost(TTI, CostKind) >
772 UnrolledBase.spillCost(TTI, CostKind))
773 return;
774
777 VPValue *StepV = Plan.getConstantInt(CanIVTy, 1);
778 auto *NewWideIV = new VPWidenIntOrFpInductionRecipe(
779 /*IV=*/nullptr, Plan.getZero(CanIVTy), StepV, &Plan.getVF(), ID,
780 WideCanIV->getNoWrapFlags(), WideCanIV->getDebugLoc());
781 NewWideIV->insertBefore(&*Header->getFirstNonPhi());
782 WideCanIV->replaceAllUsesWith(NewWideIV);
783 WideCanIV->eraseFromParent();
784}
785
786/// Returns true if \p R is dead and can be removed.
787static bool isDeadRecipe(VPRecipeBase &R) {
788 // Do remove conditional assume instructions as their conditions may be
789 // flattened.
790 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
791 bool IsConditionalAssume = RepR && RepR->isPredicated() &&
793 if (IsConditionalAssume)
794 return true;
795
796 if (R.mayHaveSideEffects())
797 return false;
798
799 // Recipe is dead if no user keeps the recipe alive.
800 return all_of(R.definedValues(),
801 [](VPValue *V) { return V->getNumUsers() == 0; });
802}
803
806 Plan.getEntry());
808 // The recipes in the block are processed in reverse order, to catch chains
809 // of dead recipes.
810 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
811 if (isDeadRecipe(R)) {
812 R.eraseFromParent();
813 continue;
814 }
815
816 // Check if R is a dead VPPhi <-> update cycle and remove it.
817 VPValue *Start, *Incoming;
818 if (!match(&R, m_VPPhi(m_VPValue(Start), m_VPValue(Incoming))))
819 continue;
820 auto *PhiR = cast<VPPhi>(&R);
821 VPUser *PhiUser = PhiR->getSingleUser();
822 if (!PhiUser)
823 continue;
824 if (PhiUser != Incoming->getDefiningRecipe() ||
825 Incoming->getNumUsers() != 1)
826 continue;
827 PhiR->replaceAllUsesWith(Start);
828 PhiR->eraseFromParent();
829 Incoming->getDefiningRecipe()->eraseFromParent();
830 }
831 }
832}
833
836 for (unsigned I = 0; I != Users.size(); ++I) {
838 for (VPValue *V : Cur->definedValues())
839 Users.insert_range(V->users());
840 }
841 return Users.takeVector();
842}
843
844/// Scalarize a VPWidenPointerInductionRecipe by replacing it with a PtrAdd
845/// (IndStart, ScalarIVSteps (0, Step)). This is used when the recipe only
846/// generates scalar values.
847static VPValue *
849 VPlan &Plan, VPBuilder &Builder) {
851 VPIRValue *StartV = Plan.getZero(ID.getStep()->getType());
852 VPValue *StepV = PtrIV->getOperand(1);
854 Plan, InductionDescriptor::IK_IntInduction, Instruction::Add, nullptr,
855 nullptr, StartV, StepV, PtrIV->getDebugLoc(), Builder);
856
857 return Builder.createPtrAdd(PtrIV->getStartValue(), Steps,
858 PtrIV->getDebugLoc(), "next.gep");
859}
860
861/// Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd
862/// (IndStart, ScalarIVSteps (0, Step)) if only its scalar values are used, as
863/// VPWidenPointerInductionRecipe will generate vectors only. If some users
864/// require vectors while other require scalars, the scalar uses need to extract
865/// the scalars from the generated vectors (Note that this is different to how
866/// int/fp inductions are handled). Legalize extract-from-ends using uniform
867/// VPReplicateRecipe of wide inductions to use regular VPReplicateRecipe, so
868/// the correct end value is available. Also optimize
869/// VPWidenIntOrFpInductionRecipe, if any of its users needs scalar values, by
870/// providing them scalar steps built on the canonical scalar IV and update the
871/// original IV's users. This is an optional optimization to reduce the needs of
872/// vector extracts.
875 bool HasOnlyVectorVFs = !Plan.hasScalarVFOnly();
876 VPBuilder Builder(HeaderVPBB, HeaderVPBB->getFirstNonPhi());
877 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
878 auto *PhiR = dyn_cast<VPWidenInductionRecipe>(&Phi);
879 if (!PhiR)
880 continue;
881
882 // Try to narrow wide and replicating recipes to uniform recipes, based on
883 // VPlan analysis.
884 // TODO: Apply to all recipes in the future, to replace legacy uniformity
885 // analysis.
886 auto Users = collectUsersRecursively(PhiR);
887 for (VPUser *U : reverse(Users)) {
888 auto *Def = dyn_cast<VPRecipeWithIRFlags>(U);
889 auto *RepR = dyn_cast<VPReplicateRecipe>(U);
890 // Skip recipes that shouldn't be narrowed.
891 if (!Def || !isa<VPReplicateRecipe, VPWidenRecipe>(Def) ||
892 Def->getNumUsers() == 0 || !Def->getUnderlyingValue() ||
893 (RepR && (RepR->isSingleScalar() || RepR->isPredicated())))
894 continue;
895
896 // Skip recipes that may have other lanes than their first used.
898 continue;
899
900 // TODO: Support scalarizing ExtractValue.
901 if (match(Def,
903 continue;
904
905 auto *Clone = new VPReplicateRecipe(Def->getUnderlyingInstr(),
906 Def->operands(), /*IsUniform*/ true,
907 /*Mask*/ nullptr, /*Flags*/ *Def);
908 Clone->insertAfter(Def);
909 Def->replaceAllUsesWith(Clone);
910 }
911
912 // Replace wide pointer inductions which have only their scalars used by
913 // PtrAdd(IndStart, ScalarIVSteps (0, Step)).
914 if (auto *PtrIV = dyn_cast<VPWidenPointerInductionRecipe>(&Phi)) {
915 if (!Plan.hasScalarVFOnly() &&
916 !PtrIV->onlyScalarsGenerated(Plan.hasScalableVF()))
917 continue;
918
919 VPValue *PtrAdd = scalarizeVPWidenPointerInduction(PtrIV, Plan, Builder);
920 PtrIV->replaceAllUsesWith(PtrAdd);
921 continue;
922 }
923
924 // Replace widened induction with scalar steps for users that only use
925 // scalars.
926 auto *WideIV = cast<VPWidenIntOrFpInductionRecipe>(&Phi);
927 if (HasOnlyVectorVFs && none_of(WideIV->users(), [WideIV](VPUser *U) {
928 return U->usesScalars(WideIV);
929 }))
930 continue;
931
932 const InductionDescriptor &ID = WideIV->getInductionDescriptor();
934 Plan, ID.getKind(), ID.getInductionOpcode(),
935 dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
936 WideIV->getTruncInst(), WideIV->getStartValue(), WideIV->getStepValue(),
937 WideIV->getDebugLoc(), Builder);
938
939 // Update scalar users of IV to use Step instead.
940 if (!HasOnlyVectorVFs) {
941 assert(!Plan.hasScalableVF() &&
942 "plans containing a scalar VF cannot also include scalable VFs");
943 WideIV->replaceAllUsesWith(Steps);
944 } else {
945 bool HasScalableVF = Plan.hasScalableVF();
946 WideIV->replaceUsesWithIf(Steps,
947 [WideIV, HasScalableVF](VPUser &U, unsigned) {
948 if (HasScalableVF)
949 return U.usesFirstLaneOnly(WideIV);
950 return U.usesScalars(WideIV);
951 });
952 }
953 }
954}
955
956/// Check if \p VPV is an untruncated wide induction, either before or after the
957/// increment. If so return the header IV (before the increment), otherwise
958/// return null.
961 auto *WideIV = dyn_cast<VPWidenInductionRecipe>(VPV);
962 if (WideIV) {
963 // VPV itself is a wide induction, separately compute the end value for exit
964 // users if it is not a truncated IV.
965 auto *IntOrFpIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
966 return (IntOrFpIV && IntOrFpIV->getTruncInst()) ? nullptr : WideIV;
967 }
968
969 // Check if VPV is an optimizable induction increment.
970 VPRecipeBase *Def = VPV->getDefiningRecipe();
971 if (!Def || Def->getNumOperands() != 2)
972 return nullptr;
973 WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(0));
974 if (!WideIV)
975 WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(1));
976 if (!WideIV)
977 return nullptr;
978
979 auto IsWideIVInc = [&]() {
980 auto &ID = WideIV->getInductionDescriptor();
981
982 // Check if VPV increments the induction by the induction step.
983 VPValue *IVStep = WideIV->getStepValue();
984 switch (ID.getInductionOpcode()) {
985 case Instruction::Add:
986 return match(VPV, m_c_Add(m_Specific(WideIV), m_Specific(IVStep)));
987 case Instruction::FAdd:
988 return match(VPV, m_c_FAdd(m_Specific(WideIV), m_Specific(IVStep)));
989 case Instruction::FSub:
990 return match(VPV, m_Binary<Instruction::FSub>(m_Specific(WideIV),
991 m_Specific(IVStep)));
992 case Instruction::Sub: {
993 // IVStep will be the negated step of the subtraction. Check if Step == -1
994 // * IVStep.
995 VPValue *Step;
996 if (!match(VPV, m_Sub(m_VPValue(), m_VPValue(Step))))
997 return false;
998 const SCEV *IVStepSCEV = vputils::getSCEVExprForVPValue(IVStep, PSE);
999 const SCEV *StepSCEV = vputils::getSCEVExprForVPValue(Step, PSE);
1000 ScalarEvolution &SE = *PSE.getSE();
1001 return !isa<SCEVCouldNotCompute>(IVStepSCEV) &&
1002 !isa<SCEVCouldNotCompute>(StepSCEV) &&
1003 IVStepSCEV == SE.getNegativeSCEV(StepSCEV);
1004 }
1005 default:
1006 return ID.getKind() == InductionDescriptor::IK_PtrInduction &&
1007 match(VPV, m_GetElementPtr(m_Specific(WideIV),
1008 m_Specific(WideIV->getStepValue())));
1009 }
1010 llvm_unreachable("should have been covered by switch above");
1011 };
1012 return IsWideIVInc() ? WideIV : nullptr;
1013}
1014
1015/// Attempts to optimize the induction variable exit values for users in the
1016/// early exit block.
1019 VPValue *Incoming, *Mask;
1021 m_VPValue(Incoming))))
1022 return nullptr;
1023
1024 auto *WideIV = getOptimizableIVOf(Incoming, PSE);
1025 if (!WideIV)
1026 return nullptr;
1027
1028 auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
1029 if (WideIntOrFp && WideIntOrFp->getTruncInst())
1030 return nullptr;
1031
1032 // Calculate the final index.
1033 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
1034 auto *CanonicalIV = LoopRegion->getCanonicalIV();
1035 Type *CanonicalIVType = LoopRegion->getCanonicalIVType();
1036 auto *ExtractR = cast<VPInstruction>(Op);
1037 VPBuilder B(ExtractR);
1038
1039 DebugLoc DL = ExtractR->getDebugLoc();
1040 VPValue *FirstActiveLane = B.createFirstActiveLane(Mask, DL);
1041 FirstActiveLane = B.createScalarZExtOrTrunc(
1042 FirstActiveLane, CanonicalIVType, FirstActiveLane->getScalarType(), DL);
1043 VPValue *EndValue = B.createAdd(CanonicalIV, FirstActiveLane, DL);
1044
1045 // `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
1046 // changed it means the exit is using the incremented value, so we need to
1047 // add the step.
1048 if (Incoming != WideIV) {
1049 VPValue *One = Plan.getConstantInt(CanonicalIVType, 1);
1050 EndValue = B.createAdd(EndValue, One, DL);
1051 }
1052
1053 if (!match(WideIV, m_CanonicalWidenIV())) {
1054 const InductionDescriptor &ID = WideIV->getInductionDescriptor();
1055 VPIRValue *Start = WideIV->getStartValue();
1056 VPValue *Step = WideIV->getStepValue();
1057 EndValue = B.createDerivedIV(
1058 ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
1059 Start, EndValue, Step);
1060 }
1061
1062 return EndValue;
1063}
1064
1065/// Compute the end value for \p WideIV, unless it is truncated. Creates a
1066/// VPDerivedIVRecipe for non-canonical inductions.
1068 VPBuilder &VectorPHBuilder,
1069 VPValue *VectorTC) {
1070 auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
1071 // Truncated wide inductions resume from the last lane of their vector value
1072 // in the last vector iteration which is handled elsewhere.
1073 if (WideIntOrFp && WideIntOrFp->getTruncInst())
1074 return nullptr;
1075
1076 VPIRValue *Start = WideIV->getStartValue();
1077 VPValue *Step = WideIV->getStepValue();
1079 VPValue *EndValue = VectorTC;
1080 if (!match(WideIV, m_CanonicalWidenIV())) {
1081 EndValue = VectorPHBuilder.createDerivedIV(
1082 ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
1083 Start, VectorTC, Step);
1084 }
1085
1086 // EndValue is derived from the vector trip count (which has the same type as
1087 // the widest induction) and thus may be wider than the induction here.
1088 Type *ScalarTypeOfWideIV = WideIV->getScalarType();
1089 if (ScalarTypeOfWideIV != EndValue->getScalarType()) {
1090 EndValue = VectorPHBuilder.createScalarCast(Instruction::Trunc, EndValue,
1091 ScalarTypeOfWideIV,
1092 WideIV->getDebugLoc());
1093 }
1094
1095 return EndValue;
1096}
1097
1098/// Attempts to optimize the induction variable exit values for users in the
1099/// exit block coming from the latch in the original scalar loop.
1100static VPValue *
1104 VPValue *Incoming;
1106 return nullptr;
1107
1108 VPWidenInductionRecipe *WideIV = getOptimizableIVOf(Incoming, PSE);
1109 if (!WideIV)
1110 return nullptr;
1111
1112 VPValue *EndValue = EndValues.lookup(WideIV);
1113 assert(EndValue && "Must have computed the end value up front");
1114
1115 // `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
1116 // changed it means the exit is using the incremented value, so we don't
1117 // need to subtract the step.
1118 if (Incoming != WideIV)
1119 return EndValue;
1120
1121 // Otherwise, subtract the step from the EndValue.
1122 auto *ExtractR = cast<VPInstruction>(Op);
1123 VPBuilder B(ExtractR);
1124 VPValue *Step = WideIV->getStepValue();
1125 Type *ScalarTy = WideIV->getScalarType();
1126 if (ScalarTy->isIntegerTy())
1127 return B.createSub(EndValue, Step, DebugLoc::getUnknown(), "ind.escape");
1128 if (ScalarTy->isPointerTy()) {
1129 Type *StepTy = Step->getScalarType();
1130 auto *Zero = Plan.getZero(StepTy);
1131 return B.createPtrAdd(EndValue, B.createSub(Zero, Step),
1132 DebugLoc::getUnknown(), "ind.escape");
1133 }
1134 if (ScalarTy->isFloatingPointTy()) {
1135 const auto &ID = WideIV->getInductionDescriptor();
1136 return B.createNaryOp(
1137 ID.getInductionBinOp()->getOpcode() == Instruction::FAdd
1138 ? Instruction::FSub
1139 : Instruction::FAdd,
1140 {EndValue, Step}, {ID.getInductionBinOp()->getFastMathFlags()});
1141 }
1142 llvm_unreachable("all possible induction types must be handled");
1143 return nullptr;
1144}
1145
1147 VPlan &Plan, PredicatedScalarEvolution &PSE, bool FoldTail) {
1148 // Compute end values for all inductions.
1149 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
1150 auto *VectorPH = cast<VPBasicBlock>(VectorRegion->getSinglePredecessor());
1151 VPBuilder VectorPHBuilder(VectorPH, VectorPH->begin());
1153 VPValue *ResumeTC =
1154 FoldTail ? Plan.getTripCount() : &Plan.getVectorTripCount();
1155 for (auto &Phi : VectorRegion->getEntryBasicBlock()->phis()) {
1156 auto *WideIV = dyn_cast<VPWidenInductionRecipe>(&Phi);
1157 if (!WideIV)
1158 continue;
1159 if (VPValue *EndValue =
1160 tryToComputeEndValueForInduction(WideIV, VectorPHBuilder, ResumeTC))
1161 EndValues[WideIV] = EndValue;
1162 }
1163
1164 VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
1165 for (VPRecipeBase &R : make_early_inc_range(*MiddleVPBB)) {
1166 VPValue *Op;
1167 if (!match(&R, m_ExitingIVValue(m_VPValue(Op))))
1168 continue;
1169 auto *WideIV = cast<VPWidenInductionRecipe>(Op);
1170 if (VPValue *EndValue = EndValues.lookup(WideIV)) {
1171 R.getVPSingleValue()->replaceAllUsesWith(EndValue);
1172 R.eraseFromParent();
1173 }
1174 }
1175
1176 // Then, optimize exit block users.
1177 for (VPIRBasicBlock *ExitVPBB : Plan.getExitBlocks()) {
1178 for (VPRecipeBase &R : ExitVPBB->phis()) {
1179 auto *ExitIRI = cast<VPIRPhi>(&R);
1180
1181 for (auto [Idx, PredVPBB] : enumerate(ExitVPBB->getPredecessors())) {
1182 VPValue *Escape = nullptr;
1183 if (PredVPBB == MiddleVPBB)
1185 Plan, ExitIRI->getOperand(Idx), EndValues, PSE);
1186 else
1188 Plan, ExitIRI->getOperand(Idx), PSE);
1189 if (Escape)
1190 ExitIRI->setOperand(Idx, Escape);
1191 }
1192 }
1193 }
1194}
1195
1196/// Remove redundant ExpandSCEVRecipes in \p Plan's entry block by replacing
1197/// them with already existing recipes expanding the same SCEV expression.
1200
1201 for (VPRecipeBase &R :
1203 auto *ExpR = dyn_cast<VPExpandSCEVRecipe>(&R);
1204 if (!ExpR)
1205 continue;
1206
1207 const auto &[V, Inserted] = SCEV2VPV.try_emplace(ExpR->getSCEV(), ExpR);
1208 if (Inserted)
1209 continue;
1210
1211 ExpR->replaceAllUsesWith(V->second);
1212 if (ExpR == Plan.getTripCount())
1213 Plan.resetTripCount(V->second);
1214
1215 ExpR->eraseFromParent();
1216 }
1217}
1218
1220 SmallVector<VPValue *> WorkList;
1222 WorkList.push_back(V);
1223
1224 while (!WorkList.empty()) {
1225 VPValue *Cur = WorkList.pop_back_val();
1226 if (!Seen.insert(Cur).second)
1227 continue;
1228 VPRecipeBase *R = Cur->getDefiningRecipe();
1229 if (!R)
1230 continue;
1231 if (!isDeadRecipe(*R))
1232 continue;
1233 append_range(WorkList, R->operands());
1234 R->eraseFromParent();
1235 }
1236}
1237
1238/// Get any instruction opcode or intrinsic ID data embedded in recipe \p R.
1239/// Returns an optional pair, where the first element indicates whether it is
1240/// an intrinsic ID.
1241static std::optional<std::pair<bool, unsigned>>
1243 return TypeSwitch<const VPSingleDefRecipe *,
1244 std::optional<std::pair<bool, unsigned>>>(R)
1247 [](auto *I) { return std::make_pair(false, I->getOpcode()); })
1248 .Case([](const VPWidenIntrinsicRecipe *I) {
1249 return std::make_pair(true, I->getVectorIntrinsicID());
1250 })
1251 .Case<VPVectorPointerRecipe, VPPredInstPHIRecipe, VPScalarIVStepsRecipe>(
1252 [](auto *I) {
1253 // For recipes that do not directly map to LLVM IR instructions,
1254 // assign opcodes after the last VPInstruction opcode (which is also
1255 // after the last IR Instruction opcode), based on the VPRecipeID.
1256 return std::make_pair(false, VPInstruction::OpsEnd + 1 +
1257 I->getVPRecipeID());
1258 })
1259 .Default([](auto *) { return std::nullopt; });
1260}
1261
1262/// Try to fold \p R using InstSimplifyFolder. Will succeed and return a
1263/// non-nullptr VPValue for a handled opcode or intrinsic ID if corresponding \p
1264/// Operands are foldable live-ins.
1266 ArrayRef<VPValue *> Operands,
1267 const DataLayout &DL, LLVMContext &Ctx) {
1268 auto OpcodeOrIID = getOpcodeOrIntrinsicID(&R);
1269 if (!OpcodeOrIID)
1270 return nullptr;
1271
1273 for (VPValue *Op : Operands) {
1274 if (!match(Op, m_LiveIn()))
1275 return nullptr;
1276 Value *V = Op->getUnderlyingValue();
1277 if (!V)
1278 return nullptr;
1279 Ops.push_back(V);
1280 }
1281
1282 auto FoldToIRValue = [&]() -> Value * {
1283 InstSimplifyFolder Folder(DL);
1284 if (OpcodeOrIID->first) {
1285 if (R.getNumOperands() != 2)
1286 return nullptr;
1287 unsigned ID = OpcodeOrIID->second;
1288 return Folder.FoldBinaryIntrinsic(ID, Ops[0], Ops[1], R.getScalarType());
1289 }
1290 unsigned Opcode = OpcodeOrIID->second;
1291 if (Instruction::isBinaryOp(Opcode))
1292 return Folder.FoldBinOp(static_cast<Instruction::BinaryOps>(Opcode),
1293 Ops[0], Ops[1]);
1294 if (Instruction::isCast(Opcode))
1295 return Folder.FoldCast(static_cast<Instruction::CastOps>(Opcode), Ops[0],
1296 R.getVPSingleValue()->getScalarType());
1297 switch (Opcode) {
1299 return Folder.FoldSelect(Ops[0], Ops[1],
1301 case VPInstruction::Not:
1302 return Folder.FoldBinOp(Instruction::BinaryOps::Xor, Ops[0],
1304 case Instruction::Select:
1305 return Folder.FoldSelect(Ops[0], Ops[1], Ops[2]);
1306 case Instruction::ICmp:
1307 case Instruction::FCmp:
1308 return Folder.FoldCmp(cast<VPRecipeWithIRFlags>(R).getPredicate(), Ops[0],
1309 Ops[1]);
1310 case Instruction::GetElementPtr: {
1311 auto &RFlags = cast<VPRecipeWithIRFlags>(R);
1312 auto *GEP = cast<GetElementPtrInst>(RFlags.getUnderlyingInstr());
1313 return Folder.FoldGEP(GEP->getSourceElementType(), Ops[0],
1314 drop_begin(Ops), RFlags.getGEPNoWrapFlags());
1315 }
1318 return Folder.FoldGEP(IntegerType::getInt8Ty(Ctx), Ops[0], Ops[1],
1319 cast<VPRecipeWithIRFlags>(R).getGEPNoWrapFlags());
1320 // An extract of a live-in is an extract of a broadcast, so return the
1321 // broadcasted element.
1322 case Instruction::ExtractElement:
1323 assert(!Ops[0]->getType()->isVectorTy() && "Live-ins should be scalar");
1324 return Ops[0];
1325 }
1326 return nullptr;
1327 };
1328
1329 if (Value *V = FoldToIRValue())
1330 return R.getParent()->getPlan()->getOrAddLiveIn(V);
1331 return nullptr;
1332}
1333
1334/// Try to simplify logical and bitwise recipes in \p Def.
1336 bool CanCreateNewRecipe) {
1337 VPlan *Plan = Def->getParent()->getPlan();
1338
1339 // Simplify (X && Y) | (X && !Y) -> X.
1340 // TODO: Split up into simpler, modular combines: (X && Y) | (X && Z) into X
1341 // && (Y | Z) and (X | !X) into true. This requires queuing newly created
1342 // recipes to be visited during simplification.
1343 VPValue *X, *Y, *Z;
1344 if (match(Def,
1347 Def->replaceAllUsesWith(X);
1348 Def->eraseFromParent();
1349 return true;
1350 }
1351
1352 // x | AllOnes -> AllOnes
1353 if (match(Def, m_c_BinaryOr(m_VPValue(X), m_AllOnes()))) {
1354 Def->replaceAllUsesWith(Plan->getAllOnesValue(Def->getScalarType()));
1355 return true;
1356 }
1357
1358 // x | 0 -> x
1359 if (match(Def, m_c_BinaryOr(m_VPValue(X), m_ZeroInt()))) {
1360 Def->replaceAllUsesWith(X);
1361 return true;
1362 }
1363
1364 // x | !x -> AllOnes
1365 if (match(Def, m_c_BinaryOr(m_VPValue(X), m_Not(m_Deferred(X))))) {
1366 Def->replaceAllUsesWith(Plan->getAllOnesValue(Def->getScalarType()));
1367 return true;
1368 }
1369
1370 // x & 0 -> 0
1371 if (match(Def, m_c_BinaryAnd(m_VPValue(X), m_ZeroInt()))) {
1372 Def->replaceAllUsesWith(Plan->getZero(Def->getScalarType()));
1373 return true;
1374 }
1375
1376 // x & AllOnes -> x
1377 if (match(Def, m_c_BinaryAnd(m_VPValue(X), m_AllOnes()))) {
1378 Def->replaceAllUsesWith(X);
1379 return true;
1380 }
1381
1382 // x && false -> false
1383 if (match(Def, m_c_LogicalAnd(m_VPValue(X), m_False()))) {
1384 Def->replaceAllUsesWith(Plan->getFalse());
1385 return true;
1386 }
1387
1388 // x && true -> x
1389 if (match(Def, m_c_LogicalAnd(m_VPValue(X), m_True()))) {
1390 Def->replaceAllUsesWith(X);
1391 return true;
1392 }
1393
1394 // (x && y) | (x && z) -> x && (y | z)
1395 if (CanCreateNewRecipe &&
1398 // Simplify only if one of the operands has one use to avoid creating an
1399 // extra recipe.
1400 (!Def->getOperand(0)->hasMoreThanOneUniqueUser() ||
1401 !Def->getOperand(1)->hasMoreThanOneUniqueUser())) {
1402 Def->replaceAllUsesWith(
1403 Builder.createLogicalAnd(X, Builder.createOr(Y, Z)));
1404 return true;
1405 }
1406
1407 // x && (x && y) -> x && y
1408 if (match(Def, m_LogicalAnd(m_VPValue(X),
1410 Def->replaceAllUsesWith(Def->getOperand(1));
1411 return true;
1412 }
1413
1414 // x && (y && x) -> x && y
1415 if (match(Def, m_LogicalAnd(m_VPValue(X),
1417 Def->replaceAllUsesWith(Builder.createLogicalAnd(X, Y));
1418 return true;
1419 }
1420
1421 // x && !x -> 0
1422 if (match(Def, m_LogicalAnd(m_VPValue(X), m_Not(m_Deferred(X))))) {
1423 Def->replaceAllUsesWith(Plan->getFalse());
1424 return true;
1425 }
1426
1427 if (match(Def, m_Select(m_VPValue(), m_VPValue(X), m_Deferred(X)))) {
1428 Def->replaceAllUsesWith(X);
1429 return true;
1430 }
1431
1432 // select c, false, true -> not c
1433 VPValue *C;
1434 if (CanCreateNewRecipe &&
1435 match(Def, m_Select(m_VPValue(C), m_False(), m_True()))) {
1436 Def->replaceAllUsesWith(Builder.createNot(C));
1437 return true;
1438 }
1439
1440 // select !c, x, y -> select c, y, x
1441 if (match(Def, m_Select(m_Not(m_VPValue(C)), m_VPValue(X), m_VPValue(Y)))) {
1442 Def->setOperand(0, C);
1443 Def->setOperand(1, Y);
1444 Def->setOperand(2, X);
1445 return true;
1446 }
1447
1448 // select x, (i1 y | z), y -> y | (x && z)
1449 if (CanCreateNewRecipe &&
1450 match(Def, m_Select(m_VPValue(X),
1452 m_Deferred(Y))) &&
1453 Y->getScalarType()->isIntegerTy(1)) {
1454 Def->replaceAllUsesWith(
1455 Builder.createOr(Y, Builder.createLogicalAnd(X, Z)));
1456 return true;
1457 }
1458
1459 return false;
1460}
1461
1462/// Try to simplify VPSingleDefRecipe \p Def.
1464 VPlan *Plan = Def->getParent()->getPlan();
1465
1466 // Simplification of live-in IR values for SingleDef recipes using
1467 // InstSimplifyFolder.
1468 const DataLayout &DL = Plan->getDataLayout();
1469 if (VPValue *V =
1470 tryToFoldLiveIns(*Def, Def->operands(), DL, Plan->getContext()))
1471 return Def->replaceAllUsesWith(V);
1472
1473 // Fold PredPHI LiveIn -> LiveIn.
1474 if (auto *PredPHI = dyn_cast<VPPredInstPHIRecipe>(Def)) {
1475 VPValue *Op = PredPHI->getOperand(0);
1476 if (isa<VPIRValue>(Op))
1477 PredPHI->replaceAllUsesWith(Op);
1478 }
1479
1480 VPBuilder Builder(Def);
1481
1482 // Avoid replacing VPInstructions with underlying values with new
1483 // VPInstructions, as we would fail to create widen/replicate recpes from the
1484 // new VPInstructions without an underlying value, and miss out on some
1485 // transformations that only apply to widened/replicated recipes later, by
1486 // doing so.
1487 // TODO: We should also not replace non-VPInstructions like VPWidenRecipe with
1488 // VPInstructions without underlying values, as those will get skipped during
1489 // cost computation.
1490 bool CanCreateNewRecipe =
1491 !isa<VPInstruction>(Def) || !Def->getUnderlyingValue();
1492
1493 VPValue *A;
1494 if (match(Def, m_Trunc(m_ZExtOrSExt(m_VPValue(A))))) {
1495 Type *TruncTy = Def->getScalarType();
1496 Type *ATy = A->getScalarType();
1497 if (TruncTy == ATy) {
1498 Def->replaceAllUsesWith(A);
1499 } else {
1500 // Don't replace a non-widened cast recipe with a widened cast.
1501 if (!isa<VPWidenCastRecipe>(Def))
1502 return;
1503 if (ATy->getScalarSizeInBits() < TruncTy->getScalarSizeInBits()) {
1504
1505 unsigned ExtOpcode = match(Def->getOperand(0), m_SExt(m_VPValue()))
1506 ? Instruction::SExt
1507 : Instruction::ZExt;
1508 auto *Ext = Builder.createWidenCast(Instruction::CastOps(ExtOpcode), A,
1509 TruncTy);
1510 if (auto *UnderlyingExt = Def->getOperand(0)->getUnderlyingValue()) {
1511 // UnderlyingExt has distinct return type, used to retain legacy cost.
1512 Ext->setUnderlyingValue(UnderlyingExt);
1513 }
1514 Def->replaceAllUsesWith(Ext);
1515 } else if (ATy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits()) {
1516 auto *Trunc = Builder.createWidenCast(Instruction::Trunc, A, TruncTy);
1517 Def->replaceAllUsesWith(Trunc);
1518 }
1519 }
1520 }
1521
1522 if (simplifyLogicalRecipe(Def, Builder, CanCreateNewRecipe))
1523 return;
1524
1525 VPValue *X, *Y, *C;
1526 if (match(Def, m_c_Add(m_VPValue(A), m_ZeroInt())))
1527 return Def->replaceAllUsesWith(A);
1528
1529 if (match(Def, m_c_Mul(m_VPValue(A), m_One())))
1530 return Def->replaceAllUsesWith(A);
1531
1532 if (match(Def, m_c_Mul(m_VPValue(A), m_ZeroInt())))
1533 return Def->replaceAllUsesWith(Plan->getZero(Def->getScalarType()));
1534
1535 if (CanCreateNewRecipe && match(Def, m_c_Mul(m_VPValue(A), m_AllOnes()))) {
1536 // Preserve nsw from the Mul on the new Sub.
1538 false, cast<VPRecipeWithIRFlags>(Def)->hasNoSignedWrap()};
1539 return Def->replaceAllUsesWith(Builder.createSub(
1540 Plan->getZero(A->getScalarType()), A, Def->getDebugLoc(), "", NW));
1541 }
1542
1543 if (CanCreateNewRecipe &&
1545 // Preserve nsw from the Add and the Sub, if it's present on both, on the
1546 // new Sub.
1548 false,
1549 cast<VPRecipeWithIRFlags>(Def)->hasNoSignedWrap() &&
1550 cast<VPRecipeWithIRFlags>(Def->getOperand(Def->getOperand(0) == X))
1551 ->hasNoSignedWrap()};
1552 return Def->replaceAllUsesWith(
1553 Builder.createSub(X, Y, Def->getDebugLoc(), "", NW));
1554 }
1555
1556 const APInt *APC;
1557 if (CanCreateNewRecipe && match(Def, m_c_Mul(m_VPValue(A), m_APInt(APC))) &&
1558 APC->isPowerOf2())
1559 return Def->replaceAllUsesWith(Builder.createNaryOp(
1560 Instruction::Shl,
1561 {A, Plan->getConstantInt(APC->getBitWidth(), APC->exactLogBase2())},
1562 *cast<VPRecipeWithIRFlags>(Def), Def->getDebugLoc()));
1563
1564 if (CanCreateNewRecipe && match(Def, m_UDiv(m_VPValue(A), m_APInt(APC))) &&
1565 APC->isPowerOf2())
1566 return Def->replaceAllUsesWith(Builder.createNaryOp(
1567 Instruction::LShr,
1568 {A, Plan->getConstantInt(APC->getBitWidth(), APC->exactLogBase2())},
1569 *cast<VPRecipeWithIRFlags>(Def), Def->getDebugLoc()));
1570
1571 if (match(Def, m_Not(m_VPValue(A)))) {
1572 if (match(A, m_Not(m_VPValue(A))))
1573 return Def->replaceAllUsesWith(A);
1574
1575 // Try to fold Not into compares by adjusting the predicate in-place.
1576 CmpPredicate Pred;
1577 if (match(A, m_Cmp(Pred, m_VPValue(), m_VPValue()))) {
1578 auto *Cmp = cast<VPRecipeWithIRFlags>(A);
1579 if (all_of(Cmp->users(),
1581 m_Not(m_Specific(Cmp)),
1582 m_Select(m_Specific(Cmp), m_VPValue(), m_VPValue()))))) {
1583 Cmp->setPredicate(CmpInst::getInversePredicate(Pred));
1584 for (VPUser *U : to_vector(Cmp->users())) {
1585 auto *R = cast<VPSingleDefRecipe>(U);
1586 if (match(R, m_Select(m_Specific(Cmp), m_VPValue(X), m_VPValue(Y)))) {
1587 // select (cmp pred), x, y -> select (cmp inv_pred), y, x
1588 R->setOperand(1, Y);
1589 R->setOperand(2, X);
1590 } else {
1591 // not (cmp pred) -> cmp inv_pred
1592 assert(match(R, m_Not(m_Specific(Cmp))) && "Unexpected user");
1593 R->replaceAllUsesWith(Cmp);
1594 }
1595 }
1596 // If Cmp doesn't have a debug location, use the one from the negation,
1597 // to preserve the location.
1598 if (!Cmp->getDebugLoc() && Def->getDebugLoc())
1599 Cmp->setDebugLoc(Def->getDebugLoc());
1600 }
1601 }
1602 }
1603
1604 // Fold any-of (fcmp uno %A, %A), (fcmp uno %B, %B), ... ->
1605 // any-of (fcmp uno %A, %B), ...
1606 if (match(Def, m_AnyOf())) {
1608 VPRecipeBase *UnpairedCmp = nullptr;
1609 for (VPValue *Op : Def->operands()) {
1610 VPValue *X;
1611 if (Op->getNumUsers() > 1 ||
1613 m_Deferred(X)))) {
1614 NewOps.push_back(Op);
1615 } else if (!UnpairedCmp) {
1616 UnpairedCmp = Op->getDefiningRecipe();
1617 } else {
1618 NewOps.push_back(Builder.createFCmp(CmpInst::FCMP_UNO,
1619 UnpairedCmp->getOperand(0), X));
1620 UnpairedCmp = nullptr;
1621 }
1622 }
1623
1624 if (UnpairedCmp)
1625 NewOps.push_back(UnpairedCmp->getVPSingleValue());
1626
1627 if (NewOps.size() < Def->getNumOperands()) {
1628 VPValue *NewAnyOf = Builder.createNaryOp(VPInstruction::AnyOf, NewOps);
1629 return Def->replaceAllUsesWith(NewAnyOf);
1630 }
1631 }
1632
1633 // Fold (fcmp uno %X, %X) or (fcmp uno %Y, %Y) -> fcmp uno %X, %Y
1634 // This is useful for fmax/fmin without fast-math flags, where we need to
1635 // check if any operand is NaN.
1636 if (CanCreateNewRecipe &&
1638 m_Deferred(X)),
1640 m_Deferred(Y))))) {
1641 VPValue *NewCmp = Builder.createFCmp(CmpInst::FCMP_UNO, X, Y);
1642 return Def->replaceAllUsesWith(NewCmp);
1643 }
1644
1645 // Remove redundant DerviedIVs, that is 0 + A * 1 -> A and 0 + 0 * x -> 0.
1646 if ((match(Def, m_DerivedIV(m_ZeroInt(), m_VPValue(A), m_One())) ||
1647 match(Def, m_DerivedIV(m_ZeroInt(), m_ZeroInt(), m_VPValue()))) &&
1648 Def->getOperand(1)->getScalarType() == Def->getScalarType())
1649 return Def->replaceAllUsesWith(Def->getOperand(1));
1650
1652 m_One()))) {
1653 Type *WideStepTy = Def->getScalarType();
1654 if (X->getScalarType() != WideStepTy)
1655 X = Builder.createWidenCast(Instruction::Trunc, X, WideStepTy);
1656 Def->replaceAllUsesWith(X);
1657 return;
1658 }
1659
1660 // For i1 vp.merges produced by AnyOf reductions:
1661 // vp.merge true, (or x, y), x, evl -> vp.merge y, true, x, evl
1663 m_VPValue(X), m_VPValue())) &&
1665 Def->getScalarType()->isIntegerTy(1)) {
1666 Def->setOperand(1, Def->getOperand(0));
1667 Def->setOperand(0, Y);
1668 return;
1669 }
1670
1671 // Simplify MaskedCond with no block mask to its single operand.
1673 !cast<VPInstruction>(Def)->isMasked())
1674 return Def->replaceAllUsesWith(Def->getOperand(0));
1675
1676 // Look through ExtractLastLane.
1677 if (match(Def, m_ExtractLastLane(m_VPValue(A)))) {
1678 if (match(A, m_BuildVector())) {
1679 auto *BuildVector = cast<VPInstruction>(A);
1680 Def->replaceAllUsesWith(
1681 BuildVector->getOperand(BuildVector->getNumOperands() - 1));
1682 return;
1683 }
1684
1685 if (match(A, m_Broadcast(m_VPValue(X))))
1686 return Def->replaceAllUsesWith(X);
1687
1689 return Def->replaceAllUsesWith(A);
1690
1691 if (Plan->hasScalarVFOnly())
1692 return Def->replaceAllUsesWith(A);
1693 }
1694
1695 // Look through ExtractPenultimateElement (BuildVector ....).
1697 auto *BuildVector = cast<VPInstruction>(Def->getOperand(0));
1698 Def->replaceAllUsesWith(
1699 BuildVector->getOperand(BuildVector->getNumOperands() - 2));
1700 return;
1701 }
1702
1703 uint64_t Idx;
1705 auto *BuildVector = cast<VPInstruction>(Def->getOperand(0));
1706 Def->replaceAllUsesWith(BuildVector->getOperand(Idx));
1707 return;
1708 }
1709
1710 if (match(Def, m_BuildVector()) && all_equal(Def->operands())) {
1711 Def->replaceAllUsesWith(
1712 Builder.createNaryOp(VPInstruction::Broadcast, Def->getOperand(0)));
1713 return;
1714 }
1715
1716 // Look through broadcast of single-scalar when used as select conditions; in
1717 // that case the scalar condition can be used directly.
1718 if (match(Def,
1721 "broadcast operand must be single-scalar");
1722 Def->setOperand(0, C);
1723 return;
1724 }
1725
1726 if (match(Def, m_Broadcast(m_VPValue(X))))
1727 return Def->replaceUsesWithIf(
1728 X, [Def](const VPUser &U, unsigned) { return U.usesScalars(Def); });
1729
1731 if (Def->getNumOperands() == 1) {
1732 Def->replaceAllUsesWith(Def->getOperand(0));
1733 return;
1734 }
1735 if (auto *Phi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(Def)) {
1736 if (all_equal(Phi->incoming_values()))
1737 Phi->replaceAllUsesWith(Phi->getOperand(0));
1738 }
1739 return;
1740 }
1741
1742 VPIRValue *IRV;
1743 if (Def->getNumOperands() == 1 &&
1745 return Def->replaceAllUsesWith(IRV);
1746
1747 // Some simplifications can only be applied after unrolling. Perform them
1748 // below.
1749 if (!Plan->isUnrolled())
1750 return;
1751
1752 // After unrolling, extract-lane may be used to extract values from multiple
1753 // scalar sources. Only simplify when extracting from a single scalar source.
1754 VPValue *LaneToExtract;
1755 if (match(Def, m_ExtractLane(m_VPValue(LaneToExtract), m_VPValue(A)))) {
1756 // Simplify extract-lane(%lane_num, %scalar_val) -> %scalar_val.
1758 return Def->replaceAllUsesWith(A);
1759
1760 // Replace extract-lane(0, canonical-WIDEN-INDUCTION) with the region's
1761 // scalar canonical IV.
1763 if (match(LaneToExtract, m_ZeroInt()) &&
1764 match(A, m_CanonicalWidenIV(WidenIV)))
1765 return Def->replaceAllUsesWith(WidenIV->getRegion()->getCanonicalIV());
1766
1767 // Simplify extract-lane with single source to extract-element.
1768 Def->replaceAllUsesWith(Builder.createNaryOp(
1769 Instruction::ExtractElement, {A, LaneToExtract}, Def->getDebugLoc()));
1770 return;
1771 }
1772
1773 // Look for cycles where Def is of the form:
1774 // X = phi(0, IVInc) ; used only by IVInc, or by IVInc and Inc = X + Y
1775 // IVInc = X + Step ; used by X and Def
1776 // Def = IVInc + Y
1777 // Fold the increment Y into the phi's start value, replace Def with IVInc,
1778 // and if Inc exists, replace it with X.
1779 if (match(Def, m_Add(m_Add(m_VPValue(X), m_VPValue()), m_VPValue(Y))) &&
1780 isa<VPIRValue>(Y) &&
1781 match(X, m_VPPhi(m_ZeroInt(), m_Specific(Def->getOperand(0))))) {
1782 auto *Phi = cast<VPPhi>(X);
1783 auto *IVInc = Def->getOperand(0);
1784 if (IVInc->getNumUsers() == 2) {
1785 // If Phi has a second user (besides IVInc's defining recipe), it must
1786 // be Inc = Phi + Y for the fold to apply.
1788 findUserOf(Phi, m_Add(m_Specific(Phi), m_Specific(Y))));
1789 if (Phi->getNumUsers() == 1 || (Phi->getNumUsers() == 2 && Inc)) {
1790 Def->replaceAllUsesWith(IVInc);
1791 if (Inc)
1792 Inc->replaceAllUsesWith(Phi);
1793 Phi->setOperand(0, Y);
1794 return;
1795 }
1796 }
1797 }
1798
1799 // Simplify unrolled VectorPointer without offset, or with zero offset, to
1800 // just the pointer operand.
1801 if (auto *VPR = dyn_cast<VPVectorPointerRecipe>(Def))
1802 if (!VPR->getVFxPart() || match(VPR->getVFxPart(), m_ZeroInt()))
1803 return VPR->replaceAllUsesWith(VPR->getOperand(0));
1804
1805 // VPScalarIVSteps after unrolling can be replaced by their start value, if
1806 // the start index is zero and only the first lane 0 is demanded.
1807 if (auto *Steps = dyn_cast<VPScalarIVStepsRecipe>(Def)) {
1808 if (!Steps->getStartIndex() && vputils::onlyFirstLaneUsed(Steps)) {
1809 Steps->replaceAllUsesWith(Steps->getOperand(0));
1810 return;
1811 }
1812 }
1813 // Simplify redundant ReductionStartVector recipes after unrolling.
1814 VPValue *StartV;
1816 m_VPValue(StartV), m_VPValue(), m_VPValue()))) {
1817 Def->replaceUsesWithIf(StartV, [](const VPUser &U, unsigned Idx) {
1818 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&U);
1819 return PhiR && PhiR->isInLoop();
1820 });
1821 return;
1822 }
1823
1824 if (Plan->getConcreteUF() == 1 && match(Def, m_ExtractLastPart(m_VPValue(A))))
1825 return Def->replaceAllUsesWith(A);
1826}
1827
1837
1838/// Reassociate (headermask && x) && y -> headermask && (x && y) to allow the
1839/// header mask to be simplified further when tail folding, e.g. in
1840/// optimizeEVLMasks.
1841static void reassociateHeaderMask(VPlan &Plan) {
1842 VPValue *HeaderMask = vputils::findHeaderMask(Plan);
1843 if (!HeaderMask)
1844 return;
1845
1846 SmallVector<VPUser *> Worklist;
1847 for (VPUser *U : HeaderMask->users())
1848 if (match(U, m_LogicalAnd(m_Specific(HeaderMask), m_VPValue())))
1850
1851 while (!Worklist.empty()) {
1852 auto *R = dyn_cast<VPSingleDefRecipe>(Worklist.pop_back_val());
1853 VPValue *X, *Y;
1854 if (!R || !match(R, m_LogicalAnd(
1855 m_LogicalAnd(m_Specific(HeaderMask), m_VPValue(X)),
1856 m_VPValue(Y))))
1857 continue;
1858 append_range(Worklist, R->users());
1859 VPBuilder Builder(R);
1860 R->replaceAllUsesWith(
1861 Builder.createLogicalAnd(HeaderMask, Builder.createLogicalAnd(X, Y)));
1862 }
1863}
1864
1865static std::optional<Instruction::BinaryOps>
1867 switch (ID) {
1868 case Intrinsic::masked_udiv:
1869 return Instruction::UDiv;
1870 case Intrinsic::masked_sdiv:
1871 return Instruction::SDiv;
1872 case Intrinsic::masked_urem:
1873 return Instruction::URem;
1874 case Intrinsic::masked_srem:
1875 return Instruction::SRem;
1876 default:
1877 return {};
1878 }
1879}
1880
1882 if (Plan.hasScalarVFOnly())
1883 return;
1884
1886 vp_depth_first_deep(Plan.getEntry()))) {
1887 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
1890 continue;
1891 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
1892 if (RepR && (RepR->isSingleScalar() || RepR->isPredicated()))
1893 continue;
1894
1895 auto *RepOrWidenR = cast<VPRecipeWithIRFlags>(&R);
1896 if (RepR && RepR->getOpcode() == Instruction::Store &&
1897 vputils::isSingleScalar(RepR->getOperand(1))) {
1898 auto *Clone = new VPReplicateRecipe(
1899 RepOrWidenR->getUnderlyingInstr(), RepOrWidenR->operands(),
1900 true /*IsSingleScalar*/, nullptr /*Mask*/, *RepR /*Flags*/,
1901 *RepR /*Metadata*/, RepR->getDebugLoc());
1902 Clone->insertBefore(RepOrWidenR);
1903 VPBuilder Builder(Clone);
1904 VPValue *ExtractOp = Clone->getOperand(0);
1905 if (vputils::isUniformAcrossVFsAndUFs(RepR->getOperand(1)))
1906 ExtractOp =
1907 Builder.createNaryOp(VPInstruction::ExtractLastPart, ExtractOp);
1908 ExtractOp =
1909 Builder.createNaryOp(VPInstruction::ExtractLastLane, ExtractOp);
1910 Clone->setOperand(0, ExtractOp);
1911 RepR->eraseFromParent();
1912 continue;
1913 }
1914
1915 // Narrow llvm.masked.{u,s}{div,rem} intrinsics with a safe divisor.
1916 if (auto *IntrR = dyn_cast<VPWidenIntrinsicRecipe>(RepOrWidenR)) {
1917 if (!vputils::onlyFirstLaneUsed(IntrR))
1918 continue;
1919 auto Opc = getUnmaskedDivRemOpcode(IntrR->getVectorIntrinsicID());
1920 if (!Opc)
1921 continue;
1922 VPBuilder Builder(IntrR);
1923 VPValue *SafeDivisor = Builder.createSelect(
1924 IntrR->getOperand(2), IntrR->getOperand(1),
1925 Plan.getConstantInt(IntrR->getScalarType(), 1));
1926 VPValue *Clone = Builder.createNaryOp(
1927 *Opc, {IntrR->getOperand(0), SafeDivisor},
1928 VPIRFlags::getDefaultFlags(*Opc), IntrR->getDebugLoc());
1929 IntrR->replaceAllUsesWith(Clone);
1930 IntrR->eraseFromParent();
1931 continue;
1932 }
1933
1934 // Skip recipes that aren't single scalars.
1935 if (!vputils::isSingleScalar(RepOrWidenR))
1936 continue;
1937
1938 // Predicate to check if a user of Op introduces extra broadcasts.
1939 auto IntroducesBCastOf = [](const VPValue *Op) {
1940 return [Op](const VPUser *U) {
1941 if (auto *VPI = dyn_cast<VPInstruction>(U)) {
1945 VPI->getOpcode()))
1946 return false;
1947 }
1948 return !U->usesScalars(Op);
1949 };
1950 };
1951
1952 if (any_of(RepOrWidenR->users(), IntroducesBCastOf(RepOrWidenR)) &&
1953 none_of(RepOrWidenR->operands(), [&](VPValue *Op) {
1954 if (any_of(
1955 make_filter_range(Op->users(), not_equal_to(RepOrWidenR)),
1956 IntroducesBCastOf(Op)))
1957 return false;
1958 // Non-constant live-ins require broadcasts, while constants do not
1959 // need explicit broadcasts.
1960 auto *IRV = dyn_cast<VPIRValue>(Op);
1961 bool LiveInNeedsBroadcast = IRV && !isa<Constant>(IRV->getValue());
1962 auto *OpR = dyn_cast<VPReplicateRecipe>(Op);
1963 return LiveInNeedsBroadcast || (OpR && OpR->isSingleScalar());
1964 }))
1965 continue;
1966
1967 auto *Clone = new VPReplicateRecipe(
1968 RepOrWidenR->getUnderlyingInstr(), RepOrWidenR->operands(),
1969 true /*IsSingleScalar*/, nullptr, *RepOrWidenR);
1970 Clone->insertBefore(RepOrWidenR);
1971 RepOrWidenR->replaceAllUsesWith(Clone);
1972 if (isDeadRecipe(*RepOrWidenR))
1973 RepOrWidenR->eraseFromParent();
1974 }
1975 }
1976}
1977
1978/// Try to see if all of \p Blend's masks share a common value logically and'ed
1979/// and remove it from the masks.
1981 if (Blend->isNormalized())
1982 return;
1983 VPValue *CommonEdgeMask;
1984 if (!match(Blend->getMask(0),
1985 m_LogicalAnd(m_VPValue(CommonEdgeMask), m_VPValue())))
1986 return;
1987 for (unsigned I = 0; I < Blend->getNumIncomingValues(); I++)
1988 if (!match(Blend->getMask(I),
1989 m_LogicalAnd(m_Specific(CommonEdgeMask), m_VPValue())))
1990 return;
1991 for (unsigned I = 0; I < Blend->getNumIncomingValues(); I++)
1992 Blend->setMask(I, Blend->getMask(I)->getDefiningRecipe()->getOperand(1));
1993}
1994
1995/// Normalize and simplify VPBlendRecipes. Should be run after simplifyRecipes
1996/// to make sure the masks are simplified.
1997static void simplifyBlends(VPlan &Plan) {
2000 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
2001 auto *Blend = dyn_cast<VPBlendRecipe>(&R);
2002 if (!Blend)
2003 continue;
2004
2005 removeCommonBlendMask(Blend);
2006
2007 // Try to remove redundant blend recipes.
2008 SmallPtrSet<VPValue *, 4> UniqueValues;
2009 if (Blend->isNormalized() || !match(Blend->getMask(0), m_False()))
2010 UniqueValues.insert(Blend->getIncomingValue(0));
2011 for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
2012 if (!match(Blend->getMask(I), m_False()))
2013 UniqueValues.insert(Blend->getIncomingValue(I));
2014
2015 if (UniqueValues.size() == 1) {
2016 Blend->replaceAllUsesWith(*UniqueValues.begin());
2017 Blend->eraseFromParent();
2018 continue;
2019 }
2020
2021 if (Blend->isNormalized())
2022 continue;
2023
2024 // Normalize the blend so its first incoming value is used as the initial
2025 // value with the others blended into it.
2026
2027 unsigned StartIndex = 0;
2028 for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
2029 // If a value's mask is used only by the blend then is can be deadcoded.
2030 // TODO: Find the most expensive mask that can be deadcoded, or a mask
2031 // that's used by multiple blends where it can be removed from them all.
2032 VPValue *Mask = Blend->getMask(I);
2033 if (Mask->getNumUsers() == 1 && !match(Mask, m_False())) {
2034 StartIndex = I;
2035 break;
2036 }
2037 }
2038
2039 SmallVector<VPValue *, 4> OperandsWithMask;
2040 OperandsWithMask.push_back(Blend->getIncomingValue(StartIndex));
2041
2042 for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
2043 if (I == StartIndex)
2044 continue;
2045 OperandsWithMask.push_back(Blend->getIncomingValue(I));
2046 OperandsWithMask.push_back(Blend->getMask(I));
2047 }
2048
2049 auto *NewBlend =
2050 new VPBlendRecipe(cast_or_null<PHINode>(Blend->getUnderlyingValue()),
2051 OperandsWithMask, *Blend, Blend->getDebugLoc());
2052 NewBlend->insertBefore(&R);
2053
2054 VPValue *DeadMask = Blend->getMask(StartIndex);
2055 Blend->replaceAllUsesWith(NewBlend);
2056 Blend->eraseFromParent();
2058
2059 /// Simplify BLEND %a, %b, Not(%mask) -> BLEND %b, %a, %mask.
2060 VPValue *NewMask;
2061 if (NewBlend->getNumOperands() == 3 &&
2062 match(NewBlend->getMask(1), m_Not(m_VPValue(NewMask)))) {
2063 VPValue *Inc0 = NewBlend->getOperand(0);
2064 VPValue *Inc1 = NewBlend->getOperand(1);
2065 VPValue *OldMask = NewBlend->getOperand(2);
2066 NewBlend->setOperand(0, Inc1);
2067 NewBlend->setOperand(1, Inc0);
2068 NewBlend->setOperand(2, NewMask);
2069 if (OldMask->getNumUsers() == 0)
2070 cast<VPInstruction>(OldMask)->eraseFromParent();
2071 }
2072 }
2073 }
2074}
2075
2076/// Optimize the width of vector induction variables in \p Plan based on a known
2077/// constant Trip Count, \p BestVF and \p BestUF.
2079 ElementCount BestVF,
2080 unsigned BestUF) {
2081 // Only proceed if we have not completely removed the vector region.
2082 if (!Plan.getVectorLoopRegion())
2083 return false;
2084
2085 const APInt *TC;
2086 if (!BestVF.isFixed() || !match(Plan.getTripCount(), m_APInt(TC)))
2087 return false;
2088
2089 // Calculate the minimum power-of-2 bit width that can fit the known TC, VF
2090 // and UF. Returns at least 8.
2091 auto ComputeBitWidth = [](APInt TC, uint64_t Align) {
2092 APInt AlignedTC =
2095 APInt MaxVal = AlignedTC - 1;
2096 return std::max<unsigned>(PowerOf2Ceil(MaxVal.getActiveBits()), 8);
2097 };
2098 unsigned NewBitWidth =
2099 ComputeBitWidth(*TC, BestVF.getKnownMinValue() * BestUF);
2100
2101 LLVMContext &Ctx = Plan.getContext();
2102 auto *NewIVTy = IntegerType::get(Ctx, NewBitWidth);
2103
2104 bool MadeChange = false;
2105
2106 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
2107 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
2108 // Currently only handle canonical IVs as it is trivial to replace the start
2109 // and stop values, and we currently only perform the optimization when the
2110 // IV has a single use.
2112 if (!match(&Phi, m_CanonicalWidenIV(WideIV)))
2113 continue;
2114 if (WideIV->hasMoreThanOneUniqueUser() ||
2115 NewIVTy == WideIV->getScalarType())
2116 continue;
2117
2118 // Currently only handle cases where the single user is a header-mask
2119 // comparison with the backedge-taken-count.
2120 VPUser *SingleUser = WideIV->getSingleUser();
2121 if (!SingleUser ||
2122 !match(SingleUser,
2123 m_ICmp(m_Specific(WideIV),
2125 continue;
2126
2127 // Update IV operands and comparison bound to use new narrower type.
2128 assert(!WideIV->getTruncInst() &&
2129 "canonical IV is not expected to have a truncation");
2130 auto *NewWideIV = new VPWidenIntOrFpInductionRecipe(
2131 WideIV->getPHINode(), Plan.getZero(NewIVTy),
2132 Plan.getConstantInt(NewIVTy, 1), WideIV->getVFValue(),
2133 WideIV->getInductionDescriptor(), *WideIV, WideIV->getDebugLoc());
2134 NewWideIV->insertBefore(WideIV);
2135
2136 auto *NewBTC = new VPWidenCastRecipe(
2137 Instruction::Trunc, Plan.getOrCreateBackedgeTakenCount(), NewIVTy,
2138 nullptr, VPIRFlags::getDefaultFlags(Instruction::Trunc));
2139 Plan.getVectorPreheader()->appendRecipe(NewBTC);
2140 auto *Cmp = cast<VPInstruction>(WideIV->getSingleUser());
2141 Cmp->replaceAllUsesWith(
2142 VPBuilder(Cmp).createICmp(Cmp->getPredicate(), NewWideIV, NewBTC));
2143
2144 MadeChange = true;
2145 }
2146
2147 return MadeChange;
2148}
2149
2150/// Return true if \p Cond is known to be true for given \p BestVF and \p
2151/// BestUF.
2153 ElementCount BestVF, unsigned BestUF,
2156 return any_of(Cond->getDefiningRecipe()->operands(), [&Plan, BestVF, BestUF,
2157 &PSE](VPValue *C) {
2158 return isConditionTrueViaVFAndUF(C, Plan, BestVF, BestUF, PSE);
2159 });
2160
2161 auto *CanIV = Plan.getVectorLoopRegion()->getCanonicalIV();
2164 m_c_Add(m_Specific(CanIV), m_Specific(&Plan.getVFxUF())),
2165 m_Specific(&Plan.getVectorTripCount()))))
2166 return false;
2167
2168 // The compare checks CanIV + VFxUF == vector trip count. The vector trip
2169 // count is not conveniently available as SCEV so far, so we compare directly
2170 // against the original trip count. This is stricter than necessary, as we
2171 // will only return true if the trip count == vector trip count.
2172 const SCEV *VectorTripCount =
2174 if (isa<SCEVCouldNotCompute>(VectorTripCount))
2175 VectorTripCount = vputils::getSCEVExprForVPValue(Plan.getTripCount(), PSE);
2176 assert(!isa<SCEVCouldNotCompute>(VectorTripCount) &&
2177 "Trip count SCEV must be computable");
2178 ScalarEvolution &SE = *PSE.getSE();
2179 ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF);
2180 const SCEV *C = SE.getElementCount(VectorTripCount->getType(), NumElements);
2181 return SE.isKnownPredicate(CmpInst::ICMP_EQ, VectorTripCount, C);
2182}
2183
2184/// Try to replace multiple active lane masks used for control flow with
2185/// a single, wide active lane mask instruction followed by multiple
2186/// extract subvector intrinsics. This applies to the active lane mask
2187/// instructions both in the loop and in the preheader.
2188/// Incoming values of all ActiveLaneMaskPHIs are updated to use the
2189/// new extracts from the first active lane mask, which has it's last
2190/// operand (multiplier) set to UF.
2192 unsigned UF) {
2193 if (!EnableWideActiveLaneMask || !VF.isVector() || UF == 1)
2194 return false;
2195
2196 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
2197 VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
2198 auto *Term = &ExitingVPBB->back();
2199
2200 using namespace llvm::VPlanPatternMatch;
2202 m_VPValue(), m_VPValue(), m_VPValue())))))
2203 return false;
2204
2205 auto *Header = cast<VPBasicBlock>(VectorRegion->getEntry());
2206 LLVMContext &Ctx = Plan.getContext();
2207
2208 auto ExtractFromALM = [&](VPInstruction *ALM,
2209 SmallVectorImpl<VPValue *> &Extracts) {
2210 DebugLoc DL = ALM->getDebugLoc();
2211 for (unsigned Part = 0; Part < UF; ++Part) {
2213 Ops.append({ALM, Plan.getConstantInt(64, VF.getKnownMinValue() * Part)});
2214 auto *Ext =
2215 new VPWidenIntrinsicRecipe(Intrinsic::vector_extract, Ops,
2216 IntegerType::getInt1Ty(Ctx), {}, {}, DL);
2217 Extracts[Part] = Ext;
2218 Ext->insertAfter(ALM);
2219 }
2220 };
2221
2222 // Create a list of each active lane mask phi, ordered by unroll part.
2224 for (VPRecipeBase &R : Header->phis()) {
2226 if (!Phi)
2227 continue;
2228 VPValue *Index = nullptr;
2229 match(Phi->getBackedgeValue(),
2231 assert(Index && "Expected index from ActiveLaneMask instruction");
2232
2233 uint64_t Part;
2234 if (match(Index,
2236 m_VPValue(), m_Mul(m_VPValue(), m_ConstantInt(Part)))))
2237 Phis[Part] = Phi;
2238 else {
2239 // Anything other than a CanonicalIVIncrementForPart is part 0
2240 assert(!match(
2241 Index,
2243 Phis[0] = Phi;
2244 }
2245 }
2246
2247 assert(all_of(Phis, not_equal_to(nullptr)) &&
2248 "Expected one VPActiveLaneMaskPHIRecipe for each unroll part");
2249
2250 auto *EntryALM = cast<VPInstruction>(Phis[0]->getStartValue());
2251 auto *LoopALM = cast<VPInstruction>(Phis[0]->getBackedgeValue());
2252
2253 assert((EntryALM->getOpcode() == VPInstruction::ActiveLaneMask &&
2254 LoopALM->getOpcode() == VPInstruction::ActiveLaneMask) &&
2255 "Expected incoming values of Phi to be ActiveLaneMasks");
2256
2257 // When using wide lane masks, the return type of the get.active.lane.mask
2258 // intrinsic is VF x UF (last operand).
2259 VPValue *ALMMultiplier = Plan.getConstantInt(64, UF);
2260 EntryALM->setOperand(2, ALMMultiplier);
2261 LoopALM->setOperand(2, ALMMultiplier);
2262
2263 // Create UF x extract vectors and insert into preheader.
2264 SmallVector<VPValue *> EntryExtracts(UF);
2265 ExtractFromALM(EntryALM, EntryExtracts);
2266
2267 // Create UF x extract vectors and insert before the loop compare & branch,
2268 // updating the compare to use the first extract.
2269 SmallVector<VPValue *> LoopExtracts(UF);
2270 ExtractFromALM(LoopALM, LoopExtracts);
2271 VPInstruction *Not = cast<VPInstruction>(Term->getOperand(0));
2272 Not->setOperand(0, LoopExtracts[0]);
2273
2274 // Update the incoming values of active lane mask phis.
2275 for (unsigned Part = 0; Part < UF; ++Part) {
2276 Phis[Part]->setStartValue(EntryExtracts[Part]);
2277 Phis[Part]->setBackedgeValue(LoopExtracts[Part]);
2278 }
2279
2280 return true;
2281}
2282
2283/// Try to simplify the branch condition of \p Plan. This may restrict the
2284/// resulting plan to \p BestVF and \p BestUF.
2286 unsigned BestUF,
2288 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
2289 VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
2290 auto *Term = &ExitingVPBB->back();
2291 VPValue *Cond;
2292 auto m_CanIVInc = m_Add(m_VPValue(), m_Specific(&Plan.getVFxUF()));
2293 // Check if the branch condition compares the canonical IV increment (for main
2294 // loop), or the canonical IV increment plus an offset (for epilog loop).
2295 if (match(Term, m_BranchOnCount(
2296 m_CombineOr(m_CanIVInc, m_c_Add(m_CanIVInc, m_LiveIn())),
2297 m_VPValue())) ||
2299 m_VPValue(), m_VPValue(), m_VPValue()))))) {
2300 // Try to simplify the branch condition if VectorTC <= VF * UF when the
2301 // latch terminator is BranchOnCount or BranchOnCond(Not(ActiveLaneMask)).
2302 const SCEV *VectorTripCount =
2304 if (isa<SCEVCouldNotCompute>(VectorTripCount))
2305 VectorTripCount =
2307 assert(!isa<SCEVCouldNotCompute>(VectorTripCount) &&
2308 "Trip count SCEV must be computable");
2309 ScalarEvolution &SE = *PSE.getSE();
2310 ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF);
2311 const SCEV *C = SE.getElementCount(VectorTripCount->getType(), NumElements);
2312 if (!SE.isKnownPredicate(CmpInst::ICMP_ULE, VectorTripCount, C))
2313 return false;
2314 } else if (match(Term, m_BranchOnCond(m_VPValue(Cond))) ||
2316 // For BranchOnCond, check if we can prove the condition to be true using VF
2317 // and UF.
2318 if (!isConditionTrueViaVFAndUF(Cond, Plan, BestVF, BestUF, PSE))
2319 return false;
2320 } else {
2321 return false;
2322 }
2323
2324 // The vector loop region only executes once. Convert terminator of the
2325 // exiting block to exit in the first iteration.
2326 if (match(Term, m_BranchOnTwoConds())) {
2327 Term->setOperand(1, Plan.getTrue());
2328 return true;
2329 }
2330
2331 auto *BOC = new VPInstruction(VPInstruction::BranchOnCond, Plan.getTrue(), {},
2332 {}, Term->getDebugLoc());
2333 ExitingVPBB->appendRecipe(BOC);
2334 Term->eraseFromParent();
2335
2336 return true;
2337}
2338
2339/// From the definition of llvm.experimental.get.vector.length,
2340/// VPInstruction::ExplicitVectorLength(%AVL) = %AVL when %AVL <= VF.
2344 vp_depth_first_deep(Plan.getEntry()))) {
2345 for (VPRecipeBase &R : *VPBB) {
2346 VPValue *AVL;
2347 if (!match(&R, m_EVL(m_VPValue(AVL))))
2348 continue;
2349
2350 const SCEV *AVLSCEV = vputils::getSCEVExprForVPValue(AVL, PSE);
2351 if (isa<SCEVCouldNotCompute>(AVLSCEV))
2352 continue;
2353 ScalarEvolution &SE = *PSE.getSE();
2354 const SCEV *VFSCEV = SE.getElementCount(AVLSCEV->getType(), VF);
2355 if (!SE.isKnownPredicate(CmpInst::ICMP_ULE, AVLSCEV, VFSCEV))
2356 continue;
2357
2359 AVL, Type::getInt32Ty(Plan.getContext()), AVLSCEV->getType(),
2360 R.getDebugLoc());
2361 if (Trunc != AVL) {
2362 auto *TruncR = cast<VPSingleDefRecipe>(Trunc);
2363 const DataLayout &DL = Plan.getDataLayout();
2364 if (VPValue *Folded = tryToFoldLiveIns(*TruncR, TruncR->operands(), DL,
2365 Plan.getContext()))
2366 Trunc = Folded;
2367 }
2368 R.getVPSingleValue()->replaceAllUsesWith(Trunc);
2369 return true;
2370 }
2371 }
2372 return false;
2373}
2374
2376 unsigned BestUF,
2378 assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
2379 assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
2380
2381 bool MadeChange = tryToReplaceALMWithWideALM(Plan, BestVF, BestUF);
2382 MadeChange |= simplifyBranchConditionForVFAndUF(Plan, BestVF, BestUF, PSE);
2383 MadeChange |= optimizeVectorInductionWidthForTCAndVFUF(Plan, BestVF, BestUF);
2384
2385 if (MadeChange) {
2386 Plan.setVF(BestVF);
2387 assert(Plan.getConcreteUF() == BestUF && "BestUF must match the Plan's UF");
2388 }
2389}
2390
2392 for (VPRecipeBase &R :
2394 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
2395 if (!PhiR)
2396 continue;
2397 RecurKind RK = PhiR->getRecurrenceKind();
2398 if (RK != RecurKind::Add && RK != RecurKind::Mul && RK != RecurKind::Sub &&
2400 continue;
2401
2402 for (VPUser *U : collectUsersRecursively(PhiR))
2403 if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(U)) {
2404 RecWithFlags->dropPoisonGeneratingFlags();
2405 }
2406 }
2407}
2408
2409namespace {
2410struct VPCSEDenseMapInfo : public DenseMapInfo<VPSingleDefRecipe *> {
2411 static bool isSentinel(const VPSingleDefRecipe *Def) {
2412 return Def == getEmptyKey();
2413 }
2414
2415 /// If recipe \p R will lower to a GEP with a non-i8 source element type,
2416 /// return that source element type.
2417 static Type *getGEPSourceElementType(const VPSingleDefRecipe *R) {
2418 // All VPInstructions that lower to GEPs must have the i8 source element
2419 // type (as they are PtrAdds), so we omit it.
2421 .Case([](const VPReplicateRecipe *I) -> Type * {
2422 if (auto *GEP = dyn_cast<GetElementPtrInst>(I->getUnderlyingValue()))
2423 return GEP->getSourceElementType();
2424 return nullptr;
2425 })
2426 .Case<VPVectorPointerRecipe, VPWidenGEPRecipe>(
2427 [](auto *I) { return I->getSourceElementType(); })
2428 .Default([](auto *) { return nullptr; });
2429 }
2430
2431 /// Returns true if recipe \p Def can be safely handed for CSE.
2432 static bool canHandle(const VPSingleDefRecipe *Def) {
2433 // We can extend the list of handled recipes in the future,
2434 // provided we account for the data embedded in them while checking for
2435 // equality or hashing.
2436 auto C = getOpcodeOrIntrinsicID(Def);
2437
2438 // The issue with (Insert|Extract)Value is that the index of the
2439 // insert/extract is not a proper operand in LLVM IR, and hence also not in
2440 // VPlan.
2441 if (!C || (!C->first && (C->second == Instruction::InsertValue ||
2442 C->second == Instruction::ExtractValue)))
2443 return false;
2444
2445 // During CSE, we can only handle recipes that don't read from memory: if
2446 // they read from memory, there could be an intervening write to memory
2447 // before the next instance is CSE'd, leading to an incorrect result.
2448 return !Def->mayReadFromMemory();
2449 }
2450
2451 /// Hash the underlying data of \p Def.
2452 static unsigned getHashValue(const VPSingleDefRecipe *Def) {
2453 hash_code Result = hash_combine(
2454 Def->getVPRecipeID(), getOpcodeOrIntrinsicID(Def),
2455 getGEPSourceElementType(Def), Def->getScalarType(),
2457 if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(Def))
2458 if (RFlags->hasPredicate())
2459 return hash_combine(Result, RFlags->getPredicate());
2460 if (auto *SIVSteps = dyn_cast<VPScalarIVStepsRecipe>(Def))
2461 return hash_combine(Result, SIVSteps->getInductionOpcode());
2462 return Result;
2463 }
2464
2465 /// Check equality of underlying data of \p L and \p R.
2466 static bool isEqual(const VPSingleDefRecipe *L, const VPSingleDefRecipe *R) {
2467 if (isSentinel(L) || isSentinel(R))
2468 return L == R;
2469 if (L->getVPRecipeID() != R->getVPRecipeID() ||
2471 getGEPSourceElementType(L) != getGEPSourceElementType(R) ||
2473 !equal(L->operands(), R->operands()))
2474 return false;
2476 "must have valid opcode info for both recipes");
2477 if (auto *LFlags = dyn_cast<VPRecipeWithIRFlags>(L))
2478 if (LFlags->hasPredicate() &&
2479 LFlags->getPredicate() !=
2480 cast<VPRecipeWithIRFlags>(R)->getPredicate())
2481 return false;
2482 if (auto *LSIV = dyn_cast<VPScalarIVStepsRecipe>(L))
2483 if (LSIV->getInductionOpcode() !=
2484 cast<VPScalarIVStepsRecipe>(R)->getInductionOpcode())
2485 return false;
2486 // Recipes in replicate regions implicitly depend on predicate. If either
2487 // recipe is in a replicate region, only consider them equal if both have
2488 // the same parent.
2489 const VPRegionBlock *RegionL = L->getRegion();
2490 const VPRegionBlock *RegionR = R->getRegion();
2491 if (((RegionL && RegionL->isReplicator()) ||
2492 (RegionR && RegionR->isReplicator())) &&
2493 L->getParent() != R->getParent())
2494 return false;
2495 return L->getScalarType() == R->getScalarType();
2496 }
2497};
2498} // end anonymous namespace
2499
2500/// Perform a common-subexpression-elimination of VPSingleDefRecipes on the \p
2501/// Plan.
2503 VPDominatorTree VPDT(Plan);
2505
2507 Plan.getEntry());
2509 for (VPRecipeBase &R : *VPBB) {
2510 auto *Def = dyn_cast<VPSingleDefRecipe>(&R);
2511 if (!Def || !VPCSEDenseMapInfo::canHandle(Def))
2512 continue;
2513 if (VPSingleDefRecipe *V = CSEMap.lookup(Def)) {
2514 // V must dominate Def for a valid replacement.
2515 if (!VPDT.dominates(V->getParent(), VPBB))
2516 continue;
2517 // Only keep flags present on both V and Def.
2518 if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(V))
2519 RFlags->intersectFlags(*cast<VPRecipeWithIRFlags>(Def));
2520 Def->replaceAllUsesWith(V);
2521 continue;
2522 }
2523 CSEMap[Def] = Def;
2524 }
2525 }
2526}
2527
2528/// Return true if we do not know how to (mechanically) hoist or sink a
2529/// non-memory or memory recipe \p R out of a loop region.
2531 VPBasicBlock *LastBB) {
2532 if (!isa<VPReplicateRecipe>(R) || !R.mayReadFromMemory())
2534
2535 // Check that the load doesn't alias with stores between FirstBB and LastBB.
2536 auto MemLoc = vputils::getMemoryLocation(R);
2537 return !MemLoc || !canHoistOrSinkWithNoAliasCheck(*MemLoc, FirstBB, LastBB);
2538}
2539
2540/// Move loop-invariant recipes out of the vector loop region in \p Plan.
2541static void licm(VPlan &Plan) {
2542 VPBasicBlock *Preheader = Plan.getVectorPreheader();
2543
2544 // Hoist any loop invariant recipes from the vector loop region to the
2545 // preheader. Preform a shallow traversal of the vector loop region, to
2546 // exclude recipes in replicate regions. Since the top-level blocks in the
2547 // vector loop region are guaranteed to execute if the vector pre-header is,
2548 // we don't need to check speculation safety.
2549 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2550 assert(Preheader->getSingleSuccessor() == LoopRegion &&
2551 "Expected vector prehader's successor to be the vector loop region");
2553 vp_depth_first_shallow(LoopRegion->getEntry()))) {
2554 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
2555 if (cannotHoistOrSinkRecipe(R, LoopRegion->getEntryBasicBlock(),
2556 LoopRegion->getExitingBasicBlock()))
2557 continue;
2558 if (any_of(R.operands(), [](VPValue *Op) {
2559 return !Op->isDefinedOutsideLoopRegions();
2560 }))
2561 continue;
2562 R.moveBefore(*Preheader, Preheader->end());
2563 }
2564 }
2565
2566#ifndef NDEBUG
2567 VPDominatorTree VPDT(Plan);
2568#endif
2569 // Sink recipes with no users inside the vector loop region if all users are
2570 // in the same exit block of the region.
2571 // TODO: Extend to sink recipes from inner loops.
2573 LoopRegion->getEntry());
2575 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
2576 if (vputils::cannotHoistOrSinkRecipe(R, /*Sinking=*/true))
2577 continue;
2578
2579 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
2580 assert(!RepR->isPredicated() &&
2581 "Expected prior transformation of predicated replicates to "
2582 "replicate regions");
2583 // narrowToSingleScalarRecipes should have already maximally narrowed
2584 // replicates to single-scalar replicates.
2585 // TODO: When unrolling, replicateByVF doesn't handle sunk
2586 // non-single-scalar replicates correctly.
2587 if (!RepR->isSingleScalar())
2588 continue;
2589 }
2590
2591 // TODO: Use R.definedValues() instead of casting to VPSingleDefRecipe to
2592 // support recipes with multiple defined values (e.g., interleaved loads).
2593 auto *Def = cast<VPSingleDefRecipe>(&R);
2594
2595 // Cannot sink the recipe if the user is defined in a loop region or a
2596 // non-successor of the vector loop region. Cannot sink if user is a phi
2597 // either.
2598 VPBasicBlock *SinkBB = nullptr;
2599 if (any_of(Def->users(), [&SinkBB, &LoopRegion](VPUser *U) {
2600 auto *UserR = cast<VPRecipeBase>(U);
2601 VPBasicBlock *Parent = UserR->getParent();
2602 // TODO: Support sinking when users are in multiple blocks.
2603 if (SinkBB && SinkBB != Parent)
2604 return true;
2605 SinkBB = Parent;
2606 // TODO: If the user is a PHI node, we should check the block of
2607 // incoming value. Support PHI node users if needed.
2608 return UserR->isPhi() || Parent->getEnclosingLoopRegion() ||
2609 Parent->getSinglePredecessor() != LoopRegion;
2610 }))
2611 continue;
2612
2613 if (!SinkBB)
2614 SinkBB = cast<VPBasicBlock>(LoopRegion->getSingleSuccessor());
2615
2616 // TODO: This will need to be a check instead of a assert after
2617 // conditional branches in vectorized loops are supported.
2618 assert(VPDT.properlyDominates(VPBB, SinkBB) &&
2619 "Defining block must dominate sink block");
2620 // TODO: Clone the recipe if users are on multiple exit paths, instead of
2621 // just moving.
2622 Def->moveBefore(*SinkBB, SinkBB->getFirstNonPhi());
2623 }
2624 }
2625}
2626
2628 VPlan &Plan, const MapVector<Instruction *, uint64_t> &MinBWs) {
2629 if (Plan.hasScalarVFOnly())
2630 return;
2631 // Keep track of created truncates, so they can be re-used. Note that we
2632 // cannot use RAUW after creating a new truncate, as this would could make
2633 // other uses have different types for their operands, making them invalidly
2634 // typed.
2636 VPBasicBlock *PH = Plan.getVectorPreheader();
2639 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
2642 continue;
2643
2644 VPValue *ResultVPV = R.getVPSingleValue();
2645 auto *UI = cast_or_null<Instruction>(ResultVPV->getUnderlyingValue());
2646 unsigned NewResSizeInBits = MinBWs.lookup(UI);
2647 if (!NewResSizeInBits)
2648 continue;
2649
2650 // If the value wasn't vectorized, we must maintain the original scalar
2651 // type. Skip those here, after incrementing NumProcessedRecipes. Also
2652 // skip casts which do not need to be handled explicitly here, as
2653 // redundant casts will be removed during recipe simplification.
2655 continue;
2656
2657 Type *OldResTy = ResultVPV->getScalarType();
2658 unsigned OldResSizeInBits = OldResTy->getScalarSizeInBits();
2659 assert(OldResTy->isIntegerTy() && "only integer types supported");
2660 (void)OldResSizeInBits;
2661
2662 auto *NewResTy = IntegerType::get(Plan.getContext(), NewResSizeInBits);
2663
2664 // Any wrapping introduced by shrinking this operation shouldn't be
2665 // considered undefined behavior. So, we can't unconditionally copy
2666 // arithmetic wrapping flags to VPW.
2667 if (auto *VPW = dyn_cast<VPRecipeWithIRFlags>(&R))
2668 VPW->dropPoisonGeneratingFlags();
2669
2670 assert((OldResSizeInBits != NewResSizeInBits ||
2671 match(&R, m_ICmp(m_VPValue(), m_VPValue()))) &&
2672 "Only ICmps should not need extending the result.");
2673 assert(!isa<VPWidenStoreRecipe>(&R) && "stores cannot be narrowed");
2674
2675 // For loads/intrinsics we don't recreate the recipe; just wrap the
2676 // original wide result in a ZExt to OldResTy.
2678 if (OldResSizeInBits != NewResSizeInBits) {
2680 Instruction::ZExt, ResultVPV, OldResTy);
2681 ResultVPV->replaceAllUsesWith(Ext);
2682 Ext->setOperand(0, ResultVPV);
2683 }
2684 continue;
2685 }
2686
2687 // Shrink operands by introducing truncates as needed.
2688 unsigned StartIdx =
2689 match(&R, m_Select(m_VPValue(), m_VPValue(), m_VPValue())) ? 1 : 0;
2690 SmallVector<VPValue *> NewOperands(R.operands());
2691 for (VPValue *&Op : drop_begin(NewOperands, StartIdx)) {
2692 unsigned OpSizeInBits = Op->getScalarType()->getScalarSizeInBits();
2693 if (OpSizeInBits == NewResSizeInBits)
2694 continue;
2695 assert(OpSizeInBits > NewResSizeInBits && "nothing to truncate");
2696 auto [ProcessedIter, Inserted] = ProcessedTruncs.try_emplace(Op);
2697 if (Inserted) {
2698 VPBuilder Builder;
2699 if (isa<VPIRValue>(Op))
2700 Builder.setInsertPoint(PH);
2701 else
2702 Builder.setInsertPoint(&R);
2703 ProcessedIter->second =
2704 Builder.createWidenCast(Instruction::Trunc, Op, NewResTy);
2705 }
2706 Op = ProcessedIter->second;
2707 }
2708
2709 auto *NWR = cast<VPWidenRecipe>(&R)->cloneWithOperands(NewOperands);
2710 NWR->insertBefore(&R);
2711
2712 // Wrap NWR in a ZExt to preserve the original wide type for downstream
2713 // users (unless this is an ICmp, which produces i1 regardless).
2714 VPValue *Replacement = NWR->getVPSingleValue();
2715 if (OldResSizeInBits != NewResSizeInBits)
2716 Replacement =
2718 .createWidenCast(Instruction::ZExt, Replacement, OldResTy)
2719 ->getVPSingleValue();
2720 ResultVPV->replaceAllUsesWith(Replacement);
2721 R.eraseFromParent();
2722 }
2723 }
2724}
2725
2726void VPlanTransforms::removeBranchOnConst(VPlan &Plan, bool OnlyLatches) {
2727 std::optional<VPDominatorTree> VPDT;
2728 if (OnlyLatches)
2729 VPDT.emplace(Plan);
2730
2731 // Collect all blocks before modifying the CFG so we can identify unreachable
2732 // ones after constant branch removal.
2734
2735 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(AllBlocks)) {
2736 VPValue *Cond;
2737 // Skip blocks that are not terminated by BranchOnCond.
2738 if (VPBB->empty() || !match(&VPBB->back(), m_BranchOnCond(m_VPValue(Cond))))
2739 continue;
2740
2741 if (OnlyLatches && !VPBlockUtils::isLatch(VPBB, *VPDT))
2742 continue;
2743
2744 assert(VPBB->getNumSuccessors() == 2 &&
2745 "Two successors expected for BranchOnCond");
2746 unsigned RemovedIdx;
2747 if (match(Cond, m_True()))
2748 RemovedIdx = 1;
2749 else if (match(Cond, m_False()))
2750 RemovedIdx = 0;
2751 else
2752 continue;
2753
2754 VPBasicBlock *RemovedSucc =
2755 cast<VPBasicBlock>(VPBB->getSuccessors()[RemovedIdx]);
2756 assert(count(RemovedSucc->getPredecessors(), VPBB) == 1 &&
2757 "There must be a single edge between VPBB and its successor");
2758 // Values coming from VPBB into phi recipes of RemovedSucc are removed from
2759 // these recipes.
2760 for (VPRecipeBase &R : RemovedSucc->phis())
2761 cast<VPPhiAccessors>(&R)->removeIncomingValueFor(VPBB);
2762
2763 // Disconnect blocks and remove the terminator.
2764 VPBlockUtils::disconnectBlocks(VPBB, RemovedSucc);
2765 VPBB->back().eraseFromParent();
2766 }
2767
2768 // Compute which blocks are still reachable from the entry after constant
2769 // branch removal.
2772
2773 // Detach all unreachable blocks from their successors, removing their recipes
2774 // and incoming values from phi recipes.
2775 VPSymbolicValue Tmp(nullptr);
2776 for (VPBlockBase *B : AllBlocks) {
2777 if (Reachable.contains(B))
2778 continue;
2779 for (VPBlockBase *Succ : to_vector(B->successors())) {
2780 if (auto *SuccBB = dyn_cast<VPBasicBlock>(Succ))
2781 for (VPRecipeBase &R : SuccBB->phis())
2782 cast<VPPhiAccessors>(&R)->removeIncomingValueFor(B);
2784 }
2785 for (VPBasicBlock *DeadBB :
2787 for (VPRecipeBase &R : make_early_inc_range(*DeadBB)) {
2788 for (VPValue *Def : R.definedValues())
2789 Def->replaceAllUsesWith(&Tmp);
2790 R.eraseFromParent();
2791 }
2792 }
2793 }
2794}
2795
2815
2816// Add a VPActiveLaneMaskPHIRecipe and related recipes to \p Plan and replace
2817// the loop terminator with a branch-on-cond recipe with the negated
2818// active-lane-mask as operand. Note that this turns the loop into an
2819// uncountable one. Only the existing terminator is replaced, all other existing
2820// recipes/users remain unchanged, except for poison-generating flags being
2821// dropped from the canonical IV increment. Return the created
2822// VPActiveLaneMaskPHIRecipe.
2823//
2824// The function adds the following recipes:
2825//
2826// vector.ph:
2827// %EntryInc = canonical-iv-increment-for-part CanonicalIVStart
2828// %EntryALM = active-lane-mask %EntryInc, TC
2829//
2830// vector.body:
2831// ...
2832// %P = active-lane-mask-phi [ %EntryALM, %vector.ph ], [ %ALM, %vector.body ]
2833// ...
2834// %InLoopInc = canonical-iv-increment-for-part CanonicalIVIncrement
2835// %ALM = active-lane-mask %InLoopInc, TC
2836// %Negated = Not %ALM
2837// branch-on-cond %Negated
2838//
2841 VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
2842 VPBasicBlock *EB = TopRegion->getExitingBasicBlock();
2843 VPValue *StartV = Plan.getZero(TopRegion->getCanonicalIVType());
2844 auto *CanonicalIVIncrement = TopRegion->getOrCreateCanonicalIVIncrement();
2845 // TODO: Check if dropping the flags is needed.
2846 TopRegion->clearCanonicalIVNUW(CanonicalIVIncrement);
2847 DebugLoc DL = CanonicalIVIncrement->getDebugLoc();
2848 // We can't use StartV directly in the ActiveLaneMask VPInstruction, since
2849 // we have to take unrolling into account. Each part needs to start at
2850 // Part * VF
2851 auto *VecPreheader = Plan.getVectorPreheader();
2852 VPBuilder Builder(VecPreheader);
2853
2854 // Create the ActiveLaneMask instruction using the correct start values.
2855 VPValue *TC = Plan.getTripCount();
2856 VPValue *VF = &Plan.getVF();
2857
2858 auto *EntryIncrement = Builder.createOverflowingOp(
2859 VPInstruction::CanonicalIVIncrementForPart, {StartV, VF}, {false, false},
2860 DL, "index.part.next");
2861
2862 // Create the active lane mask instruction in the VPlan preheader.
2863 VPValue *ALMMultiplier =
2864 Plan.getConstantInt(TopRegion->getCanonicalIVType(), 1);
2865 auto *EntryALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
2866 {EntryIncrement, TC, ALMMultiplier}, DL,
2867 "active.lane.mask.entry");
2868
2869 // Now create the ActiveLaneMaskPhi recipe in the main loop using the
2870 // preheader ActiveLaneMask instruction.
2871 auto *LaneMaskPhi =
2873 auto *HeaderVPBB = TopRegion->getEntryBasicBlock();
2874 LaneMaskPhi->insertBefore(*HeaderVPBB, HeaderVPBB->begin());
2875
2876 // Create the active lane mask for the next iteration of the loop before the
2877 // original terminator.
2878 VPRecipeBase *OriginalTerminator = EB->getTerminator();
2879 Builder.setInsertPoint(OriginalTerminator);
2880 auto *InLoopIncrement = Builder.createOverflowingOp(
2882 {CanonicalIVIncrement, &Plan.getVF()}, {false, false}, DL);
2883 auto *ALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
2884 {InLoopIncrement, TC, ALMMultiplier}, DL,
2885 "active.lane.mask.next");
2886 LaneMaskPhi->addOperand(ALM);
2887
2888 // Replace the original terminator with BranchOnCond. We have to invert the
2889 // mask here because a true condition means jumping to the exit block.
2890 auto *NotMask = Builder.createNot(ALM, DL);
2891 Builder.createNaryOp(VPInstruction::BranchOnCond, {NotMask}, DL);
2892 OriginalTerminator->eraseFromParent();
2893 return LaneMaskPhi;
2894}
2895
2897 bool UseActiveLaneMaskForControlFlow) {
2898 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2899 auto *WideCanonicalIV =
2901 assert(WideCanonicalIV &&
2902 "Must have widened canonical IV when tail folding!");
2903 VPSingleDefRecipe *HeaderMask = vputils::findHeaderMask(Plan);
2904 VPSingleDefRecipe *LaneMask;
2905 if (UseActiveLaneMaskForControlFlow) {
2906 LaneMask = addVPLaneMaskPhiAndUpdateExitBranch(Plan);
2907 } else {
2908 VPBuilder B = VPBuilder::getToInsertAfter(WideCanonicalIV);
2909 VPValue *ALMMultiplier =
2910 Plan.getConstantInt(LoopRegion->getCanonicalIVType(), 1);
2911 LaneMask =
2912 B.createNaryOp(VPInstruction::ActiveLaneMask,
2913 {WideCanonicalIV, Plan.getTripCount(), ALMMultiplier},
2914 nullptr, "active.lane.mask");
2915 }
2916
2917 // Walk users of WideCanonicalIV and replace the header mask of the form
2918 // (ICMP_ULE, WideCanonicalIV, backedge-taken-count) with an active-lane-mask,
2919 // removing the old one to ensure there is always only a single header mask.
2920 HeaderMask->replaceAllUsesWith(LaneMask);
2921 HeaderMask->eraseFromParent();
2922}
2923
2924template <typename Op0_t, typename Op1_t> struct RemoveMask_match {
2925 Op0_t In;
2927
2928 RemoveMask_match(const Op0_t &In, Op1_t &Out) : In(In), Out(Out) {}
2929
2930 template <typename OpTy> bool match(OpTy *V) const {
2931 if (m_Specific(In).match(V)) {
2932 Out = nullptr;
2933 return true;
2934 }
2935 return m_LogicalAnd(m_Specific(In), m_VPValue(Out)).match(V);
2936 }
2937};
2938
2939/// Match a specific mask \p In, or a combination of it (logical-and In, Out).
2940/// Returns the remaining part \p Out if so, or nullptr otherwise.
2941template <typename Op0_t, typename Op1_t>
2942static inline RemoveMask_match<Op0_t, Op1_t> m_RemoveMask(const Op0_t &In,
2943 Op1_t &Out) {
2944 return RemoveMask_match<Op0_t, Op1_t>(In, Out);
2945}
2946
2947static std::optional<Intrinsic::ID> getVPDivRemIntrinsic(Intrinsic::ID IntrID) {
2948 switch (IntrID) {
2949 case Intrinsic::masked_udiv:
2950 return Intrinsic::vp_udiv;
2951 case Intrinsic::masked_sdiv:
2952 return Intrinsic::vp_sdiv;
2953 case Intrinsic::masked_urem:
2954 return Intrinsic::vp_urem;
2955 case Intrinsic::masked_srem:
2956 return Intrinsic::vp_srem;
2957 default:
2958 return std::nullopt;
2959 }
2960}
2961
2962/// Try to optimize a \p CurRecipe masked by \p HeaderMask to a corresponding
2963/// EVL-based recipe without the header mask. Returns nullptr if no EVL-based
2964/// recipe could be created.
2965/// \p HeaderMask Header Mask.
2966/// \p CurRecipe Recipe to be transform.
2967/// \p EVL The explicit vector length parameter of vector-predication
2968/// intrinsics.
2970 VPRecipeBase &CurRecipe, VPValue &EVL) {
2971 VPlan *Plan = CurRecipe.getParent()->getPlan();
2972 DebugLoc DL = CurRecipe.getDebugLoc();
2973 VPValue *Addr, *Mask, *EndPtr;
2974
2975 /// Adjust any end pointers so that they point to the end of EVL lanes not VF.
2976 auto AdjustEndPtr = [&CurRecipe, &EVL](VPValue *EndPtr) {
2977 auto *EVLEndPtr = cast<VPVectorEndPointerRecipe>(EndPtr)->clone();
2978 EVLEndPtr->insertBefore(&CurRecipe);
2979 // Cast EVL (i32) to match the VF operand's type.
2980 VPValue *EVLAsVF = VPBuilder(EVLEndPtr).createScalarZExtOrTrunc(
2981 &EVL, EVLEndPtr->getOperand(1)->getScalarType(), EVL.getScalarType(),
2983 EVLEndPtr->setOperand(1, EVLAsVF);
2984 return EVLEndPtr;
2985 };
2986
2987 auto GetVPReverse = [&CurRecipe, &EVL, Plan,
2989 if (!V)
2990 return nullptr;
2991 auto *Reverse = new VPWidenIntrinsicRecipe(
2992 Intrinsic::experimental_vp_reverse, {V, Plan->getTrue(), &EVL},
2993 V->getScalarType(), {}, {}, DL);
2994 Reverse->insertBefore(&CurRecipe);
2995 return Reverse;
2996 };
2997
2998 if (match(&CurRecipe,
2999 m_MaskedLoad(m_VPValue(Addr), m_RemoveMask(HeaderMask, Mask))))
3000 return new VPWidenLoadEVLRecipe(cast<VPWidenLoadRecipe>(CurRecipe), Addr,
3001 EVL, Mask);
3002
3003 VPValue *ReversedVal;
3004 if (match(&CurRecipe, m_Reverse(m_VPValue(ReversedVal))) &&
3005 match(ReversedVal,
3006 m_MaskedLoad(m_VPValue(EndPtr),
3007 m_Reverse(m_RemoveMask(HeaderMask, Mask)))) &&
3008 match(EndPtr, m_VecEndPtr(m_VPValue(), m_Specific(&Plan->getVF())))) {
3009 Mask = GetVPReverse(Mask);
3010 Addr = AdjustEndPtr(EndPtr);
3011 auto *LoadR = new VPWidenLoadEVLRecipe(
3012 *cast<VPWidenLoadRecipe>(ReversedVal), Addr, EVL, Mask);
3013 LoadR->insertBefore(&CurRecipe);
3014 return new VPWidenIntrinsicRecipe(Intrinsic::experimental_vp_reverse,
3015 {LoadR, Plan->getTrue(), &EVL},
3016 LoadR->getScalarType(), {}, {}, DL);
3017 }
3018
3019 VPValue *Stride;
3021 m_VPValue(Addr), m_VPValue(Stride),
3022 m_RemoveMask(HeaderMask, Mask),
3023 m_TruncOrSelf(m_Specific(&Plan->getVF()))))) {
3024 if (!Mask)
3025 Mask = Plan->getTrue();
3026 auto *NewLoad = cast<VPWidenMemIntrinsicRecipe>(&CurRecipe)->clone();
3027 NewLoad->setOperand(2, Mask);
3028 NewLoad->setOperand(3, &EVL);
3029 return NewLoad;
3030 }
3031
3032 VPValue *StoredVal;
3033 if (match(&CurRecipe, m_MaskedStore(m_VPValue(Addr), m_VPValue(StoredVal),
3034 m_RemoveMask(HeaderMask, Mask))))
3035 return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(CurRecipe), Addr,
3036 StoredVal, EVL, Mask);
3037
3038 if (match(&CurRecipe,
3039 m_MaskedStore(m_VPValue(EndPtr), m_Reverse(m_VPValue(ReversedVal)),
3040 m_Reverse(m_RemoveMask(HeaderMask, Mask)))) &&
3041 match(EndPtr, m_VecEndPtr(m_VPValue(), m_Specific(&Plan->getVF())))) {
3042 Mask = GetVPReverse(Mask);
3043 Addr = AdjustEndPtr(EndPtr);
3044 StoredVal = GetVPReverse(ReversedVal);
3045 return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(CurRecipe), Addr,
3046 StoredVal, EVL, Mask);
3047 }
3048
3049 if (auto *Rdx = dyn_cast<VPReductionRecipe>(&CurRecipe))
3050 if (Rdx->isConditional() &&
3051 match(Rdx->getCondOp(), m_RemoveMask(HeaderMask, Mask)))
3052 return new VPReductionEVLRecipe(*Rdx, EVL, Mask);
3053
3054 if (auto *Interleave = dyn_cast<VPInterleaveRecipe>(&CurRecipe))
3055 if (Interleave->getMask() &&
3056 match(Interleave->getMask(), m_RemoveMask(HeaderMask, Mask)))
3057 return new VPInterleaveEVLRecipe(*Interleave, EVL, Mask);
3058
3059 VPValue *LHS, *RHS;
3060 if (match(&CurRecipe, m_Select(m_RemoveMask(HeaderMask, Mask), m_VPValue(LHS),
3061 m_VPValue(RHS))))
3062 return new VPWidenIntrinsicRecipe(
3063 Intrinsic::vp_merge, {Mask ? Mask : Plan->getTrue(), LHS, RHS, &EVL},
3064 LHS->getScalarType(), {}, {}, DL);
3065
3066 if (match(&CurRecipe, m_LastActiveLane(m_Specific(HeaderMask)))) {
3067 Type *Ty = CurRecipe.getVPSingleValue()->getScalarType();
3068 VPValue *ZExt =
3069 VPBuilder(&CurRecipe)
3070 .createScalarZExtOrTrunc(&EVL, Ty, EVL.getScalarType(), DL);
3071 return new VPInstruction(
3072 Instruction::Sub, {ZExt, Plan->getConstantInt(Ty, 1)},
3073 VPIRFlags::getDefaultFlags(Instruction::Sub), {}, DL);
3074 }
3075
3076 // lhs | (headermask && rhs) -> vp.merge rhs, true, lhs, evl
3077 if (match(&CurRecipe,
3079 m_LogicalAnd(m_Specific(HeaderMask), m_VPValue(RHS)))))
3080 return new VPWidenIntrinsicRecipe(Intrinsic::vp_merge,
3081 {RHS, Plan->getTrue(), LHS, &EVL},
3082 LHS->getScalarType(), {}, {}, DL);
3083
3084 if (auto *IntrR = dyn_cast<VPWidenIntrinsicRecipe>(&CurRecipe))
3085 if (auto VPID = getVPDivRemIntrinsic(IntrR->getVectorIntrinsicID()))
3086 if (match(IntrR->getOperand(2), m_RemoveMask(HeaderMask, Mask)))
3087 return new VPWidenIntrinsicRecipe(*VPID,
3088 {IntrR->getOperand(0),
3089 IntrR->getOperand(1),
3090 Mask ? Mask : Plan->getTrue(), &EVL},
3091 IntrR->getScalarType(), {}, {}, DL);
3092
3093 return nullptr;
3094}
3095
3096/// Optimize away any EVL-based header masks to VP intrinsic based recipes.
3097/// The transforms here need to preserve the original semantics.
3099 // Find the EVL-based header mask if it exists: icmp ult step-vector, EVL
3100 VPValue *HeaderMask = nullptr, *EVL = nullptr;
3103 m_VPValue(EVL))) &&
3104 match(EVL, m_EVL(m_VPValue()))) {
3105 HeaderMask = R.getVPSingleValue();
3106 break;
3107 }
3108 }
3109 if (!HeaderMask)
3110 return;
3111
3112 SmallVector<VPRecipeBase *> OldRecipes;
3113 for (VPUser *U : collectUsersRecursively(HeaderMask)) {
3115 if (auto *NewR = optimizeMaskToEVL(HeaderMask, *R, *EVL)) {
3116 NewR->insertBefore(R);
3117 for (auto [Old, New] :
3118 zip_equal(R->definedValues(), NewR->definedValues()))
3119 Old->replaceAllUsesWith(New);
3120 OldRecipes.push_back(R);
3121 }
3122 }
3123
3124 // Replace remaining (HeaderMask && Mask) with vp.merge (True, Mask,
3125 // False, EVL)
3126 for (VPUser *U : collectUsersRecursively(HeaderMask)) {
3127 VPValue *Mask;
3128 if (match(U, m_LogicalAnd(m_Specific(HeaderMask), m_VPValue(Mask)))) {
3129 auto *LogicalAnd = cast<VPInstruction>(U);
3130 auto *Merge = new VPWidenIntrinsicRecipe(
3131 Intrinsic::vp_merge, {Plan.getTrue(), Mask, Plan.getFalse(), EVL},
3132 Mask->getScalarType(), {}, {}, LogicalAnd->getDebugLoc());
3133 Merge->insertBefore(LogicalAnd);
3134 LogicalAnd->replaceAllUsesWith(Merge);
3135 OldRecipes.push_back(LogicalAnd);
3136 }
3137 }
3138
3139 for (VPRecipeBase *R : reverse(OldRecipes)) {
3140 SmallVector<VPValue *> PossiblyDead(R->operands());
3141 R->eraseFromParent();
3142 for (VPValue *Op : PossiblyDead)
3144 }
3145}
3146
3147/// After replacing the canonical IV with a EVL-based IV, fixup recipes that use
3148/// VF to use the EVL instead to avoid incorrect updates on the penultimate
3149/// iteration.
3150static void fixupVFUsersForEVL(VPlan &Plan, VPValue &EVL) {
3151 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3152 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
3153
3154 // EVL is i32 but VF/VFxUF are IdxTy. Convert as needed.
3155 VPValue *EVLAsIdx =
3159
3160 assert(all_of(Plan.getVF().users(),
3161 [&Plan](VPUser *U) {
3162 auto IsAllowedUser =
3163 IsaPred<VPVectorEndPointerRecipe, VPScalarIVStepsRecipe,
3164 VPWidenIntOrFpInductionRecipe,
3165 VPWidenMemIntrinsicRecipe>;
3166 if (match(U, m_Trunc(m_Specific(&Plan.getVF()))))
3167 return all_of(cast<VPSingleDefRecipe>(U)->users(),
3168 IsAllowedUser);
3169 return IsAllowedUser(U);
3170 }) &&
3171 "User of VF that we can't transform to EVL.");
3172 Plan.getVF().replaceUsesWithIf(EVLAsIdx, [](VPUser &U, unsigned Idx) {
3174 });
3175
3176 assert(all_of(Plan.getVFxUF().users(),
3178 m_c_Add(m_Specific(LoopRegion->getCanonicalIV()),
3179 m_Specific(&Plan.getVFxUF())),
3181 "Only users of VFxUF should be VPWidenPointerInductionRecipe and the "
3182 "increment of the canonical induction.");
3183 Plan.getVFxUF().replaceUsesWithIf(EVLAsIdx, [](VPUser &U, unsigned Idx) {
3184 // Only replace uses in VPWidenPointerInductionRecipe; The increment of the
3185 // canonical induction must not be updated.
3187 });
3188
3189 // Create a scalar phi to track the previous EVL if fixed-order recurrence is
3190 // contained.
3191 bool ContainsFORs =
3193 if (ContainsFORs) {
3194 // TODO: Use VPInstruction::ExplicitVectorLength to get maximum EVL.
3195 VPValue *MaxEVL = &Plan.getVF();
3196 // Emit VPScalarCastRecipe in preheader if VF is not a 32 bits integer.
3197 VPBuilder Builder(LoopRegion->getPreheaderVPBB());
3198 MaxEVL = Builder.createScalarZExtOrTrunc(
3199 MaxEVL, Type::getInt32Ty(Plan.getContext()), MaxEVL->getScalarType(),
3201
3202 Builder.setInsertPoint(Header, Header->getFirstNonPhi());
3203 VPValue *PrevEVL = Builder.createScalarPhi(
3204 {MaxEVL, &EVL}, DebugLoc::getUnknown(), "prev.evl");
3205
3208 for (VPRecipeBase &R : *VPBB) {
3209 VPValue *V1, *V2;
3210 if (!match(&R,
3212 m_VPValue(V1), m_VPValue(V2))))
3213 continue;
3214 VPValue *Imm = Plan.getOrAddLiveIn(
3217 Intrinsic::experimental_vp_splice,
3218 {V1, V2, Imm, Plan.getTrue(), PrevEVL, &EVL},
3219 R.getVPSingleValue()->getScalarType(), {}, {}, R.getDebugLoc());
3220 VPSplice->insertBefore(&R);
3221 R.getVPSingleValue()->replaceAllUsesWith(VPSplice);
3222 }
3223 }
3224 }
3225
3226 VPValue *HeaderMask = vputils::findHeaderMask(Plan);
3227 if (!HeaderMask)
3228 return;
3229
3230 // Ensure that any reduction that uses a select to mask off tail lanes does so
3231 // in the vector loop, not the middle block, since EVL tail folding can have
3232 // tail elements in the penultimate iteration.
3233 assert(all_of(*Plan.getMiddleBlock(), [&Plan, HeaderMask](VPRecipeBase &R) {
3234 if (match(&R, m_ComputeReductionResult(m_Select(m_Specific(HeaderMask),
3235 m_VPValue(), m_VPValue()))))
3236 return R.getOperand(0)->getDefiningRecipe()->getRegion() ==
3237 Plan.getVectorLoopRegion();
3238 return true;
3239 }));
3240
3241 // Replace header masks with a mask equivalent to predicating by EVL:
3242 //
3243 // icmp ule widen-canonical-iv backedge-taken-count
3244 // ->
3245 // icmp ult step-vector, EVL
3246 VPRecipeBase *EVLR = EVL.getDefiningRecipe();
3247 VPBuilder Builder(EVLR->getParent(), std::next(EVLR->getIterator()));
3248 Type *EVLType = EVL.getScalarType();
3249 VPValue *EVLMask = Builder.createICmp(
3251 Builder.createNaryOp(VPInstruction::StepVector, {}, EVLType), &EVL);
3252 HeaderMask->replaceAllUsesWith(EVLMask);
3253}
3254
3255/// Converts a tail folded vector loop region to step by
3256/// VPInstruction::ExplicitVectorLength elements instead of VF elements each
3257/// iteration.
3258///
3259/// - Add a VPCurrentIterationPHIRecipe and related recipes to \p Plan and
3260/// replaces all uses of the canonical IV except for the canonical IV
3261/// increment with a VPCurrentIterationPHIRecipe. The canonical IV is used
3262/// only for loop iterations counting after this transformation.
3263///
3264/// - The header mask is replaced with a header mask based on the EVL.
3265///
3266/// - Plans with FORs have a new phi added to keep track of the EVL of the
3267/// previous iteration, and VPFirstOrderRecurrencePHIRecipes are replaced with
3268/// @llvm.vp.splice.
3269///
3270/// The function uses the following definitions:
3271/// %StartV is the canonical induction start value.
3272///
3273/// The function adds the following recipes:
3274///
3275/// vector.ph:
3276/// ...
3277///
3278/// vector.body:
3279/// ...
3280/// %CurrentIter = CURRENT-ITERATION-PHI [ %StartV, %vector.ph ],
3281/// [ %NextIter, %vector.body ]
3282/// %AVL = phi [ trip-count, %vector.ph ], [ %NextAVL, %vector.body ]
3283/// %VPEVL = EXPLICIT-VECTOR-LENGTH %AVL
3284/// ...
3285/// %OpEVL = cast i32 %VPEVL to IVSize
3286/// %NextIter = add IVSize %OpEVL, %CurrentIter
3287/// %NextAVL = sub IVSize nuw %AVL, %OpEVL
3288/// ...
3289///
3290/// If MaxSafeElements is provided, the function adds the following recipes:
3291/// vector.ph:
3292/// ...
3293///
3294/// vector.body:
3295/// ...
3296/// %CurrentIter = CURRENT-ITERATION-PHI [ %StartV, %vector.ph ],
3297/// [ %NextIter, %vector.body ]
3298/// %AVL = phi [ trip-count, %vector.ph ], [ %NextAVL, %vector.body ]
3299/// %cmp = cmp ult %AVL, MaxSafeElements
3300/// %SAFE_AVL = select %cmp, %AVL, MaxSafeElements
3301/// %VPEVL = EXPLICIT-VECTOR-LENGTH %SAFE_AVL
3302/// ...
3303/// %OpEVL = cast i32 %VPEVL to IVSize
3304/// %NextIter = add IVSize %OpEVL, %CurrentIter
3305/// %NextAVL = sub IVSize nuw %AVL, %OpEVL
3306/// ...
3307///
3309 VPlan &Plan, const std::optional<unsigned> &MaxSafeElements) {
3310 if (Plan.hasScalarVFOnly())
3311 return;
3312 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3313 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
3314
3315 auto *CanonicalIV = LoopRegion->getCanonicalIV();
3316 auto *CanIVTy = LoopRegion->getCanonicalIVType();
3317 VPValue *StartV = Plan.getZero(CanIVTy);
3318 auto *CanonicalIVIncrement = LoopRegion->getOrCreateCanonicalIVIncrement();
3319
3320 // Create the CurrentIteration recipe in the vector loop.
3321 auto *CurrentIteration =
3323 CurrentIteration->insertBefore(*Header, Header->begin());
3324 VPBuilder Builder(Header, Header->getFirstNonPhi());
3325 // Create the AVL (application vector length), starting from TC -> 0 in steps
3326 // of EVL.
3327 VPPhi *AVLPhi = Builder.createScalarPhi(
3328 {Plan.getTripCount()}, DebugLoc::getCompilerGenerated(), "avl");
3329 VPValue *AVL = AVLPhi;
3330
3331 if (MaxSafeElements) {
3332 // Support for MaxSafeDist for correct loop emission.
3333 VPValue *AVLSafe = Plan.getConstantInt(CanIVTy, *MaxSafeElements);
3334 VPValue *Cmp = Builder.createICmp(ICmpInst::ICMP_ULT, AVL, AVLSafe);
3335 AVL = Builder.createSelect(Cmp, AVL, AVLSafe, DebugLoc::getUnknown(),
3336 "safe_avl");
3337 }
3338 auto *VPEVL = Builder.createNaryOp(VPInstruction::ExplicitVectorLength, AVL,
3339 DebugLoc::getUnknown(), "evl");
3340
3341 Builder.setInsertPoint(CanonicalIVIncrement);
3342 VPValue *OpVPEVL = VPEVL;
3343
3344 auto *I32Ty = Type::getInt32Ty(Plan.getContext());
3345 OpVPEVL = Builder.createScalarZExtOrTrunc(
3346 OpVPEVL, CanIVTy, I32Ty, CanonicalIVIncrement->getDebugLoc());
3347
3348 auto *NextIter = Builder.createAdd(
3349 OpVPEVL, CurrentIteration, CanonicalIVIncrement->getDebugLoc(),
3350 "current.iteration.next", CanonicalIVIncrement->getNoWrapFlags());
3351 CurrentIteration->addOperand(NextIter);
3352
3353 VPValue *NextAVL =
3354 Builder.createSub(AVLPhi, OpVPEVL, DebugLoc::getCompilerGenerated(),
3355 "avl.next", {/*NUW=*/true, /*NSW=*/false});
3356 AVLPhi->addOperand(NextAVL);
3357
3358 fixupVFUsersForEVL(Plan, *VPEVL);
3359 removeDeadRecipes(Plan);
3360
3361 // Replace all uses of the canonical IV with VPCurrentIterationPHIRecipe
3362 // except for the canonical IV increment.
3363 CanonicalIV->replaceAllUsesWith(CurrentIteration);
3364 CanonicalIVIncrement->setOperand(0, CanonicalIV);
3365 // TODO: support unroll factor > 1.
3366 Plan.setUF(1);
3367}
3368
3370 // Find the vector loop entry by locating VPCurrentIterationPHIRecipe.
3371 // There should be only one VPCurrentIteration in the entire plan.
3372 VPCurrentIterationPHIRecipe *CurrentIteration = nullptr;
3373
3376 for (VPRecipeBase &R : VPBB->phis())
3377 if (auto *PhiR = dyn_cast<VPCurrentIterationPHIRecipe>(&R)) {
3378 assert(!CurrentIteration &&
3379 "Found multiple CurrentIteration. Only one expected");
3380 CurrentIteration = PhiR;
3381 }
3382
3383 // Early return if it is not variable-length stepping.
3384 if (!CurrentIteration)
3385 return;
3386
3387 VPBasicBlock *HeaderVPBB = CurrentIteration->getParent();
3388 VPValue *CurrentIterationIncr = CurrentIteration->getBackedgeValue();
3389
3390 // Convert CurrentIteration to concrete recipe.
3391 auto *ScalarR =
3392 VPBuilder(CurrentIteration)
3394 {CurrentIteration->getStartValue(), CurrentIterationIncr},
3395 CurrentIteration->getDebugLoc(), "current.iteration.iv");
3396 CurrentIteration->replaceAllUsesWith(ScalarR);
3397 CurrentIteration->eraseFromParent();
3398
3399 // Replace CanonicalIVInc with CurrentIteration increment if it exists.
3400 auto *CanonicalIV = cast<VPPhi>(&*HeaderVPBB->begin());
3401 if (auto *CanIVInc = findUserOf(
3402 CanonicalIV, m_c_Add(m_VPValue(), m_Specific(&Plan.getVFxUF())))) {
3403 cast<VPInstruction>(CanIVInc)->replaceAllUsesWith(CurrentIterationIncr);
3404 CanIVInc->eraseFromParent();
3405 }
3406}
3407
3409 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3410 if (!LoopRegion)
3411 return;
3412 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
3413 if (Header->empty())
3414 return;
3415 // The EVL IV is always at the beginning.
3416 auto *EVLPhi = dyn_cast<VPCurrentIterationPHIRecipe>(&Header->front());
3417 if (!EVLPhi)
3418 return;
3419
3420 // Bail if not an EVL tail folded loop.
3421 VPValue *AVL;
3422 if (!match(EVLPhi->getBackedgeValue(),
3423 m_c_Add(m_ZExtOrSelf(m_EVL(m_VPValue(AVL))), m_Specific(EVLPhi))))
3424 return;
3425
3426 // The AVL may be capped to a safe distance.
3427 VPValue *SafeAVL, *UnsafeAVL;
3428 if (match(AVL,
3430 m_VPValue(SafeAVL)),
3431 m_Deferred(UnsafeAVL), m_Deferred(SafeAVL))))
3432 AVL = UnsafeAVL;
3433
3434 VPValue *AVLNext;
3435 [[maybe_unused]] bool FoundAVLNext =
3437 m_Specific(Plan.getTripCount()), m_VPValue(AVLNext)));
3438 assert(FoundAVLNext && "Didn't find AVL backedge?");
3439
3440 VPBasicBlock *Latch = LoopRegion->getExitingBasicBlock();
3441 auto *LatchBr = cast<VPInstruction>(Latch->getTerminator());
3442 if (match(LatchBr, m_BranchOnCond(m_True())))
3443 return;
3444
3445 VPValue *CanIVInc;
3446 [[maybe_unused]] bool FoundIncrement = match(
3447 LatchBr,
3449 m_Specific(&Plan.getVectorTripCount()))));
3450 assert(FoundIncrement &&
3451 match(CanIVInc, m_Add(m_Specific(LoopRegion->getCanonicalIV()),
3452 m_Specific(&Plan.getVFxUF()))) &&
3453 "Expected BranchOnCond with ICmp comparing CanIV + VFxUF with vector "
3454 "trip count");
3455
3456 Type *AVLTy = AVLNext->getScalarType();
3457 VPBuilder Builder(LatchBr);
3458 LatchBr->setOperand(
3459 0, Builder.createICmp(CmpInst::ICMP_EQ, AVLNext, Plan.getZero(AVLTy)));
3460}
3461
3463 VPlan &Plan, PredicatedScalarEvolution &PSE,
3464 const DenseMap<Value *, const SCEV *> &StridesMap) {
3465 // Replace VPValues for known constant strides guaranteed by predicated scalar
3466 // evolution that are guaranteed to be guarded by the runtime checks; that is,
3467 // blocks dominated by the vector preheader.
3468 assert(!Plan.getVectorLoopRegion() &&
3469 "expected to run before loop regions are created");
3470 VPDominatorTree VPDT(Plan);
3471 VPBlockBase *Preheader = Plan.getEntry()->getSuccessors()[1];
3472 auto CanUseVersionedStride = [&VPDT, Preheader](VPUser &U, unsigned) {
3473 auto *R = cast<VPRecipeBase>(&U);
3474 VPBlockBase *Parent = R->getParent();
3475 return VPDT.dominates(Preheader, Parent);
3476 };
3477 ValueToSCEVMapTy RewriteMap;
3478 for (const SCEV *Stride : StridesMap.values()) {
3479 using namespace SCEVPatternMatch;
3480 auto *StrideV = cast<SCEVUnknown>(Stride)->getValue();
3481 const APInt *StrideConst;
3482 if (!match(PSE.getSCEV(StrideV), m_scev_APInt(StrideConst)))
3483 // Only handle constant strides for now.
3484 continue;
3485
3486 auto *CI = Plan.getConstantInt(*StrideConst);
3487 if (VPValue *StrideVPV = Plan.getLiveIn(StrideV))
3488 StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
3489
3490 // The versioned value may not be used in the loop directly but through a
3491 // sext/zext. Add new live-ins in those cases.
3492 for (Value *U : StrideV->users()) {
3494 continue;
3495 VPValue *StrideVPV = Plan.getLiveIn(U);
3496 if (!StrideVPV)
3497 continue;
3498 unsigned BW = U->getType()->getScalarSizeInBits();
3499 APInt C =
3500 isa<SExtInst>(U) ? StrideConst->sext(BW) : StrideConst->zext(BW);
3501 VPValue *CI = Plan.getConstantInt(C);
3502 StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
3503 }
3504 RewriteMap[StrideV] = PSE.getSCEV(StrideV);
3505 }
3506
3507 for (VPRecipeBase &R : *Plan.getEntry()) {
3508 auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(&R);
3509 if (!ExpSCEV)
3510 continue;
3511 const SCEV *ScevExpr = ExpSCEV->getSCEV();
3512 auto *NewSCEV =
3513 SCEVParameterRewriter::rewrite(ScevExpr, *PSE.getSE(), RewriteMap);
3514 if (NewSCEV != ScevExpr) {
3515 VPValue *NewExp = vputils::getOrCreateVPValueForSCEVExpr(Plan, NewSCEV);
3516 ExpSCEV->replaceAllUsesWith(NewExp);
3517 if (Plan.getTripCount() == ExpSCEV)
3518 Plan.resetTripCount(NewExp);
3519 }
3520 }
3521}
3522
3524 // Collect recipes in the backward slice of `Root` that may generate a poison
3525 // value that is used after vectorization.
3527 auto CollectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) {
3529 Worklist.push_back(Root);
3530
3531 // Traverse the backward slice of Root through its use-def chain.
3532 while (!Worklist.empty()) {
3533 VPRecipeBase *CurRec = Worklist.pop_back_val();
3534
3535 if (!Visited.insert(CurRec).second)
3536 continue;
3537
3538 // Prune search if we find another recipe generating a widen memory
3539 // instruction. Widen memory instructions involved in address computation
3540 // will lead to gather/scatter instructions, which don't need to be
3541 // handled.
3543 VPHeaderPHIRecipe>(CurRec))
3544 continue;
3545
3546 // This recipe contributes to the address computation of a widen
3547 // load/store. If the underlying instruction has poison-generating flags,
3548 // drop them directly.
3549 if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(CurRec)) {
3550 VPValue *A, *B;
3551 // Dropping disjoint from an OR may yield incorrect results, as some
3552 // analysis may have converted it to an Add implicitly (e.g. SCEV used
3553 // for dependence analysis). Instead, replace it with an equivalent Add.
3554 // This is possible as all users of the disjoint OR only access lanes
3555 // where the operands are disjoint or poison otherwise.
3556 if (match(RecWithFlags, m_BinaryOr(m_VPValue(A), m_VPValue(B))) &&
3557 RecWithFlags->isDisjoint()) {
3558 VPBuilder Builder(RecWithFlags);
3559 VPInstruction *New =
3560 Builder.createAdd(A, B, RecWithFlags->getDebugLoc());
3561 New->setUnderlyingValue(RecWithFlags->getUnderlyingValue());
3562 RecWithFlags->replaceAllUsesWith(New);
3563 RecWithFlags->eraseFromParent();
3564 CurRec = New;
3565 } else
3566 RecWithFlags->dropPoisonGeneratingFlags();
3567 } else {
3570 (void)Instr;
3571 assert((!Instr || !Instr->hasPoisonGeneratingFlags()) &&
3572 "found instruction with poison generating flags not covered by "
3573 "VPRecipeWithIRFlags");
3574 }
3575
3576 // Add new definitions to the worklist.
3577 for (VPValue *Operand : CurRec->operands())
3578 if (VPRecipeBase *OpDef = Operand->getDefiningRecipe())
3579 Worklist.push_back(OpDef);
3580 }
3581 });
3582
3583 // We want to exclude the tail folding case, as we don't need to drop flags
3584 // for operations computing the first lane in this case: the first lane of the
3585 // header mask must always be true.
3586 auto IsNotHeaderMask = [&Plan](VPValue *Mask) {
3587 return Mask && !vputils::isHeaderMask(Mask, Plan);
3588 };
3589
3590 // Traverse all the recipes in the VPlan and collect the poison-generating
3591 // recipes in the backward slice starting at the address of a VPWidenRecipe or
3592 // VPInterleaveRecipe.
3593 auto Iter =
3596 for (VPRecipeBase &Recipe : *VPBB) {
3597 if (auto *WidenRec = dyn_cast<VPWidenMemoryRecipe>(&Recipe)) {
3598 VPRecipeBase *AddrDef = WidenRec->getAddr()->getDefiningRecipe();
3599 if (AddrDef && WidenRec->isConsecutive() &&
3600 IsNotHeaderMask(WidenRec->getMask()))
3601 CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
3602 } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) {
3603 VPRecipeBase *AddrDef = InterleaveRec->getAddr()->getDefiningRecipe();
3604 if (AddrDef && IsNotHeaderMask(InterleaveRec->getMask()))
3605 CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
3606 }
3607 }
3608 }
3609}
3610
3612 VPlan &Plan,
3614 &InterleaveGroups,
3615 const bool &EpilogueAllowed) {
3616 if (InterleaveGroups.empty())
3617 return;
3618
3620 for (VPBasicBlock *VPBB :
3623 for (VPRecipeBase &R : make_filter_range(*VPBB, [](VPRecipeBase &R) {
3624 return isa<VPWidenMemoryRecipe>(&R);
3625 })) {
3626 auto *MemR = cast<VPWidenMemoryRecipe>(&R);
3627 IRMemberToRecipe[&MemR->getIngredient()] = MemR;
3628 }
3629
3630 // Interleave memory: for each Interleave Group we marked earlier as relevant
3631 // for this VPlan, replace the Recipes widening its memory instructions with a
3632 // single VPInterleaveRecipe at its insertion point.
3633 VPDominatorTree VPDT(Plan);
3634 for (const auto *IG : InterleaveGroups) {
3635 // Skip interleave groups where members don't have recipes. This can happen
3636 // when removeDeadRecipes removes recipes that are part of interleave groups
3637 // but have no users.
3638 if (llvm::any_of(IG->members(), [&IRMemberToRecipe](Instruction *Member) {
3639 return !IRMemberToRecipe.contains(Member);
3640 }))
3641 continue;
3642
3643 auto *Start = IRMemberToRecipe.lookup(IG->getMember(0));
3644 VPIRMetadata InterleaveMD(*Start);
3645 SmallVector<VPValue *, 4> StoredValues;
3646 if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(Start->getAsRecipe()))
3647 StoredValues.push_back(StoreR->getStoredValue());
3648 for (unsigned I = 1; I < IG->getFactor(); ++I) {
3649 Instruction *MemberI = IG->getMember(I);
3650 if (!MemberI)
3651 continue;
3652 VPWidenMemoryRecipe *MemoryR = IRMemberToRecipe.lookup(MemberI);
3653 if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(MemoryR->getAsRecipe()))
3654 StoredValues.push_back(StoreR->getStoredValue());
3655 InterleaveMD.intersect(*MemoryR);
3656 }
3657
3658 bool NeedsMaskForGaps =
3659 (IG->requiresScalarEpilogue() && !EpilogueAllowed) ||
3660 (!StoredValues.empty() && !IG->isFull());
3661
3662 Instruction *IRInsertPos = IG->getInsertPos();
3663 auto *InsertPos = IRMemberToRecipe.lookup(IRInsertPos);
3664 VPRecipeBase *InsertPosR = InsertPos->getAsRecipe();
3665
3667 if (auto *Gep = dyn_cast<GetElementPtrInst>(
3668 getLoadStorePointerOperand(IRInsertPos)->stripPointerCasts()))
3669 NW = Gep->getNoWrapFlags().withoutNoUnsignedWrap();
3670
3671 // Get or create the start address for the interleave group.
3672 VPValue *Addr = Start->getAddr();
3673 VPRecipeBase *AddrDef = Addr->getDefiningRecipe();
3674 if (AddrDef && !VPDT.properlyDominates(AddrDef, InsertPosR)) {
3675 // We cannot re-use the address of member zero because it does not
3676 // dominate the insert position. Instead, use the address of the insert
3677 // position and create a PtrAdd adjusting it to the address of member
3678 // zero.
3679 // TODO: Hoist Addr's defining recipe (and any operands as needed) to
3680 // InsertPos or sink loads above zero members to join it.
3681 assert(IG->getIndex(IRInsertPos) != 0 &&
3682 "index of insert position shouldn't be zero");
3683 auto &DL = IRInsertPos->getDataLayout();
3684 APInt Offset(32,
3685 DL.getTypeAllocSize(getLoadStoreType(IRInsertPos)) *
3686 IG->getIndex(IRInsertPos),
3687 /*IsSigned=*/true);
3688 VPValue *OffsetVPV = Plan.getConstantInt(-Offset);
3689 VPBuilder B(InsertPosR);
3690 Addr = B.createNoWrapPtrAdd(InsertPos->getAddr(), OffsetVPV, NW);
3691 }
3692 // If the group is reverse, adjust the index to refer to the last vector
3693 // lane instead of the first. We adjust the index from the first vector
3694 // lane, rather than directly getting the pointer for lane VF - 1, because
3695 // the pointer operand of the interleaved access is supposed to be uniform.
3696 if (IG->isReverse()) {
3697 auto *ReversePtr = new VPVectorEndPointerRecipe(
3698 Addr, &Plan.getVF(), getLoadStoreType(IRInsertPos),
3699 -(int64_t)IG->getFactor(), NW, InsertPosR->getDebugLoc());
3700 ReversePtr->insertBefore(InsertPosR);
3701 Addr = ReversePtr;
3702 }
3703 auto *VPIG = new VPInterleaveRecipe(
3704 IG, Addr, StoredValues, InsertPos->getMask(), NeedsMaskForGaps,
3705 InterleaveMD, InsertPosR->getDebugLoc());
3706 VPIG->insertBefore(InsertPosR);
3707
3708 unsigned J = 0;
3709 for (unsigned i = 0; i < IG->getFactor(); ++i)
3710 if (Instruction *Member = IG->getMember(i)) {
3711 VPRecipeBase *MemberR = IRMemberToRecipe.lookup(Member)->getAsRecipe();
3712 if (!Member->getType()->isVoidTy()) {
3713 VPValue *OriginalV = MemberR->getVPSingleValue();
3714 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));
3715 J++;
3716 }
3717 MemberR->eraseFromParent();
3718 }
3719 }
3720}
3721
3722/// Expand a VPWidenIntOrFpInduction into executable recipes, for the initial
3723/// value, phi and backedge value. In the following example:
3724///
3725/// vector.ph:
3726/// Successor(s): vector loop
3727///
3728/// <x1> vector loop: {
3729/// vector.body:
3730/// WIDEN-INDUCTION %i = phi %start, %step, %vf
3731/// ...
3732/// EMIT branch-on-count ...
3733/// No successors
3734/// }
3735///
3736/// WIDEN-INDUCTION will get expanded to:
3737///
3738/// vector.ph:
3739/// ...
3740/// vp<%induction.start> = ...
3741/// vp<%induction.increment> = ...
3742///
3743/// Successor(s): vector loop
3744///
3745/// <x1> vector loop: {
3746/// vector.body:
3747/// ir<%i> = WIDEN-PHI vp<%induction.start>, vp<%vec.ind.next>
3748/// ...
3749/// vp<%vec.ind.next> = add ir<%i>, vp<%induction.increment>
3750/// EMIT branch-on-count ...
3751/// No successors
3752/// }
3753static void
3755 VPlan *Plan = WidenIVR->getParent()->getPlan();
3756 VPValue *Start = WidenIVR->getStartValue();
3757 VPValue *Step = WidenIVR->getStepValue();
3758 VPValue *VF = WidenIVR->getVFValue();
3759 DebugLoc DL = WidenIVR->getDebugLoc();
3760
3761 // The value from the original loop to which we are mapping the new induction
3762 // variable.
3763 Type *Ty = WidenIVR->getScalarType();
3764
3765 const InductionDescriptor &ID = WidenIVR->getInductionDescriptor();
3768 VPIRFlags Flags = *WidenIVR;
3769 if (ID.getKind() == InductionDescriptor::IK_IntInduction) {
3770 AddOp = Instruction::Add;
3771 MulOp = Instruction::Mul;
3772 } else {
3773 AddOp = ID.getInductionOpcode();
3774 MulOp = Instruction::FMul;
3775 }
3776
3777 // If the phi is truncated, truncate the start and step values.
3778 VPBuilder Builder(Plan->getVectorPreheader());
3779 Type *StepTy = Step->getScalarType();
3780 if (Ty->getScalarSizeInBits() < StepTy->getScalarSizeInBits()) {
3781 assert(StepTy->isIntegerTy() && "Truncation requires an integer type");
3782 Step = Builder.createScalarCast(Instruction::Trunc, Step, Ty, DL);
3783 Start = Builder.createScalarCast(Instruction::Trunc, Start, Ty, DL);
3784 StepTy = Ty;
3785 }
3786
3787 // Construct the initial value of the vector IV in the vector loop preheader.
3788 Type *IVIntTy =
3790 VPValue *Init = Builder.createNaryOp(VPInstruction::StepVector, {}, IVIntTy);
3791 if (StepTy->isFloatingPointTy())
3792 Init = Builder.createWidenCast(Instruction::UIToFP, Init, StepTy);
3793
3794 VPValue *SplatStart = Builder.createNaryOp(VPInstruction::Broadcast, Start);
3795 VPValue *SplatStep = Builder.createNaryOp(VPInstruction::Broadcast, Step);
3796
3797 Init = Builder.createNaryOp(MulOp, {Init, SplatStep}, Flags);
3798 Init = Builder.createNaryOp(AddOp, {SplatStart, Init}, Flags,
3799 DebugLoc::getUnknown(), "induction");
3800
3801 // Create the widened phi of the vector IV.
3802 auto *WidePHI = VPBuilder(WidenIVR).createWidenPhi(
3803 Init, WidenIVR->getDebugLoc(), "vec.ind");
3804
3805 // Create the backedge value for the vector IV.
3806 VPValue *Inc;
3807 VPValue *Prev;
3808 // If unrolled, use the increment and prev value from the operands.
3809 if (auto *SplatVF = WidenIVR->getSplatVFValue()) {
3810 Inc = SplatVF;
3811 Prev = WidenIVR->getLastUnrolledPartOperand();
3812 } else {
3813 // Move the insertion point after the VF definition when the VF is defined
3814 // inside a loop, such as for EVL tail-folding.
3815 if (VPRecipeBase *R = VF->getDefiningRecipe())
3816 if (R->getParent()->getEnclosingLoopRegion())
3817 Builder.setInsertPoint(R->getParent(), std::next(R->getIterator()));
3818
3819 // Multiply the vectorization factor by the step using integer or
3820 // floating-point arithmetic as appropriate.
3821 if (StepTy->isFloatingPointTy())
3822 VF = Builder.createScalarCast(Instruction::CastOps::UIToFP, VF, StepTy,
3823 DL);
3824 else
3825 VF = Builder.createScalarZExtOrTrunc(VF, StepTy, VF->getScalarType(), DL);
3826
3827 Inc = Builder.createNaryOp(MulOp, {Step, VF}, Flags);
3828 Inc = Builder.createNaryOp(VPInstruction::Broadcast, Inc);
3829 Prev = WidePHI;
3830 }
3831
3833 Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator());
3834 auto *Next = Builder.createNaryOp(AddOp, {Prev, Inc}, Flags,
3835 WidenIVR->getDebugLoc(), "vec.ind.next");
3836
3837 WidePHI->addOperand(Next);
3838
3839 WidenIVR->replaceAllUsesWith(WidePHI);
3840}
3841
3842/// Expand a VPWidenPointerInductionRecipe into executable recipes, for the
3843/// initial value, phi and backedge value. In the following example:
3844///
3845/// <x1> vector loop: {
3846/// vector.body:
3847/// EMIT ir<%ptr.iv> = WIDEN-POINTER-INDUCTION %start, %step, %vf
3848/// ...
3849/// EMIT branch-on-count ...
3850/// }
3851///
3852/// WIDEN-POINTER-INDUCTION will get expanded to:
3853///
3854/// <x1> vector loop: {
3855/// vector.body:
3856/// EMIT-SCALAR %pointer.phi = phi %start, %ptr.ind
3857/// EMIT %mul = mul %stepvector, %step
3858/// EMIT %vector.gep = wide-ptradd %pointer.phi, %mul
3859/// ...
3860/// EMIT %ptr.ind = ptradd %pointer.phi, %vf
3861/// EMIT branch-on-count ...
3862/// }
3864 VPlan *Plan = R->getParent()->getPlan();
3865 VPValue *Start = R->getStartValue();
3866 VPValue *Step = R->getStepValue();
3867 VPValue *VF = R->getVFValue();
3868
3869 assert(R->getInductionDescriptor().getKind() ==
3871 "Not a pointer induction according to InductionDescriptor!");
3872 assert(R->getScalarType()->isPointerTy() && "Unexpected type.");
3873 assert(!R->onlyScalarsGenerated(Plan->hasScalableVF()) &&
3874 "Recipe should have been replaced");
3875
3876 VPBuilder Builder(R);
3877 DebugLoc DL = R->getDebugLoc();
3878
3879 // Build a scalar pointer phi.
3880 VPPhi *ScalarPtrPhi = Builder.createScalarPhi(Start, DL, "pointer.phi");
3881
3882 // Create actual address geps that use the pointer phi as base and a
3883 // vectorized version of the step value (<step*0, ..., step*N>) as offset.
3884 Builder.setInsertPoint(R->getParent(), R->getParent()->getFirstNonPhi());
3885 Type *StepTy = Step->getScalarType();
3886 VPValue *Offset = Builder.createNaryOp(VPInstruction::StepVector, {}, StepTy);
3887 Offset = Builder.createOverflowingOp(Instruction::Mul, {Offset, Step});
3888 VPValue *PtrAdd =
3889 Builder.createWidePtrAdd(ScalarPtrPhi, Offset, DL, "vector.gep");
3890 R->replaceAllUsesWith(PtrAdd);
3891
3892 // Create the backedge value for the scalar pointer phi.
3894 Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator());
3895 VF = Builder.createScalarZExtOrTrunc(VF, StepTy, VF->getScalarType(), DL);
3896 VPValue *Inc = Builder.createOverflowingOp(Instruction::Mul, {Step, VF});
3897
3898 VPValue *InductionGEP =
3899 Builder.createPtrAdd(ScalarPtrPhi, Inc, DL, "ptr.ind");
3900 ScalarPtrPhi->addOperand(InductionGEP);
3901}
3902
3903/// Expand a VPDerivedIVRecipe into executable recipes.
3905 VPBuilder Builder(R);
3906 VPIRValue *Start = R->getStartValue();
3907 VPValue *Step = R->getStepValue();
3908 VPValue *Index = R->getIndex();
3909 Type *StepTy = Step->getScalarType();
3910 Type *IndexTy = Index->getScalarType();
3911 Index = StepTy->isIntegerTy()
3912 ? Builder.createScalarSExtOrTrunc(
3913 Index, StepTy, IndexTy, DebugLoc::getCompilerGenerated())
3914 : Builder.createScalarCast(Instruction::SIToFP, Index, StepTy,
3916 switch (R->getInductionKind()) {
3918 assert(Index->getScalarType() == Start->getScalarType() &&
3919 "Index type does not match StartValue type");
3920 return R->replaceAllUsesWith(Builder.createAdd(
3921 Start, Builder.createOverflowingOp(Instruction::Mul, {Index, Step})));
3922 }
3924 return R->replaceAllUsesWith(Builder.createPtrAdd(
3925 Start, Builder.createOverflowingOp(Instruction::Mul, {Index, Step})));
3927 assert(StepTy->isFloatingPointTy() && "Expected FP Step value");
3928 const FPMathOperator *FPBinOp = R->getFPBinOp();
3929 assert(FPBinOp &&
3930 (FPBinOp->getOpcode() == Instruction::FAdd ||
3931 FPBinOp->getOpcode() == Instruction::FSub) &&
3932 "Original BinOp should be defined for FP induction");
3933 FastMathFlags FMF = FPBinOp->getFastMathFlags();
3934 VPValue *FMul = Builder.createNaryOp(Instruction::FMul, {Step, Index}, FMF);
3935 return R->replaceAllUsesWith(
3936 Builder.createNaryOp(FPBinOp->getOpcode(), {Start, FMul}, FMF));
3937 }
3939 return;
3940 }
3941 llvm_unreachable("Unhandled induction kind");
3942}
3943
3945 // Replace loop regions with explicity CFG.
3946 SmallVector<VPRegionBlock *> LoopRegions;
3948 vp_depth_first_deep(Plan.getEntry()))) {
3949 if (!R->isReplicator())
3950 LoopRegions.push_back(R);
3951 }
3952 for (VPRegionBlock *R : LoopRegions)
3953 R->dissolveToCFGLoop();
3954}
3955
3958 // The transform runs after dissolving loop regions, so all VPBasicBlocks
3959 // terminated with BranchOnTwoConds are reached via a shallow traversal.
3962 if (!VPBB->empty() && match(&VPBB->back(), m_BranchOnTwoConds()))
3963 WorkList.push_back(cast<VPInstruction>(&VPBB->back()));
3964 }
3965
3966 // Expand BranchOnTwoConds instructions into explicit CFG with two new
3967 // single-condition branches:
3968 // 1. A branch that replaces BranchOnTwoConds, jumps to the first successor if
3969 // the first condition is true, and otherwise jumps to a new interim block.
3970 // 2. A branch that ends the interim block, jumps to the second successor if
3971 // the second condition is true, and otherwise jumps to the third
3972 // successor.
3973 for (VPInstruction *Br : WorkList) {
3974 assert(Br->getNumOperands() == 2 &&
3975 "BranchOnTwoConds must have exactly 2 conditions");
3976 DebugLoc DL = Br->getDebugLoc();
3977 VPBasicBlock *BrOnTwoCondsBB = Br->getParent();
3978 const auto Successors = to_vector(BrOnTwoCondsBB->getSuccessors());
3979 assert(Successors.size() == 3 &&
3980 "BranchOnTwoConds must have exactly 3 successors");
3981
3982 for (VPBlockBase *Succ : Successors)
3983 VPBlockUtils::disconnectBlocks(BrOnTwoCondsBB, Succ);
3984
3985 VPValue *Cond0 = Br->getOperand(0);
3986 VPValue *Cond1 = Br->getOperand(1);
3987 VPBlockBase *Succ0 = Successors[0];
3988 VPBlockBase *Succ1 = Successors[1];
3989 VPBlockBase *Succ2 = Successors[2];
3990 assert(!Succ0->getParent() && !Succ1->getParent() && !Succ2->getParent() &&
3991 !BrOnTwoCondsBB->getParent() && "regions must already be dissolved");
3992
3993 VPBasicBlock *InterimBB =
3994 Plan.createVPBasicBlock(BrOnTwoCondsBB->getName() + ".interim");
3995
3996 VPBuilder(BrOnTwoCondsBB)
3998 VPBlockUtils::connectBlocks(BrOnTwoCondsBB, Succ0);
3999 VPBlockUtils::connectBlocks(BrOnTwoCondsBB, InterimBB);
4000
4002 VPBlockUtils::connectBlocks(InterimBB, Succ1);
4003 VPBlockUtils::connectBlocks(InterimBB, Succ2);
4004 Br->eraseFromParent();
4005 }
4006}
4007
4010 vp_depth_first_deep(Plan.getEntry()))) {
4011 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
4012 VPBuilder Builder(&R);
4013 if (auto *WidenIVR = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R)) {
4015 WidenIVR->eraseFromParent();
4016 continue;
4017 }
4018
4019 if (auto *WidenIVR = dyn_cast<VPWidenPointerInductionRecipe>(&R)) {
4020 // If the recipe only generates scalars, scalarize it instead of
4021 // expanding it.
4022 if (WidenIVR->onlyScalarsGenerated(Plan.hasScalableVF())) {
4023 VPValue *PtrAdd =
4024 scalarizeVPWidenPointerInduction(WidenIVR, Plan, Builder);
4025 WidenIVR->replaceAllUsesWith(PtrAdd);
4026 WidenIVR->eraseFromParent();
4027 continue;
4028 }
4030 WidenIVR->eraseFromParent();
4031 continue;
4032 }
4033
4034 if (auto *DerivedIVR = dyn_cast<VPDerivedIVRecipe>(&R)) {
4035 expandVPDerivedIV(DerivedIVR);
4036 DerivedIVR->eraseFromParent();
4037 continue;
4038 }
4039
4040 if (auto *WideCanIV = dyn_cast<VPWidenCanonicalIVRecipe>(&R)) {
4041 VPValue *CanIV = WideCanIV->getCanonicalIV();
4042 Type *CanIVTy = CanIV->getScalarType();
4043 VPValue *Step = WideCanIV->getStepValue();
4044 if (!Step) {
4045 assert(Plan.getConcreteUF() == 1 &&
4046 "Expected unroller to have materialized step for UF != 1");
4047 Step = Plan.getZero(CanIVTy);
4048 }
4049 CanIV = Builder.createNaryOp(VPInstruction::Broadcast, CanIV);
4050 Step = Builder.createNaryOp(VPInstruction::Broadcast, Step);
4051 Step = Builder.createAdd(
4052 Step, Builder.createNaryOp(VPInstruction::StepVector, {}, CanIVTy));
4053 VPValue *CanVecIV =
4054 Builder.createAdd(CanIV, Step, WideCanIV->getDebugLoc(), "vec.iv",
4055 WideCanIV->getNoWrapFlags());
4056 WideCanIV->replaceAllUsesWith(CanVecIV);
4057 WideCanIV->eraseFromParent();
4058 continue;
4059 }
4060
4061 // Expand VPBlendRecipe into VPInstruction::Select.
4062 if (auto *Blend = dyn_cast<VPBlendRecipe>(&R)) {
4063 VPValue *Select = Blend->getIncomingValue(0);
4064 for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
4065 Select = Builder.createSelect(Blend->getMask(I),
4066 Blend->getIncomingValue(I), Select,
4067 R.getDebugLoc(), "predphi", *Blend);
4068 Blend->replaceAllUsesWith(Select);
4069 Blend->eraseFromParent();
4070 continue;
4071 }
4072
4073 if (auto *VEPR = dyn_cast<VPVectorEndPointerRecipe>(&R)) {
4074 if (!VEPR->getOffset()) {
4075 assert(Plan.getConcreteUF() == 1 &&
4076 "Expected unroller to have materialized offset for UF != 1");
4077 VEPR->materializeOffset();
4078 }
4079 continue;
4080 }
4081
4082 if (auto *Expr = dyn_cast<VPExpressionRecipe>(&R)) {
4083 Expr->decompose();
4084 Expr->eraseFromParent();
4085 continue;
4086 }
4087
4088 // Expand LastActiveLane into Not + FirstActiveLane + Sub.
4089 auto *LastActiveL = dyn_cast<VPInstruction>(&R);
4090 if (LastActiveL &&
4091 LastActiveL->getOpcode() == VPInstruction::LastActiveLane) {
4092 // Create Not(Mask) for all operands.
4094 for (VPValue *Op : LastActiveL->operands()) {
4095 VPValue *NotMask = Builder.createNot(Op, LastActiveL->getDebugLoc());
4096 NotMasks.push_back(NotMask);
4097 }
4098
4099 // Create FirstActiveLane on the inverted masks.
4100 VPValue *FirstInactiveLane = Builder.createFirstActiveLane(
4101 NotMasks, LastActiveL->getDebugLoc(), "first.inactive.lane");
4102
4103 // Subtract 1 to get the last active lane.
4104 VPValue *One =
4105 Plan.getConstantInt(FirstInactiveLane->getScalarType(), 1);
4106 VPValue *LastLane =
4107 Builder.createSub(FirstInactiveLane, One,
4108 LastActiveL->getDebugLoc(), "last.active.lane");
4109
4110 LastActiveL->replaceAllUsesWith(LastLane);
4111 LastActiveL->eraseFromParent();
4112 continue;
4113 }
4114
4115 // Lower MaskedCond with block mask to LogicalAnd.
4117 auto *VPI = cast<VPInstruction>(&R);
4118 assert(VPI->isMasked() &&
4119 "Unmasked MaskedCond should be simplified earlier");
4120 VPI->replaceAllUsesWith(Builder.createNaryOp(
4121 VPInstruction::LogicalAnd, {VPI->getMask(), VPI->getOperand(0)}));
4122 VPI->eraseFromParent();
4123 continue;
4124 }
4125
4126 // Lower CanonicalIVIncrementForPart to plain Add.
4127 if (match(
4128 &R,
4130 auto *VPI = cast<VPInstruction>(&R);
4131 VPValue *Add = Builder.createOverflowingOp(
4132 Instruction::Add, VPI->operands(), VPI->getNoWrapFlags(),
4133 VPI->getDebugLoc());
4134 VPI->replaceAllUsesWith(Add);
4135 VPI->eraseFromParent();
4136 continue;
4137 }
4138
4139 // Lower BranchOnCount to ICmp + BranchOnCond.
4140 VPValue *IV, *TC;
4141 if (match(&R, m_BranchOnCount(m_VPValue(IV), m_VPValue(TC)))) {
4142 auto *BranchOnCountInst = cast<VPInstruction>(&R);
4143 DebugLoc DL = BranchOnCountInst->getDebugLoc();
4144 VPValue *Cond = Builder.createICmp(CmpInst::ICMP_EQ, IV, TC, DL);
4145 Builder.createNaryOp(VPInstruction::BranchOnCond, Cond, DL);
4146 BranchOnCountInst->eraseFromParent();
4147 continue;
4148 }
4149
4150 VPValue *VectorStep;
4151 VPValue *ScalarStep;
4153 m_VPValue(VectorStep), m_VPValue(ScalarStep))))
4154 continue;
4155
4156 // Expand WideIVStep.
4157 auto *VPI = cast<VPInstruction>(&R);
4158 Type *IVTy = VPI->getScalarType();
4159 if (VectorStep->getScalarType() != IVTy) {
4161 ? Instruction::UIToFP
4162 : Instruction::Trunc;
4163 VectorStep = Builder.createWidenCast(CastOp, VectorStep, IVTy);
4164 }
4165
4166 assert(!match(ScalarStep, m_One()) && "Expected non-unit scalar-step");
4167 if (ScalarStep->getScalarType() != IVTy) {
4168 ScalarStep =
4169 Builder.createWidenCast(Instruction::Trunc, ScalarStep, IVTy);
4170 }
4171
4172 VPIRFlags Flags;
4173 unsigned MulOpc;
4174 if (IVTy->isFloatingPointTy()) {
4175 MulOpc = Instruction::FMul;
4176 Flags = VPI->getFastMathFlags();
4177 } else {
4178 MulOpc = Instruction::Mul;
4179 Flags = VPIRFlags::getDefaultFlags(MulOpc);
4180 }
4181
4182 VPInstruction *Mul = Builder.createNaryOp(
4183 MulOpc, {VectorStep, ScalarStep}, Flags, R.getDebugLoc());
4184 VectorStep = Mul;
4185 VPI->replaceAllUsesWith(VectorStep);
4186 VPI->eraseFromParent();
4187 }
4188 }
4189}
4190
4192 VPBasicBlock *HeaderVPBB,
4193 VPBasicBlock *LatchVPBB,
4194 VPBasicBlock *MiddleVPBB,
4195 UncountableExitStyle Style) {
4196 struct EarlyExitInfo {
4197 VPBasicBlock *EarlyExitingVPBB;
4198 VPIRBasicBlock *EarlyExitVPBB;
4199 VPValue *CondToExit;
4200 };
4201
4202 VPDominatorTree VPDT(Plan);
4203 VPBuilder Builder(LatchVPBB->getTerminator());
4205 for (VPIRBasicBlock *ExitBlock : Plan.getExitBlocks()) {
4206 for (VPBlockBase *Pred : to_vector(ExitBlock->getPredecessors())) {
4207 if (Pred == MiddleVPBB)
4208 continue;
4209 // Collect condition for this early exit.
4210 auto *EarlyExitingVPBB = cast<VPBasicBlock>(Pred);
4211 VPBlockBase *TrueSucc = EarlyExitingVPBB->getSuccessors()[0];
4212 VPValue *CondOfEarlyExitingVPBB;
4213 [[maybe_unused]] bool Matched =
4214 match(EarlyExitingVPBB->getTerminator(),
4215 m_BranchOnCond(m_VPValue(CondOfEarlyExitingVPBB)));
4216 assert(Matched && "Terminator must be BranchOnCond");
4217
4218 // Insert the MaskedCond in the EarlyExitingVPBB so the predicator adds
4219 // the correct block mask.
4220 VPBuilder EarlyExitingBuilder(EarlyExitingVPBB->getTerminator());
4221 auto *CondToEarlyExit = EarlyExitingBuilder.createNaryOp(
4223 TrueSucc == ExitBlock
4224 ? CondOfEarlyExitingVPBB
4225 : EarlyExitingBuilder.createNot(CondOfEarlyExitingVPBB));
4226 assert((isa<VPIRValue>(CondOfEarlyExitingVPBB) ||
4227 !VPDT.properlyDominates(EarlyExitingVPBB, LatchVPBB) ||
4228 VPDT.properlyDominates(
4229 CondOfEarlyExitingVPBB->getDefiningRecipe()->getParent(),
4230 LatchVPBB)) &&
4231 "exit condition must dominate the latch");
4232 Exits.push_back({
4233 EarlyExitingVPBB,
4234 ExitBlock,
4235 CondToEarlyExit,
4236 });
4237 }
4238 }
4239
4240 assert(!Exits.empty() && "must have at least one early exit");
4241 // Sort exits by RPO order to get correct program order. RPO gives a
4242 // topological ordering of the CFG, ensuring upstream exits are checked
4243 // before downstream exits in the dispatch chain.
4245 HeaderVPBB);
4247 for (const auto &[Num, VPB] : enumerate(RPOT))
4248 RPOIdx[VPB] = Num;
4249 llvm::sort(Exits, [&RPOIdx](const EarlyExitInfo &A, const EarlyExitInfo &B) {
4250 return RPOIdx[A.EarlyExitingVPBB] < RPOIdx[B.EarlyExitingVPBB];
4251 });
4252#ifndef NDEBUG
4253 // After RPO sorting, verify that for any pair where one exit dominates
4254 // another, the dominating exit comes first. This is guaranteed by RPO
4255 // (topological order) and is required for the dispatch chain correctness.
4256 for (unsigned I = 0; I + 1 < Exits.size(); ++I)
4257 for (unsigned J = I + 1; J < Exits.size(); ++J)
4258 assert(!VPDT.properlyDominates(Exits[J].EarlyExitingVPBB,
4259 Exits[I].EarlyExitingVPBB) &&
4260 "RPO sort must place dominating exits before dominated ones");
4261#endif
4262
4263 // Build the AnyOf condition for the latch terminator using logical OR
4264 // to avoid poison propagation from later exit conditions when an earlier
4265 // exit is taken.
4266 VPValue *Combined = Exits[0].CondToExit;
4267 for (const EarlyExitInfo &Info : drop_begin(Exits))
4268 Combined = Builder.createLogicalOr(Combined, Info.CondToExit);
4269
4270 VPValue *IsAnyExitTaken =
4271 Builder.createNaryOp(VPInstruction::AnyOf, {Combined});
4272
4274 "Early exit store masking not implemented");
4275
4276 // Create the vector.early.exit blocks.
4277 SmallVector<VPBasicBlock *> VectorEarlyExitVPBBs(Exits.size());
4278 for (unsigned Idx = 0; Idx != Exits.size(); ++Idx) {
4279 Twine BlockSuffix = Exits.size() == 1 ? "" : Twine(".") + Twine(Idx);
4280 VPBasicBlock *VectorEarlyExitVPBB =
4281 Plan.createVPBasicBlock("vector.early.exit" + BlockSuffix);
4282 VectorEarlyExitVPBBs[Idx] = VectorEarlyExitVPBB;
4283 }
4284
4285 // Create the dispatch block (or reuse the single exit block if only one
4286 // exit). The dispatch block computes the first active lane of the combined
4287 // condition and, for multiple exits, chains through conditions to determine
4288 // which exit to take.
4289 VPBasicBlock *DispatchVPBB =
4290 Exits.size() == 1 ? VectorEarlyExitVPBBs[0]
4291 : Plan.createVPBasicBlock("vector.early.exit.check");
4292 DispatchVPBB->setPredecessors({LatchVPBB});
4293 VPBuilder DispatchBuilder(DispatchVPBB, DispatchVPBB->begin());
4294 VPValue *FirstActiveLane = DispatchBuilder.createFirstActiveLane(
4295 {Combined}, DebugLoc::getUnknown(), "first.active.lane");
4296
4297 // For each early exit, disconnect the original exiting block
4298 // (early.exiting.I) from the exit block (ir-bb<exit.I>) and route through a
4299 // new vector.early.exit block. Update ir-bb<exit.I>'s phis to extract their
4300 // values at the first active lane:
4301 //
4302 // Input:
4303 // early.exiting.I:
4304 // ...
4305 // EMIT branch-on-cond vp<%cond.I>
4306 // Successor(s): in.loop.succ, ir-bb<exit.I>
4307 //
4308 // ir-bb<exit.I>:
4309 // IR %phi = phi [ vp<%incoming.I>, early.exiting.I ], ...
4310 //
4311 // Output:
4312 // early.exiting.I:
4313 // ...
4314 // Successor(s): in.loop.succ
4315 //
4316 // vector.early.exit.I:
4317 // EMIT vp<%exit.val> = extract-lane vp<%first.lane>, vp<%incoming.I>
4318 // Successor(s): ir-bb<exit.I>
4319 //
4320 // ir-bb<exit.I>:
4321 // IR %phi = phi ... (extra operand: vp<%exit.val> from
4322 // vector.early.exit.I)
4323 //
4324 for (auto [Exit, VectorEarlyExitVPBB] :
4325 zip_equal(Exits, VectorEarlyExitVPBBs)) {
4326 auto &[EarlyExitingVPBB, EarlyExitVPBB, _] = Exit;
4327 // Adjust the phi nodes in EarlyExitVPBB.
4328 // 1. remove incoming values from EarlyExitingVPBB,
4329 // 2. extract the incoming value at FirstActiveLane
4330 // 3. add back the extracts as last operands for the phis
4331 // Then adjust the CFG, removing the edge between EarlyExitingVPBB and
4332 // EarlyExitVPBB and adding a new edge between VectorEarlyExitVPBB and
4333 // EarlyExitVPBB. The extracts at FirstActiveLane are now the incoming
4334 // values from VectorEarlyExitVPBB.
4335 for (VPRecipeBase &R : EarlyExitVPBB->phis()) {
4336 auto *ExitIRI = cast<VPIRPhi>(&R);
4337 VPValue *IncomingVal =
4338 ExitIRI->getIncomingValueForBlock(EarlyExitingVPBB);
4339 VPValue *NewIncoming = IncomingVal;
4340 if (!isa<VPIRValue>(IncomingVal)) {
4341 VPBuilder EarlyExitBuilder(VectorEarlyExitVPBB);
4342 NewIncoming = EarlyExitBuilder.createNaryOp(
4343 VPInstruction::ExtractLane, {FirstActiveLane, IncomingVal},
4344 DebugLoc::getUnknown(), "early.exit.value");
4345 }
4346 ExitIRI->removeIncomingValueFor(EarlyExitingVPBB);
4347 ExitIRI->addOperand(NewIncoming);
4348 }
4349
4350 EarlyExitingVPBB->getTerminator()->eraseFromParent();
4351 VPBlockUtils::disconnectBlocks(EarlyExitingVPBB, EarlyExitVPBB);
4352 VPBlockUtils::connectBlocks(VectorEarlyExitVPBB, EarlyExitVPBB);
4353 }
4354
4355 // Chain through exits: for each exit, check if its condition is true at
4356 // the first active lane. If so, take that exit; otherwise, try the next.
4357 // The last exit needs no check since it must be taken if all others fail.
4358 //
4359 // For 3 exits (cond.0, cond.1, cond.2), this creates:
4360 //
4361 // latch:
4362 // ...
4363 // EMIT vp<%combined> = logical-or vp<%cond.0>, vp<%cond.1>, vp<%cond.2>
4364 // ...
4365 //
4366 // vector.early.exit.check:
4367 // EMIT vp<%first.lane> = first-active-lane vp<%combined>
4368 // EMIT vp<%at.cond.0> = extract-lane vp<%first.lane>, vp<%cond.0>
4369 // EMIT branch-on-cond vp<%at.cond.0>
4370 // Successor(s): vector.early.exit.0, vector.early.exit.check.0
4371 //
4372 // vector.early.exit.check.0:
4373 // EMIT vp<%at.cond.1> = extract-lane vp<%first.lane>, vp<%cond.1>
4374 // EMIT branch-on-cond vp<%at.cond.1>
4375 // Successor(s): vector.early.exit.1, vector.early.exit.2
4376 VPBasicBlock *CurrentBB = DispatchVPBB;
4377 for (auto [I, Exit] : enumerate(ArrayRef(Exits).drop_back())) {
4378 VPValue *LaneVal = DispatchBuilder.createNaryOp(
4379 VPInstruction::ExtractLane, {FirstActiveLane, Exit.CondToExit},
4380 DebugLoc::getUnknown(), "exit.cond.at.lane");
4381
4382 // For the last dispatch, branch directly to the last exit on false;
4383 // otherwise, create a new check block.
4384 bool IsLastDispatch = (I + 2 == Exits.size());
4385 VPBasicBlock *FalseBB =
4386 IsLastDispatch ? VectorEarlyExitVPBBs.back()
4387 : Plan.createVPBasicBlock(
4388 Twine("vector.early.exit.check.") + Twine(I));
4389
4390 DispatchBuilder.createNaryOp(VPInstruction::BranchOnCond, {LaneVal});
4391 CurrentBB->setSuccessors({VectorEarlyExitVPBBs[I], FalseBB});
4392 VectorEarlyExitVPBBs[I]->setPredecessors({CurrentBB});
4393 FalseBB->setPredecessors({CurrentBB});
4394
4395 CurrentBB = FalseBB;
4396 DispatchBuilder.setInsertPoint(CurrentBB);
4397 }
4398
4399 // Replace the latch terminator with the new branching logic. The original
4400 // BranchOnCond's condition is used as the latch-exit condition; canonical IV
4401 // recipes have not been introduced yet, so there is no BranchOnCount to
4402 // derive the condition from.
4403 auto *LatchExitingBranch = cast<VPInstruction>(LatchVPBB->getTerminator());
4404 assert(LatchExitingBranch->getOpcode() == VPInstruction::BranchOnCond &&
4405 "Unexpected terminator");
4406 VPValue *IsLatchExitTaken = LatchExitingBranch->getOperand(0);
4407
4408 DebugLoc LatchDL = LatchExitingBranch->getDebugLoc();
4409 LatchExitingBranch->eraseFromParent();
4410 Builder.setInsertPoint(LatchVPBB);
4411 Builder.createNaryOp(VPInstruction::BranchOnTwoConds,
4412 {IsAnyExitTaken, IsLatchExitTaken}, LatchDL);
4413 LatchVPBB->clearSuccessors();
4414 LatchVPBB->setSuccessors({DispatchVPBB, MiddleVPBB, HeaderVPBB});
4415}
4416
4417/// This function tries convert extended in-loop reductions to
4418/// VPExpressionRecipe and clamp the \p Range if it is beneficial and
4419/// valid. The created recipe must be decomposed to its constituent
4420/// recipes before execution.
4421static VPExpressionRecipe *
4423 VFRange &Range) {
4424 Type *RedTy = Red->getScalarType();
4425 VPValue *VecOp = Red->getVecOp();
4426
4427 assert(!Red->isPartialReduction() &&
4428 "This path does not support partial reductions");
4429
4430 // Clamp the range if using extended-reduction is profitable.
4431 auto IsExtendedRedValidAndClampRange =
4432 [&](unsigned Opcode, Instruction::CastOps ExtOpc, Type *SrcTy) -> bool {
4434 [&](ElementCount VF) {
4435 auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF));
4437
4439 InstructionCost ExtCost =
4440 cast<VPWidenCastRecipe>(VecOp)->computeCost(VF, Ctx);
4441 InstructionCost RedCost = Red->computeCost(VF, Ctx);
4442
4443 assert(!RedTy->isFloatingPointTy() &&
4444 "getExtendedReductionCost only supports integer types");
4445 ExtRedCost = Ctx.TTI.getExtendedReductionCost(
4446 Opcode, ExtOpc == Instruction::CastOps::ZExt, RedTy, SrcVecTy,
4447 Red->getFastMathFlags(), CostKind);
4448 return ExtRedCost.isValid() && ExtRedCost < ExtCost + RedCost;
4449 },
4450 Range);
4451 };
4452
4453 VPValue *A;
4454 // Match reduce(ext)).
4456 IsExtendedRedValidAndClampRange(
4457 RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()),
4458 cast<VPWidenCastRecipe>(VecOp)->getOpcode(), A->getScalarType()))
4459 return new VPExpressionRecipe(cast<VPWidenCastRecipe>(VecOp), Red);
4460
4461 return nullptr;
4462}
4463
4464/// This function tries convert extended in-loop reductions to
4465/// VPExpressionRecipe and clamp the \p Range if it is beneficial
4466/// and valid. The created VPExpressionRecipe must be decomposed to its
4467/// constituent recipes before execution. Patterns of the
4468/// VPExpressionRecipe:
4469/// reduce.add(mul(...)),
4470/// reduce.add(mul(ext(A), ext(B))),
4471/// reduce.add(ext(mul(ext(A), ext(B)))).
4472/// reduce.fadd(fmul(ext(A), ext(B)))
4473static VPExpressionRecipe *
4475 VPCostContext &Ctx, VFRange &Range) {
4476 unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind());
4477 if (Opcode != Instruction::Add && Opcode != Instruction::Sub &&
4478 Opcode != Instruction::FAdd)
4479 return nullptr;
4480
4481 assert(!Red->isPartialReduction() &&
4482 "This path does not support partial reductions");
4483 Type *RedTy = Red->getScalarType();
4484
4485 // Clamp the range if using multiply-accumulate-reduction is profitable.
4486 auto IsMulAccValidAndClampRange =
4488 VPWidenCastRecipe *OuterExt) -> bool {
4490 [&](ElementCount VF) {
4492 Type *SrcTy = Ext0 ? Ext0->getOperand(0)->getScalarType() : RedTy;
4493 InstructionCost MulAccCost;
4494
4495 // getMulAccReductionCost for in-loop reductions does not support
4496 // mixed or floating-point extends.
4497 if (Ext0 && Ext1 &&
4498 (Ext0->getOpcode() != Ext1->getOpcode() ||
4499 Ext0->getOpcode() == Instruction::CastOps::FPExt))
4500 return false;
4501
4502 bool IsZExt =
4503 !Ext0 || Ext0->getOpcode() == Instruction::CastOps::ZExt;
4504 auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF));
4505 MulAccCost = Ctx.TTI.getMulAccReductionCost(IsZExt, Opcode, RedTy,
4506 SrcVecTy, CostKind);
4507
4508 InstructionCost MulCost = Mul->computeCost(VF, Ctx);
4509 InstructionCost RedCost = Red->computeCost(VF, Ctx);
4510 InstructionCost ExtCost = 0;
4511 if (Ext0)
4512 ExtCost += Ext0->computeCost(VF, Ctx);
4513 if (Ext1)
4514 ExtCost += Ext1->computeCost(VF, Ctx);
4515 if (OuterExt)
4516 ExtCost += OuterExt->computeCost(VF, Ctx);
4517
4518 return MulAccCost.isValid() &&
4519 MulAccCost < ExtCost + MulCost + RedCost;
4520 },
4521 Range);
4522 };
4523
4524 VPValue *VecOp = Red->getVecOp();
4525 VPRecipeBase *Sub = nullptr;
4526 VPValue *A, *B;
4527 VPValue *Tmp = nullptr;
4528
4529 if (RedTy->isFloatingPointTy())
4530 return nullptr;
4531
4532 // Sub reductions could have a sub between the add reduction and vec op.
4533 if (match(VecOp, m_Sub(m_ZeroInt(), m_VPValue(Tmp)))) {
4534 Sub = VecOp->getDefiningRecipe();
4535 VecOp = Tmp;
4536 }
4537
4538 // If ValB is a constant and can be safely extended, truncate it to the same
4539 // type as ExtA's operand, then extend it to the same type as ExtA. This
4540 // creates two uniform extends that can more easily be matched by the rest of
4541 // the bundling code. The ExtB reference, ValB and operand 1 of Mul are all
4542 // replaced with the new extend of the constant.
4543 auto ExtendAndReplaceConstantOp = [](VPWidenCastRecipe *ExtA,
4544 VPWidenCastRecipe *&ExtB, VPValue *&ValB,
4545 VPWidenRecipe *Mul) {
4546 if (!ExtA || ExtB || !isa<VPIRValue>(ValB))
4547 return;
4548 Type *NarrowTy = ExtA->getOperand(0)->getScalarType();
4549 Instruction::CastOps ExtOpc = ExtA->getOpcode();
4550 const APInt *Const;
4551 if (!match(ValB, m_APInt(Const)) ||
4553 Const, NarrowTy, TTI::getPartialReductionExtendKind(ExtOpc)))
4554 return;
4555 // The truncate ensures that the type of each extended operand is the
4556 // same, and it's been proven that the constant can be extended from
4557 // NarrowTy safely. Necessary since ExtA's extended operand would be
4558 // e.g. an i8, while the const will likely be an i32. This will be
4559 // elided by later optimisations.
4560 VPBuilder Builder(Mul);
4561 auto *Trunc =
4562 Builder.createWidenCast(Instruction::CastOps::Trunc, ValB, NarrowTy);
4563 Type *WideTy = ExtA->getScalarType();
4564 ValB = ExtB = Builder.createWidenCast(ExtOpc, Trunc, WideTy);
4565 Mul->setOperand(1, ExtB);
4566 };
4567
4568 // Try to match reduce.add(mul(...)).
4569 if (match(VecOp, m_Mul(m_VPValue(A), m_VPValue(B)))) {
4570 auto *RecipeA = dyn_cast<VPWidenCastRecipe>(A);
4571 auto *RecipeB = dyn_cast<VPWidenCastRecipe>(B);
4572 auto *Mul = cast<VPWidenRecipe>(VecOp);
4573
4574 // Convert reduce.add(mul(ext, const)) to reduce.add(mul(ext, ext(const)))
4575 ExtendAndReplaceConstantOp(RecipeA, RecipeB, B, Mul);
4576
4577 // Match reduce.add/sub(mul(ext, ext)).
4578 if (RecipeA && RecipeB && match(RecipeA, m_ZExtOrSExt(m_VPValue())) &&
4579 match(RecipeB, m_ZExtOrSExt(m_VPValue())) &&
4580 IsMulAccValidAndClampRange(Mul, RecipeA, RecipeB, nullptr)) {
4581 if (Sub)
4582 return new VPExpressionRecipe(RecipeA, RecipeB, Mul,
4583 cast<VPWidenRecipe>(Sub), Red);
4584 return new VPExpressionRecipe(RecipeA, RecipeB, Mul, Red);
4585 }
4586 // TODO: Add an expression type for this variant with a negated mul
4587 if (!Sub && IsMulAccValidAndClampRange(Mul, nullptr, nullptr, nullptr))
4588 return new VPExpressionRecipe(Mul, Red);
4589 }
4590 // TODO: Add an expression type for negated versions of other expression
4591 // variants.
4592 if (Sub)
4593 return nullptr;
4594
4595 // Match reduce.add(ext(mul(A, B))).
4596 if (match(VecOp, m_ZExtOrSExt(m_Mul(m_VPValue(A), m_VPValue(B))))) {
4597 auto *Ext = cast<VPWidenCastRecipe>(VecOp);
4598 auto *Mul = cast<VPWidenRecipe>(Ext->getOperand(0));
4599 auto *Ext0 = dyn_cast<VPWidenCastRecipe>(A);
4600 auto *Ext1 = dyn_cast<VPWidenCastRecipe>(B);
4601
4602 // reduce.add(ext(mul(ext, const)))
4603 // -> reduce.add(ext(mul(ext, ext(const))))
4604 ExtendAndReplaceConstantOp(Ext0, Ext1, B, Mul);
4605
4606 // reduce.add(ext(mul(ext(A), ext(B))))
4607 // -> reduce.add(mul(wider_ext(A), wider_ext(B)))
4608 // The inner extends must either have the same opcode as the outer extend or
4609 // be the same, in which case the multiply can never result in a negative
4610 // value and the outer extend can be folded away by doing wider
4611 // extends for the operands of the mul.
4612 if (Ext0 && Ext1 &&
4613 (Ext->getOpcode() == Ext0->getOpcode() || Ext0 == Ext1) &&
4614 Ext0->getOpcode() == Ext1->getOpcode() &&
4615 IsMulAccValidAndClampRange(Mul, Ext0, Ext1, Ext) && Mul->hasOneUse()) {
4616 auto *NewExt0 = new VPWidenCastRecipe(
4617 Ext0->getOpcode(), Ext0->getOperand(0), Ext->getScalarType(), nullptr,
4618 *Ext0, *Ext0, Ext0->getDebugLoc());
4619 NewExt0->insertBefore(Ext0);
4620
4621 VPWidenCastRecipe *NewExt1 = NewExt0;
4622 if (Ext0 != Ext1) {
4623 NewExt1 = new VPWidenCastRecipe(Ext1->getOpcode(), Ext1->getOperand(0),
4624 Ext->getScalarType(), nullptr, *Ext1,
4625 *Ext1, Ext1->getDebugLoc());
4626 NewExt1->insertBefore(Ext1);
4627 }
4628 auto *NewMul = Mul->cloneWithOperands({NewExt0, NewExt1});
4629 NewMul->insertBefore(Mul);
4630 Ext->replaceAllUsesWith(NewMul);
4631 Ext->eraseFromParent();
4632 Mul->eraseFromParent();
4633 return new VPExpressionRecipe(NewExt0, NewExt1, NewMul, Red);
4634 }
4635 }
4636 return nullptr;
4637}
4638
4639/// This function tries to create abstract recipes from the reduction recipe for
4640/// following optimizations and cost estimation.
4642 VPCostContext &Ctx,
4643 VFRange &Range) {
4644 // Creation of VPExpressions for partial reductions is entirely handled in
4645 // transformToPartialReduction.
4646 assert(!Red->isPartialReduction() &&
4647 "This path does not support partial reductions");
4648
4649 VPExpressionRecipe *AbstractR = nullptr;
4650 auto IP = std::next(Red->getIterator());
4651 auto *VPBB = Red->getParent();
4652 if (auto *MulAcc = tryToMatchAndCreateMulAccumulateReduction(Red, Ctx, Range))
4653 AbstractR = MulAcc;
4654 else if (auto *ExtRed = tryToMatchAndCreateExtendedReduction(Red, Ctx, Range))
4655 AbstractR = ExtRed;
4656 // Cannot create abstract inloop reduction recipes.
4657 if (!AbstractR)
4658 return;
4659
4660 AbstractR->insertBefore(*VPBB, IP);
4661 Red->replaceAllUsesWith(AbstractR);
4662}
4663
4674
4676 if (Plan.hasScalarVFOnly())
4677 return;
4678
4679#ifndef NDEBUG
4680 VPDominatorTree VPDT(Plan);
4681#endif
4682
4683 SmallVector<VPValue *> VPValues;
4684 if (VPValue *BTC = Plan.getBackedgeTakenCount())
4685 VPValues.push_back(BTC);
4686 append_range(VPValues, Plan.getLiveIns());
4687 for (VPRecipeBase &R : *Plan.getEntry())
4688 append_range(VPValues, R.definedValues());
4689
4690 auto *VectorPreheader = Plan.getVectorPreheader();
4691 for (VPValue *VPV : VPValues) {
4693 (isa<VPIRValue>(VPV) && isa<Constant>(VPV->getLiveInIRValue())))
4694 continue;
4695
4696 // Add explicit broadcast at the insert point that dominates all users.
4697 VPBasicBlock *HoistBlock = VectorPreheader;
4698 VPBasicBlock::iterator HoistPoint = VectorPreheader->end();
4699 for (VPUser *User : VPV->users()) {
4700 if (User->usesScalars(VPV))
4701 continue;
4702 if (cast<VPRecipeBase>(User)->getParent() == VectorPreheader)
4703 HoistPoint = HoistBlock->begin();
4704 else
4705 assert(VPDT.dominates(VectorPreheader,
4706 cast<VPRecipeBase>(User)->getParent()) &&
4707 "All users must be in the vector preheader or dominated by it");
4708 }
4709
4710 VPBuilder Builder(cast<VPBasicBlock>(HoistBlock), HoistPoint);
4711 auto *Broadcast = Builder.createNaryOp(VPInstruction::Broadcast, {VPV});
4712 VPV->replaceUsesWithIf(Broadcast,
4713 [VPV, Broadcast](VPUser &U, unsigned Idx) {
4714 return Broadcast != &U && !U.usesScalars(VPV);
4715 });
4716 }
4717}
4718
4719// Collect common metadata from a group of replicate recipes by intersecting
4720// metadata from all recipes in the group.
4722 VPIRMetadata CommonMetadata = *Recipes.front();
4723 for (VPReplicateRecipe *Recipe : drop_begin(Recipes))
4724 CommonMetadata.intersect(*Recipe);
4725 return CommonMetadata;
4726}
4727
4728template <unsigned Opcode>
4732 const Loop *L) {
4733 static_assert(Opcode == Instruction::Load || Opcode == Instruction::Store,
4734 "Only Load and Store opcodes supported");
4735 [[maybe_unused]] constexpr bool IsLoad = (Opcode == Instruction::Load);
4736
4737 // For each address, collect operations with the same or complementary masks.
4740 Plan, PSE, L,
4741 [](VPReplicateRecipe *RepR) { return RepR->isPredicated(); });
4742 for (auto Recipes : Groups) {
4743 if (Recipes.size() < 2)
4744 continue;
4745
4747 map_range(Recipes, bind_back<getLoadStoreValueType>(IsLoad))) &&
4748 "Expected all recipes in group to have the same load-store type");
4749
4750 // Collect groups with the same or complementary masks.
4751 for (VPReplicateRecipe *&RecipeI : Recipes) {
4752 if (!RecipeI)
4753 continue;
4754
4755 VPValue *MaskI = RecipeI->getMask();
4757 Group.push_back(RecipeI);
4758 RecipeI = nullptr;
4759
4760 // Find all operations with the same or complementary masks.
4761 bool HasComplementaryMask = false;
4762 for (VPReplicateRecipe *&RecipeJ : Recipes) {
4763 if (!RecipeJ)
4764 continue;
4765
4766 VPValue *MaskJ = RecipeJ->getMask();
4767 // Check if any operation in the group has a complementary mask with
4768 // another, that is M1 == NOT(M2) or M2 == NOT(M1).
4769 HasComplementaryMask |= match(MaskI, m_Not(m_Specific(MaskJ))) ||
4770 match(MaskJ, m_Not(m_Specific(MaskI)));
4771 Group.push_back(RecipeJ);
4772 RecipeJ = nullptr;
4773 }
4774
4775 if (HasComplementaryMask) {
4776 assert(Group.size() >= 2 && "must have at least 2 entries");
4777 AllGroups.push_back(std::move(Group));
4778 }
4779 }
4780 }
4781
4782 return AllGroups;
4783}
4784
4785// Find the recipe with minimum alignment in the group.
4786template <typename InstType>
4787static VPReplicateRecipe *
4789 return *min_element(Group, [](VPReplicateRecipe *A, VPReplicateRecipe *B) {
4790 return cast<InstType>(A->getUnderlyingInstr())->getAlign() <
4791 cast<InstType>(B->getUnderlyingInstr())->getAlign();
4792 });
4793}
4794
4797 const Loop *L) {
4798 auto Groups =
4800 if (Groups.empty())
4801 return;
4802
4803 // Process each group of loads.
4804 for (auto &Group : Groups) {
4805 // Try to use the earliest (most dominating) load to replace all others.
4806 VPReplicateRecipe *EarliestLoad = Group[0];
4807 VPBasicBlock *FirstBB = EarliestLoad->getParent();
4808 VPBasicBlock *LastBB = Group.back()->getParent();
4809
4810 // Check that the load doesn't alias with stores between first and last.
4811 auto LoadLoc = vputils::getMemoryLocation(*EarliestLoad);
4812 if (!LoadLoc || !canHoistOrSinkWithNoAliasCheck(*LoadLoc, FirstBB, LastBB))
4813 continue;
4814
4815 // Collect common metadata from all loads in the group.
4816 VPIRMetadata CommonMetadata = getCommonMetadata(Group);
4817
4818 // Find the load with minimum alignment to use.
4819 auto *LoadWithMinAlign = findRecipeWithMinAlign<LoadInst>(Group);
4820
4821 bool IsSingleScalar = EarliestLoad->isSingleScalar();
4822 assert(all_of(Group,
4823 [IsSingleScalar](VPReplicateRecipe *R) {
4824 return R->isSingleScalar() == IsSingleScalar;
4825 }) &&
4826 "all members in group must agree on IsSingleScalar");
4827
4828 // Create an unpredicated version of the earliest load with common
4829 // metadata.
4830 auto *UnpredicatedLoad = new VPReplicateRecipe(
4831 LoadWithMinAlign->getUnderlyingInstr(), {EarliestLoad->getOperand(0)},
4832 IsSingleScalar, /*Mask=*/nullptr, *EarliestLoad, CommonMetadata);
4833
4834 UnpredicatedLoad->insertBefore(EarliestLoad);
4835
4836 // Replace all loads in the group with the unpredicated load.
4837 for (VPReplicateRecipe *Load : Group) {
4838 Load->replaceAllUsesWith(UnpredicatedLoad);
4839 Load->eraseFromParent();
4840 }
4841 }
4842}
4843
4844static bool
4846 PredicatedScalarEvolution &PSE, const Loop &L) {
4847 auto StoreLoc = vputils::getMemoryLocation(*StoresToSink.front());
4848 if (!StoreLoc || !StoreLoc->AATags.Scope)
4849 return false;
4850
4851 // When sinking a group of stores, all members of the group alias each other.
4852 // Skip them during the alias checks.
4853 SmallPtrSet<VPRecipeBase *, 4> StoresToSinkSet(StoresToSink.begin(),
4854 StoresToSink.end());
4855
4856 VPBasicBlock *FirstBB = StoresToSink.front()->getParent();
4857 VPBasicBlock *LastBB = StoresToSink.back()->getParent();
4858 SinkStoreInfo SinkInfo(StoresToSinkSet, *StoresToSink[0], PSE, L);
4859 return canHoistOrSinkWithNoAliasCheck(*StoreLoc, FirstBB, LastBB, SinkInfo);
4860}
4861
4864 const Loop *L) {
4865 auto Groups =
4867 if (Groups.empty())
4868 return;
4869
4870 for (auto &Group : Groups) {
4871 if (!canSinkStoreWithNoAliasCheck(Group, PSE, *L))
4872 continue;
4873
4874 // Use the last (most dominated) store's location for the unconditional
4875 // store.
4876 VPReplicateRecipe *LastStore = Group.back();
4877 VPBasicBlock *InsertBB = LastStore->getParent();
4878
4879 // Collect common alias metadata from all stores in the group.
4880 VPIRMetadata CommonMetadata = getCommonMetadata(Group);
4881
4882 // Build select chain for stored values.
4883 VPValue *SelectedValue = Group[0]->getOperand(0);
4884 VPBuilder Builder(InsertBB, LastStore->getIterator());
4885
4886 bool IsSingleScalar = Group[0]->isSingleScalar();
4887 for (unsigned I = 1; I < Group.size(); ++I) {
4888 assert(IsSingleScalar == Group[I]->isSingleScalar() &&
4889 "all members in group must agree on IsSingleScalar");
4890 VPValue *Mask = Group[I]->getMask();
4891 VPValue *Value = Group[I]->getOperand(0);
4892 SelectedValue = Builder.createSelect(Mask, Value, SelectedValue,
4893 Group[I]->getDebugLoc());
4894 }
4895
4896 // Find the store with minimum alignment to use.
4897 auto *StoreWithMinAlign = findRecipeWithMinAlign<StoreInst>(Group);
4898
4899 // Create unconditional store with selected value and common metadata.
4900 auto *UnpredicatedStore = new VPReplicateRecipe(
4901 StoreWithMinAlign->getUnderlyingInstr(),
4902 {SelectedValue, LastStore->getOperand(1)}, IsSingleScalar,
4903 /*Mask=*/nullptr, *LastStore, CommonMetadata);
4904 UnpredicatedStore->insertBefore(*InsertBB, LastStore->getIterator());
4905
4906 // Remove all predicated stores from the group.
4907 for (VPReplicateRecipe *Store : Group)
4908 Store->eraseFromParent();
4909 }
4910}
4911
4913 VPlan &Plan, ElementCount BestVF, unsigned BestUF,
4915 assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
4916 assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
4917
4918 VPValue *TC = Plan.getTripCount();
4919 if (TC->getNumUsers() == 0)
4920 return;
4921
4922 // Skip cases for which the trip count may be non-trivial to materialize.
4923 // I.e., when a scalar tail is absent - due to tail folding, or when a scalar
4924 // tail is required.
4925 if (!Plan.hasScalarTail() ||
4927 Plan.getScalarPreheader() ||
4928 !isa<VPIRValue>(TC))
4929 return;
4930
4931 // Materialize vector trip counts for constants early if it can simply
4932 // be computed as (Original TC / VF * UF) * VF * UF.
4933 // TODO: Compute vector trip counts for loops requiring a scalar epilogue and
4934 // tail-folded loops.
4935 ScalarEvolution &SE = *PSE.getSE();
4936 auto *TCScev = SE.getSCEV(TC->getLiveInIRValue());
4937 if (!isa<SCEVConstant>(TCScev))
4938 return;
4939 const SCEV *VFxUF = SE.getElementCount(TCScev->getType(), BestVF * BestUF);
4940 auto VecTCScev = SE.getMulExpr(SE.getUDivExpr(TCScev, VFxUF), VFxUF);
4941 if (auto *ConstVecTC = dyn_cast<SCEVConstant>(VecTCScev))
4942 Plan.getVectorTripCount().setUnderlyingValue(ConstVecTC->getValue());
4943}
4944
4946 VPBasicBlock *VectorPH) {
4948 if (BTC->getNumUsers() == 0)
4949 return;
4950
4951 VPBuilder Builder(VectorPH, VectorPH->begin());
4952 auto *TCTy = Plan.getTripCount()->getScalarType();
4953 auto *TCMO =
4954 Builder.createSub(Plan.getTripCount(), Plan.getConstantInt(TCTy, 1),
4955 DebugLoc::getCompilerGenerated(), "trip.count.minus.1");
4956 BTC->replaceAllUsesWith(TCMO);
4957}
4958
4960 if (Plan.hasScalarVFOnly())
4961 return;
4962
4963 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
4964 auto VPBBsOutsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
4966 auto VPBBsInsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
4967 vp_depth_first_shallow(LoopRegion->getEntry()));
4968 // Materialize Build(Struct)Vector for all replicating VPReplicateRecipes,
4969 // VPScalarIVStepsRecipe and VPInstructions, excluding ones in replicate
4970 // regions. Those are not materialized explicitly yet.
4971 // TODO: materialize build vectors for replicating recipes in replicating
4972 // regions.
4973 for (VPBasicBlock *VPBB :
4974 concat<VPBasicBlock *>(VPBBsOutsideLoopRegion, VPBBsInsideLoopRegion)) {
4975 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
4977 continue;
4978 auto *DefR = cast<VPSingleDefRecipe>(&R);
4979 auto UsesVectorOrInsideReplicateRegion = [DefR, LoopRegion](VPUser *U) {
4980 VPRegionBlock *ParentRegion = cast<VPRecipeBase>(U)->getRegion();
4981 return !U->usesScalars(DefR) || ParentRegion != LoopRegion;
4982 };
4983 if ((isa<VPReplicateRecipe>(DefR) &&
4984 cast<VPReplicateRecipe>(DefR)->isSingleScalar()) ||
4985 (isa<VPInstruction>(DefR) &&
4987 !cast<VPInstruction>(DefR)->doesGeneratePerAllLanes())) ||
4988 none_of(DefR->users(), UsesVectorOrInsideReplicateRegion))
4989 continue;
4990
4991 Type *ScalarTy = DefR->getScalarType();
4992 unsigned Opcode = ScalarTy->isStructTy()
4995 auto *BuildVector = new VPInstruction(Opcode, {DefR});
4996 BuildVector->insertAfter(DefR);
4997
4998 DefR->replaceUsesWithIf(
4999 BuildVector, [BuildVector, &UsesVectorOrInsideReplicateRegion](
5000 VPUser &U, unsigned) {
5001 return &U != BuildVector && UsesVectorOrInsideReplicateRegion(&U);
5002 });
5003 }
5004 }
5005
5006 // Create explicit VPInstructions to convert vectors to scalars. The current
5007 // implementation is conservative - it may miss some cases that may or may not
5008 // be vector values. TODO: introduce Unpacks speculatively - remove them later
5009 // if they are known to operate on scalar values.
5010 for (VPBasicBlock *VPBB : VPBBsInsideLoopRegion) {
5011 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
5013 VPDerivedIVRecipe>(&R))
5014 continue;
5015 for (VPValue *Def : R.definedValues()) {
5016 // Skip recipes that are single-scalar or only have their first lane
5017 // used.
5018 // TODO: The Defs skipped here may or may not be vector values.
5019 // Introduce Unpacks, and remove them later, if they are guaranteed to
5020 // produce scalar values.
5022 continue;
5023
5024 // At the moment, we create unpacks only for scalar users outside
5025 // replicate regions. Recipes inside replicate regions still extract the
5026 // required lanes implicitly.
5027 // TODO: Remove once replicate regions are unrolled completely.
5028 auto IsCandidateUnpackUser = [Def](VPUser *U) {
5029 VPRegionBlock *ParentRegion = cast<VPRecipeBase>(U)->getRegion();
5030 return U->usesScalars(Def) &&
5031 (!ParentRegion || !ParentRegion->isReplicator());
5032 };
5033 if (none_of(Def->users(), IsCandidateUnpackUser))
5034 continue;
5035
5036 auto *Unpack = new VPInstruction(VPInstruction::Unpack, {Def});
5037 if (R.isPhi())
5038 Unpack->insertBefore(*VPBB, VPBB->getFirstNonPhi());
5039 else
5040 Unpack->insertAfter(&R);
5041 Def->replaceUsesWithIf(Unpack,
5042 [&IsCandidateUnpackUser](VPUser &U, unsigned) {
5043 return IsCandidateUnpackUser(&U);
5044 });
5045 }
5046 }
5047 }
5048}
5049
5051 VPlan &Plan, VPBasicBlock *VectorPHVPBB, bool TailByMasking,
5052 bool RequiresScalarEpilogue, VPValue *Step,
5053 std::optional<uint64_t> MaxRuntimeStep) {
5054 VPSymbolicValue &VectorTC = Plan.getVectorTripCount();
5055 // There's nothing to do if there are no users of the vector trip count or its
5056 // IR value has already been set.
5057 if (VectorTC.getNumUsers() == 0 || VectorTC.getUnderlyingValue())
5058 return;
5059
5060 VPValue *TC = Plan.getTripCount();
5061 Type *TCTy = TC->getScalarType();
5062 VPBasicBlock::iterator InsertPt = VectorPHVPBB->begin();
5063 if (auto *StepR = Step->getDefiningRecipe()) {
5064 assert(VPDominatorTree(Plan).dominates(StepR->getParent(), VectorPHVPBB) &&
5065 "Step VPBB must dominate VectorPHVPBB");
5066 // Insert after Step's definition to maintain valid def-use ordering.
5067 InsertPt = std::next(StepR->getIterator());
5068 }
5069 VPBuilder Builder(VectorPHVPBB, InsertPt);
5070
5071 // For scalable steps, if TC is a constant and is divisible by the maximum
5072 // possible runtime step, then TC % Step == 0 for all valid vscale values
5073 // and the vector trip count equals TC directly.
5074 const APInt *TCVal;
5075 if (!RequiresScalarEpilogue && match(TC, m_APInt(TCVal)) && MaxRuntimeStep &&
5076 TCVal->getZExtValue() % *MaxRuntimeStep == 0) {
5077 VectorTC.replaceAllUsesWith(TC);
5078 return;
5079 }
5080
5081 // If the tail is to be folded by masking, round the number of iterations N
5082 // up to a multiple of Step instead of rounding down. This is done by first
5083 // adding Step-1 and then rounding down. Note that it's ok if this addition
5084 // overflows: the vector induction variable will eventually wrap to zero given
5085 // that it starts at zero and its Step is a power of two; the loop will then
5086 // exit, with the last early-exit vector comparison also producing all-true.
5087 if (TailByMasking) {
5088 TC = Builder.createAdd(
5089 TC, Builder.createSub(Step, Plan.getConstantInt(TCTy, 1)),
5090 DebugLoc::getCompilerGenerated(), "n.rnd.up");
5091 }
5092
5093 // Now we need to generate the expression for the part of the loop that the
5094 // vectorized body will execute. This is equal to N - (N % Step) if scalar
5095 // iterations are not required for correctness, or N - Step, otherwise. Step
5096 // is equal to the vectorization factor (number of SIMD elements) times the
5097 // unroll factor (number of SIMD instructions).
5098 VPValue *R =
5099 Builder.createNaryOp(Instruction::URem, {TC, Step},
5100 DebugLoc::getCompilerGenerated(), "n.mod.vf");
5101
5102 // There are cases where we *must* run at least one iteration in the remainder
5103 // loop. See the cost model for when this can happen. If the step evenly
5104 // divides the trip count, we set the remainder to be equal to the step. If
5105 // the step does not evenly divide the trip count, no adjustment is necessary
5106 // since there will already be scalar iterations. Note that the minimum
5107 // iterations check ensures that N >= Step.
5108 if (RequiresScalarEpilogue) {
5109 assert(!TailByMasking &&
5110 "requiring scalar epilogue is not supported with fail folding");
5111 VPValue *IsZero =
5112 Builder.createICmp(CmpInst::ICMP_EQ, R, Plan.getZero(TCTy));
5113 R = Builder.createSelect(IsZero, Step, R);
5114 }
5115
5116 VPValue *Res =
5117 Builder.createSub(TC, R, DebugLoc::getCompilerGenerated(), "n.vec");
5118 VectorTC.replaceAllUsesWith(Res);
5119}
5120
5122 ElementCount VFEC) {
5123 // If VF and VFxUF have already been materialized (no remaining users),
5124 // there's nothing more to do.
5125 if (Plan.getVF().isMaterialized()) {
5126 assert(Plan.getVFxUF().isMaterialized() &&
5127 "VF and VFxUF must be materialized together");
5128 return;
5129 }
5130
5131 VPBuilder Builder(VectorPH, VectorPH->begin());
5132 Type *TCTy = Plan.getTripCount()->getScalarType();
5133 VPValue &VF = Plan.getVF();
5134 VPValue &VFxUF = Plan.getVFxUF();
5135 // If there are no users of the runtime VF, compute VFxUF by constant folding
5136 // the multiplication of VF and UF.
5137 if (VF.getNumUsers() == 0) {
5138 VPValue *RuntimeVFxUF =
5139 Builder.createElementCount(TCTy, VFEC * Plan.getConcreteUF());
5140 VFxUF.replaceAllUsesWith(RuntimeVFxUF);
5141 return;
5142 }
5143
5144 // For users of the runtime VF, compute it as VF * vscale, and VFxUF as (VF *
5145 // vscale) * UF.
5146 VPValue *RuntimeVF = Builder.createElementCount(TCTy, VFEC);
5148 VPValue *BC = Builder.createNaryOp(VPInstruction::Broadcast, RuntimeVF);
5150 BC, [&VF](VPUser &U, unsigned) { return !U.usesScalars(&VF); });
5151 }
5152 VF.replaceAllUsesWith(RuntimeVF);
5153
5154 VPValue *MulByUF = Builder.createOverflowingOp(
5155 Instruction::Mul,
5156 {RuntimeVF, Plan.getConstantInt(TCTy, Plan.getConcreteUF())},
5157 {true, false});
5158 VFxUF.replaceAllUsesWith(MulByUF);
5159}
5160
5162 VPSingleDefRecipe *HeaderMask = vputils::findHeaderMask(Plan);
5163 auto *HeaderMaskDef = HeaderMask->getDefiningRecipe();
5164 Type *I1Ty = IntegerType::getInt1Ty(Plan.getContext());
5165
5166 VPBuilder Builder(Plan.getVectorPreheader());
5167 auto *AliasMask = Builder.createNaryOp(
5168 VPInstruction::IncomingAliasMask, {}, nullptr, {}, {},
5169 DebugLoc::getUnknown(), "incoming.alias.mask", I1Ty);
5170
5171 if (HeaderMaskDef->isPhi())
5172 Builder = VPBuilder(&*HeaderMaskDef->getParent()->getFirstNonPhi());
5173 else
5174 Builder = VPBuilder::getToInsertAfter(HeaderMaskDef);
5175
5176 // Update all existing users of the header mask to "HeaderMask & AliasMask".
5177 auto *ClampedHeaderMask = Builder.createAnd(HeaderMask, AliasMask);
5178 HeaderMask->replaceUsesWithIf(ClampedHeaderMask, [&](VPUser &U, unsigned) {
5179 return &U != ClampedHeaderMask;
5180 });
5181}
5182
5183VPValue *
5185 ArrayRef<PointerDiffInfo> DiffChecks) {
5186 VPBuilder Builder(AliasCheckVPBB);
5187 Type *I1Ty = IntegerType::getInt1Ty(Plan.getContext());
5188
5189 VPValue *IncomingAliasMask = vputils::findIncomingAliasMask(Plan);
5190 assert(IncomingAliasMask && "Expected an alias mask!");
5191
5192 VPValue *AliasMask = nullptr;
5193 for (const PointerDiffInfo &Check : DiffChecks) {
5195 VPValue *Sink =
5197 Type *AddrType = Src->getScalarType();
5198
5199 // TODO: Only freeze the required pointer (not both src and sink).
5200 if (Check.NeedsFreeze) {
5201 Src = Builder.createScalarFreeze(Src, AddrType, DebugLoc::getUnknown());
5202 Sink = Builder.createScalarFreeze(Sink, AddrType, DebugLoc::getUnknown());
5203 }
5204
5205 // TODO: Generate loop_dependence_raw_mask when there's a read-after-write
5206 // dependency between the source and the sink. This is not necessary for
5207 // correctness of the mask, but using the "raw" variant prevents loads
5208 // depending on the completion of stores.
5209 VPWidenIntrinsicRecipe *WARMask = Builder.insert(new VPWidenIntrinsicRecipe(
5210 Intrinsic::loop_dependence_war_mask,
5211 {Src, Sink, Plan.getConstantInt(AddrType, Check.AccessSize)}, I1Ty));
5212
5213 if (AliasMask)
5214 AliasMask = Builder.createAnd(AliasMask, WARMask);
5215 else
5216 AliasMask = WARMask;
5217 }
5218
5220 Type *IndexTy = Plan.getDataLayout().getIndexType(Plan.getContext(), 0);
5221 VPValue *NumActive = Builder.createNaryOp(
5222 VPInstruction::NumActiveLanes, {AliasMask}, nullptr, {}, {},
5223 DebugLoc::getUnknown(), "num.active.lanes", IndexTy);
5224 VPValue *ClampedVF = Builder.createScalarZExtOrTrunc(
5225 NumActive, IVTy, IndexTy, DebugLoc::getCompilerGenerated());
5226
5227 IncomingAliasMask->replaceAllUsesWith(AliasMask);
5228
5229 return ClampedVF;
5230}
5231
5233 VPlan &Plan, ArrayRef<PointerDiffInfo> DiffChecks, bool HasBranchWeights) {
5234 VPBasicBlock *ClampedVFCheck =
5235 Plan.createVPBasicBlock("vector.clamped.vf.check");
5236
5237 VPValue *ClampedVF = materializeAliasMask(Plan, ClampedVFCheck, DiffChecks);
5238 VPBuilder Builder(ClampedVFCheck);
5240 Type *TCTy = Plan.getTripCount()->getScalarType();
5241
5242 // Check the "ClampedVF" from the alias mask is larger than one.
5243 VPValue *IsScalar =
5244 Builder.createICmp(CmpInst::ICMP_ULE, ClampedVF,
5245 Plan.getConstantInt(TCTy, 1), DL, "vf.is.scalar");
5246
5247 VPValue *TripCount = Plan.getTripCount();
5248 VPValue *MaxUIntTripCount =
5250 VPValue *DistanceToMax = Builder.createSub(MaxUIntTripCount, TripCount);
5251
5252 // For tail-folding: Don't execute the vector loop if (UMax - n) < ClampedVF.
5253 // Note: The ClampedVF may not be a power-of-two. This means the loop exit
5254 // condition (index.next == n.vec) may not be correct in the case of an
5255 // overflow. The issue is `n.vec` could be zero due to an overflow, but
5256 // index.next is not guaranteed to overflow to zero as the ClampedVF is not a
5257 // power-of-two).
5258 VPValue *TripCountCheck = Builder.createICmp(
5259 ICmpInst::ICMP_ULT, DistanceToMax, ClampedVF, DL, "vf.step.overflow");
5260
5261 VPValue *Cond = Builder.createOr(IsScalar, TripCountCheck, DL);
5262 attachVPCheckBlock(Plan, Cond, ClampedVFCheck, HasBranchWeights);
5263
5264 // Materialize the trip count early as this will add a use of (VFxUF) that
5265 // needs to be replaced with the ClampedVF.
5267 /*TailByMasking=*/true,
5268 /*RequiresScalarEpilogue=*/false,
5269 &Plan.getVFxUF());
5270
5271 assert(Plan.getConcreteUF() == 1 &&
5272 "Clamped VF not supported with interleaving");
5273 Plan.getVF().replaceAllUsesWith(ClampedVF);
5274 Plan.getVFxUF().replaceAllUsesWith(ClampedVF);
5275}
5276
5278 ScalarEvolution &SE) {
5279 auto *Entry = Plan.getEntry();
5280 VPBuilder Builder(Entry, Entry->begin());
5281 VPSCEVExpander Expander(Builder, SE);
5282
5283 // Expand VPExpandSCEVRecipes to VPInstructions using VPSCEVExpander. During
5284 // the transition, unsupported VPExpandSCEVRecipes are skipped and left for
5285 // late expansion.
5286 for (VPRecipeBase &R : make_early_inc_range(*Entry)) {
5287 auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(&R);
5288 if (!ExpSCEV || ExpSCEV->getNumUsers() == 0)
5289 continue;
5290 Builder.setInsertPoint(ExpSCEV);
5291 VPValue *Expanded = Expander.tryToExpand(ExpSCEV->getSCEV());
5292 if (!Expanded)
5293 continue;
5294 ExpSCEV->replaceAllUsesWith(Expanded);
5295 if (Plan.getTripCount() == ExpSCEV)
5296 Plan.resetTripCount(Expanded);
5297 ExpSCEV->eraseFromParent();
5298 }
5299}
5300
5303 SCEVExpander Expander(SE, "induction", /*PreserveLCSSA=*/false);
5304
5305 auto *Entry = cast<VPIRBasicBlock>(Plan.getEntry());
5306 BasicBlock *EntryBB = Entry->getIRBasicBlock();
5307 DenseMap<const SCEV *, Value *> ExpandedSCEVs;
5308 // Expand remaining VPExpandSCEVRecipes to IR instructions using SCEVExpander.
5309 for (VPRecipeBase &R : make_early_inc_range(*Entry)) {
5310 auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(&R);
5311 if (!ExpSCEV)
5312 continue;
5313 const SCEV *Expr = ExpSCEV->getSCEV();
5314 Value *Res =
5315 Expander.expandCodeFor(Expr, Expr->getType(), EntryBB->getTerminator());
5316 ExpandedSCEVs[Expr] = Res;
5317 VPValue *Exp = Plan.getOrAddLiveIn(Res);
5318 ExpSCEV->replaceAllUsesWith(Exp);
5319 if (Plan.getTripCount() == ExpSCEV)
5320 Plan.resetTripCount(Exp);
5321 ExpSCEV->eraseFromParent();
5322 }
5324 "all VPExpandSCEVRecipes must have been expanded");
5325 // Add IR instructions in the entry basic block but not in the VPIRBasicBlock
5326 // to the VPIRBasicBlock.
5327 auto EI = Entry->begin();
5328 for (Instruction &I : drop_end(*EntryBB)) {
5329 if (EI != Entry->end() && isa<VPIRInstruction>(*EI) &&
5330 &cast<VPIRInstruction>(&*EI)->getInstruction() == &I) {
5331 EI++;
5332 continue;
5333 }
5335 }
5336
5337 return ExpandedSCEVs;
5338}
5339
5340/// Returns true if \p V is VPWidenLoadRecipe or VPInterleaveRecipe that can be
5341/// converted to a narrower recipe. \p V is used by a wide recipe that feeds a
5342/// store interleave group at index \p Idx, \p WideMember0 is the recipe feeding
5343/// the same interleave group at index 0. A VPWidenLoadRecipe can be narrowed to
5344/// an index-independent load if it feeds all wide ops at all indices (\p OpV
5345/// must be the operand at index \p OpIdx for both the recipe at lane 0, \p
5346/// WideMember0). A VPInterleaveRecipe can be narrowed to a wide load, if \p V
5347/// is defined at \p Idx of a load interleave group.
5348static bool canNarrowLoad(VPSingleDefRecipe *WideMember0, unsigned OpIdx,
5349 VPValue *OpV, unsigned Idx, bool IsScalable) {
5350 VPValue *Member0Op = WideMember0->getOperand(OpIdx);
5351 VPRecipeBase *Member0OpR = Member0Op->getDefiningRecipe();
5352 if (!Member0OpR)
5353 return Member0Op == OpV;
5354 if (auto *W = dyn_cast<VPWidenLoadRecipe>(Member0OpR))
5355 // For scalable VFs, the narrowed plan processes vscale iterations at once,
5356 // so a shared wide load cannot be narrowed to a uniform scalar; bail out.
5357 return !IsScalable && !W->getMask() && W->isConsecutive() &&
5358 Member0Op == OpV;
5359 if (auto *IR = dyn_cast<VPInterleaveRecipe>(Member0OpR))
5360 return IR->getInterleaveGroup()->isFull() && IR->getVPValue(Idx) == OpV;
5361 return false;
5362}
5363
5364static bool canNarrowOps(ArrayRef<VPValue *> Ops, bool IsScalable) {
5366 auto *WideMember0 = dyn_cast<VPSingleDefRecipe>(Ops[0]);
5367 if (!WideMember0)
5368 return false;
5369 for (VPValue *V : Ops) {
5371 return false;
5372 auto *R = cast<VPSingleDefRecipe>(V);
5373 if (getOpcodeOrIntrinsicID(R) != getOpcodeOrIntrinsicID(WideMember0))
5374 return false;
5375 }
5376
5377 for (unsigned Idx = 0; Idx != WideMember0->getNumOperands(); ++Idx) {
5379 for (VPValue *Op : Ops)
5380 OpsI.push_back(Op->getDefiningRecipe()->getOperand(Idx));
5381
5382 if (canNarrowOps(OpsI, IsScalable))
5383 continue;
5384
5385 if (any_of(enumerate(OpsI), [WideMember0, Idx, IsScalable](const auto &P) {
5386 const auto &[OpIdx, OpV] = P;
5387 return !canNarrowLoad(WideMember0, Idx, OpV, OpIdx, IsScalable);
5388 }))
5389 return false;
5390 }
5391
5392 return true;
5393}
5394
5395/// Returns VF from \p VFs if \p IR is a full interleave group with factor and
5396/// number of members both equal to VF. The interleave group must also access
5397/// the full vector width.
5398static std::optional<ElementCount>
5401 const TargetTransformInfo &TTI) {
5402 if (!InterleaveR || InterleaveR->getMask())
5403 return std::nullopt;
5404
5405 Type *GroupElementTy = nullptr;
5406 if (InterleaveR->getStoredValues().empty()) {
5407 GroupElementTy = InterleaveR->getVPValue(0)->getScalarType();
5408 if (!all_of(InterleaveR->definedValues(), [GroupElementTy](VPValue *Op) {
5409 return Op->getScalarType() == GroupElementTy;
5410 }))
5411 return std::nullopt;
5412 } else {
5413 GroupElementTy = InterleaveR->getStoredValues()[0]->getScalarType();
5414 if (!all_of(InterleaveR->getStoredValues(), [GroupElementTy](VPValue *Op) {
5415 return Op->getScalarType() == GroupElementTy;
5416 }))
5417 return std::nullopt;
5418 }
5419
5420 auto IG = InterleaveR->getInterleaveGroup();
5421 if (IG->getFactor() != IG->getNumMembers())
5422 return std::nullopt;
5423
5424 auto GetVectorBitWidthForVF = [&TTI](ElementCount VF) {
5425 TypeSize Size = TTI.getRegisterBitWidth(
5428 assert(Size.isScalable() == VF.isScalable() &&
5429 "if Size is scalable, VF must be scalable and vice versa");
5430 return Size.getKnownMinValue();
5431 };
5432
5433 for (ElementCount VF : VFs) {
5434 unsigned MinVal = VF.getKnownMinValue();
5435 unsigned GroupSize = GroupElementTy->getScalarSizeInBits() * MinVal;
5436 if (IG->getFactor() == MinVal && GroupSize == GetVectorBitWidthForVF(VF))
5437 return {VF};
5438 }
5439 return std::nullopt;
5440}
5441
5442/// Returns true if \p VPValue is a narrow VPValue.
5443static bool isAlreadyNarrow(VPValue *VPV) {
5444 if (isa<VPIRValue>(VPV))
5445 return true;
5446 auto *RepR = dyn_cast<VPReplicateRecipe>(VPV);
5447 return RepR && RepR->isSingleScalar();
5448}
5449
5450// Convert a wide recipe defining a VPValue \p V feeding an interleave group to
5451// a narrow variant.
5452static VPValue *
5454 auto *R = V->getDefiningRecipe();
5455 if (!R || NarrowedOps.contains(V))
5456 return V;
5457
5458 if (isAlreadyNarrow(V))
5459 return V;
5460
5462 auto *WideMember0 = cast<VPSingleDefRecipe>(R);
5463 for (unsigned Idx = 0, E = WideMember0->getNumOperands(); Idx != E; ++Idx)
5464 WideMember0->setOperand(
5465 Idx,
5466 narrowInterleaveGroupOp(WideMember0->getOperand(Idx), NarrowedOps));
5467 return V;
5468 }
5469
5470 if (auto *LoadGroup = dyn_cast<VPInterleaveRecipe>(R)) {
5471 // Narrow interleave group to wide load, as transformed VPlan will only
5472 // process one original iteration.
5473 auto *LI = cast<LoadInst>(LoadGroup->getInterleaveGroup()->getInsertPos());
5474 auto *L = new VPWidenLoadRecipe(*LI, LoadGroup->getAddr(),
5475 LoadGroup->getMask(), /*Consecutive=*/true,
5476 *LoadGroup, LoadGroup->getDebugLoc());
5477 L->insertBefore(LoadGroup);
5478 NarrowedOps.insert(L);
5479 return L;
5480 }
5481
5482 if (auto *RepR = dyn_cast<VPReplicateRecipe>(R)) {
5483 assert(RepR->isSingleScalar() && RepR->getOpcode() == Instruction::Load &&
5484 "must be a single scalar load");
5485 NarrowedOps.insert(RepR);
5486 return RepR;
5487 }
5488
5489 auto *WideLoad = cast<VPWidenLoadRecipe>(R);
5490 VPValue *PtrOp = WideLoad->getAddr();
5491 if (auto *VecPtr = dyn_cast<VPVectorPointerRecipe>(PtrOp))
5492 PtrOp = VecPtr->getOperand(0);
5493 // Narrow wide load to uniform scalar load, as transformed VPlan will only
5494 // process one original iteration.
5495 auto *N = new VPReplicateRecipe(&WideLoad->getIngredient(), {PtrOp},
5496 /*IsUniform*/ true,
5497 /*Mask*/ nullptr, {}, *WideLoad);
5498 N->insertBefore(WideLoad);
5499 NarrowedOps.insert(N);
5500 return N;
5501}
5502
5503std::unique_ptr<VPlan>
5505 const TargetTransformInfo &TTI) {
5506 VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
5507
5508 if (!VectorLoop)
5509 return nullptr;
5510
5511 // Only handle single-block loops for now.
5512 if (VectorLoop->getEntryBasicBlock() != VectorLoop->getExitingBasicBlock())
5513 return nullptr;
5514
5515 // Skip plans when we may not be able to properly narrow.
5516 VPBasicBlock *Exiting = VectorLoop->getExitingBasicBlock();
5517 if (!match(&Exiting->back(), m_BranchOnCount()))
5518 return nullptr;
5519
5520 assert(match(&Exiting->back(),
5522 m_Specific(&Plan.getVectorTripCount()))) &&
5523 "unexpected branch-on-count");
5524
5526 std::optional<ElementCount> VFToOptimize;
5527 for (auto &R : *VectorLoop->getEntryBasicBlock()) {
5530 continue;
5531
5532 // Bail out on recipes not supported at the moment:
5533 // * phi recipes other than the canonical induction
5534 // * recipes writing to memory except interleave groups
5535 // Only support plans with a canonical induction phi.
5536 if (R.isPhi())
5537 return nullptr;
5538
5539 auto *InterleaveR = dyn_cast<VPInterleaveRecipe>(&R);
5540 if (R.mayWriteToMemory() && !InterleaveR)
5541 return nullptr;
5542
5543 // Bail out if any recipe defines a vector value used outside the
5544 // vector loop region.
5545 if (any_of(R.definedValues(), [&](VPValue *V) {
5546 return any_of(V->users(), [&](VPUser *U) {
5547 auto *UR = cast<VPRecipeBase>(U);
5548 return UR->getParent()->getParent() != VectorLoop;
5549 });
5550 }))
5551 return nullptr;
5552
5553 // All other ops are allowed, but we reject uses that cannot be converted
5554 // when checking all allowed consumers (store interleave groups) below.
5555 if (!InterleaveR)
5556 continue;
5557
5558 // Try to find a single VF, where all interleave groups are consecutive and
5559 // saturate the full vector width. If we already have a candidate VF, check
5560 // if it is applicable for the current InterleaveR, otherwise look for a
5561 // suitable VF across the Plan's VFs.
5563 VFToOptimize ? SmallVector<ElementCount>({*VFToOptimize})
5564 : to_vector(Plan.vectorFactors());
5565 std::optional<ElementCount> NarrowedVF =
5566 isConsecutiveInterleaveGroup(InterleaveR, VFs, TTI);
5567 if (!NarrowedVF || (VFToOptimize && NarrowedVF != VFToOptimize))
5568 return nullptr;
5569 VFToOptimize = NarrowedVF;
5570
5571 // Skip read interleave groups.
5572 if (InterleaveR->getStoredValues().empty())
5573 continue;
5574
5575 // Narrow interleave groups, if all operands are already matching narrow
5576 // ops.
5577 auto *Member0 = InterleaveR->getStoredValues()[0];
5578 if (isAlreadyNarrow(Member0) &&
5579 all_of(InterleaveR->getStoredValues(), equal_to(Member0))) {
5580 StoreGroups.push_back(InterleaveR);
5581 continue;
5582 }
5583
5584 // For now, we only support full interleave groups storing load interleave
5585 // groups.
5586 if (all_of(enumerate(InterleaveR->getStoredValues()), [](auto Op) {
5587 VPRecipeBase *DefR = Op.value()->getDefiningRecipe();
5588 if (!DefR)
5589 return false;
5590 auto *IR = dyn_cast<VPInterleaveRecipe>(DefR);
5591 return IR && IR->getInterleaveGroup()->isFull() &&
5592 IR->getVPValue(Op.index()) == Op.value();
5593 })) {
5594 StoreGroups.push_back(InterleaveR);
5595 continue;
5596 }
5597
5598 // Check if all values feeding InterleaveR are matching wide recipes, which
5599 // operands that can be narrowed.
5600 if (!canNarrowOps(InterleaveR->getStoredValues(),
5601 VFToOptimize->isScalable()))
5602 return nullptr;
5603 StoreGroups.push_back(InterleaveR);
5604 }
5605
5606 if (StoreGroups.empty())
5607 return nullptr;
5608
5609 VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
5610 bool RequiresScalarEpilogue =
5611 MiddleVPBB->getNumSuccessors() == 1 &&
5612 MiddleVPBB->getSingleSuccessor() == Plan.getScalarPreheader();
5613 // Bail out for tail-folding (middle block with a single successor to exit).
5614 if (MiddleVPBB->getNumSuccessors() != 2 && !RequiresScalarEpilogue)
5615 return nullptr;
5616
5617 // All interleave groups in Plan can be narrowed for VFToOptimize. Split the
5618 // original Plan into 2: a) a new clone which contains all VFs of Plan, except
5619 // VFToOptimize, and b) the original Plan with VFToOptimize as single VF.
5620 // TODO: Handle cases where only some interleave groups can be narrowed.
5621 std::unique_ptr<VPlan> NewPlan;
5622 if (size(Plan.vectorFactors()) != 1) {
5623 NewPlan = std::unique_ptr<VPlan>(Plan.duplicate());
5624 Plan.setVF(*VFToOptimize);
5625 NewPlan->removeVF(*VFToOptimize);
5626 }
5627
5628 // Convert InterleaveGroup \p R to a single VPWidenLoadRecipe.
5629 SmallPtrSet<VPValue *, 4> NarrowedOps;
5630 // Narrow operation tree rooted at store groups.
5631 for (auto *StoreGroup : StoreGroups) {
5632 VPValue *Res =
5633 narrowInterleaveGroupOp(StoreGroup->getStoredValues()[0], NarrowedOps);
5634 auto *SI =
5635 cast<StoreInst>(StoreGroup->getInterleaveGroup()->getInsertPos());
5636 auto *S = new VPWidenStoreRecipe(*SI, StoreGroup->getAddr(), Res, nullptr,
5637 /*Consecutive=*/true, *StoreGroup,
5638 StoreGroup->getDebugLoc());
5639 S->insertBefore(StoreGroup);
5640 StoreGroup->eraseFromParent();
5641 }
5642
5643 // Adjust induction to reflect that the transformed plan only processes one
5644 // original iteration.
5646 Type *CanIVTy = VectorLoop->getCanonicalIVType();
5647 VPBasicBlock *VectorPH = Plan.getVectorPreheader();
5648 VPBuilder PHBuilder(VectorPH, VectorPH->begin());
5649
5650 VPValue *UF = &Plan.getUF();
5651 VPValue *Step;
5652 if (VFToOptimize->isScalable()) {
5653 VPValue *VScale =
5654 PHBuilder.createElementCount(CanIVTy, ElementCount::getScalable(1));
5655 Step = PHBuilder.createOverflowingOp(Instruction::Mul, {VScale, UF},
5656 {true, false});
5657 Plan.getVF().replaceAllUsesWith(VScale);
5658 } else {
5659 Step = UF;
5660 Plan.getVF().replaceAllUsesWith(Plan.getConstantInt(CanIVTy, 1));
5661 }
5662 // Materialize vector trip count with the narrowed step.
5663 materializeVectorTripCount(Plan, VectorPH, /*TailByMasking=*/false,
5664 RequiresScalarEpilogue, Step);
5665
5666 CanIVInc->setOperand(1, Step);
5667 Plan.getVFxUF().replaceAllUsesWith(Step);
5668
5669 removeDeadRecipes(Plan);
5670 assert(none_of(*VectorLoop->getEntryBasicBlock(),
5672 "All VPVectorPointerRecipes should have been removed");
5673 return NewPlan;
5674}
5675
5676/// Add branch weight metadata, if the \p Plan's middle block is terminated by a
5677/// BranchOnCond recipe.
5679 VPlan &Plan, ElementCount VF, std::optional<unsigned> VScaleForTuning) {
5680 VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
5681 auto *MiddleTerm =
5683 // Only add branch metadata if there is a (conditional) terminator.
5684 if (!MiddleTerm)
5685 return;
5686
5687 assert(MiddleTerm->getOpcode() == VPInstruction::BranchOnCond &&
5688 "must have a BranchOnCond");
5689 // Assume that `TripCount % VectorStep ` is equally distributed.
5690 unsigned VectorStep = Plan.getConcreteUF() * VF.getKnownMinValue();
5691 if (VF.isScalable() && VScaleForTuning.has_value())
5692 VectorStep *= *VScaleForTuning;
5693 assert(VectorStep > 0 && "trip count should not be zero");
5694 MDBuilder MDB(Plan.getContext());
5695 MDNode *BranchWeights =
5696 MDB.createBranchWeights({1, VectorStep - 1}, /*IsExpected=*/false);
5697 MiddleTerm->setMetadata(LLVMContext::MD_prof, BranchWeights);
5698}
5699
5701 VFRange &Range) {
5702 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
5703 auto *MiddleVPBB = Plan.getMiddleBlock();
5704 VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
5705
5706 auto IsScalableOne = [](ElementCount VF) -> bool {
5707 return VF == ElementCount::getScalable(1);
5708 };
5709
5710 for (auto &HeaderPhi : VectorRegion->getEntryBasicBlock()->phis()) {
5711 auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&HeaderPhi);
5712 if (!FOR)
5713 continue;
5714
5715 assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
5716 "Cannot handle loops with uncountable early exits");
5717
5718 // Find the existing splice for this FOR, created in
5719 // createHeaderPhiRecipes. All uses of FOR have already been replaced with
5720 // RecurSplice there; only RecurSplice itself still references FOR.
5721 auto *RecurSplice =
5723 assert(RecurSplice && "expected FirstOrderRecurrenceSplice");
5724
5725 // For VF vscale x 1, if vscale = 1, we are unable to extract the
5726 // penultimate value of the recurrence. Instead we rely on the existing
5727 // extract of the last element from the result of
5728 // VPInstruction::FirstOrderRecurrenceSplice.
5729 // TODO: Consider vscale_range info and UF.
5730 if (any_of(RecurSplice->users(),
5731 [](VPUser *U) { return !cast<VPRecipeBase>(U)->getRegion(); }) &&
5733 Range))
5734 return;
5735
5736 // This is the second phase of vectorizing first-order recurrences, creating
5737 // extracts for users outside the loop. An overview of the transformation is
5738 // described below. Suppose we have the following loop with some use after
5739 // the loop of the last a[i-1],
5740 //
5741 // for (int i = 0; i < n; ++i) {
5742 // t = a[i - 1];
5743 // b[i] = a[i] - t;
5744 // }
5745 // use t;
5746 //
5747 // There is a first-order recurrence on "a". For this loop, the shorthand
5748 // scalar IR looks like:
5749 //
5750 // scalar.ph:
5751 // s.init = a[-1]
5752 // br scalar.body
5753 //
5754 // scalar.body:
5755 // i = phi [0, scalar.ph], [i+1, scalar.body]
5756 // s1 = phi [s.init, scalar.ph], [s2, scalar.body]
5757 // s2 = a[i]
5758 // b[i] = s2 - s1
5759 // br cond, scalar.body, exit.block
5760 //
5761 // exit.block:
5762 // use = lcssa.phi [s1, scalar.body]
5763 //
5764 // In this example, s1 is a recurrence because it's value depends on the
5765 // previous iteration. In the first phase of vectorization, we created a
5766 // VPFirstOrderRecurrencePHIRecipe v1 for s1. Now we create the extracts
5767 // for users in the scalar preheader and exit block.
5768 //
5769 // vector.ph:
5770 // v_init = vector(..., ..., ..., a[-1])
5771 // br vector.body
5772 //
5773 // vector.body
5774 // i = phi [0, vector.ph], [i+4, vector.body]
5775 // v1 = phi [v_init, vector.ph], [v2, vector.body]
5776 // v2 = a[i, i+1, i+2, i+3]
5777 // v1' = splice(v1(3), v2(0, 1, 2))
5778 // b[i, i+1, i+2, i+3] = v2 - v1'
5779 // br cond, vector.body, middle.block
5780 //
5781 // middle.block:
5782 // vector.recur.extract.for.phi = v2(2)
5783 // vector.recur.extract = v2(3)
5784 // br cond, scalar.ph, exit.block
5785 //
5786 // scalar.ph:
5787 // scalar.recur.init = phi [vector.recur.extract, middle.block],
5788 // [s.init, otherwise]
5789 // br scalar.body
5790 //
5791 // scalar.body:
5792 // i = phi [0, scalar.ph], [i+1, scalar.body]
5793 // s1 = phi [scalar.recur.init, scalar.ph], [s2, scalar.body]
5794 // s2 = a[i]
5795 // b[i] = s2 - s1
5796 // br cond, scalar.body, exit.block
5797 //
5798 // exit.block:
5799 // lo = lcssa.phi [s1, scalar.body],
5800 // [vector.recur.extract.for.phi, middle.block]
5801 //
5802 // Update extracts of the splice in the middle block: they extract the
5803 // penultimate element of the recurrence.
5805 make_range(MiddleVPBB->getFirstNonPhi(), MiddleVPBB->end()))) {
5806 if (!match(&R, m_ExtractLastLaneOfLastPart(m_Specific(RecurSplice))))
5807 continue;
5808
5809 auto *ExtractR = cast<VPInstruction>(&R);
5810 VPValue *PenultimateElement = MiddleBuilder.createNaryOp(
5811 VPInstruction::ExtractPenultimateElement, RecurSplice->getOperand(1),
5812 {}, "vector.recur.extract.for.phi");
5813 for (VPUser *ExitU : to_vector(ExtractR->users())) {
5814 if (auto *ExitPhi = dyn_cast<VPIRPhi>(ExitU))
5815 ExitPhi->replaceUsesOfWith(ExtractR, PenultimateElement);
5816 }
5817 }
5818 }
5819}
5820
5821/// Check if \p V is a binary expression of a widened IV and a loop-invariant
5822/// value. Returns the widened IV if found, nullptr otherwise.
5824 auto *BinOp = dyn_cast<VPWidenRecipe>(V);
5825 if (!BinOp || !Instruction::isBinaryOp(BinOp->getOpcode()) ||
5826 Instruction::isIntDivRem(BinOp->getOpcode()))
5827 return nullptr;
5828
5829 VPValue *WidenIVCandidate = BinOp->getOperand(0);
5830 VPValue *InvariantCandidate = BinOp->getOperand(1);
5831 if (!isa<VPWidenIntOrFpInductionRecipe>(WidenIVCandidate))
5832 std::swap(WidenIVCandidate, InvariantCandidate);
5833
5834 if (!InvariantCandidate->isDefinedOutsideLoopRegions())
5835 return nullptr;
5836
5837 return dyn_cast<VPWidenIntOrFpInductionRecipe>(WidenIVCandidate);
5838}
5839
5840/// Create a scalar version of \p BinOp, with its \p WidenIV operand replaced
5841/// by \p ScalarIV, and place it after \p ScalarIV's defining recipe.
5845 BinOp->getNumOperands() == 2 && "BinOp must have 2 operands");
5846 auto *ClonedOp = BinOp->clone();
5847 if (ClonedOp->getOperand(0) == WidenIV) {
5848 ClonedOp->setOperand(0, ScalarIV);
5849 } else {
5850 assert(ClonedOp->getOperand(1) == WidenIV && "one operand must be WideIV");
5851 ClonedOp->setOperand(1, ScalarIV);
5852 }
5853 ClonedOp->insertAfter(ScalarIV->getDefiningRecipe());
5854 return ClonedOp;
5855}
5856
5859 Loop &L) {
5860 ScalarEvolution &SE = *PSE.getSE();
5861 VPRegionBlock *VectorLoopRegion = Plan.getVectorLoopRegion();
5862
5863 // Helper lambda to check if the IV range excludes the sentinel value. Try
5864 // signed first, then unsigned. Return an excluded sentinel if found,
5865 // otherwise return std::nullopt.
5866 auto CheckSentinel = [&SE](const SCEV *IVSCEV,
5867 bool UseMax) -> std::optional<APSInt> {
5868 unsigned BW = IVSCEV->getType()->getScalarSizeInBits();
5869 for (bool Signed : {true, false}) {
5870 APSInt Sentinel = UseMax ? APSInt::getMinValue(BW, /*Unsigned=*/!Signed)
5871 : APSInt::getMaxValue(BW, /*Unsigned=*/!Signed);
5872
5873 ConstantRange IVRange =
5874 Signed ? SE.getSignedRange(IVSCEV) : SE.getUnsignedRange(IVSCEV);
5875 if (!IVRange.contains(Sentinel))
5876 return Sentinel;
5877 }
5878 return std::nullopt;
5879 };
5880
5881 VPValue *HeaderMask = vputils::findHeaderMask(Plan);
5882 for (VPRecipeBase &Phi :
5883 make_early_inc_range(VectorLoopRegion->getEntryBasicBlock()->phis())) {
5884 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&Phi);
5886 PhiR->getRecurrenceKind()))
5887 continue;
5888
5889 Type *PhiTy = PhiR->getScalarType();
5890 if (PhiTy->isPointerTy() || PhiTy->isFloatingPointTy())
5891 continue;
5892
5893 // If there's a header mask, the backedge select will not be the find-last
5894 // select.
5895 VPValue *BackedgeVal = PhiR->getBackedgeValue();
5896 auto *FindLastSelect = cast<VPSingleDefRecipe>(BackedgeVal);
5897 if (HeaderMask &&
5898 !match(BackedgeVal,
5899 m_Select(m_Specific(HeaderMask),
5900 m_VPSingleDefRecipe(FindLastSelect), m_Specific(PhiR))))
5901 continue;
5902
5903 // Get the find-last expression from the find-last select of the reduction
5904 // phi. The find-last select should be a select between the phi and the
5905 // find-last expression.
5906 VPValue *Cond, *FindLastExpression;
5907 if (!match(FindLastSelect, m_Select(m_VPValue(Cond), m_Specific(PhiR),
5908 m_VPValue(FindLastExpression))) &&
5909 !match(FindLastSelect,
5910 m_Select(m_VPValue(Cond), m_VPValue(FindLastExpression),
5911 m_Specific(PhiR))))
5912 continue;
5913
5914 // Check if FindLastExpression is a simple expression of a widened IV. If
5915 // so, we can track the underlying IV instead and sink the expression.
5916 auto *IVOfExpressionToSink = getExpressionIV(FindLastExpression);
5917 const SCEV *IVSCEV = vputils::getSCEVExprForVPValue(
5918 IVOfExpressionToSink ? IVOfExpressionToSink : FindLastExpression, PSE,
5919 &L);
5920 const SCEV *Step;
5921 if (!match(IVSCEV, m_scev_AffineAddRec(m_SCEV(), m_SCEV(Step)))) {
5922 assert(!match(vputils::getSCEVExprForVPValue(FindLastExpression, PSE, &L),
5924 "IVOfExpressionToSink not being an AddRec must imply "
5925 "FindLastExpression not being an AddRec.");
5926 continue;
5927 }
5928
5929 // Determine direction from SCEV step.
5930 if (!SE.isKnownNonZero(Step))
5931 continue;
5932
5933 // Positive step means we need UMax/SMax to find the last IV value, and
5934 // UMin/SMin otherwise.
5935 bool UseMax = SE.isKnownPositive(Step);
5936 std::optional<APSInt> SentinelVal = CheckSentinel(IVSCEV, UseMax);
5937 bool UseSigned = SentinelVal && SentinelVal->isSigned();
5938
5939 // Sinking an expression will disable epilogue vectorization. Only use it,
5940 // if FindLastExpression cannot be vectorized via a sentinel. Sinking may
5941 // also prevent vectorizing using a sentinel (e.g., if the expression is a
5942 // multiply or divide by large constant, respectively), which also makes
5943 // sinking undesirable.
5944 if (IVOfExpressionToSink) {
5945 const SCEV *FindLastExpressionSCEV =
5946 vputils::getSCEVExprForVPValue(FindLastExpression, PSE, &L);
5947 if (match(FindLastExpressionSCEV,
5948 m_scev_AffineAddRec(m_SCEV(), m_SCEV(Step)))) {
5949 bool NewUseMax = SE.isKnownPositive(Step);
5950 if (auto NewSentinel =
5951 CheckSentinel(FindLastExpressionSCEV, NewUseMax)) {
5952 // The original expression already has a sentinel, so prefer not
5953 // sinking to keep epilogue vectorization possible.
5954 SentinelVal = *NewSentinel;
5955 UseSigned = NewSentinel->isSigned();
5956 UseMax = NewUseMax;
5957 IVSCEV = FindLastExpressionSCEV;
5958 IVOfExpressionToSink = nullptr;
5959 }
5960 }
5961 }
5962
5963 // If no sentinel was found, fall back to a boolean AnyOf reduction to track
5964 // if the condition was ever true. Requires the IV to not wrap, otherwise we
5965 // cannot use min/max.
5966 if (!SentinelVal) {
5967 auto *AR = cast<SCEVAddRecExpr>(IVSCEV);
5968 if (AR->hasNoSignedWrap())
5969 UseSigned = true;
5970 else if (AR->hasNoUnsignedWrap())
5971 UseSigned = false;
5972 else
5973 continue;
5974 }
5975
5977 BackedgeVal,
5979
5980 VPValue *NewFindLastSelect = BackedgeVal;
5981 VPValue *SelectCond = Cond;
5982 if (!SentinelVal || IVOfExpressionToSink) {
5983 // When we need to create a new select, normalize the condition so that
5984 // PhiR is the last operand and include the header mask if needed.
5985 DebugLoc DL = FindLastSelect->getDefiningRecipe()->getDebugLoc();
5986 VPBuilder LoopBuilder(FindLastSelect->getDefiningRecipe());
5987 if (FindLastSelect->getDefiningRecipe()->getOperand(1) == PhiR)
5988 SelectCond = LoopBuilder.createNot(SelectCond);
5989
5990 // When tail folding, mask the condition with the header mask to prevent
5991 // propagating poison from inactive lanes in the last vector iteration.
5992 if (HeaderMask)
5993 SelectCond = LoopBuilder.createLogicalAnd(HeaderMask, SelectCond);
5994
5995 if (SelectCond != Cond || IVOfExpressionToSink) {
5996 NewFindLastSelect = LoopBuilder.createSelect(
5997 SelectCond,
5998 IVOfExpressionToSink ? IVOfExpressionToSink : FindLastExpression,
5999 PhiR, DL);
6000 }
6001 }
6002
6003 // Create the reduction result in the middle block using sentinel directly.
6004 RecurKind MinMaxKind =
6005 UseMax ? (UseSigned ? RecurKind::SMax : RecurKind::UMax)
6006 : (UseSigned ? RecurKind::SMin : RecurKind::UMin);
6007 VPIRFlags Flags(MinMaxKind, /*IsOrdered=*/false, /*IsInLoop=*/false,
6008 FastMathFlags());
6009 DebugLoc ExitDL = RdxResult->getDebugLoc();
6010 VPBuilder MiddleBuilder(RdxResult);
6011 VPValue *ReducedIV =
6013 NewFindLastSelect, Flags, ExitDL);
6014
6015 // If IVOfExpressionToSink is an expression to sink, sink it now.
6016 VPValue *VectorRegionExitingVal = ReducedIV;
6017 if (IVOfExpressionToSink)
6018 VectorRegionExitingVal =
6019 cloneBinOpForScalarIV(cast<VPWidenRecipe>(FindLastExpression),
6020 ReducedIV, IVOfExpressionToSink);
6021
6022 VPValue *NewRdxResult;
6023 VPValue *StartVPV = PhiR->getStartValue();
6024 if (SentinelVal) {
6025 // Sentinel-based approach: reduce IVs with min/max, compare against
6026 // sentinel to detect if condition was ever true, select accordingly.
6027 VPValue *Sentinel = Plan.getConstantInt(*SentinelVal);
6028 auto *Cmp = MiddleBuilder.createICmp(CmpInst::ICMP_NE, ReducedIV,
6029 Sentinel, ExitDL);
6030 NewRdxResult = MiddleBuilder.createSelect(Cmp, VectorRegionExitingVal,
6031 StartVPV, ExitDL);
6032 StartVPV = Sentinel;
6033 } else {
6034 // Introduce a boolean AnyOf reduction to track if the condition was ever
6035 // true in the loop. Use it to select the initial start value, if it was
6036 // never true.
6037 auto *AnyOfPhi = new VPReductionPHIRecipe(
6038 /*Phi=*/nullptr, RecurKind::Or, *Plan.getFalse(), *Plan.getFalse(),
6039 RdxUnordered{1}, {}, /*HasUsesOutsideReductionChain=*/false);
6040 AnyOfPhi->insertAfter(PhiR);
6041
6042 VPBuilder LoopBuilder(BackedgeVal->getDefiningRecipe());
6043 VPValue *OrVal = LoopBuilder.createOr(AnyOfPhi, SelectCond);
6044 AnyOfPhi->setOperand(1, OrVal);
6045
6046 NewRdxResult = MiddleBuilder.createAnyOfReduction(
6047 OrVal, VectorRegionExitingVal, StartVPV, ExitDL);
6048
6049 // Initialize the IV reduction phi with the neutral element, not the
6050 // original start value, to ensure correct min/max reduction results.
6051 StartVPV = Plan.getOrAddLiveIn(
6052 getRecurrenceIdentity(MinMaxKind, IVSCEV->getType(), {}));
6053 }
6054 RdxResult->replaceAllUsesWith(NewRdxResult);
6055 RdxResult->eraseFromParent();
6056
6057 auto *NewPhiR = new VPReductionPHIRecipe(
6058 cast<PHINode>(PhiR->getUnderlyingInstr()), RecurKind::FindIV, *StartVPV,
6059 *NewFindLastSelect, RdxUnordered{1}, {},
6060 PhiR->hasUsesOutsideReductionChain());
6061 NewPhiR->insertBefore(PhiR);
6062 PhiR->replaceAllUsesWith(NewPhiR);
6063 PhiR->eraseFromParent();
6064 }
6065}
6066
6067namespace {
6068
6069using ExtendKind = TTI::PartialReductionExtendKind;
6070struct ReductionExtend {
6071 Type *SrcType = nullptr;
6072 ExtendKind Kind = ExtendKind::PR_None;
6073};
6074
6075/// Describes the extends used to compute the extended reduction operand.
6076/// ExtendB is optional. If ExtendB is present, ExtendsUser is a binary
6077/// operation.
6078struct ExtendedReductionOperand {
6079 /// The recipe that consumes the extends.
6080 VPWidenRecipe *ExtendsUser = nullptr;
6081 /// Extend descriptions (inputs to getPartialReductionCost).
6082 ReductionExtend ExtendA, ExtendB;
6083};
6084
6085/// A chain of recipes that form a partial reduction. Matches either
6086/// reduction_bin_op (extended op, accumulator), or
6087/// reduction_bin_op (accumulator, extended op).
6088/// The possible forms of the "extended op" are listed in
6089/// matchExtendedReductionOperand.
6090struct VPPartialReductionChain {
6091 /// The top-level binary operation that forms the reduction to a scalar
6092 /// after the loop body.
6093 VPWidenRecipe *ReductionBinOp = nullptr;
6094 /// The user of the extends that is then reduced.
6095 ExtendedReductionOperand ExtendedOp;
6096 /// The recurrence kind for the entire partial reduction chain.
6097 /// This allows distinguishing between Sub and AddWithSub recurrences,
6098 /// when the ReductionBinOp is a Instruction::Sub.
6099 RecurKind RK;
6100 /// The index of the accumulator operand of ReductionBinOp. The extended op
6101 /// is `1 - AccumulatorOpIdx`.
6102 unsigned AccumulatorOpIdx;
6103 unsigned ScaleFactor;
6104};
6105
6106static VPSingleDefRecipe *
6107optimizeExtendsForPartialReduction(VPSingleDefRecipe *Op) {
6108 // reduce.add(mul(ext(A), C))
6109 // -> reduce.add(mul(ext(A), ext(trunc(C))))
6110 const APInt *Const;
6111 if (match(Op, m_Mul(m_ZExtOrSExt(m_VPValue()), m_APInt(Const)))) {
6112 auto *ExtA = cast<VPWidenCastRecipe>(Op->getOperand(0));
6113 Instruction::CastOps ExtOpc = ExtA->getOpcode();
6114 Type *NarrowTy = ExtA->getOperand(0)->getScalarType();
6115 if (!Op->hasOneUse() ||
6117 Const, NarrowTy, TTI::getPartialReductionExtendKind(ExtOpc)))
6118 return Op;
6119
6120 VPBuilder Builder(Op);
6121 auto *Trunc = Builder.createWidenCast(Instruction::CastOps::Trunc,
6122 Op->getOperand(1), NarrowTy);
6123 Type *WideTy = ExtA->getScalarType();
6124 Op->setOperand(1, Builder.createWidenCast(ExtOpc, Trunc, WideTy));
6125 return Op;
6126 }
6127
6128 // reduce.add(abs(sub(ext(A), ext(B))))
6129 // -> reduce.add(ext(absolute-difference(A, B)))
6130 VPValue *X, *Y;
6133 auto *Sub = Op->getOperand(0)->getDefiningRecipe();
6134 auto *Ext = cast<VPWidenCastRecipe>(Sub->getOperand(0));
6135 assert(Ext->getOpcode() ==
6136 cast<VPWidenCastRecipe>(Sub->getOperand(1))->getOpcode() &&
6137 "Expected both the LHS and RHS extends to be the same");
6138 bool IsSigned = Ext->getOpcode() == Instruction::SExt;
6139 VPBuilder Builder(Op);
6140 Type *SrcTy = X->getScalarType();
6141 auto *FreezeX = Builder.insert(new VPWidenRecipe(Instruction::Freeze, {X}));
6142 auto *FreezeY = Builder.insert(new VPWidenRecipe(Instruction::Freeze, {Y}));
6143 auto *Max = Builder.insert(
6144 new VPWidenIntrinsicRecipe(IsSigned ? Intrinsic::smax : Intrinsic::umax,
6145 {FreezeX, FreezeY}, SrcTy));
6146 auto *Min = Builder.insert(
6147 new VPWidenIntrinsicRecipe(IsSigned ? Intrinsic::smin : Intrinsic::umin,
6148 {FreezeX, FreezeY}, SrcTy));
6149 auto *AbsDiff =
6150 Builder.insert(new VPWidenRecipe(Instruction::Sub, {Max, Min}));
6151 return Builder.createWidenCast(Instruction::CastOps::ZExt, AbsDiff,
6152 Op->getScalarType());
6153 }
6154
6155 // reduce.add(ext(mul(ext(A), ext(B))))
6156 // -> reduce.add(mul(wider_ext(A), wider_ext(B)))
6157 // TODO: Support this optimization for float types.
6159 m_ZExtOrSExt(m_VPValue()))))) {
6160 auto *Ext = cast<VPWidenCastRecipe>(Op);
6161 auto *Mul = cast<VPWidenRecipe>(Ext->getOperand(0));
6162 auto *MulLHS = cast<VPWidenCastRecipe>(Mul->getOperand(0));
6163 auto *MulRHS = cast<VPWidenCastRecipe>(Mul->getOperand(1));
6164 if (!Mul->hasOneUse() ||
6165 (Ext->getOpcode() != MulLHS->getOpcode() && MulLHS != MulRHS) ||
6166 MulLHS->getOpcode() != MulRHS->getOpcode())
6167 return Op;
6168 VPBuilder Builder(Mul);
6169 auto *NewLHS = Builder.createWidenCast(
6170 MulLHS->getOpcode(), MulLHS->getOperand(0), Ext->getScalarType());
6171 auto *NewRHS = MulLHS == MulRHS
6172 ? NewLHS
6173 : Builder.createWidenCast(MulRHS->getOpcode(),
6174 MulRHS->getOperand(0),
6175 Ext->getScalarType());
6176 auto *NewMul = Mul->cloneWithOperands({NewLHS, NewRHS});
6177 Builder.insert(NewMul);
6178 Op->replaceAllUsesWith(NewMul);
6179 Op->eraseFromParent();
6180 Mul->eraseFromParent();
6181 return NewMul;
6182 }
6183
6184 return Op;
6185}
6186
6187static VPExpressionRecipe *
6188createPartialReductionExpression(VPReductionRecipe *Red) {
6189 VPValue *VecOp = Red->getVecOp();
6190
6191 // reduce.[f]add(ext(op))
6192 // -> VPExpressionRecipe(op, red)
6193 if (match(VecOp, m_WidenAnyExtend(m_VPValue())))
6194 return new VPExpressionRecipe(cast<VPWidenCastRecipe>(VecOp), Red);
6195
6196 // reduce.[f]add(neg(ext(op)))
6197 // -> VPExpressionRecipe(op, sub/neg, red)
6198 if (match(VecOp, m_AnyNeg(m_WidenAnyExtend(m_VPValue())))) {
6199 auto *Neg = cast<VPWidenRecipe>(VecOp);
6200 auto *Ext =
6201 cast<VPWidenCastRecipe>(Neg->getOperand(Neg->getNumOperands() - 1));
6202 return new VPExpressionRecipe(Ext, Neg, Red);
6203 }
6204
6205 // reduce.[f]add([f]mul(ext(a), ext(b)))
6206 // -> VPExpressionRecipe(a, b, mul, red)
6207 if (match(VecOp, m_FMul(m_FPExt(m_VPValue()), m_FPExt(m_VPValue()))) ||
6208 match(VecOp,
6210 auto *Mul = cast<VPWidenRecipe>(VecOp);
6211 auto *ExtA = cast<VPWidenCastRecipe>(Mul->getOperand(0));
6212 auto *ExtB = cast<VPWidenCastRecipe>(Mul->getOperand(1));
6213 return new VPExpressionRecipe(ExtA, ExtB, Mul, Red);
6214 }
6215
6216 // reduce.fadd(fneg(fmul(fpext(a), fpext(b))))
6217 // -> VPExpressionRecipe(a, b, fmul, fsub, red)
6218 if (match(VecOp,
6220 auto *FNeg = cast<VPWidenRecipe>(VecOp);
6221 auto *FMul = cast<VPWidenRecipe>(FNeg->getOperand(0));
6222 auto *ExtA = cast<VPWidenCastRecipe>(FMul->getOperand(0));
6223 auto *ExtB = cast<VPWidenCastRecipe>(FMul->getOperand(1));
6224 return new VPExpressionRecipe(ExtA, ExtB, FMul, FNeg, Red);
6225 }
6226
6227 // reduce.add(neg(mul(ext(a), ext(b))))
6228 // -> VPExpressionRecipe(a, b, mul, sub, red)
6230 m_ZExtOrSExt(m_VPValue()))))) {
6231 auto *Sub = cast<VPWidenRecipe>(VecOp);
6232 auto *Mul = cast<VPWidenRecipe>(Sub->getOperand(1));
6233 auto *ExtA = cast<VPWidenCastRecipe>(Mul->getOperand(0));
6234 auto *ExtB = cast<VPWidenCastRecipe>(Mul->getOperand(1));
6235 return new VPExpressionRecipe(ExtA, ExtB, Mul, Sub, Red);
6236 }
6237
6238 llvm_unreachable("Unsupported expression");
6239}
6240
6241// Helper to transform a partial reduction chain into a partial reduction
6242// recipe. Assumes profitability has been checked.
6243static void transformToPartialReduction(const VPPartialReductionChain &Chain,
6244 VPlan &Plan,
6245 VPReductionPHIRecipe *RdxPhi) {
6246 VPWidenRecipe *WidenRecipe = Chain.ReductionBinOp;
6247 assert(WidenRecipe->getNumOperands() == 2 && "Expected binary operation");
6248
6249 VPValue *Accumulator = WidenRecipe->getOperand(Chain.AccumulatorOpIdx);
6250 auto *ExtendedOp = cast<VPSingleDefRecipe>(
6251 WidenRecipe->getOperand(1 - Chain.AccumulatorOpIdx));
6252
6253 // Sub-reductions can be implemented in two ways:
6254 // (1) negate the operand in the vector loop (the default way).
6255 // (2) subtract the reduced value from the init value in the middle block.
6256 // Both ways keep the reduction itself as an 'add' reduction.
6257 //
6258 // The ISD nodes for partial reductions don't support folding the
6259 // sub/negation into its operands because the following is not a valid
6260 // transformation:
6261 // sub(0, mul(ext(a), ext(b)))
6262 // -> mul(ext(a), ext(sub(0, b)))
6263 //
6264 // It's therefore better to choose option (2) such that the partial
6265 // reduction is always positive (starting at '0') and to do a final
6266 // subtract in the middle block.
6267 if ((WidenRecipe->getOpcode() == Instruction::Sub &&
6268 Chain.RK != RecurKind::Sub) ||
6269 (WidenRecipe->getOpcode() == Instruction::FSub &&
6270 Chain.RK != RecurKind::FSub)) {
6271 VPBuilder Builder(WidenRecipe);
6272 Type *ElemTy = ExtendedOp->getScalarType();
6273 VPWidenRecipe *NegRecipe;
6274 if (WidenRecipe->getOpcode() == Instruction::FSub) {
6275 NegRecipe =
6276 new VPWidenRecipe(Instruction::FNeg, {ExtendedOp}, VPIRFlags(),
6278 } else {
6279 auto *Zero = Plan.getZero(ElemTy);
6280 NegRecipe =
6281 new VPWidenRecipe(Instruction::Sub, {Zero, ExtendedOp}, VPIRFlags(),
6283 }
6284 Builder.insert(NegRecipe);
6285 ExtendedOp = NegRecipe;
6286 }
6287
6288 // FIXME: Do these transforms before invoking the cost-model.
6289 ExtendedOp = optimizeExtendsForPartialReduction(ExtendedOp);
6290
6291 // Check if WidenRecipe is the final result of the reduction. If so look
6292 // through selects for predicated reductions.
6293 VPValue *Cond = nullptr;
6295 findUserOf(WidenRecipe, m_Select(m_VPValue(Cond), m_Specific(WidenRecipe),
6296 m_Specific(RdxPhi))));
6297 bool IsLastInChain = RdxPhi->getBackedgeValue() == WidenRecipe ||
6298 RdxPhi->getBackedgeValue() == ExitValue;
6299 assert((!ExitValue || IsLastInChain) &&
6300 "if we found ExitValue, it must match RdxPhi's backedge value");
6301
6302 Type *PhiType = RdxPhi->getScalarType();
6303 RecurKind RdxKind =
6305 auto *PartialRed = new VPReductionRecipe(
6306 RdxKind,
6307 RdxKind == RecurKind::FAdd ? WidenRecipe->getFastMathFlags()
6308 : FastMathFlags(),
6309 WidenRecipe->getUnderlyingInstr(), Accumulator, ExtendedOp, Cond,
6310 RdxUnordered{/*VFScaleFactor=*/Chain.ScaleFactor});
6311 PartialRed->insertBefore(WidenRecipe);
6312
6313 if (Cond)
6314 ExitValue->replaceAllUsesWith(PartialRed);
6315 WidenRecipe->replaceAllUsesWith(PartialRed);
6316
6317 // For cost-model purposes, fold this into a VPExpression.
6318 VPExpressionRecipe *E = createPartialReductionExpression(PartialRed);
6319 E->insertBefore(WidenRecipe);
6320 PartialRed->replaceAllUsesWith(E);
6321
6322 // We only need to update the PHI node once, which is when we find the
6323 // last reduction in the chain.
6324 if (!IsLastInChain)
6325 return;
6326
6327 // Scale the PHI and ReductionStartVector by the VFScaleFactor
6328 assert(RdxPhi->getVFScaleFactor() == 1 && "scale factor must not be set");
6329 RdxPhi->setVFScaleFactor(Chain.ScaleFactor);
6330
6331 auto *StartInst = cast<VPInstruction>(RdxPhi->getStartValue());
6332 assert(StartInst->getOpcode() == VPInstruction::ReductionStartVector);
6333 auto *NewScaleFactor = Plan.getConstantInt(32, Chain.ScaleFactor);
6334 StartInst->setOperand(2, NewScaleFactor);
6335
6336 // If this is the last value in a sub-reduction chain, then update the PHI
6337 // node to start at `0` and update the reduction-result to subtract from
6338 // the PHI's start value.
6339 if (Chain.RK != RecurKind::Sub && Chain.RK != RecurKind::FSub)
6340 return;
6341
6342 VPValue *OldStartValue = StartInst->getOperand(0);
6343 StartInst->setOperand(0, StartInst->getOperand(1));
6344
6345 // Replace reduction_result by 'sub (startval, reductionresult)'.
6347 assert(RdxResult && "Could not find reduction result");
6348
6349 VPBuilder Builder = VPBuilder::getToInsertAfter(RdxResult);
6350 unsigned SubOpc = Chain.RK == RecurKind::FSub ? Instruction::BinaryOps::FSub
6351 : Instruction::BinaryOps::Sub;
6352 VPInstruction *NewResult = Builder.createNaryOp(
6353 SubOpc, {OldStartValue, RdxResult}, VPIRFlags::getDefaultFlags(SubOpc),
6354 RdxPhi->getDebugLoc());
6355 RdxResult->replaceUsesWithIf(
6356 NewResult,
6357 [&NewResult](VPUser &U, unsigned Idx) { return &U != NewResult; });
6358}
6359
6360/// Returns the cost of a link in a partial-reduction chain for a given VF.
6361static InstructionCost
6362getPartialReductionLinkCost(VPCostContext &CostCtx,
6363 const VPPartialReductionChain &Link,
6364 ElementCount VF) {
6365 Type *RdxType = Link.ReductionBinOp->getScalarType();
6366 const ExtendedReductionOperand &ExtendedOp = Link.ExtendedOp;
6367 std::optional<unsigned> BinOpc = std::nullopt;
6368 // If ExtendB is not none, then the "ExtendsUser" is the binary operation.
6369 if (ExtendedOp.ExtendB.Kind != ExtendKind::PR_None)
6370 BinOpc = ExtendedOp.ExtendsUser->getOpcode();
6371
6372 std::optional<llvm::FastMathFlags> Flags;
6373 if (RdxType->isFloatingPointTy())
6374 Flags = Link.ReductionBinOp->getFastMathFlags();
6375
6376 auto GetLinkOpcode = [&Link]() -> unsigned {
6377 switch (Link.RK) {
6378 case RecurKind::Sub:
6379 return Instruction::Add;
6380 case RecurKind::FSub:
6381 return Instruction::FAdd;
6382 default:
6383 return Link.ReductionBinOp->getOpcode();
6384 }
6385 };
6386
6387 return CostCtx.TTI.getPartialReductionCost(
6388 GetLinkOpcode(), ExtendedOp.ExtendA.SrcType, ExtendedOp.ExtendB.SrcType,
6389 RdxType, VF, ExtendedOp.ExtendA.Kind, ExtendedOp.ExtendB.Kind, BinOpc,
6390 CostCtx.CostKind, Flags);
6391}
6392
6393static ExtendKind getPartialReductionExtendKind(VPWidenCastRecipe *Cast) {
6395}
6396
6397/// Checks if \p Op (which is an operand of \p UpdateR) is an extended reduction
6398/// operand. This is an operand where the source of the value (e.g. a load) has
6399/// been extended (sext, zext, or fpext) before it is used in the reduction.
6400///
6401/// Possible forms matched by this function:
6402/// - UpdateR(PrevValue, ext(...))
6403/// - UpdateR(PrevValue, mul(ext(...), ext(...)))
6404/// - UpdateR(PrevValue, mul(ext(...), Constant))
6405/// - UpdateR(PrevValue, ext(mul(ext(...), ext(...))))
6406/// - UpdateR(PrevValue, ext(mul(ext(...), Constant)))
6407/// - UpdateR(PrevValue, abs(sub(ext(...), ext(...)))
6408///
6409/// Note: The second operand of UpdateR corresponds to \p Op in the examples.
6410static std::optional<ExtendedReductionOperand>
6411matchExtendedReductionOperand(VPWidenRecipe *UpdateR, VPValue *Op) {
6412 assert(is_contained(UpdateR->operands(), Op) &&
6413 "Op should be operand of UpdateR");
6414
6415 // Try matching an absolute difference operand of the form
6416 // `abs(sub(ext(A), ext(B)))`. This will be later transformed into
6417 // `ext(absolute-difference(A, B))`. This allows us to perform the absolute
6418 // difference on a wider type and get the extend for "free" from the partial
6419 // reduction.
6420 VPValue *X, *Y;
6421 if (Op->hasOneUse() &&
6425 auto *Abs = cast<VPWidenIntrinsicRecipe>(Op);
6426 auto *Sub = cast<VPWidenRecipe>(Abs->getOperand(0));
6427 auto *LHSExt = cast<VPWidenCastRecipe>(Sub->getOperand(0));
6428 auto *RHSExt = cast<VPWidenCastRecipe>(Sub->getOperand(1));
6429 Type *LHSInputType = X->getScalarType();
6430 Type *RHSInputType = Y->getScalarType();
6431 if (LHSInputType != RHSInputType ||
6432 LHSExt->getOpcode() != RHSExt->getOpcode())
6433 return std::nullopt;
6434 // Note: This is essentially the same as matching ext(...) as we will
6435 // rewrite this operand to ext(absolute-difference(A, B)).
6436 return ExtendedReductionOperand{
6437 Sub,
6438 /*ExtendA=*/{LHSInputType, getPartialReductionExtendKind(LHSExt)},
6439 /*ExtendB=*/{}};
6440 }
6441
6442 std::optional<TTI::PartialReductionExtendKind> OuterExtKind;
6444 auto *CastRecipe = cast<VPWidenCastRecipe>(Op);
6445 VPValue *CastSource = CastRecipe->getOperand(0);
6446 OuterExtKind = getPartialReductionExtendKind(CastRecipe);
6447 if (match(CastSource, m_Mul(m_VPValue(), m_VPValue())) ||
6448 match(CastSource, m_FMul(m_VPValue(), m_VPValue()))) {
6449 // Match: ext(mul(...))
6450 // Record the outer extend kind and set `Op` to the mul. We can then match
6451 // this as a binary operation. Note: We can optimize out the outer extend
6452 // by widening the inner extends to match it. See
6453 // optimizeExtendsForPartialReduction.
6454 Op = CastSource;
6455 // FIXME: createPartialReductionExpression can't handle sub(ext(mul(...)))
6456 if (UpdateR->getOpcode() == Instruction::Sub)
6457 return std::nullopt;
6458 } else {
6459 return ExtendedReductionOperand{
6460 UpdateR,
6461 /*ExtendA=*/{CastSource->getScalarType(), *OuterExtKind},
6462 /*ExtendB=*/{}};
6463 }
6464 }
6465
6466 if (!Op->hasOneUse())
6467 return std::nullopt;
6468
6470 if (!MulOp ||
6471 !is_contained({Instruction::Mul, Instruction::FMul}, MulOp->getOpcode()))
6472 return std::nullopt;
6473
6474 // The rest of the matching assumes `Op` is a (possibly extended) mul
6475 // operation.
6476
6477 VPValue *LHS = MulOp->getOperand(0);
6478 VPValue *RHS = MulOp->getOperand(1);
6479
6480 // The LHS of the operation must always be an extend.
6482 return std::nullopt;
6483
6484 auto *LHSCast = cast<VPWidenCastRecipe>(LHS);
6485 Type *LHSInputType = LHSCast->getOperand(0)->getScalarType();
6486 ExtendKind LHSExtendKind = getPartialReductionExtendKind(LHSCast);
6487
6488 // The RHS of the operation can be an extend or a constant integer.
6489 const APInt *RHSConst = nullptr;
6490 VPWidenCastRecipe *RHSCast = nullptr;
6492 RHSCast = cast<VPWidenCastRecipe>(RHS);
6493 else if (!match(RHS, m_APInt(RHSConst)) ||
6494 !canConstantBeExtended(RHSConst, LHSInputType, LHSExtendKind))
6495 return std::nullopt;
6496
6497 // The outer extend kind must match the inner extends for folding.
6498 for (VPWidenCastRecipe *Cast : {LHSCast, RHSCast})
6499 if (Cast && OuterExtKind &&
6500 getPartialReductionExtendKind(Cast) != OuterExtKind)
6501 return std::nullopt;
6502
6503 Type *RHSInputType = LHSInputType;
6504 ExtendKind RHSExtendKind = LHSExtendKind;
6505 if (RHSCast) {
6506 RHSInputType = RHSCast->getOperand(0)->getScalarType();
6507 RHSExtendKind = getPartialReductionExtendKind(RHSCast);
6508 }
6509
6510 return ExtendedReductionOperand{
6511 MulOp, {LHSInputType, LHSExtendKind}, {RHSInputType, RHSExtendKind}};
6512}
6513
6514/// Examines each operation in the reduction chain corresponding to \p RedPhiR,
6515/// and determines if the target can use a cheaper operation with a wider
6516/// per-iteration input VF and narrower PHI VF. If successful, returns the chain
6517/// of operations in the reduction.
6518static std::optional<SmallVector<VPPartialReductionChain>>
6519getScaledReductions(VPReductionPHIRecipe *RedPhiR, VPCostContext &CostCtx,
6520 VFRange &Range) {
6521 // Get the backedge value from the reduction PHI and find the
6522 // ComputeReductionResult that uses it (directly or through a select for
6523 // predicated reductions).
6524 auto *RdxResult = vputils::findComputeReductionResult(RedPhiR);
6525 if (!RdxResult)
6526 return std::nullopt;
6527 VPValue *ExitValue = RdxResult->getOperand(0);
6528 match(ExitValue, m_Select(m_VPValue(), m_VPValue(ExitValue), m_VPValue()));
6529
6531 RecurKind RK = RedPhiR->getRecurrenceKind();
6532 Type *PhiType = RedPhiR->getScalarType();
6533 TypeSize PHISize = PhiType->getPrimitiveSizeInBits();
6534
6535 // Work backwards from the ExitValue examining each reduction operation.
6536 VPValue *CurrentValue = ExitValue;
6537 while (CurrentValue != RedPhiR) {
6538 auto *UpdateR = dyn_cast<VPWidenRecipe>(CurrentValue);
6539 if (!UpdateR || !Instruction::isBinaryOp(UpdateR->getOpcode()))
6540 return std::nullopt;
6541
6542 VPValue *Op = UpdateR->getOperand(1);
6543 VPValue *PrevValue = UpdateR->getOperand(0);
6544
6545 // Find the extended operand. The other operand (PrevValue) is the next link
6546 // in the reduction chain.
6547 std::optional<ExtendedReductionOperand> ExtendedOp =
6548 matchExtendedReductionOperand(UpdateR, Op);
6549 if (!ExtendedOp) {
6550 ExtendedOp = matchExtendedReductionOperand(UpdateR, PrevValue);
6551 if (!ExtendedOp)
6552 return std::nullopt;
6553 std::swap(Op, PrevValue);
6554 }
6555
6556 Type *ExtSrcType = ExtendedOp->ExtendA.SrcType;
6557 TypeSize ExtSrcSize = ExtSrcType->getPrimitiveSizeInBits();
6558 if (!PHISize.hasKnownScalarFactor(ExtSrcSize))
6559 return std::nullopt;
6560
6561 // Check if a partial reduction chain is supported by the target (i.e. does
6562 // not have an invalid cost) for the given VF range. Clamps the range and
6563 // returns true if feasible for any VF.
6564 VPPartialReductionChain Link(
6565 {UpdateR, *ExtendedOp, RK,
6566 PrevValue == UpdateR->getOperand(0) ? 0U : 1U,
6567 static_cast<unsigned>(PHISize.getKnownScalarFactor(ExtSrcSize))});
6568 Chain.push_back(Link);
6569 CurrentValue = PrevValue;
6570 }
6571
6572 // The chain links were collected by traversing backwards from the exit value.
6573 // Reverse the chains so they are in program order.
6574 std::reverse(Chain.begin(), Chain.end());
6575 return Chain;
6576}
6577} // namespace
6578
6580 VPCostContext &CostCtx,
6581 VFRange &Range) {
6582 // Find all possible valid partial reductions, grouping chains by their PHI.
6583 // This grouping allows invalidating the whole chain, if any link is not a
6584 // valid partial reduction.
6586 ChainsByPhi;
6587 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
6588 for (VPRecipeBase &R : HeaderVPBB->phis()) {
6589 auto *RedPhiR = dyn_cast<VPReductionPHIRecipe>(&R);
6590 if (!RedPhiR)
6591 continue;
6592
6593 if (auto Chains = getScaledReductions(RedPhiR, CostCtx, Range))
6594 ChainsByPhi.try_emplace(RedPhiR, std::move(*Chains));
6595 }
6596
6597 if (ChainsByPhi.empty())
6598 return;
6599
6600 // Build set of partial reduction operations for extend user validation and
6601 // a map of reduction bin ops to their scale factors for scale validation.
6602 SmallPtrSet<VPRecipeBase *, 4> PartialReductionOps;
6603 DenseMap<VPSingleDefRecipe *, unsigned> ScaledReductionMap;
6604 for (const auto &[_, Chains] : ChainsByPhi)
6605 for (const VPPartialReductionChain &Chain : Chains) {
6606 PartialReductionOps.insert(Chain.ExtendedOp.ExtendsUser);
6607 ScaledReductionMap[Chain.ReductionBinOp] = Chain.ScaleFactor;
6608 }
6609
6610 // A partial reduction is invalid if any of its extends are used by
6611 // something that isn't another partial reduction. This is because the
6612 // extends are intended to be lowered along with the reduction itself.
6613 auto ExtendUsersValid = [&](VPValue *Ext) {
6614 return !isa<VPWidenCastRecipe>(Ext) || all_of(Ext->users(), [&](VPUser *U) {
6615 return PartialReductionOps.contains(cast<VPRecipeBase>(U));
6616 });
6617 };
6618
6619 auto IsProfitablePartialReductionChainForVF =
6620 [&](ArrayRef<VPPartialReductionChain> Chain, ElementCount VF) -> bool {
6621 InstructionCost PartialCost = 0, RegularCost = 0;
6622
6623 // The chain is a profitable partial reduction chain if the cost of handling
6624 // the entire chain is cheaper when using partial reductions than when
6625 // handling the entire chain using regular reductions.
6626 for (const VPPartialReductionChain &Link : Chain) {
6627 const ExtendedReductionOperand &ExtendedOp = Link.ExtendedOp;
6628 InstructionCost LinkCost = getPartialReductionLinkCost(CostCtx, Link, VF);
6629 if (!LinkCost.isValid())
6630 return false;
6631
6632 PartialCost += LinkCost;
6633 RegularCost += Link.ReductionBinOp->computeCost(VF, CostCtx);
6634 // If ExtendB is not none, then the "ExtendsUser" is the binary operation.
6635 if (ExtendedOp.ExtendB.Kind != ExtendKind::PR_None)
6636 RegularCost += ExtendedOp.ExtendsUser->computeCost(VF, CostCtx);
6637 for (VPValue *Op : ExtendedOp.ExtendsUser->operands())
6638 if (auto *Extend = dyn_cast<VPWidenCastRecipe>(Op))
6639 RegularCost += Extend->computeCost(VF, CostCtx);
6640 }
6641 return PartialCost.isValid() && PartialCost < RegularCost;
6642 };
6643
6644 // Validate chains: check that extends are only used by partial reductions,
6645 // and that reduction bin ops are only used by other partial reductions with
6646 // matching scale factors, are outside the loop region or the select
6647 // introduced by tail-folding. Otherwise we would create users of scaled
6648 // reductions where the types of the other operands don't match.
6649 for (auto &[RedPhiR, Chains] : ChainsByPhi) {
6650 for (const VPPartialReductionChain &Chain : Chains) {
6651 if (!all_of(Chain.ExtendedOp.ExtendsUser->operands(), ExtendUsersValid)) {
6652 Chains.clear();
6653 break;
6654 }
6655 auto UseIsValid = [&, RedPhiR = RedPhiR](VPUser *U) {
6656 if (auto *PhiR = dyn_cast<VPReductionPHIRecipe>(U))
6657 return PhiR == RedPhiR;
6658 auto *R = cast<VPSingleDefRecipe>(U);
6659 return Chain.ScaleFactor == ScaledReductionMap.lookup_or(R, 0) ||
6661 m_Specific(Chain.ReductionBinOp))) ||
6662 match(R, m_Select(m_VPValue(), m_Specific(Chain.ReductionBinOp),
6663 m_Specific(RedPhiR)));
6664 };
6665 if (!all_of(Chain.ReductionBinOp->users(), UseIsValid)) {
6666 Chains.clear();
6667 break;
6668 }
6669
6670 // Check if the compute-reduction-result is used by a sunk store.
6671 // TODO: Also form partial reductions in those cases.
6672 if (auto *RdxResult = vputils::findComputeReductionResult(RedPhiR)) {
6673 if (any_of(RdxResult->users(), [](VPUser *U) {
6674 auto *RepR = dyn_cast<VPReplicateRecipe>(U);
6675 return RepR && RepR->getOpcode() == Instruction::Store;
6676 })) {
6677 Chains.clear();
6678 break;
6679 }
6680 }
6681 }
6682
6683 // Clear the chain if it is not profitable.
6685 [&, &Chains = Chains](ElementCount VF) {
6686 return IsProfitablePartialReductionChainForVF(Chains, VF);
6687 },
6688 Range))
6689 Chains.clear();
6690 }
6691
6692 for (auto &[Phi, Chains] : ChainsByPhi)
6693 for (const VPPartialReductionChain &Chain : Chains)
6694 transformToPartialReduction(Chain, Plan, Phi);
6695}
6696
6698 VPlan &Plan, VFRange &Range, VPRecipeBuilder &RecipeBuilder) {
6699 // Collect all loads/stores first. We will start with ones having simpler
6700 // decisions followed by more complex ones that are potentially
6701 // guided/dependent on the simpler ones.
6703 for (VPBasicBlock *VPBB :
6706 for (VPRecipeBase &R : *VPBB) {
6707 auto *VPI = dyn_cast<VPInstruction>(&R);
6708 if (VPI && VPI->getUnderlyingValue() &&
6709 is_contained({Instruction::Load, Instruction::Store},
6710 VPI->getOpcode()))
6711 MemOps.push_back(VPI);
6712 }
6713 }
6714
6715 VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
6716 VPBuilder FinalRedStoresBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
6717
6718 for (VPInstruction *VPI : MemOps) {
6719 auto ReplaceWith = [&](VPRecipeBase *New) {
6720 New->insertBefore(VPI);
6721 if (VPI->getOpcode() == Instruction::Load)
6722 VPI->replaceAllUsesWith(New->getVPSingleValue());
6723 VPI->eraseFromParent();
6724 };
6725
6726 // Note: we must do that for scalar VPlan as well.
6727 if (RecipeBuilder.replaceWithFinalIfReductionStore(VPI,
6728 FinalRedStoresBuilder))
6729 continue;
6730
6731 // Filter out scalar VPlan for the remaining memory operations.
6733 [](ElementCount VF) { return VF.isScalar(); }, Range))
6734 continue;
6735
6736 if (VPHistogramRecipe *Histogram = RecipeBuilder.widenIfHistogram(VPI)) {
6737 ReplaceWith(Histogram);
6738 continue;
6739 }
6740
6741 VPRecipeBase *Recipe = RecipeBuilder.tryToWidenMemory(VPI, Range);
6742 if (!Recipe)
6743 Recipe = RecipeBuilder.handleReplication(VPI, Range);
6744
6745 ReplaceWith(Recipe);
6746 }
6747}
6748
6751 [&](ElementCount VF) { return VF.isScalar(); }, Range))
6752 return;
6753
6755 Plan.getEntry());
6757 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
6758 auto *VPI = dyn_cast<VPInstruction>(&R);
6759 if (!VPI)
6760 continue;
6761
6762 auto *I = cast_or_null<Instruction>(VPI->getUnderlyingValue());
6763 // Wouldn't be able to create a `VPReplicateRecipe` anyway.
6764 if (!I)
6765 continue;
6766
6767 // If executing other lanes produces side-effects we can't avoid them.
6768 if (VPI->mayHaveSideEffects())
6769 continue;
6770
6771 // We want to drop the mask operand, verify we can safely do that.
6772 if (VPI->isMasked() && !VPI->isSafeToSpeculativelyExecute())
6773 continue;
6774
6775 // Avoid rewriting IV increment as that interferes with
6776 // `removeRedundantCanonicalIVs`.
6777 if (VPI->getOpcode() == Instruction::Add &&
6779 continue;
6780
6781 // Other lanes are needed - can't drop them.
6783 continue;
6784
6785 auto *Recipe = new VPReplicateRecipe(
6786 I, VPI->operandsWithoutMask(), /*IsSingleScalar=*/true,
6787 /*Mask=*/nullptr, *VPI, *VPI, VPI->getDebugLoc());
6788 Recipe->insertBefore(VPI);
6789 VPI->replaceAllUsesWith(Recipe);
6790 VPI->eraseFromParent();
6791 }
6792 }
6793}
6794
6795/// Returns true if \p Info's parameter kinds are compatible with \p Args.
6796static bool areVFParamsOk(const VFInfo &Info, ArrayRef<VPValue *> Args,
6797 PredicatedScalarEvolution &PSE, const Loop *L) {
6798 ScalarEvolution *SE = PSE.getSE();
6799 return all_of(Info.Shape.Parameters, [&](VFParameter Param) {
6800 switch (Param.ParamKind) {
6801 case VFParamKind::Vector:
6802 case VFParamKind::GlobalPredicate:
6803 return true;
6804 case VFParamKind::OMP_Uniform:
6805 return SE->isSCEVable(Args[Param.ParamPos]->getScalarType()) &&
6806 SE->isLoopInvariant(
6807 vputils::getSCEVExprForVPValue(Args[Param.ParamPos], PSE, L),
6808 L);
6809 case VFParamKind::OMP_Linear:
6810 return match(vputils::getSCEVExprForVPValue(Args[Param.ParamPos], PSE, L),
6811 m_scev_AffineAddRec(
6812 m_SCEV(), m_scev_SpecificSInt(Param.LinearStepOrPos),
6813 m_SpecificLoop(L)));
6814 default:
6815 return false;
6816 }
6817 });
6818}
6819
6820/// Find a vector variant of \p CI for \p VF, respecting \p MaskRequired.
6821/// Returns the variant function, or nullptr. Masked variants are assumed to
6822/// take the mask as a trailing parameter.
6824 ElementCount VF, bool MaskRequired,
6826 const Loop *L) {
6827 if (CI->isNoBuiltin())
6828 return nullptr;
6829 auto Mappings = VFDatabase::getMappings(*CI);
6830 const auto *It = find_if(Mappings, [&](const VFInfo &Info) {
6831 return Info.Shape.VF == VF && (!MaskRequired || Info.isMasked()) &&
6832 areVFParamsOk(Info, Args, PSE, L);
6833 });
6834 if (It == Mappings.end())
6835 return nullptr;
6836 return CI->getModule()->getFunction(It->VectorName);
6837}
6838
6839namespace {
6840/// The outcome of choosing how to widen a call at a given VF.
6841struct CallWideningDecision {
6842 enum class KindTy { Scalarize, Intrinsic, VectorVariant };
6843 CallWideningDecision(KindTy Kind, Function *Variant = nullptr)
6844 : Kind(Kind), Variant(Variant) {}
6845 KindTy Kind;
6846
6847 /// Set when Kind == VectorVariant.
6849
6850 bool operator==(const CallWideningDecision &Other) const {
6851 return Kind == Other.Kind && Variant == Other.Variant;
6852 }
6853};
6854} // namespace
6855
6856/// Pick the cheapest widening for the call \p VPI at \p VF among scalarization,
6857/// vector intrinsic, and vector library variant.
6858static CallWideningDecision decideCallWidening(VPInstruction &VPI,
6860 ElementCount VF,
6861 VPCostContext &CostCtx) {
6862 auto *CI = cast<CallInst>(VPI.getUnderlyingInstr());
6863
6864 // Scalar VFs and calls forced or known to scalarize always replicate.
6865 if (VF.isScalar() || CostCtx.willBeScalarized(CI, VF))
6866 return CallWideningDecision::KindTy::Scalarize;
6867
6868 auto *CalledFn = cast<Function>(
6870 Type *ResultTy = VPI.getScalarType();
6872 bool MaskRequired = CostCtx.isMaskRequired(CI);
6873
6874 // Pseudo intrinsics (assume, lifetime, ...) are always scalarized.
6876 return CallWideningDecision::KindTy::Scalarize;
6877
6878 InstructionCost ScalarCost =
6879 VPReplicateRecipe::computeCallCost(CalledFn, ResultTy, Ops,
6880 /*IsSingleScalar=*/false, VF, CostCtx);
6881
6882 Function *VecFunc =
6883 findVectorVariant(CI, Ops, VF, MaskRequired, CostCtx.PSE, CostCtx.L);
6885 if (VecFunc)
6886 VecCallCost = VPWidenCallRecipe::computeCallCost(VecFunc, CostCtx);
6887
6888 // Prefer the intrinsic if it is at least as cheap as scalarizing and any
6889 // available vector variant.
6890 if (ID) {
6893 if (IntrinsicCost.isValid() && ScalarCost >= IntrinsicCost &&
6894 (!VecFunc || VecCallCost >= IntrinsicCost))
6895 return CallWideningDecision::KindTy::Intrinsic;
6896 }
6897
6898 // Otherwise, use a vector library variant when it beats scalarizing.
6899 if (VecFunc && ScalarCost >= VecCallCost)
6900 return {CallWideningDecision::KindTy::VectorVariant, VecFunc};
6901
6902 return CallWideningDecision::KindTy::Scalarize;
6903}
6904
6906 VPRecipeBuilder &RecipeBuilder,
6907 VPCostContext &CostCtx) {
6910 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
6911 auto *VPI = dyn_cast<VPInstruction>(&R);
6912 if (!VPI || !VPI->getUnderlyingValue() ||
6913 VPI->getOpcode() != Instruction::Call)
6914 continue;
6915
6916 auto *CI = cast<CallInst>(VPI->getUnderlyingInstr());
6917 SmallVector<VPValue *, 4> Ops(VPI->op_begin(),
6918 VPI->op_begin() + CI->arg_size());
6919
6920 CallWideningDecision Decision =
6921 decideCallWidening(*VPI, Ops, Range.Start, CostCtx);
6923 [&](ElementCount VF) {
6924 return Decision == decideCallWidening(*VPI, Ops, VF, CostCtx);
6925 },
6926 Range);
6927
6928 VPSingleDefRecipe *Replacement = nullptr;
6929 switch (Decision.Kind) {
6930 case CallWideningDecision::KindTy::Intrinsic: {
6932 Type *ResultTy = VPI->getScalarType();
6933 Replacement = new VPWidenIntrinsicRecipe(*CI, ID, Ops, ResultTy, *VPI,
6934 *VPI, VPI->getDebugLoc());
6935 break;
6936 }
6937 case CallWideningDecision::KindTy::VectorVariant: {
6938 // Masked variants take the mask as a trailing parameter, so they have
6939 // one more parameter than the original call's arguments.
6940 if (Decision.Variant->arg_size() > Ops.size()) {
6941 VPValue *Mask = VPI->isMasked() ? VPI->getMask() : Plan.getTrue();
6942 Ops.push_back(Mask);
6943 }
6944 Ops.push_back(VPI->getOperand(VPI->getNumOperandsWithoutMask() - 1));
6945 Replacement = new VPWidenCallRecipe(CI, Decision.Variant, Ops, *VPI,
6946 *VPI, VPI->getDebugLoc());
6947 break;
6948 }
6949 case CallWideningDecision::KindTy::Scalarize:
6950 Replacement = RecipeBuilder.handleReplication(VPI, Range);
6951 break;
6952 }
6953
6954 Replacement->insertBefore(VPI);
6955 VPI->replaceAllUsesWith(Replacement);
6956 VPI->eraseFromParent();
6957 }
6958 }
6959}
6960
6963 Loop &L, VPCostContext &Ctx,
6964 VFRange &Range) {
6965 if (Plan.hasScalarVFOnly())
6966 return;
6967
6968 VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
6969 VPValue *I32VF = nullptr;
6971 vp_depth_first_shallow(VectorLoop->getEntry()))) {
6972 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
6973 auto *LoadR = dyn_cast<VPWidenLoadRecipe>(&R);
6974 // TODO: Support strided store.
6975 // TODO: Transform reverse access into strided access with -1 stride.
6976 // TODO: Transform gather/scatter with uniform address into strided access
6977 // with 0 stride.
6978 // TODO: Transform interleave access into multiple strided accesses.
6979 if (!LoadR || LoadR->isConsecutive())
6980 continue;
6981
6982 auto *Ptr = dyn_cast<VPWidenGEPRecipe>(LoadR->getAddr());
6983 if (!Ptr)
6984 continue;
6985
6986 // Check if this is a strided access by analyzing the address SCEV for an
6987 // affine addRec.
6988 const SCEV *PtrSCEV = vputils::getSCEVExprForVPValue(Ptr, PSE, &L);
6989 const SCEV *Start;
6990 const SCEVConstant *Step;
6991 // TODO: Support non-constant loop invariant stride.
6992 if (!match(PtrSCEV,
6994 m_SpecificLoop(&L))))
6995 continue;
6996
6997 Type *LoadTy = LoadR->getScalarType();
6998 Align Alignment = LoadR->getAlign();
6999 auto IsProfitable = [&](ElementCount VF) {
7000 Type *DataTy = toVectorTy(LoadTy, VF);
7001 if (!Ctx.TTI.isLegalStridedLoadStore(DataTy, Alignment))
7002 return false;
7003 const InstructionCost CurrentCost = LoadR->computeCost(VF, Ctx);
7004 const InstructionCost StridedLoadStoreCost =
7006 Intrinsic::experimental_vp_strided_load, DataTy,
7007 LoadR->isMasked(), Alignment, Ctx);
7008 return StridedLoadStoreCost < CurrentCost;
7009 };
7010
7012 Range))
7013 continue;
7014
7015 // Invalidate the legacy widening decision so the cost of replaced load is
7016 // not counted during precomputeCosts.
7017 // TODO: Remove once the legacy exit cost computation is retired.
7018 for (ElementCount VF : Range)
7019 Ctx.invalidateWideningDecision(&LoadR->getIngredient(), VF);
7020
7021 // Get VF as i32 for the vector length operand.
7022 if (!I32VF) {
7023 VPBuilder Builder(Plan.getVectorPreheader());
7024 I32VF = Builder.createScalarZExtOrTrunc(
7025 &Plan.getVF(), Type::getInt32Ty(Plan.getContext()),
7027 }
7028
7029 VPBuilder Builder(LoadR);
7030 // Create the base pointer of strided access.
7031 VPValue *StartVPV = vputils::getOrCreateVPValueForSCEVExpr(Plan, Start);
7032 VPValue *StrideInBytes = Plan.getOrAddLiveIn(Step->getValue());
7033 Type *IndexTy = Plan.getDataLayout().getIndexType(Ptr->getScalarType());
7034 assert(IndexTy == StrideInBytes->getScalarType() &&
7035 "Stride type from SCEV must match the index type");
7036 VPValue *CanIVTyStride = Builder.createScalarSExtOrTrunc(
7037 StrideInBytes, VectorLoop->getCanonicalIVType(), IndexTy,
7039 auto *AddRecPtr = cast<SCEVAddRecExpr>(PtrSCEV);
7040 auto *Offset = Builder.createOverflowingOp(
7041 Instruction::Mul, {VectorLoop->getCanonicalIV(), CanIVTyStride},
7042 {AddRecPtr->hasNoUnsignedWrap(), AddRecPtr->hasNoSignedWrap()});
7043 auto *BasePtr = Builder.createNoWrapPtrAdd(
7044 StartVPV, Offset,
7045 AddRecPtr->hasNoUnsignedWrap() ? GEPNoWrapFlags::noUnsignedWrap()
7047
7048 // Create a new vector pointer for strided access.
7049 VPValue *NewPtr = Builder.createVectorPointer(
7050 BasePtr, Type::getInt8Ty(Plan.getContext()), StrideInBytes,
7051 Ptr->getGEPNoWrapFlags(), Ptr->getDebugLoc());
7052
7053 VPValue *Mask = LoadR->getMask();
7054 if (!Mask)
7055 Mask = Plan.getTrue();
7056 auto *StridedLoad = Builder.createWidenMemIntrinsic(
7057 Intrinsic::experimental_vp_strided_load,
7058 {NewPtr, StrideInBytes, Mask, I32VF}, LoadTy, Alignment, *LoadR,
7059 LoadR->getDebugLoc());
7060 LoadR->replaceAllUsesWith(StridedLoad);
7061 }
7062 }
7063}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
AMDGPU Register Bank Select
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool isEqual(const Function &Caller, const Function &Callee)
static const Function * getParent(const Value *V)
#define X(NUM, ENUM, NAME)
Definition ELF.h:853
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
static cl::opt< IntrinsicCostStrategy > IntrinsicCost("intrinsic-cost-strategy", cl::desc("Costing strategy for intrinsic instructions"), cl::init(IntrinsicCostStrategy::InstructionCost), cl::values(clEnumValN(IntrinsicCostStrategy::InstructionCost, "instruction-cost", "Use TargetTransformInfo::getInstructionCost"), clEnumValN(IntrinsicCostStrategy::IntrinsicCost, "intrinsic-cost", "Use TargetTransformInfo::getIntrinsicInstrCost"), clEnumValN(IntrinsicCostStrategy::TypeBasedIntrinsicCost, "type-based-intrinsic-cost", "Calculate the intrinsic cost based only on argument types")))
static bool isSentinel(const DWARFDebugNames::AttributeEncoding &AE)
@ Default
Hexagon Common GEP
#define _
iv Induction Variable Users
Definition IVUsers.cpp:48
iv users
Definition IVUsers.cpp:48
static std::pair< Value *, APInt > getMask(Value *WideMask, unsigned Factor, ElementCount LeafValueEC)
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
licm
Definition LICM.cpp:383
Legalize the Machine IR a function s Machine IR
Definition Legalizer.cpp:81
#define I(x, y, z)
Definition MD5.cpp:57
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first DebugLoc that has line number information, given a range of instructions.
This file provides utility analysis objects describing memory locations.
This file contains the declarations for metadata subclasses.
MachineInstr unsigned OpIdx
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
#define P(N)
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
R600 Clause Merge
const SmallVectorImpl< MachineOperand > & Cond
static bool dominates(InstrPosIndexes &PosIndexes, const MachineInstr &A, const MachineInstr &B)
This file contains some templates that are useful if you are working with the STL at all.
This is the interface for a metadata-based scoped no-alias analysis.
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file defines the SmallPtrSet class.
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This file implements the TypeSwitch template, which mimics a switch() statement whose cases are type ...
This file implements dominator tree analysis for a single level of a VPlan's H-CFG.
This file contains the declarations of different VPlan-related auxiliary helpers.
static SmallVector< SmallVector< VPReplicateRecipe *, 4 > > collectComplementaryPredicatedMemOps(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L)
static void removeCommonBlendMask(VPBlendRecipe *Blend)
Try to see if all of Blend's masks share a common value logically and'ed and remove it from the masks...
static void tryToCreateAbstractReductionRecipe(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries to create abstract recipes from the reduction recipe for following optimizations ...
static VPReplicateRecipe * findRecipeWithMinAlign(ArrayRef< VPReplicateRecipe * > Group)
static CallWideningDecision decideCallWidening(VPInstruction &VPI, ArrayRef< VPValue * > Ops, ElementCount VF, VPCostContext &CostCtx)
Pick the cheapest widening for the call VPI at VF among scalarization, vector intrinsic,...
static bool areVFParamsOk(const VFInfo &Info, ArrayRef< VPValue * > Args, PredicatedScalarEvolution &PSE, const Loop *L)
Returns true if Info's parameter kinds are compatible with Args.
static bool simplifyLogicalRecipe(VPSingleDefRecipe *Def, VPBuilder &Builder, bool CanCreateNewRecipe)
Try to simplify logical and bitwise recipes in Def.
static bool sinkScalarOperands(VPlan &Plan)
static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Try to simplify the branch condition of Plan.
static VPValue * cloneBinOpForScalarIV(VPWidenRecipe *BinOp, VPValue *ScalarIV, VPWidenIntOrFpInductionRecipe *WidenIV)
Create a scalar version of BinOp, with its WidenIV operand replaced by ScalarIV, and place it after S...
static VPWidenIntOrFpInductionRecipe * getExpressionIV(VPValue *V)
Check if V is a binary expression of a widened IV and a loop-invariant value.
static void removeRedundantInductionCasts(VPlan &Plan)
Remove redundant casts of inductions.
static bool isConditionTrueViaVFAndUF(VPValue *Cond, VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Return true if Cond is known to be true for given BestVF and BestUF.
static VPIRValue * tryToFoldLiveIns(VPSingleDefRecipe &R, ArrayRef< VPValue * > Operands, const DataLayout &DL, LLVMContext &Ctx)
Try to fold R using InstSimplifyFolder.
static bool tryToReplaceALMWithWideALM(VPlan &Plan, ElementCount VF, unsigned UF)
Try to replace multiple active lane masks used for control flow with a single, wide active lane mask ...
static std::optional< std::pair< bool, unsigned > > getOpcodeOrIntrinsicID(const VPSingleDefRecipe *R)
Get any instruction opcode or intrinsic ID data embedded in recipe R.
static VPExpressionRecipe * tryToMatchAndCreateExtendedReduction(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries convert extended in-loop reductions to VPExpressionRecipe and clamp the Range if ...
static RemoveMask_match< Op0_t, Op1_t > m_RemoveMask(const Op0_t &In, Op1_t &Out)
Match a specific mask In, or a combination of it (logical-and In, Out).
static std::optional< ElementCount > isConsecutiveInterleaveGroup(VPInterleaveRecipe *InterleaveR, ArrayRef< ElementCount > VFs, const TargetTransformInfo &TTI)
Returns VF from VFs if IR is a full interleave group with factor and number of members both equal to ...
static Type * getLoadStoreValueType(VPReplicateRecipe *R, bool IsLoad)
Get the value type of the replicate load or store.
static VPIRMetadata getCommonMetadata(ArrayRef< VPReplicateRecipe * > Recipes)
static VPValue * getPredicatedMask(VPRegionBlock *R)
If R is a region with a VPBranchOnMaskRecipe in the entry block, return the mask.
static bool mergeReplicateRegionsIntoSuccessors(VPlan &Plan)
static Function * findVectorVariant(CallInst *CI, ArrayRef< VPValue * > Args, ElementCount VF, bool MaskRequired, PredicatedScalarEvolution &PSE, const Loop *L)
Find a vector variant of CI for VF, respecting MaskRequired.
static VPScalarIVStepsRecipe * createScalarIVSteps(VPlan &Plan, InductionDescriptor::InductionKind Kind, Instruction::BinaryOps InductionOpcode, FPMathOperator *FPBinOp, Instruction *TruncI, VPIRValue *StartV, VPValue *Step, DebugLoc DL, VPBuilder &Builder)
static VPWidenInductionRecipe * getOptimizableIVOf(VPValue *VPV, PredicatedScalarEvolution &PSE)
Check if VPV is an untruncated wide induction, either before or after the increment.
static void fixupVFUsersForEVL(VPlan &Plan, VPValue &EVL)
After replacing the canonical IV with a EVL-based IV, fixup recipes that use VF to use the EVL instea...
static bool canNarrowLoad(VPSingleDefRecipe *WideMember0, unsigned OpIdx, VPValue *OpV, unsigned Idx, bool IsScalable)
Returns true if V is VPWidenLoadRecipe or VPInterleaveRecipe that can be converted to a narrower reci...
static void simplifyRecipe(VPSingleDefRecipe *Def)
Try to simplify VPSingleDefRecipe Def.
static bool isDeadRecipe(VPRecipeBase &R)
Returns true if R is dead and can be removed.
static void legalizeAndOptimizeInductions(VPlan &Plan)
Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd (IndStart, ScalarIVSteps (0,...
static void addReplicateRegions(VPlan &Plan)
static SmallVector< SmallVector< VPReplicateRecipe *, 4 > > collectGroupedReplicateMemOps(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L, function_ref< bool(VPReplicateRecipe *)> FilterFn)
Collect either replicated Loads or Stores grouped by their address SCEV and their load-store type,...
static VPValue * tryToComputeEndValueForInduction(VPWidenInductionRecipe *WideIV, VPBuilder &VectorPHBuilder, VPValue *VectorTC)
Compute the end value for WideIV, unless it is truncated.
static std::optional< Intrinsic::ID > getVPDivRemIntrinsic(Intrinsic::ID IntrID)
static void removeRedundantExpandSCEVRecipes(VPlan &Plan)
Remove redundant ExpandSCEVRecipes in Plan's entry block by replacing them with already existing reci...
static VPValue * optimizeEarlyExitInductionUser(VPlan &Plan, VPValue *Op, PredicatedScalarEvolution &PSE)
Attempts to optimize the induction variable exit values for users in the early exit block.
static VPValue * scalarizeVPWidenPointerInduction(VPWidenPointerInductionRecipe *PtrIV, VPlan &Plan, VPBuilder &Builder)
Scalarize a VPWidenPointerInductionRecipe by replacing it with a PtrAdd (IndStart,...
static SmallVector< VPUser * > collectUsersRecursively(VPValue *V)
static VPValue * optimizeLatchExitInductionUser(VPlan &Plan, VPValue *Op, DenseMap< VPValue *, VPValue * > &EndValues, PredicatedScalarEvolution &PSE)
Attempts to optimize the induction variable exit values for users in the exit block coming from the l...
static void recursivelyDeleteDeadRecipes(VPValue *V)
static void reassociateHeaderMask(VPlan &Plan)
Reassociate (headermask && x) && y -> headermask && (x && y) to allow the header mask to be simplifie...
static VPActiveLaneMaskPHIRecipe * addVPLaneMaskPhiAndUpdateExitBranch(VPlan &Plan)
static void expandVPDerivedIV(VPDerivedIVRecipe *R)
Expand a VPDerivedIVRecipe into executable recipes.
static VPBasicBlock * getPredicatedThenBlock(VPRegionBlock *R)
If R is a triangle region, return the 'then' block of the triangle.
static VPValue * narrowInterleaveGroupOp(VPValue *V, SmallPtrSetImpl< VPValue * > &NarrowedOps)
static bool canHoistOrSinkWithNoAliasCheck(const MemoryLocation &MemLoc, VPBasicBlock *FirstBB, VPBasicBlock *LastBB, std::optional< SinkStoreInfo > SinkInfo={})
Check if a memory operation doesn't alias with memory operations using scoped noalias metadata,...
static VPRegionBlock * createReplicateRegion(VPReplicateRecipe *PredRecipe, VPRegionBlock *ParentRegion, VPlan &Plan)
static void simplifyBlends(VPlan &Plan)
Normalize and simplify VPBlendRecipes.
static std::optional< Instruction::BinaryOps > getUnmaskedDivRemOpcode(Intrinsic::ID ID)
static bool isAlreadyNarrow(VPValue *VPV)
Returns true if VPValue is a narrow VPValue.
static bool cannotHoistOrSinkRecipe(VPRecipeBase &R, VPBasicBlock *FirstBB, VPBasicBlock *LastBB)
Return true if we do not know how to (mechanically) hoist or sink a non-memory or memory recipe R out...
static bool canNarrowOps(ArrayRef< VPValue * > Ops, bool IsScalable)
static bool optimizeVectorInductionWidthForTCAndVFUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF)
Optimize the width of vector induction variables in Plan based on a known constant Trip Count,...
static VPExpressionRecipe * tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries convert extended in-loop reductions to VPExpressionRecipe and clamp the Range if ...
static bool canSinkStoreWithNoAliasCheck(ArrayRef< VPReplicateRecipe * > StoresToSink, PredicatedScalarEvolution &PSE, const Loop &L)
static void expandVPWidenIntOrFpInduction(VPWidenIntOrFpInductionRecipe *WidenIVR)
Expand a VPWidenIntOrFpInduction into executable recipes, for the initial value, phi and backedge val...
static VPRecipeBase * optimizeMaskToEVL(VPValue *HeaderMask, VPRecipeBase &CurRecipe, VPValue &EVL)
Try to optimize a CurRecipe masked by HeaderMask to a corresponding EVL-based recipe without the head...
static void expandVPWidenPointerInduction(VPWidenPointerInductionRecipe *R)
Expand a VPWidenPointerInductionRecipe into executable recipes, for the initial value,...
static void narrowToSingleScalarRecipes(VPlan &Plan)
This file provides utility VPlan to VPlan transformations.
#define RUN_VPLAN_PASS(PASS,...)
This file declares the class VPlanVerifier, which contains utility functions to check the consistency...
This file contains the declarations of the Vectorization Plan base classes:
static const X86InstrFMA3Group Groups[]
Value * RHS
Value * LHS
BinaryOperator * Mul
static const uint32_t IV[8]
Definition blake3_impl.h:83
Helper for extra no-alias checks via known-safe recipe and SCEV.
SinkStoreInfo(const SmallPtrSetImpl< VPRecipeBase * > &ExcludeRecipes, VPReplicateRecipe &GroupLeader, PredicatedScalarEvolution &PSE, const Loop &L)
bool shouldSkip(VPRecipeBase &R) const
Return true if R should be skipped during alias checking, either because it's in the exclude set or b...
Class for arbitrary precision integers.
Definition APInt.h:78
LLVM_ABI APInt zext(unsigned width) const
Zero extend to a new width.
Definition APInt.cpp:1055
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1563
unsigned getActiveBits() const
Compute the number of active bits in the value.
Definition APInt.h:1535
APInt abs() const
Get the absolute value.
Definition APInt.h:1818
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1511
LLVM_ABI APInt sext(unsigned width) const
Sign extend to a new width.
Definition APInt.cpp:1028
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition APInt.h:441
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition APInt.h:1228
An arbitrary precision integer that knows its signedness.
Definition APSInt.h:24
static APSInt getMinValue(uint32_t numBits, bool Unsigned)
Return the APSInt representing the minimum integer value with the given bit width and signedness.
Definition APSInt.h:310
static APSInt getMaxValue(uint32_t numBits, bool Unsigned)
Return the APSInt representing the maximum integer value with the given bit width and signedness.
Definition APSInt.h:302
@ NoAlias
The two locations do not alias at all.
Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
const T & back() const
Get the last element.
Definition ArrayRef.h:150
const T & front() const
Get the first element.
Definition ArrayRef.h:144
iterator end() const
Definition ArrayRef.h:130
iterator begin() const
Definition ArrayRef.h:129
LLVM Basic Block Representation.
Definition BasicBlock.h:62
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction; assumes that the block is well-formed.
Definition BasicBlock.h:237
bool isNoBuiltin() const
Return true if the call should not be treated as a call to a builtin.
This class represents a function call, abstracting a target machine's calling convention.
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:765
@ ICMP_NE
not equal
Definition InstrTypes.h:762
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:766
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition InstrTypes.h:750
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition InstrTypes.h:852
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
static ConstantInt * getSigned(IntegerType *Ty, int64_t V, bool ImplicitTrunc=false)
Return a ConstantInt with the specified value for the specified type.
Definition Constants.h:135
This class represents a range of values.
LLVM_ABI bool contains(const APInt &Val) const
Return true if the specified value is in the set.
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
LLVM_ABI IntegerType * getIndexType(LLVMContext &C, unsigned AddressSpace) const
Returns the type of a GEP index in AddressSpace.
A debug info location.
Definition DebugLoc.h:124
static DebugLoc getCompilerGenerated()
Definition DebugLoc.h:152
static DebugLoc getUnknown()
Definition DebugLoc.h:151
ValueT lookup(const_arg_type_t< KeyT > Val) const
Return the entry for the specified key, or a default constructed value if no such entry exists.
Definition DenseMap.h:205
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:254
ValueT lookup_or(const_arg_type_t< KeyT > Val, U &&Default) const
Definition DenseMap.h:215
bool dominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
dominates - Returns true iff A dominates B.
constexpr bool isVector() const
One or more elements.
Definition TypeSize.h:324
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition TypeSize.h:312
constexpr bool isScalar() const
Exactly one element.
Definition TypeSize.h:320
Utility class for floating point operations which can have information about relaxed accuracy require...
Definition Operator.h:202
FastMathFlags getFastMathFlags() const
Convenience function for getting all the fast-math flags.
Definition Operator.h:291
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:23
size_t arg_size() const
Definition Function.h:901
Represents flags for the getelementptr instruction/expression.
static GEPNoWrapFlags noUnsignedWrap()
GEPNoWrapFlags withoutNoUnsignedWrap() const
static GEPNoWrapFlags none()
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
A struct for saving information about induction variables.
static LLVM_ABI InductionDescriptor getCanonicalIntInduction(Type *Ty, ScalarEvolution &SE)
Returns the canonical integer induction for type Ty with start = 0 and step = 1.
InductionKind
This enum represents the kinds of inductions that we support.
@ IK_NoInduction
Not an induction variable.
@ IK_FpInduction
Floating point induction variable.
@ IK_PtrInduction
Pointer induction var. Step = C.
@ IK_IntInduction
Integer induction variable. Step = C.
InstSimplifyFolder - Use InstructionSimplify to fold operations to existing values.
static InstructionCost getInvalid(CostType Val=0)
bool isCast() const
LLVM_ABI const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
bool isBinaryOp() const
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
bool isIntDivRem() const
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:350
The group of interleaved loads/stores sharing the same stride and close to each other.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
static bool getDecisionAndClampRange(const std::function< bool(ElementCount)> &Predicate, VFRange &Range)
Test a Predicate on a Range of VF's.
Definition VPlan.cpp:1666
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
LLVM_ABI MDNode * createBranchWeights(uint32_t TrueWeight, uint32_t FalseWeight, bool IsExpected=false)
Return metadata containing two branch weights.
Definition MDBuilder.cpp:38
Metadata node.
Definition Metadata.h:1075
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:38
ValueT lookup(const KeyT &Key) const
Definition MapVector.h:110
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
Definition MapVector.h:118
bool empty() const
Definition MapVector.h:79
Representation for a specific memory location.
AAMDNodes AATags
The metadata nodes which describes the aliasing of the location (each member is null if that kind of ...
Function * getFunction(StringRef Name) const
Look up the specified function in the module symbol table.
Definition Module.cpp:235
unsigned getOpcode() const
Return the opcode for this Instruction or ConstantExpr.
Definition Operator.h:43
Post-order traversal of a graph.
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
ScalarEvolution * getSE() const
Returns the ScalarEvolution analysis used.
LLVM_ABI const SCEV * getSCEV(Value *V)
Returns the SCEV expression of V, in the context of the current SCEV predicate.
static LLVM_ABI unsigned getOpcode(RecurKind Kind)
Returns the opcode corresponding to the RecurrenceKind.
static bool isFindLastRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is of the form select(cmp(),x,y) where one of (x,...
RegionT * getParent() const
Get the parent of the Region.
Definition RegionInfo.h:362
This class represents a constant integer value.
ConstantInt * getValue() const
This class uses information about analyze scalars to rewrite expressions in canonical form.
LLVM_ABI Value * expandCodeFor(SCEVUse SH, Type *Ty, BasicBlock::iterator I)
Insert code to directly compute the specified SCEV expression into the program.
static const SCEV * rewrite(const SCEV *Scev, ScalarEvolution &SE, ValueToSCEVMapTy &Map)
This class represents an analyzed expression in the program.
LLVM_ABI Type * getType() const
Return the LLVM type of this SCEV expression.
The main scalar evolution driver.
LLVM_ABI const SCEV * getUDivExpr(SCEVUse LHS, SCEVUse RHS)
Get a canonical unsigned division expression, or something simpler if possible.
const DataLayout & getDataLayout() const
Return the DataLayout associated with the module this SCEV instance is operating on.
LLVM_ABI const SCEV * getNegativeSCEV(const SCEV *V, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
Return the SCEV object corresponding to -V.
LLVM_ABI bool isKnownNonZero(const SCEV *S)
Test if the given expression is known to be non-zero.
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI const SCEV * getMinusSCEV(SCEVUse LHS, SCEVUse RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
ConstantRange getSignedRange(const SCEV *S)
Determine the signed range for a particular SCEV.
LLVM_ABI bool isKnownPositive(const SCEV *S)
Test if the given expression is known to be positive.
LLVM_ABI const SCEV * getElementCount(Type *Ty, ElementCount EC, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
ConstantRange getUnsignedRange(const SCEV *S)
Determine the unsigned range for a particular SCEV.
LLVM_ABI const SCEV * getMulExpr(SmallVectorImpl< SCEVUse > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
LLVM_ABI bool isKnownPredicate(CmpPredicate Pred, SCEVUse LHS, SCEVUse RHS)
Test if the given expression is known to satisfy the condition described by Pred, LHS,...
static LLVM_ABI AliasResult alias(const MemoryLocation &LocA, const MemoryLocation &LocB)
A vector that has set insertion semantics.
Definition SetVector.h:57
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:103
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:151
size_type size() const
Definition SmallPtrSet.h:99
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
iterator begin() const
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
Provides information about what library functions are available for the current target.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
static LLVM_ABI PartialReductionExtendKind getPartialReductionExtendKind(Instruction *I)
Get the kind of extension that an instruction represents.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
LLVM_ABI InstructionCost getPartialReductionCost(unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType, ElementCount VF, PartialReductionExtendKind OpAExtend, PartialReductionExtendKind OpBExtend, std::optional< unsigned > BinOp, TTI::TargetCostKind CostKind, std::optional< FastMathFlags > FMF) const
@ SK_Broadcast
Broadcast element 0 to all other elements.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
This class implements a switch-like dispatch statement for a value of 'T' using dyn_cast functionalit...
Definition TypeSwitch.h:89
TypeSwitch< T, ResultT > & Case(CallableT &&caseFn)
Add a case on the given type.
Definition TypeSwitch.h:98
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:309
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:282
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
Definition Type.cpp:307
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:368
bool isStructTy() const
True if this is an instance of StructType.
Definition Type.h:276
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:197
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:232
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:306
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:186
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:257
op_range operands()
Definition User.h:267
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
Definition VectorUtils.h:76
A recipe for generating the active lane mask for the vector loop that is used to predicate the vector...
Definition VPlan.h:3987
VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph.
Definition VPlan.h:4338
void appendRecipe(VPRecipeBase *Recipe)
Augment the existing recipes of a VPBasicBlock with an additional Recipe as the last recipe.
Definition VPlan.h:4413
RecipeListTy::iterator iterator
Instruction iterators...
Definition VPlan.h:4365
iterator end()
Definition VPlan.h:4375
iterator begin()
Recipe iterator methods.
Definition VPlan.h:4373
iterator_range< iterator > phis()
Returns an iterator range over the PHI-like recipes in the block.
Definition VPlan.h:4426
iterator getFirstNonPhi()
Return the position of the first non-phi node recipe in the block.
Definition VPlan.cpp:266
VPBasicBlock * splitAt(iterator SplitAt)
Split current block at SplitAt by inserting a new block between the current block and its successors ...
Definition VPlan.cpp:560
VPRecipeBase * getTerminator()
If the block has multiple successors, return the branch recipe terminating the block.
Definition VPlan.cpp:639
const VPRecipeBase & back() const
Definition VPlan.h:4387
A recipe for vectorizing a phi-node as a sequence of mask-based select instructions.
Definition VPlan.h:2909
VPValue * getMask(unsigned Idx) const
Return mask number Idx.
Definition VPlan.h:2959
unsigned getNumIncomingValues() const
Return the number of incoming values, taking into account when normalized the first incoming value wi...
Definition VPlan.h:2949
void setMask(unsigned Idx, VPValue *V)
Set mask number Idx to V.
Definition VPlan.h:2965
bool isNormalized() const
A normalized blend is one that has an odd number of operands, whereby the first operand does not have...
Definition VPlan.h:2945
VPBlockBase is the building block of the Hierarchical Control-Flow Graph.
Definition VPlan.h:94
void setSuccessors(ArrayRef< VPBlockBase * > NewSuccs)
Set each VPBasicBlock in NewSuccss as successor of this VPBlockBase.
Definition VPlan.h:315
VPRegionBlock * getParent()
Definition VPlan.h:186
const VPBasicBlock * getExitingBasicBlock() const
Definition VPlan.cpp:236
size_t getNumSuccessors() const
Definition VPlan.h:237
void setPredecessors(ArrayRef< VPBlockBase * > NewPreds)
Set each VPBasicBlock in NewPreds as predecessor of this VPBlockBase.
Definition VPlan.h:306
const VPBlocksTy & getPredecessors() const
Definition VPlan.h:222
VPlan * getPlan()
Definition VPlan.cpp:211
const std::string & getName() const
Definition VPlan.h:177
void clearSuccessors()
Remove all the successors of this block.
Definition VPlan.h:325
VPBlockBase * getSinglePredecessor() const
Definition VPlan.h:233
const VPBasicBlock * getEntryBasicBlock() const
Definition VPlan.cpp:216
VPBlockBase * getSingleHierarchicalPredecessor()
Definition VPlan.h:279
VPBlockBase * getSingleSuccessor() const
Definition VPlan.h:227
const VPBlocksTy & getSuccessors() const
Definition VPlan.h:211
static auto blocksAs(T &&Range)
Return an iterator range over Range with each block cast to BlockTy.
Definition VPlanUtils.h:342
static void insertOnEdge(VPBlockBase *From, VPBlockBase *To, VPBlockBase *BlockPtr)
Inserts BlockPtr on the edge between From and To.
Definition VPlanUtils.h:361
static bool isLatch(const VPBlockBase *VPB, const VPDominatorTree &VPDT)
Returns true if VPB is a loop latch, using isHeader().
static void insertTwoBlocksAfter(VPBlockBase *IfTrue, VPBlockBase *IfFalse, VPBlockBase *BlockPtr)
Insert disconnected VPBlockBases IfTrue and IfFalse after BlockPtr.
Definition VPlanUtils.h:251
static void connectBlocks(VPBlockBase *From, VPBlockBase *To, unsigned PredIdx=-1u, unsigned SuccIdx=-1u)
Connect VPBlockBases From and To bi-directionally.
Definition VPlanUtils.h:269
static void disconnectBlocks(VPBlockBase *From, VPBlockBase *To)
Disconnect VPBlockBases From and To bi-directionally.
Definition VPlanUtils.h:287
static auto blocksOnly(T &&Range)
Return an iterator range over Range which only includes BlockTy blocks.
Definition VPlanUtils.h:323
static void transferSuccessors(VPBlockBase *Old, VPBlockBase *New)
Transfer successors from Old to New. New must have no successors.
Definition VPlanUtils.h:307
static SmallVector< VPBasicBlock * > blocksInSingleSuccessorChainBetween(VPBasicBlock *FirstBB, VPBasicBlock *LastBB)
Returns the blocks between FirstBB and LastBB, where FirstBB to LastBB forms a single-sucessor chain.
A recipe for generating conditional branches on the bits of a mask.
Definition VPlan.h:3440
RAII object that stores the current insertion point and restores it when the object is destroyed.
VPlan-based builder utility analogous to IRBuilder.
VPInstruction * createFirstActiveLane(ArrayRef< VPValue * > Masks, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
VPInstruction * createOr(VPValue *LHS, VPValue *RHS, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
VPValue * createScalarZExtOrTrunc(VPValue *Op, Type *ResultTy, Type *SrcTy, DebugLoc DL)
VPInstruction * createNot(VPValue *Operand, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
VPInstruction * createAnyOfReduction(VPValue *ChainOp, VPValue *TrueVal, VPValue *FalseVal, DebugLoc DL=DebugLoc::getUnknown())
Create an AnyOf reduction pattern: or-reduce ChainOp, freeze the result, then select between TrueVal ...
Definition VPlan.cpp:1653
VPInstruction * createLogicalAnd(VPValue *LHS, VPValue *RHS, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
VPDerivedIVRecipe * createDerivedIV(InductionDescriptor::InductionKind Kind, FPMathOperator *FPBinOp, VPIRValue *Start, VPValue *Current, VPValue *Step)
Convert the input value Current to the corresponding value of an induction with Start and Step values...
VPInstruction * createScalarCast(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy, DebugLoc DL, const VPIRMetadata &Metadata={})
VPWidenPHIRecipe * createWidenPhi(ArrayRef< VPValue * > IncomingValues, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
static VPBuilder getToInsertAfter(VPRecipeBase *R)
Create a VPBuilder to insert after R.
VPWidenCastRecipe * createWidenCast(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy)
VPInstruction * createICmp(CmpInst::Predicate Pred, VPValue *A, VPValue *B, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
Create a new ICmp VPInstruction with predicate Pred and operands A and B.
VPPhi * createScalarPhi(ArrayRef< VPValue * > IncomingValues, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="", const VPIRFlags &Flags={}, Type *ResultTy=nullptr)
VPInstruction * createSelect(VPValue *Cond, VPValue *TrueVal, VPValue *FalseVal, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="", const VPIRFlags &Flags={})
VPInstruction * createNaryOp(unsigned Opcode, ArrayRef< VPValue * > Operands, Instruction *Inst=nullptr, const VPIRFlags &Flags={}, const VPIRMetadata &MD={}, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="", Type *ResultTy=nullptr)
Create an N-ary operation with Opcode, Operands and set Inst as its underlying Instruction.
void setInsertPoint(VPBasicBlock *TheBB)
This specifies that created VPInstructions should be appended to the end of the specified block.
A recipe for generating the phi node tracking the current scalar iteration index.
Definition VPlan.h:4019
unsigned getNumDefinedValues() const
Returns the number of values defined by the VPDef.
Definition VPlanValue.h:559
VPValue * getVPSingleValue()
Returns the only VPValue defined by the VPDef.
Definition VPlanValue.h:532
VPValue * getVPValue(unsigned I)
Returns the VPValue with index I defined by the VPDef.
Definition VPlanValue.h:544
ArrayRef< VPRecipeValue * > definedValues()
Returns an ArrayRef of the values defined by the VPDef.
Definition VPlanValue.h:554
A recipe for converting the input value IV value to the corresponding value of an IV with different s...
Definition VPlan.h:4113
Template specialization of the standard LLVM dominator tree utility for VPBlockBases.
bool properlyDominates(const VPRecipeBase *A, const VPRecipeBase *B)
A recipe to combine multiple recipes into a single 'expression' recipe, which should be considered a ...
Definition VPlan.h:3485
A pure virtual base class for all recipes modeling header phis, including phis for first order recurr...
Definition VPlan.h:2406
virtual VPValue * getBackedgeValue()
Returns the incoming value from the loop backedge.
Definition VPlan.h:2453
VPValue * getStartValue()
Returns the start value of the phi, if one is set.
Definition VPlan.h:2442
A recipe representing a sequence of load -> update -> store as part of a histogram operation.
Definition VPlan.h:2140
A special type of VPBasicBlock that wraps an existing IR basic block.
Definition VPlan.h:4491
Class to record and manage LLVM IR flags.
Definition VPlan.h:694
static VPIRFlags getDefaultFlags(unsigned Opcode)
Returns default flags for Opcode for opcodes that support it, asserts otherwise.
LLVM_ABI_FOR_TEST FastMathFlags getFastMathFlags() const
void dropPoisonGeneratingFlags()
Drop all poison-generating flags.
Definition VPlan.h:891
static LLVM_ABI_FOR_TEST VPIRInstruction * create(Instruction &I)
Create a new VPIRPhi for \I , if it is a PHINode, otherwise create a VPIRInstruction.
Helper to manage IR metadata for recipes.
Definition VPlan.h:1170
void intersect(const VPIRMetadata &MD)
Intersect this VPIRMetadata object with MD, keeping only metadata nodes that are common to both.
This is a concrete Recipe that models a single VPlan-level instruction.
Definition VPlan.h:1225
unsigned getNumOperandsWithoutMask() const
Returns the number of operands, excluding the mask if the VPInstruction is masked.
Definition VPlan.h:1467
@ ExtractLane
Extracts a single lane (first operand) from a set of vector operands.
Definition VPlan.h:1318
@ Unpack
Extracts all lanes from its (non-scalable) vector operand.
Definition VPlan.h:1268
@ ReductionStartVector
Start vector for reductions with 3 operands: the original start value, the identity value for the red...
Definition VPlan.h:1314
@ BuildVector
Creates a fixed-width vector containing all operands.
Definition VPlan.h:1263
@ BuildStructVector
Given operands of (the same) struct type, creates a struct of fixed- width vectors each containing a ...
Definition VPlan.h:1260
@ CanonicalIVIncrementForPart
Definition VPlan.h:1244
@ ComputeReductionResult
Reduce the operands to the final reduction result using the operation specified via the operation's V...
Definition VPlan.h:1271
const InterleaveGroup< Instruction > * getInterleaveGroup() const
Definition VPlan.h:3060
VPValue * getMask() const
Return the mask used by this recipe.
Definition VPlan.h:3052
ArrayRef< VPValue * > getStoredValues() const
Return the VPValues stored by this interleave group.
Definition VPlan.h:3081
A recipe for interleaved memory operations with vector-predication intrinsics.
Definition VPlan.h:3133
VPInterleaveRecipe is a recipe for transforming an interleave group of load or stores into one wide l...
Definition VPlan.h:3091
VPPredInstPHIRecipe is a recipe for generating the phi nodes needed when control converges back from ...
Definition VPlan.h:3649
VPRecipeBase is a base class modeling a sequence of one or more output IR instructions.
Definition VPlan.h:402
VPBasicBlock * getParent()
Definition VPlan.h:477
DebugLoc getDebugLoc() const
Returns the debug location of the recipe.
Definition VPlan.h:555
void moveBefore(VPBasicBlock &BB, iplist< VPRecipeBase >::iterator I)
Unlink this recipe and insert into BB before I.
void insertBefore(VPRecipeBase *InsertPos)
Insert an unlinked recipe into a basic block immediately before the specified recipe.
void insertAfter(VPRecipeBase *InsertPos)
Insert an unlinked Recipe into a basic block immediately after the specified Recipe.
iplist< VPRecipeBase >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Helper class to create VPRecipies from IR instructions.
VPHistogramRecipe * widenIfHistogram(VPInstruction *VPI)
If VPI represents a histogram operation (as determined by LoopVectorizationLegality) make that safe f...
VPRecipeBase * tryToWidenMemory(VPInstruction *VPI, VFRange &Range)
Check if the load or store instruction VPI should widened for Range.Start and potentially masked.
bool replaceWithFinalIfReductionStore(VPInstruction *VPI, VPBuilder &FinalRedStoresBuilder)
If VPI is a store of a reduction into an invariant address, delete it.
VPReplicateRecipe * handleReplication(VPInstruction *VPI, VFRange &Range)
Build a VPReplicationRecipe for VPI.
Type * getScalarType() const
Returns the scalar type of this VPRecipeValue.
Definition VPlanValue.h:337
A recipe to represent inloop reduction operations with vector-predication intrinsics,...
Definition VPlan.h:3304
A recipe for handling reduction phis.
Definition VPlan.h:2811
void setVFScaleFactor(unsigned ScaleFactor)
Set the VFScaleFactor for this reduction phi.
Definition VPlan.h:2862
unsigned getVFScaleFactor() const
Get the factor that the VF of this recipe's output should be scaled by, or 1 if it isn't scaled.
Definition VPlan.h:2855
RecurKind getRecurrenceKind() const
Returns the recurrence kind of the reduction.
Definition VPlan.h:2873
A recipe to represent inloop, ordered or partial reduction operations.
Definition VPlan.h:3184
VPRegionBlock represents a collection of VPBasicBlocks and VPRegionBlocks which form a Single-Entry-S...
Definition VPlan.h:4548
const VPBlockBase * getEntry() const
Definition VPlan.h:4592
bool isReplicator() const
An indicator whether this region is to generate multiple replicated instances of output IR correspond...
Definition VPlan.h:4624
VPInstruction * getOrCreateCanonicalIVIncrement()
Get the canonical IV increment instruction if it exists.
Definition VPlan.cpp:857
void setExiting(VPBlockBase *ExitingBlock)
Set ExitingBlock as the exiting VPBlockBase of this VPRegionBlock.
Definition VPlan.h:4609
Type * getCanonicalIVType() const
Return the type of the canonical IV for loop regions.
Definition VPlan.h:4668
void clearCanonicalIVNUW(VPInstruction *Increment)
Unsets NUW for the canonical IV increment Increment, for loop regions.
Definition VPlan.h:4676
VPRegionValue * getCanonicalIV()
Return the canonical induction variable of the region, null for replicating regions.
Definition VPlan.h:4660
const VPBlockBase * getExiting() const
Definition VPlan.h:4604
VPBasicBlock * getPreheaderVPBB()
Returns the pre-header VPBasicBlock of the loop region.
Definition VPlan.h:4617
VPReplicateRecipe replicates a given instruction producing multiple scalar copies of the original sca...
Definition VPlan.h:3349
bool isSingleScalar() const
Definition VPlan.h:3405
static InstructionCost computeCallCost(Function *CalledFn, Type *ResultTy, ArrayRef< const VPValue * > ArgOps, bool IsSingleScalar, ElementCount VF, VPCostContext &Ctx)
Return the cost of scalarizing a call to CalledFn with argument operands ArgOps for a given VF.
bool isPredicated() const
Definition VPlan.h:3407
VPValue * getMask()
Return the mask of a predicated VPReplicateRecipe.
Definition VPlan.h:3424
Lightweight SCEV-to-VPlan expander.
Definition VPlanUtils.h:189
VPValue * tryToExpand(const SCEV *S)
Try to expand S into recipes and live-ins using the builder.
A recipe for handling phi nodes of integer and floating-point inductions, producing their scalar valu...
Definition VPlan.h:4183
VPSingleDefRecipe is a base class for recipes that model a sequence of one or more output IR that def...
Definition VPlan.h:608
Instruction * getUnderlyingInstr()
Returns the underlying instruction.
Definition VPlan.h:679
VPSingleDefRecipe * clone() override=0
Clone the current recipe.
This class augments VPValue with operands which provide the inverse def-use edges from VPValue's user...
Definition VPlanValue.h:384
operand_range operands()
Definition VPlanValue.h:455
void setOperand(unsigned I, VPValue *New)
Definition VPlanValue.h:428
unsigned getNumOperands() const
Definition VPlanValue.h:422
VPValue * getOperand(unsigned N) const
Definition VPlanValue.h:423
void addOperand(VPValue *Operand)
Definition VPlanValue.h:417
This is the base class of the VPlan Def/Use graph, used for modeling the data flow into,...
Definition VPlanValue.h:50
Type * getScalarType() const
Returns the scalar type of this VPValue, dispatching based on the concrete subclass.
Definition VPlan.cpp:149
Value * getLiveInIRValue() const
Return the underlying IR value for a VPIRValue.
Definition VPlan.cpp:143
bool isDefinedOutsideLoopRegions() const
Returns true if the VPValue is defined outside any loop.
Definition VPlan.cpp:1478
VPRecipeBase * getDefiningRecipe()
Returns the recipe defining this VPValue or nullptr if it is not defined by a recipe,...
Definition VPlan.cpp:130
bool hasMoreThanOneUniqueUser() const
Returns true if the value has more than one unique user.
Definition VPlanValue.h:163
Value * getUnderlyingValue() const
Return the underlying Value attached to this VPValue.
Definition VPlanValue.h:75
void setUnderlyingValue(Value *Val)
Definition VPlanValue.h:208
VPUser * getSingleUser()
Return the single user of this value, or nullptr if there is not exactly one user.
Definition VPlanValue.h:178
void replaceAllUsesWith(VPValue *New)
Definition VPlan.cpp:1481
unsigned getNumUsers() const
Definition VPlanValue.h:115
void replaceUsesWithIf(VPValue *New, llvm::function_ref< bool(VPUser &U, unsigned Idx)> ShouldReplace)
Go through the uses list for this VPValue and make each use point to New if the callback ShouldReplac...
Definition VPlan.cpp:1487
user_range users()
Definition VPlanValue.h:157
A recipe to compute a pointer to the last element of each part of a widened memory access for widened...
Definition VPlan.h:2251
A recipe for widening Call instructions using library calls.
Definition VPlan.h:2074
static InstructionCost computeCallCost(Function *Variant, VPCostContext &Ctx)
Return the cost of widening a call using the vector function Variant.
VPWidenCastRecipe is a recipe to create vector cast instructions.
Definition VPlan.h:1855
Instruction::CastOps getOpcode() const
Definition VPlan.h:1891
A recipe for handling GEP instructions.
Definition VPlan.h:2183
Base class for widened induction (VPWidenIntOrFpInductionRecipe and VPWidenPointerInductionRecipe),...
Definition VPlan.h:2477
VPIRValue * getStartValue() const
Returns the start value of the induction.
Definition VPlan.h:2511
PHINode * getPHINode() const
Returns the underlying PHINode if one exists, or null otherwise.
Definition VPlan.h:2529
VPValue * getStepValue()
Returns the step value of the induction.
Definition VPlan.h:2514
const InductionDescriptor & getInductionDescriptor() const
Returns the induction descriptor for the recipe.
Definition VPlan.h:2534
A recipe for handling phi nodes of integer and floating-point inductions, producing their vector valu...
Definition VPlan.h:2570
VPIRValue * getStartValue() const
Returns the start value of the induction.
Definition VPlan.h:2617
VPValue * getSplatVFValue() const
If the recipe has been unrolled, return the VPValue for the induction increment, otherwise return nul...
Definition VPlan.h:2621
TruncInst * getTruncInst()
Returns the first defined value as TruncInst, if it is one or nullptr otherwise.
Definition VPlan.h:2632
VPValue * getLastUnrolledPartOperand()
Returns the VPValue representing the value of this induction at the last unrolled part,...
Definition VPlan.h:2643
A recipe for widening vector intrinsics.
Definition VPlan.h:1902
static InstructionCost computeCallCost(Intrinsic::ID ID, ArrayRef< const VPValue * > Operands, const VPRecipeWithIRFlags &R, ElementCount VF, VPCostContext &Ctx)
Compute the cost of a vector intrinsic with ID and Operands.
static InstructionCost computeMemIntrinsicCost(Intrinsic::ID IID, Type *Ty, bool IsMasked, Align Alignment, VPCostContext &Ctx)
Helper function for computing the cost of vector memory intrinsic.
A common mixin class for widening memory operations.
Definition VPlan.h:3685
virtual VPRecipeBase * getAsRecipe()=0
Return a VPRecipeBase* to the current object.
A recipe for widened phis.
Definition VPlan.h:2701
VPWidenRecipe is a recipe for producing a widened instruction using the opcode and operands of the re...
Definition VPlan.h:1794
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPWidenRecipe.
VPWidenRecipe * clone() override
Clone the current recipe.
Definition VPlan.h:1815
unsigned getOpcode() const
Definition VPlan.h:1834
VPlan models a candidate for vectorization, encoding various decisions take to produce efficient outp...
Definition VPlan.h:4696
VPIRValue * getLiveIn(Value *V) const
Return the live-in VPIRValue for V, if there is one or nullptr otherwise.
Definition VPlan.h:5021
bool hasVF(ElementCount VF) const
Definition VPlan.h:4919
const DataLayout & getDataLayout() const
Definition VPlan.h:4901
LLVMContext & getContext() const
Definition VPlan.h:4897
VPBasicBlock * getEntry()
Definition VPlan.h:4792
bool hasScalableVF() const
Definition VPlan.h:4920
VPValue * getTripCount() const
The trip count of the original loop.
Definition VPlan.h:4855
VPValue * getOrCreateBackedgeTakenCount()
The backedge taken count of the original loop.
Definition VPlan.h:4876
iterator_range< SmallSetVector< ElementCount, 2 >::iterator > vectorFactors() const
Returns an iterator range over all VFs of the plan.
Definition VPlan.h:4926
VPIRValue * getFalse()
Return a VPIRValue wrapping i1 false.
Definition VPlan.h:4992
VPSymbolicValue & getVFxUF()
Returns VF * UF of the vector loop region.
Definition VPlan.h:4895
VPIRValue * getAllOnesValue(Type *Ty)
Return a VPIRValue wrapping the AllOnes value of type Ty.
Definition VPlan.h:4998
VPRegionBlock * createReplicateRegion(VPBlockBase *Entry, VPBlockBase *Exiting, const std::string &Name="")
Create a new replicate region with Entry, Exiting and Name.
Definition VPlan.h:5070
auto getLiveIns() const
Return the list of live-in VPValues available in the VPlan.
Definition VPlan.h:5024
bool hasUF(unsigned UF) const
Definition VPlan.h:4944
ArrayRef< VPIRBasicBlock * > getExitBlocks() const
Return an ArrayRef containing VPIRBasicBlocks wrapping the exit blocks of the original scalar loop.
Definition VPlan.h:4845
VPSymbolicValue & getVectorTripCount()
The vector trip count.
Definition VPlan.h:4885
VPValue * getBackedgeTakenCount() const
Definition VPlan.h:4882
VPIRValue * getOrAddLiveIn(Value *V)
Gets the live-in VPIRValue for V or adds a new live-in (if none exists yet) for V.
Definition VPlan.h:4969
VPIRValue * getZero(Type *Ty)
Return a VPIRValue wrapping the null value of type Ty.
Definition VPlan.h:4995
void setVF(ElementCount VF)
Definition VPlan.h:4907
bool isUnrolled() const
Returns true if the VPlan already has been unrolled, i.e.
Definition VPlan.h:4960
LLVM_ABI_FOR_TEST VPRegionBlock * getVectorLoopRegion()
Returns the VPRegionBlock of the vector loop.
Definition VPlan.cpp:1068
unsigned getConcreteUF() const
Returns the concrete UF of the plan, after unrolling.
Definition VPlan.h:4947
void resetTripCount(VPValue *NewTripCount)
Resets the trip count for the VPlan.
Definition VPlan.h:4869
VPBasicBlock * getMiddleBlock()
Returns the 'middle' block of the plan, that is the block that selects whether to execute the scalar ...
Definition VPlan.h:4821
VPBasicBlock * createVPBasicBlock(const Twine &Name, VPRecipeBase *Recipe=nullptr)
Create a new VPBasicBlock with Name and containing Recipe if present.
Definition VPlan.h:5047
VPIRValue * getTrue()
Return a VPIRValue wrapping i1 true.
Definition VPlan.h:4989
VPBasicBlock * getVectorPreheader() const
Returns the preheader of the vector loop region, if one exists, or null otherwise.
Definition VPlan.h:4797
VPSymbolicValue & getUF()
Returns the UF of the vector loop region.
Definition VPlan.h:4892
bool hasScalarVFOnly() const
Definition VPlan.h:4937
VPBasicBlock * getScalarPreheader() const
Return the VPBasicBlock for the preheader of the scalar loop.
Definition VPlan.h:4835
VPSymbolicValue & getVF()
Returns the VF of the vector loop region.
Definition VPlan.h:4888
void setUF(unsigned UF)
Definition VPlan.h:4952
bool hasScalarTail() const
Returns true if the scalar tail may execute after the vector loop, i.e.
Definition VPlan.h:5102
LLVM_ABI_FOR_TEST VPlan * duplicate()
Clone the current VPlan, update all VPValues of the new VPlan and cloned recipes to refer to the clon...
Definition VPlan.cpp:1224
VPIRValue * getConstantInt(Type *Ty, uint64_t Val, bool IsSigned=false)
Return a VPIRValue wrapping a ConstantInt with the given type and value.
Definition VPlan.h:5003
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255
iterator_range< user_iterator > users()
Definition Value.h:426
bool hasName() const
Definition Value.h:261
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:318
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
constexpr bool hasKnownScalarFactor(const FixedOrScalableQuantity &RHS) const
Returns true if there exists a value X where RHS.multiplyCoefficientBy(X) will result in a value whos...
Definition TypeSize.h:269
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
constexpr ScalarTy getKnownScalarFactor(const FixedOrScalableQuantity &RHS) const
Returns a value X where RHS.multiplyCoefficientBy(X) will result in a value whose quantity matches ou...
Definition TypeSize.h:277
static constexpr bool isKnownLT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:216
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:168
constexpr LeafTy multiplyCoefficientBy(ScalarTy RHS) const
Definition TypeSize.h:256
constexpr bool isFixed() const
Returns true if the quantity is not scaled by vscale.
Definition TypeSize.h:171
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
An efficient, type-erasing, non-owning reference to a callable.
self_iterator getIterator()
Definition ilist_node.h:123
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
LLVM_ABI APInt RoundingUDiv(const APInt &A, const APInt &B, APInt::Rounding RM)
Return A unsign-divided by B, rounded by the given rounding mode.
Definition APInt.cpp:2815
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
std::variant< std::monostate, Loc::Single, Loc::Multi, Loc::MMI, Loc::EntryValue > Variant
Alias for the std::variant specialization base class of DbgVariable.
Definition DwarfDebug.h:190
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
match_combine_or< Ty... > m_CombineOr(const Ty &...Ps)
Combine pattern matchers matching any of Ps patterns.
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
auto m_Cmp()
Matches any compare instruction and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
m_Intrinsic_Ty< Opnd0, Opnd1, Opnd2 >::Ty m_MaskedStore(const Opnd0 &Op0, const Opnd1 &Op1, const Opnd2 &Op2)
Matches MaskedStore Intrinsic.
match_combine_or< CastInst_match< OpTy, TruncInst >, OpTy > m_TruncOrSelf(const OpTy &Op)
ap_match< APInt > m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
CastInst_match< OpTy, TruncInst > m_Trunc(const OpTy &Op)
Matches Trunc.
LogicalOp_match< LHS, RHS, Instruction::And > m_LogicalAnd(const LHS &L, const RHS &R)
Matches L && R either in the form of L & R or L ?
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, ZExtInst >, OpTy > m_ZExtOrSelf(const OpTy &Op)
bool match(Val *V, const Pattern &P)
match_deferred< Value > m_Deferred(Value *const &V)
Like m_Specific(), but works if the specific value to match is determined as part of the same match()...
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
auto match_fn(const Pattern &P)
A match functor that can be used as a UnaryPredicate in functional algorithms like all_of.
m_Intrinsic_Ty< Opnd0, Opnd1, Opnd2 >::Ty m_MaskedLoad(const Opnd0 &Op0, const Opnd1 &Op1, const Opnd2 &Op2)
Matches MaskedLoad Intrinsic.
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
SpecificCmpClass_match< LHS, RHS, CmpInst > m_SpecificCmp(CmpPredicate MatchPred, const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
CastInst_match< OpTy, FPExtInst > m_FPExt(const OpTy &Op)
SpecificCmpClass_match< LHS, RHS, ICmpInst > m_SpecificICmp(CmpPredicate MatchPred, const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::UDiv > m_UDiv(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
CmpClass_match< LHS, RHS, ICmpInst > m_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
FNeg_match< OpTy > m_FNeg(const OpTy &X)
Match 'fneg X' as 'fsub -0.0, X'.
BinaryOp_match< LHS, RHS, Instruction::FAdd, true > m_c_FAdd(const LHS &L, const RHS &R)
Matches FAdd with LHS and RHS in either order.
LogicalOp_match< LHS, RHS, Instruction::And, true > m_c_LogicalAnd(const LHS &L, const RHS &R)
Matches L && R with LHS and RHS in either order.
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
BinaryOp_match< LHS, RHS, Instruction::Mul, true > m_c_Mul(const LHS &L, const RHS &R)
Matches a Mul with LHS and RHS in either order.
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
auto m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
bind_cst_ty m_scev_APInt(const APInt *&C)
Match an SCEV constant and bind it to an APInt.
specificloop_ty m_SpecificLoop(const Loop *L)
bool match(const SCEV *S, const Pattern &P)
SCEVAffineAddRec_match< Op0_t, Op1_t, match_isa< const Loop > > m_scev_AffineAddRec(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::ExtractLastLane, VPInstruction_match< VPInstruction::ExtractLastPart, Op0_t > > m_ExtractLastLaneOfLastPart(const Op0_t &Op0)
AllRecipe_commutative_match< Instruction::And, Op0_t, Op1_t > m_c_BinaryAnd(const Op0_t &Op0, const Op1_t &Op1)
Match a binary AND operation.
AllRecipe_match< Instruction::Or, Op0_t, Op1_t > m_BinaryOr(const Op0_t &Op0, const Op1_t &Op1)
Match a binary OR operation.
VPInstruction_match< VPInstruction::AnyOf > m_AnyOf()
AllRecipe_commutative_match< Instruction::Or, Op0_t, Op1_t > m_c_BinaryOr(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::ComputeReductionResult, Op0_t > m_ComputeReductionResult(const Op0_t &Op0)
auto m_WidenAnyExtend(const Op0_t &Op0)
match_bind< VPIRValue > m_VPIRValue(VPIRValue *&V)
Match a VPIRValue.
VPInstruction_match< VPInstruction::StepVector > m_StepVector()
auto m_VPPhi(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::BranchOnTwoConds > m_BranchOnTwoConds()
AllRecipe_match< Opcode, Op0_t, Op1_t > m_Binary(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::LastActiveLane, Op0_t > m_LastActiveLane(const Op0_t &Op0)
auto m_WidenIntrinsic(const T &...Ops)
canonical_widen_iv_match m_CanonicalWidenIV()
VPInstruction_match< VPInstruction::ExitingIVValue, Op0_t > m_ExitingIVValue(const Op0_t &Op0)
VPInstruction_match< Instruction::ExtractElement, Op0_t, Op1_t > m_ExtractElement(const Op0_t &Op0, const Op1_t &Op1)
specific_intval< 1 > m_False()
VPInstruction_match< VPInstruction::ExtractLastLane, Op0_t > m_ExtractLastLane(const Op0_t &Op0)
VPInstruction_match< VPInstruction::ActiveLaneMask, Op0_t, Op1_t, Op2_t > m_ActiveLaneMask(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2)
match_bind< VPSingleDefRecipe > m_VPSingleDefRecipe(VPSingleDefRecipe *&V)
Match a VPSingleDefRecipe, capturing if we match.
VPInstruction_match< VPInstruction::BranchOnCount > m_BranchOnCount()
auto m_GetElementPtr(const Op0_t &Op0, const Op1_t &Op1)
specific_intval< 1 > m_True()
auto m_VPValue()
Match an arbitrary VPValue and ignore it.
VectorEndPointerRecipe_match< Op0_t, Op1_t > m_VecEndPtr(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::ExtractLastPart, Op0_t > m_ExtractLastPart(const Op0_t &Op0)
VPInstruction_match< VPInstruction::Broadcast, Op0_t > m_Broadcast(const Op0_t &Op0)
VPInstruction_match< VPInstruction::ExplicitVectorLength, Op0_t > m_EVL(const Op0_t &Op0)
VPInstruction_match< VPInstruction::BuildVector > m_BuildVector()
BuildVector is matches only its opcode, w/o matching its operands as the number of operands is not fi...
VPInstruction_match< VPInstruction::ExtractPenultimateElement, Op0_t > m_ExtractPenultimateElement(const Op0_t &Op0)
match_bind< VPInstruction > m_VPInstruction(VPInstruction *&V)
Match a VPInstruction, capturing if we match.
VPInstruction_match< VPInstruction::FirstActiveLane, Op0_t > m_FirstActiveLane(const Op0_t &Op0)
auto m_DerivedIV(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2)
VPInstruction_match< VPInstruction::BranchOnCond > m_BranchOnCond()
static VPRecipeBase * findUserOf(VPValue *V, const MatchT &P)
If V is used by a recipe matching pattern P, return it.
VPInstruction_match< VPInstruction::ExtractLane, Op0_t, Op1_t > m_ExtractLane(const Op0_t &Op0, const Op1_t &Op1)
auto m_AnyNeg(const Op0_t &Op0)
VPInstruction_match< VPInstruction::Reverse, Op0_t > m_Reverse(const Op0_t &Op0)
NodeAddr< DefNode * > Def
Definition RDFGraph.h:384
bool isSingleScalar(const VPValue *VPV)
Returns true if VPV is a single scalar, either because it produces the same value for all lanes or on...
VPValue * getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr)
Get or create a VPValue that corresponds to the expansion of Expr.
bool cannotHoistOrSinkRecipe(const VPRecipeBase &R, bool Sinking=false)
Return true if we do not know how to (mechanically) hoist or sink R.
VPInstruction * findComputeReductionResult(VPReductionPHIRecipe *PhiR)
Find the ComputeReductionResult recipe for PhiR, looking through selects inserted for predicated redu...
VPInstruction * findCanonicalIVIncrement(VPlan &Plan)
Find the canonical IV increment of Plan's vector loop region.
std::optional< MemoryLocation > getMemoryLocation(const VPRecipeBase &R)
Return a MemoryLocation for R with noalias metadata populated from R, if the recipe is supported and ...
bool onlyFirstLaneUsed(const VPValue *Def)
Returns true if only the first lane of Def is used.
VPValue * findIncomingAliasMask(const VPlan &Plan)
Finds the incoming alias-mask within the vector preheader.
VPRecipeBase * findRecipe(VPValue *Start, PredT Pred)
Search Start's users for a recipe satisfying Pred, looking through recipes with definitions.
Definition VPlanUtils.h:139
VPSingleDefRecipe * findHeaderMask(VPlan &Plan)
Collect the header mask with the pattern: (ICMP_ULE, WideCanonicalIV, backedge-taken-count) Note: If ...
bool onlyScalarValuesUsed(const VPValue *Def)
Returns true if only scalar values of Def are used by all users.
bool isUniformAcrossVFsAndUFs(const VPValue *V)
Checks if V is uniform across all VF lanes and UF parts.
const SCEV * getSCEVExprForVPValue(const VPValue *V, PredicatedScalarEvolution &PSE, const Loop *L=nullptr)
Return the SCEV expression for V.
bool isHeaderMask(const VPValue *V, const VPlan &Plan)
Return true if V is a header mask in Plan.
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:315
@ Offset
Definition DWP.cpp:558
constexpr auto not_equal_to(T &&Arg)
Functor variant of std::not_equal_to that can be used as a UnaryPredicate in functional algorithms li...
Definition STLExtras.h:2179
void stable_sort(R &&Range)
Definition STLExtras.h:2115
auto min_element(R &&Range)
Provide wrappers to std::min_element which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:2077
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1738
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1668
LLVM_ABI Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
Definition STLExtras.h:840
DenseMap< const Value *, const SCEV * > ValueToSCEVMapTy
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2553
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
constexpr from_range_t from_range
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2207
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:633
auto cast_or_null(const Y &Val)
Definition Casting.h:714
iterator_range< df_iterator< VPBlockShallowTraversalWrapper< VPBlockBase * > > > vp_depth_first_shallow(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order.
Definition VPlanCFG.h:253
constexpr auto bind_back(FnT &&Fn, BindArgsT &&...BindArgs)
C++23 bind_back.
iterator_range< df_iterator< VPBlockDeepTraversalWrapper< VPBlockBase * > > > vp_depth_first_deep(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order while traversing t...
Definition VPlanCFG.h:288
constexpr auto equal_to(T &&Arg)
Functor variant of std::equal_to that can be used as a UnaryPredicate in functional algorithms like a...
Definition STLExtras.h:2172
bool operator==(const AddressRangeValuePair &LHS, const AddressRangeValuePair &RHS)
SmallVector< VPRegisterUsage, 8 > calculateRegisterUsageForPlan(VPlan &Plan, ArrayRef< ElementCount > VFs, const TargetTransformInfo &TTI, const SmallPtrSetImpl< const Value * > &ValuesToIgnore)
Estimate the register usage for Plan and vectorization factors in VFs by calculating the highest numb...
auto map_range(ContainerTy &&C, FuncTy F)
Return a range that applies F to the elements of C.
Definition STLExtras.h:365
detail::concat_range< ValueT, RangeTs... > concat(RangeTs &&...Ranges)
Returns a concatenated range across two or more ranges.
Definition STLExtras.h:1151
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:385
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1745
auto reverse(ContainerTy &&C)
Definition STLExtras.h:407
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1635
LLVM_ABI_FOR_TEST cl::opt< bool > EnableWideActiveLaneMask
UncountableExitStyle
Different methods of handling early exits.
Definition VPlan.h:79
@ ReadOnly
No side effects to worry about, so we can process any uncountable exits in the loop and branch either...
Definition VPlan.h:84
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1752
SmallVector< ValueTypeFromRangeType< R >, Size > to_vector(R &&Range)
Given a range of type R, iterate the entire range and return a SmallVector with elements of the vecto...
iterator_range< filter_iterator< detail::IterOfRange< RangeT >, PredicateT > > make_filter_range(RangeT &&Range, PredicateT Pred)
Convenience function that takes a range of elements and a predicate, and return a new filter_iterator...
Definition STLExtras.h:551
bool canConstantBeExtended(const APInt *C, Type *NarrowType, TTI::PartialReductionExtendKind ExtKind)
Check if a constant CI can be safely treated as having been extended from a narrower type with the gi...
Definition VPlan.cpp:1850
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
auto drop_end(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the last N elements excluded.
Definition STLExtras.h:322
@ Other
Any other memory.
Definition ModRef.h:68
TargetTransformInfo TTI
RecurKind
These are the kinds of recurrences that we support.
@ UMin
Unsigned integer min implemented in terms of select(cmp()).
@ FindIV
FindIV reduction with select(icmp(),x,y) where one of (x,y) is a loop induction variable (increasing ...
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ FSub
Subtraction of floats.
@ FMul
Product of floats.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ SMin
Signed integer min implemented in terms of select(cmp()).
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
@ AddChainWithSubs
A chain of adds and subs.
@ FAdd
Sum of floats.
@ UMax
Unsigned integer max implemented in terms of select(cmp()).
LLVM_ABI Value * getRecurrenceIdentity(RecurKind K, Type *Tp, FastMathFlags FMF)
Given information about an recurrence kind, return the identity for the @llvm.vector....
LLVM_ABI BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="")
Split the specified block at the specified instruction.
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition STLExtras.h:2011
DWARFExpression::Operation Op
auto max_element(R &&Range)
Provide wrappers to std::max_element which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:2087
ArrayRef(const T &OneElt) -> ArrayRef< T >
auto make_second_range(ContainerTy &&c)
Given a container of pairs, return a range over the second elements.
Definition STLExtras.h:1408
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1771
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1946
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition STLExtras.h:2165
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition Hashing.h:325
bool equal(L &&LRange, R &&RRange)
Wrapper function around std::equal to detect if pair-wise elements between two ranges are the same.
Definition STLExtras.h:2145
Type * toVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
@ Default
The result value is uniform if and only if all operands are uniform.
Definition Uniformity.h:20
constexpr detail::IsaCheckPredicate< Types... > IsaPred
Function object wrapper for the llvm::isa type check.
Definition Casting.h:866
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition Hashing.h:305
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:863
#define N
RemoveMask_match(const Op0_t &In, Op1_t &Out)
bool match(OpTy *V) const
MDNode * Scope
The tag for alias scope specification (used with noalias).
Definition Metadata.h:786
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
An information struct used to provide DenseMap with the various necessary components for a given valu...
This reduction is unordered with the partial result scaled down by some factor.
Definition VPlan.h:2793
Holds the VFShape for a specific scalar to vector function mapping.
Encapsulates information needed to describe a parameter.
A range of powers-of-2 vectorization factors with fixed start and adjustable end.
Struct to hold various analysis needed for cost computations.
static bool isFreeScalarIntrinsic(Intrinsic::ID ID)
Returns true if ID is a pseudo intrinsic that is dropped via scalarization rather than widened.
Definition VPlan.cpp:1946
bool isMaskRequired(Instruction *I) const
Forwards to LoopVectorizationCostModel::isMaskRequired.
PredicatedScalarEvolution & PSE
bool willBeScalarized(Instruction *I, ElementCount VF) const
Returns true if I is known to be scalarized at VF.
TargetTransformInfo::TargetCostKind CostKind
const TargetLibraryInfo & TLI
const TargetTransformInfo & TTI
A VPValue representing a live-in from the input IR or a constant.
Definition VPlanValue.h:246
Type * getType() const
Returns the type of the underlying IR value.
Definition VPlan.cpp:147
A struct that represents some properties of the register usage of a loop.
SmallMapVector< unsigned, unsigned, 4 > MaxLocalUsers
Holds the maximum number of concurrent live intervals in the loop.
InstructionCost spillCost(const TargetTransformInfo &TTI, TargetTransformInfo::TargetCostKind CostKind, unsigned OverrideMaxNumRegs=0) const
Calculate the estimated cost of any spills due to using more registers than the number available for ...
A symbolic live-in VPValue, used for values like vector trip count, VF, and VFxUF.
Definition VPlanValue.h:286
bool isMaterialized() const
Returns true if this symbolic value has been materialized.
Definition VPlanValue.h:297
A recipe for widening load operations with vector-predication intrinsics, using the address to load f...
Definition VPlan.h:3796
A recipe for widening load operations, using the address to load from and an optional mask.
Definition VPlan.h:3747
A recipe for widening store operations with vector-predication intrinsics, using the value to store,...
Definition VPlan.h:3898
A recipe for widening store operations, using the stored value, the address to store to and an option...
Definition VPlan.h:3845
static void handleUncountableEarlyExits(VPlan &Plan, VPBasicBlock *HeaderVPBB, VPBasicBlock *LatchVPBB, VPBasicBlock *MiddleVPBB, UncountableExitStyle Style)
Update Plan to account for uncountable early exits by introducing appropriate branching logic in the ...
static VPValue * materializeAliasMask(VPlan &Plan, VPBasicBlock *AliasCheckVPBB, ArrayRef< PointerDiffInfo > DiffChecks)
Materializes within the AliasCheckVPBB block.
static LLVM_ABI_FOR_TEST bool tryToConvertVPInstructionsToVPRecipes(VPlan &Plan, const TargetLibraryInfo &TLI)
Replaces the VPInstructions in Plan with corresponding widen recipes.
static void makeMemOpWideningDecisions(VPlan &Plan, VFRange &Range, VPRecipeBuilder &RecipeBuilder)
Convert load/store VPInstructions in Plan into widened or replicate recipes.
static void expandSCEVsToVPInstructions(VPlan &Plan, ScalarEvolution &SE)
Try to expand VPExpandSCEVRecipes in Plan's entry block to VPInstructions.
static void materializeBroadcasts(VPlan &Plan)
Add explicit broadcasts for live-ins and VPValues defined in Plan's entry block if they are used as v...
static void materializePacksAndUnpacks(VPlan &Plan)
Add explicit Build[Struct]Vector recipes to Pack multiple scalar values into vectors and Unpack recip...
static void createInterleaveGroups(VPlan &Plan, const SmallPtrSetImpl< const InterleaveGroup< Instruction > * > &InterleaveGroups, const bool &EpilogueAllowed)
static bool simplifyKnownEVL(VPlan &Plan, ElementCount VF, PredicatedScalarEvolution &PSE)
Try to simplify VPInstruction::ExplicitVectorLength recipes when the AVL is known to be <= VF,...
static void removeBranchOnConst(VPlan &Plan, bool OnlyLatches=false)
Remove BranchOnCond recipes with true or false conditions together with removing dead edges to their ...
static void materializeFactors(VPlan &Plan, VPBasicBlock *VectorPH, ElementCount VF)
Materialize UF, VF and VFxUF to be computed explicitly using VPInstructions.
static void materializeBackedgeTakenCount(VPlan &Plan, VPBasicBlock *VectorPH)
Materialize the backedge-taken count to be computed explicitly using VPInstructions.
static void addActiveLaneMask(VPlan &Plan, bool UseActiveLaneMaskForControlFlow)
Replace (ICMP_ULE, wide canonical IV, backedge-taken-count) checks with an (active-lane-mask recipe,...
static void replaceWideCanonicalIVWithWideIV(VPlan &Plan, ScalarEvolution &SE, const TargetTransformInfo &TTI, TargetTransformInfo::TargetCostKind CostKind, ElementCount VF, unsigned UF, const SmallPtrSetImpl< const Value * > &ValuesToIgnore)
Replace a VPWidenCanonicalIVRecipe if it is present in Plan, with a VPWidenIntOrFpInductionRecipe,...
static void createAndOptimizeReplicateRegions(VPlan &Plan)
Wrap predicated VPReplicateRecipes with a mask operand in an if-then region block and remove the mask...
static void convertToVariableLengthStep(VPlan &Plan)
Transform loops with variable-length stepping after region dissolution.
static void addBranchWeightToMiddleTerminator(VPlan &Plan, ElementCount VF, std::optional< unsigned > VScaleForTuning)
Add branch weight metadata, if the Plan's middle block is terminated by a BranchOnCond recipe.
static std::unique_ptr< VPlan > narrowInterleaveGroups(VPlan &Plan, const TargetTransformInfo &TTI)
Try to find a single VF among Plan's VFs for which all interleave groups (with known minimum VF eleme...
static void materializeAliasMaskCheckBlock(VPlan &Plan, ArrayRef< PointerDiffInfo > DiffChecks, bool HasBranchWeights)
Materializes the alias mask within a check block before the loop.
static DenseMap< const SCEV *, Value * > expandSCEVs(VPlan &Plan, ScalarEvolution &SE)
Expand remaining VPExpandSCEVRecipes in Plan's entry block using SCEVExpander.
static void convertToConcreteRecipes(VPlan &Plan)
Lower abstract recipes to concrete ones, that can be codegen'd.
static void expandBranchOnTwoConds(VPlan &Plan)
Expand BranchOnTwoConds instructions into explicit CFG with BranchOnCond instructions.
static void materializeVectorTripCount(VPlan &Plan, VPBasicBlock *VectorPHVPBB, bool TailByMasking, bool RequiresScalarEpilogue, VPValue *Step, std::optional< uint64_t > MaxRuntimeStep=std::nullopt)
Materialize vector trip count computations to a set of VPInstructions.
static void hoistPredicatedLoads(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L)
Hoist predicated loads from the same address to the loop entry block, if they are guaranteed to execu...
static bool mergeBlocksIntoPredecessors(VPlan &Plan)
Remove redundant VPBasicBlocks by merging them into their single predecessor if the latter has a sing...
static void attachAliasMaskToHeaderMask(VPlan &Plan)
Attaches the alias-mask to the existing header-mask.
static void optimizeFindIVReductions(VPlan &Plan, PredicatedScalarEvolution &PSE, Loop &L)
Optimize FindLast reductions selecting IVs (or expressions of IVs) by converting them to FindIV reduc...
static void convertToAbstractRecipes(VPlan &Plan, VPCostContext &Ctx, VFRange &Range)
This function converts initial recipes to the abstract recipes and clamps Range based on cost model f...
static void materializeConstantVectorTripCount(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
static void makeScalarizationDecisions(VPlan &Plan, VFRange &Range)
Make VPlan-based scalarization decision prior to delegating to the ones made by the legacy CM.
static void addExplicitVectorLength(VPlan &Plan, const std::optional< unsigned > &MaxEVLSafeElements)
Add a VPCurrentIterationPHIRecipe and related recipes to Plan and replaces all uses of the canonical ...
static void makeCallWideningDecisions(VPlan &Plan, VFRange &Range, VPRecipeBuilder &RecipeBuilder, VPCostContext &CostCtx)
Convert call VPInstructions in Plan into widened call, vector intrinsic or replicate recipes based on...
static void adjustFirstOrderRecurrenceMiddleUsers(VPlan &Plan, VFRange &Range)
Adjust first-order recurrence users in the middle block: create penultimate element extracts for LCSS...
static void optimizeEVLMasks(VPlan &Plan)
Optimize recipes which use an EVL-based header mask to VP intrinsics, for example:
static void replaceSymbolicStrides(VPlan &Plan, PredicatedScalarEvolution &PSE, const DenseMap< Value *, const SCEV * > &StridesMap)
Replace symbolic strides from StridesMap in Plan with constants when possible.
static void removeDeadRecipes(VPlan &Plan)
Remove dead recipes from Plan.
static void simplifyRecipes(VPlan &Plan)
Perform instcombine-like simplifications on recipes in Plan.
static void sinkPredicatedStores(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L)
Sink predicated stores to the same address with complementary predicates (P and NOT P) to an uncondit...
static void convertToStridedAccesses(VPlan &Plan, PredicatedScalarEvolution &PSE, Loop &L, VPCostContext &Ctx, VFRange &Range)
Transform widen memory recipes into strided access recipes when legal and profitable.
static void clearReductionWrapFlags(VPlan &Plan)
Clear NSW/NUW flags from reduction instructions if necessary.
static void optimizeInductionLiveOutUsers(VPlan &Plan, PredicatedScalarEvolution &PSE, bool FoldTail)
If there's a single exit block, optimize its phi recipes that use exiting IV values by feeding them p...
static void createPartialReductions(VPlan &Plan, VPCostContext &CostCtx, VFRange &Range)
Detect and create partial reduction recipes for scaled reductions in Plan.
static void cse(VPlan &Plan)
Perform common-subexpression-elimination on Plan.
static void attachVPCheckBlock(VPlan &Plan, VPValue *Cond, VPBasicBlock *CheckBlock, bool AddBranchWeights)
Wrap runtime check block CheckBlock in a VPIRBB and Cond in a VPValue and connect the block to Plan,...
static LLVM_ABI_FOR_TEST void optimize(VPlan &Plan)
Apply VPlan-to-VPlan optimizations to Plan, including induction recipe optimizations,...
static void dissolveLoopRegions(VPlan &Plan)
Replace loop regions with explicit CFG.
static void truncateToMinimalBitwidths(VPlan &Plan, const MapVector< Instruction *, uint64_t > &MinBWs)
Insert truncates and extends for any truncated recipe.
static void dropPoisonGeneratingRecipes(VPlan &Plan)
Drop poison flags from recipes that may generate a poison value that is used after vectorization,...
static void optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Optimize Plan based on BestVF and BestUF.
static void convertEVLExitCond(VPlan &Plan)
Replaces the exit condition from (branch-on-cond eq CanonicalIVInc, VectorTripCount) to (branch-on-co...