LLVM 23.0.0git
VPlanTransforms.cpp
Go to the documentation of this file.
1//===-- VPlanTransforms.cpp - Utility VPlan to VPlan transforms -----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file implements a set of utility VPlan to VPlan transformations.
11///
12//===----------------------------------------------------------------------===//
13
14#include "VPlanTransforms.h"
15#include "VPRecipeBuilder.h"
16#include "VPlan.h"
17#include "VPlanAnalysis.h"
18#include "VPlanCFG.h"
19#include "VPlanDominatorTree.h"
20#include "VPlanHelpers.h"
21#include "VPlanPatternMatch.h"
22#include "VPlanUtils.h"
23#include "VPlanVerifier.h"
24#include "llvm/ADT/APInt.h"
26#include "llvm/ADT/STLExtras.h"
27#include "llvm/ADT/SetVector.h"
29#include "llvm/ADT/TypeSwitch.h"
32#include "llvm/Analysis/Loads.h"
38#include "llvm/IR/Intrinsics.h"
39#include "llvm/IR/MDBuilder.h"
40#include "llvm/IR/Metadata.h"
45
46using namespace llvm;
47using namespace VPlanPatternMatch;
48using namespace SCEVPatternMatch;
49
51 VPlan &Plan, const TargetLibraryInfo &TLI) {
52
54 Plan.getVectorLoopRegion());
56 // Skip blocks outside region
57 if (!VPBB->getParent())
58 break;
59 VPRecipeBase *Term = VPBB->getTerminator();
60 auto EndIter = Term ? Term->getIterator() : VPBB->end();
61 // Introduce each ingredient into VPlan.
62 for (VPRecipeBase &Ingredient :
63 make_early_inc_range(make_range(VPBB->begin(), EndIter))) {
64
65 VPValue *VPV = Ingredient.getVPSingleValue();
66 if (!VPV->getUnderlyingValue())
67 continue;
68
70
71 VPRecipeBase *NewRecipe = nullptr;
72 if (auto *PhiR = dyn_cast<VPPhi>(&Ingredient)) {
73 auto *Phi = cast<PHINode>(PhiR->getUnderlyingValue());
74 NewRecipe = new VPWidenPHIRecipe(PhiR->operands(), PhiR->getDebugLoc(),
75 Phi->getName());
76 } else if (auto *VPI = dyn_cast<VPInstruction>(&Ingredient)) {
77 assert(!isa<PHINode>(Inst) && "phis should be handled above");
78 // Create VPWidenMemoryRecipe for loads and stores.
79 if (LoadInst *Load = dyn_cast<LoadInst>(Inst)) {
80 NewRecipe = new VPWidenLoadRecipe(
81 *Load, Ingredient.getOperand(0), nullptr /*Mask*/,
82 false /*Consecutive*/, *VPI, Ingredient.getDebugLoc());
83 } else if (StoreInst *Store = dyn_cast<StoreInst>(Inst)) {
84 NewRecipe = new VPWidenStoreRecipe(
85 *Store, Ingredient.getOperand(1), Ingredient.getOperand(0),
86 nullptr /*Mask*/, false /*Consecutive*/, *VPI,
87 Ingredient.getDebugLoc());
89 NewRecipe = new VPWidenGEPRecipe(GEP->getSourceElementType(),
90 Ingredient.operands(), *VPI,
91 Ingredient.getDebugLoc(), GEP);
92 } else if (CallInst *CI = dyn_cast<CallInst>(Inst)) {
93 Intrinsic::ID VectorID = getVectorIntrinsicIDForCall(CI, &TLI);
94 if (VectorID == Intrinsic::not_intrinsic)
95 return false;
96
97 // The noalias.scope.decl intrinsic declares a noalias scope that
98 // is valid for a single iteration. Emitting it as a single-scalar
99 // replicate would incorrectly extend the scope across multiple
100 // original iterations packed into one vector iteration.
101 // FIXME: If we want to vectorize this loop, then we have to drop
102 // all the associated !alias.scope and !noalias.
103 if (VectorID == Intrinsic::experimental_noalias_scope_decl)
104 return false;
105
106 // These intrinsics are recognized by getVectorIntrinsicIDForCall
107 // but are not widenable. Emit them as replicate instead of widening.
108 if (VectorID == Intrinsic::assume ||
109 VectorID == Intrinsic::lifetime_end ||
110 VectorID == Intrinsic::lifetime_start ||
111 VectorID == Intrinsic::sideeffect ||
112 VectorID == Intrinsic::pseudoprobe) {
113 // If the operand of llvm.assume holds before vectorization, it will
114 // also hold per lane.
115 // llvm.pseudoprobe requires to be duplicated per lane for accurate
116 // sample count.
117 const bool IsSingleScalar = VectorID != Intrinsic::assume &&
118 VectorID != Intrinsic::pseudoprobe;
119 NewRecipe = new VPReplicateRecipe(CI, Ingredient.operands(),
120 /*IsSingleScalar=*/IsSingleScalar,
121 /*Mask=*/nullptr, *VPI, *VPI,
122 Ingredient.getDebugLoc());
123 } else {
124 NewRecipe = new VPWidenIntrinsicRecipe(
125 *CI, VectorID, drop_end(Ingredient.operands()), CI->getType(),
126 VPIRFlags(*CI), *VPI, CI->getDebugLoc());
127 }
128 } else if (auto *CI = dyn_cast<CastInst>(Inst)) {
129 NewRecipe = new VPWidenCastRecipe(
130 CI->getOpcode(), Ingredient.getOperand(0), CI->getType(), CI,
131 VPIRFlags(*CI), VPIRMetadata(*CI));
132 } else {
133 NewRecipe = new VPWidenRecipe(*Inst, Ingredient.operands(), *VPI,
134 *VPI, Ingredient.getDebugLoc());
135 }
136 } else {
138 "inductions must be created earlier");
139 continue;
140 }
141
142 NewRecipe->insertBefore(&Ingredient);
143 if (NewRecipe->getNumDefinedValues() == 1)
144 VPV->replaceAllUsesWith(NewRecipe->getVPSingleValue());
145 else
146 assert(NewRecipe->getNumDefinedValues() == 0 &&
147 "Only recpies with zero or one defined values expected");
148 Ingredient.eraseFromParent();
149 }
150 }
151 return true;
152}
153
154/// Helper for extra no-alias checks via known-safe recipe and SCEV.
157 VPReplicateRecipe &GroupLeader;
158 PredicatedScalarEvolution *PSE = nullptr;
159 const Loop *L = nullptr;
160
161 // Return true if \p A and \p B are known to not alias for all VFs in the
162 // plan, checked via the distance between the accesses
163 bool isNoAliasViaDistance(VPReplicateRecipe *A, VPReplicateRecipe *B) const {
164 if (A->getOpcode() != Instruction::Store ||
165 B->getOpcode() != Instruction::Store)
166 return false;
167
168 if (!PSE || !L)
169 return A == B;
170
171 VPValue *AddrA = A->getOperand(1);
172 const SCEV *SCEVA = vputils::getSCEVExprForVPValue(AddrA, *PSE, L);
173 VPValue *AddrB = B->getOperand(1);
174 const SCEV *SCEVB = vputils::getSCEVExprForVPValue(AddrB, *PSE, L);
176 return false;
177
178 const APInt *Distance;
179 ScalarEvolution &SE = *PSE->getSE();
180 if (!match(SE.getMinusSCEV(SCEVA, SCEVB), m_scev_APInt(Distance)))
181 return false;
182
183 const DataLayout &DL = SE.getDataLayout();
184 Type *TyA = A->getOperand(0)->getScalarType();
185 uint64_t SizeA = DL.getTypeStoreSize(TyA);
186 Type *TyB = B->getOperand(0)->getScalarType();
187 uint64_t SizeB = DL.getTypeStoreSize(TyB);
188
189 // Use the maximum store size to ensure no overlap from either direction.
190 // Currently only handles fixed sizes, as it is only used for
191 // replicating VPReplicateRecipes.
192 uint64_t MaxStoreSize = std::max(SizeA, SizeB);
193
194 auto VFs = B->getParent()->getPlan()->vectorFactors();
196 if (MaxVF.isScalable())
197 return false;
198 return Distance->abs().uge(
199 MaxVF.multiplyCoefficientBy(MaxStoreSize).getFixedValue());
200 }
201
202public:
205 const Loop &L)
206 : ExcludeRecipes(ExcludeRecipes.begin(), ExcludeRecipes.end()),
207 GroupLeader(GroupLeader), PSE(&PSE), L(&L) {}
208
209 SinkStoreInfo(VPReplicateRecipe &GroupLeader) : GroupLeader(GroupLeader) {}
210
211 /// Return true if \p R should be skipped during alias checking, either
212 /// because it's in the exclude set or because no-alias can be proven via
213 /// SCEV.
214 bool shouldSkip(VPRecipeBase &R) const {
215 auto *Store = dyn_cast<VPReplicateRecipe>(&R);
216 return ExcludeRecipes.contains(Store) ||
217 (Store && isNoAliasViaDistance(Store, &GroupLeader));
218 }
219};
220
221/// Check if a memory operation doesn't alias with memory operations using
222/// scoped noalias metadata, in blocks in the single-successor chain between \p
223/// FirstBB and \p LastBB. If \p SinkInfo is std::nullopt, only recipes that may
224/// write to memory are checked (for load hoisting). Otherwise recipes that both
225/// read and write memory are checked, and SCEV is used to prove no-alias
226/// between the group leader and other replicate recipes (for store sinking).
227static bool
229 VPBasicBlock *FirstBB, VPBasicBlock *LastBB,
230 std::optional<SinkStoreInfo> SinkInfo = {}) {
231 bool CheckReads = SinkInfo.has_value();
232 if (!MemLoc.AATags.Scope)
233 return false;
234
235 for (VPBasicBlock *VPBB :
237 for (VPRecipeBase &R : *VPBB) {
238 if (SinkInfo && SinkInfo->shouldSkip(R))
239 continue;
240
241 // Skip recipes that don't need checking.
242 if (!R.mayWriteToMemory() && !(CheckReads && R.mayReadFromMemory()))
243 continue;
244
246 if (!Loc)
247 // Conservatively assume aliasing for memory operations without
248 // location.
249 return false;
250
252 return false;
253 }
254 }
255 return true;
256}
257
258/// Get the value type of the replicate load or store. \p IsLoad indicates
259/// whether it is a load.
261 return (IsLoad ? R : R->getOperand(0))->getScalarType();
262}
263
264/// Collect either replicated Loads or Stores grouped by their address SCEV and
265/// their load-store type, in a deep-traversal of the vector loop region in \p
266/// Plan.
267template <unsigned Opcode>
270 VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L,
271 function_ref<bool(VPReplicateRecipe *)> FilterFn) {
272 static_assert(Opcode == Instruction::Load || Opcode == Instruction::Store,
273 "Only Load and Store opcodes supported");
274 constexpr bool IsLoad = (Opcode == Instruction::Load);
277 RecipesByAddressAndType;
280 for (VPRecipeBase &R : *VPBB) {
281 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
282 if (!RepR || RepR->getOpcode() != Opcode || !FilterFn(RepR))
283 continue;
284
285 // For loads, operand 0 is address; for stores, operand 1 is address.
286 VPValue *Addr = RepR->getOperand(IsLoad ? 0 : 1);
287 const Type *LoadStoreTy = getLoadStoreValueType(RepR, IsLoad);
288 const SCEV *AddrSCEV = vputils::getSCEVExprForVPValue(Addr, PSE, L);
289 if (!isa<SCEVCouldNotCompute>(AddrSCEV))
290 RecipesByAddressAndType[{AddrSCEV, LoadStoreTy}].push_back(RepR);
291 }
292 }
293 auto Groups = to_vector(RecipesByAddressAndType.values());
294 VPDominatorTree VPDT(Plan);
295 for (auto &Group : Groups) {
296 // Sort mem ops by dominance order, with earliest (most dominating) first.
298 return VPDT.properlyDominates(A, B);
299 });
300 }
301 return Groups;
302}
303
304static bool sinkScalarOperands(VPlan &Plan) {
305 auto Iter = vp_depth_first_deep(Plan.getEntry());
306 bool ScalarVFOnly = Plan.hasScalarVFOnly();
307 bool Changed = false;
308
310 auto InsertIfValidSinkCandidate = [ScalarVFOnly, &WorkList](
311 VPBasicBlock *SinkTo, VPValue *Op) {
312 auto *Candidate =
313 dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe());
314 if (!Candidate)
315 return;
316
317 // We only know how to sink VPReplicateRecipes and VPScalarIVStepsRecipes
318 // for now.
320 return;
321
322 if (Candidate->getParent() == SinkTo ||
323 vputils::cannotHoistOrSinkRecipe(*Candidate, /*Sinking=*/true))
324 return;
325
326 if (auto *RepR = dyn_cast<VPReplicateRecipe>(Candidate))
327 if (!ScalarVFOnly && RepR->isSingleScalar())
328 return;
329
330 WorkList.insert({SinkTo, Candidate});
331 };
332
333 // First, collect the operands of all recipes in replicate blocks as seeds for
334 // sinking.
336 VPBasicBlock *EntryVPBB = VPR->getEntryBasicBlock();
337 if (!VPR->isReplicator() || EntryVPBB->getSuccessors().size() != 2)
338 continue;
339 VPBasicBlock *VPBB = cast<VPBasicBlock>(EntryVPBB->getSuccessors().front());
340 if (VPBB->getSingleSuccessor() != VPR->getExitingBasicBlock())
341 continue;
342 for (auto &Recipe : *VPBB)
343 for (VPValue *Op : Recipe.operands())
344 InsertIfValidSinkCandidate(VPBB, Op);
345 }
346
347 // Try to sink each replicate or scalar IV steps recipe in the worklist.
348 for (unsigned I = 0; I != WorkList.size(); ++I) {
349 VPBasicBlock *SinkTo;
350 VPSingleDefRecipe *SinkCandidate;
351 std::tie(SinkTo, SinkCandidate) = WorkList[I];
352
353 // All recipe users of SinkCandidate must be in the same block SinkTo or all
354 // users outside of SinkTo must only use the first lane of SinkCandidate. In
355 // the latter case, we need to duplicate SinkCandidate.
356 auto UsersOutsideSinkTo =
357 make_filter_range(SinkCandidate->users(), [SinkTo](VPUser *U) {
358 return cast<VPRecipeBase>(U)->getParent() != SinkTo;
359 });
360 if (any_of(UsersOutsideSinkTo, [SinkCandidate](VPUser *U) {
361 return !U->usesFirstLaneOnly(SinkCandidate);
362 }))
363 continue;
364 bool NeedsDuplicating = !UsersOutsideSinkTo.empty();
365
366 if (NeedsDuplicating) {
367 if (ScalarVFOnly)
368 continue;
369 VPSingleDefRecipe *Clone;
370 if (auto *SinkCandidateRepR =
371 dyn_cast<VPReplicateRecipe>(SinkCandidate)) {
372 // TODO: Handle converting to uniform recipes as separate transform,
373 // then cloning should be sufficient here.
375 SinkCandidateRepR->getOpcode(), SinkCandidate->operands(),
376 /*Mask=*/nullptr, *SinkCandidateRepR, *SinkCandidateRepR,
377 SinkCandidate->getDebugLoc(), SinkCandidate->getUnderlyingInstr());
378 // TODO: add ".cloned" suffix to name of Clone's VPValue.
379 } else {
380 Clone = SinkCandidate->clone();
381 }
382
383 Clone->insertBefore(SinkCandidate);
384 SinkCandidate->replaceUsesWithIf(Clone, [SinkTo](VPUser &U, unsigned) {
385 return cast<VPRecipeBase>(&U)->getParent() != SinkTo;
386 });
387 }
388 SinkCandidate->moveBefore(*SinkTo, SinkTo->getFirstNonPhi());
389 for (VPValue *Op : SinkCandidate->operands())
390 InsertIfValidSinkCandidate(SinkTo, Op);
391 Changed = true;
392 }
393 return Changed;
394}
395
396/// If \p R is a region with a VPBranchOnMaskRecipe in the entry block, return
397/// the mask.
399 auto *EntryBB = dyn_cast<VPBasicBlock>(R->getEntry());
400 if (!EntryBB || EntryBB->size() != 1 ||
401 !isa<VPBranchOnMaskRecipe>(EntryBB->begin()))
402 return nullptr;
403
404 return cast<VPBranchOnMaskRecipe>(&*EntryBB->begin())->getOperand(0);
405}
406
407/// If \p R is a triangle region, return the 'then' block of the triangle.
409 auto *EntryBB = cast<VPBasicBlock>(R->getEntry());
410 if (EntryBB->getNumSuccessors() != 2)
411 return nullptr;
412
413 auto *Succ0 = dyn_cast<VPBasicBlock>(EntryBB->getSuccessors()[0]);
414 auto *Succ1 = dyn_cast<VPBasicBlock>(EntryBB->getSuccessors()[1]);
415 if (!Succ0 || !Succ1)
416 return nullptr;
417
418 if (Succ0->getNumSuccessors() + Succ1->getNumSuccessors() != 1)
419 return nullptr;
420 if (Succ0->getSingleSuccessor() == Succ1)
421 return Succ0;
422 if (Succ1->getSingleSuccessor() == Succ0)
423 return Succ1;
424 return nullptr;
425}
426
427// Merge replicate regions in their successor region, if a replicate region
428// is connected to a successor replicate region with the same predicate by a
429// single, empty VPBasicBlock.
431 SmallPtrSet<VPRegionBlock *, 4> TransformedRegions;
432
433 // Collect replicate regions followed by an empty block, followed by another
434 // replicate region with matching masks to process front. This is to avoid
435 // iterator invalidation issues while merging regions.
438 vp_depth_first_deep(Plan.getEntry()))) {
439 if (!Region1->isReplicator())
440 continue;
441 auto *MiddleBasicBlock =
442 dyn_cast_or_null<VPBasicBlock>(Region1->getSingleSuccessor());
443 if (!MiddleBasicBlock || !MiddleBasicBlock->empty())
444 continue;
445
446 auto *Region2 =
447 dyn_cast_or_null<VPRegionBlock>(MiddleBasicBlock->getSingleSuccessor());
448 if (!Region2 || !Region2->isReplicator())
449 continue;
450
451 VPValue *Mask1 = getPredicatedMask(Region1);
452 VPValue *Mask2 = getPredicatedMask(Region2);
453 if (!Mask1 || Mask1 != Mask2)
454 continue;
455
456 assert(Mask1 && Mask2 && "both region must have conditions");
457 WorkList.push_back(Region1);
458 }
459
460 // Move recipes from Region1 to its successor region, if both are triangles.
461 for (VPRegionBlock *Region1 : WorkList) {
462 if (TransformedRegions.contains(Region1))
463 continue;
464 auto *MiddleBasicBlock = cast<VPBasicBlock>(Region1->getSingleSuccessor());
465 auto *Region2 = cast<VPRegionBlock>(MiddleBasicBlock->getSingleSuccessor());
466
467 VPBasicBlock *Then1 = getPredicatedThenBlock(Region1);
468 VPBasicBlock *Then2 = getPredicatedThenBlock(Region2);
469 if (!Then1 || !Then2)
470 continue;
471
472 // Note: No fusion-preventing memory dependencies are expected in either
473 // region. Such dependencies should be rejected during earlier dependence
474 // checks, which guarantee accesses can be re-ordered for vectorization.
475 //
476 // Move recipes to the successor region.
477 for (VPRecipeBase &ToMove : make_early_inc_range(reverse(*Then1)))
478 ToMove.moveBefore(*Then2, Then2->getFirstNonPhi());
479
480 auto *Merge1 = cast<VPBasicBlock>(Then1->getSingleSuccessor());
481 auto *Merge2 = cast<VPBasicBlock>(Then2->getSingleSuccessor());
482
483 // Move VPPredInstPHIRecipes from the merge block to the successor region's
484 // merge block. Update all users inside the successor region to use the
485 // original values.
486 for (VPRecipeBase &Phi1ToMove : make_early_inc_range(reverse(*Merge1))) {
487 VPValue *PredInst1 =
488 cast<VPPredInstPHIRecipe>(&Phi1ToMove)->getOperand(0);
489 VPValue *Phi1ToMoveV = Phi1ToMove.getVPSingleValue();
490 Phi1ToMoveV->replaceUsesWithIf(PredInst1, [Then2](VPUser &U, unsigned) {
491 return cast<VPRecipeBase>(&U)->getParent() == Then2;
492 });
493
494 // Remove phi recipes that are unused after merging the regions.
495 if (Phi1ToMove.getVPSingleValue()->user_empty()) {
496 Phi1ToMove.eraseFromParent();
497 continue;
498 }
499 Phi1ToMove.moveBefore(*Merge2, Merge2->begin());
500 }
501
502 // Remove the dead recipes in Region1's entry block.
503 for (VPRecipeBase &R :
504 make_early_inc_range(reverse(*Region1->getEntryBasicBlock())))
505 R.eraseFromParent();
506
507 // Finally, remove the first region.
508 for (VPBlockBase *Pred : make_early_inc_range(Region1->getPredecessors())) {
509 VPBlockUtils::disconnectBlocks(Pred, Region1);
510 VPBlockUtils::connectBlocks(Pred, MiddleBasicBlock);
511 }
512 VPBlockUtils::disconnectBlocks(Region1, MiddleBasicBlock);
513 TransformedRegions.insert(Region1);
514 }
515
516 return !TransformedRegions.empty();
517}
518
520 VPRegionBlock *ParentRegion,
521 VPlan &Plan) {
522 Instruction *Instr = PredRecipe->getUnderlyingInstr();
523 // Build the triangular if-then region.
524 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
525 assert(Instr->getParent() && "Predicated instruction not in any basic block");
526 auto *BlockInMask = PredRecipe->getMask();
527 auto *MaskDef = BlockInMask->getDefiningRecipe();
528 auto *BOMRecipe = new VPBranchOnMaskRecipe(
529 BlockInMask, MaskDef ? MaskDef->getDebugLoc() : DebugLoc::getUnknown());
530 auto *Entry =
531 Plan.createVPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
532
533 // Replace predicated replicate recipe with a replicate recipe without a
534 // mask but in the replicate region.
535 auto *RecipeWithoutMask = new VPReplicateRecipe(
536 PredRecipe->getUnderlyingInstr(), PredRecipe->operandsWithoutMask(),
537 PredRecipe->isSingleScalar(), nullptr /*Mask*/, *PredRecipe, *PredRecipe,
538 PredRecipe->getDebugLoc());
539 auto *Pred =
540 Plan.createVPBasicBlock(Twine(RegionName) + ".if", RecipeWithoutMask);
541 auto *Exiting = Plan.createVPBasicBlock(Twine(RegionName) + ".continue");
543 Plan.createReplicateRegion(Entry, Exiting, RegionName);
544
545 // Note: first set Entry as region entry and then connect successors starting
546 // from it in order, to propagate the "parent" of each VPBasicBlock.
547 Region->setParent(ParentRegion);
548 VPBlockUtils::insertTwoBlocksAfter(Pred, Exiting, Entry);
549 VPBlockUtils::connectBlocks(Pred, Exiting);
550
551 if (!PredRecipe->user_empty()) {
552 auto *PHIRecipe = new VPPredInstPHIRecipe(RecipeWithoutMask,
553 RecipeWithoutMask->getDebugLoc());
554 Exiting->appendRecipe(PHIRecipe);
555 PredRecipe->replaceAllUsesWith(PHIRecipe);
556 }
557 PredRecipe->eraseFromParent();
558 return Region;
559}
560
561static void addReplicateRegions(VPlan &Plan) {
564 vp_depth_first_deep(Plan.getEntry()))) {
565 for (VPRecipeBase &R : *VPBB)
566 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
567 if (RepR->isPredicated())
568 WorkList.push_back(RepR);
569 }
570 }
571
572 unsigned BBNum = 0;
573 for (VPReplicateRecipe *RepR : WorkList) {
574 VPBasicBlock *CurrentBlock = RepR->getParent();
575 VPBasicBlock *SplitBlock = CurrentBlock->splitAt(RepR->getIterator());
576
577 BasicBlock *OrigBB = RepR->getUnderlyingInstr()->getParent();
578 SplitBlock->setName(
579 OrigBB->hasName() ? OrigBB->getName() + "." + Twine(BBNum++) : "");
580 // Record predicated instructions for above packing optimizations.
582 createReplicateRegion(RepR, CurrentBlock->getParent(), Plan);
584
585 VPRegionBlock *ParentRegion = Region->getParent();
586 if (ParentRegion && ParentRegion->getExiting() == CurrentBlock)
587 ParentRegion->setExiting(SplitBlock);
588 }
589}
590
594 vp_depth_first_deep(Plan.getEntry()))) {
595 // Don't fold the blocks in the skeleton of the Plan into their single
596 // predecessors for now.
597 // TODO: Remove restriction once more of the skeleton is modeled in VPlan.
598 if (!VPBB->getParent())
599 continue;
600 auto *PredVPBB =
601 dyn_cast_or_null<VPBasicBlock>(VPBB->getSinglePredecessor());
602 if (!PredVPBB || PredVPBB->getNumSuccessors() != 1 ||
603 isa<VPIRBasicBlock>(PredVPBB))
604 continue;
605 WorkList.push_back(VPBB);
606 }
607
608 for (VPBasicBlock *VPBB : WorkList) {
609 VPBasicBlock *PredVPBB = cast<VPBasicBlock>(VPBB->getSinglePredecessor());
610 for (VPRecipeBase &R : make_early_inc_range(*VPBB))
611 R.moveBefore(*PredVPBB, PredVPBB->end());
612 VPBlockUtils::disconnectBlocks(PredVPBB, VPBB);
613 auto *ParentRegion = VPBB->getParent();
614 if (ParentRegion && ParentRegion->getExiting() == VPBB)
615 ParentRegion->setExiting(PredVPBB);
616 VPBlockUtils::transferSuccessors(VPBB, PredVPBB);
617 // VPBB is now dead and will be cleaned up when the plan gets destroyed.
618 }
619 return !WorkList.empty();
620}
621
623 // Convert masked VPReplicateRecipes to if-then region blocks.
625
626 bool ShouldSimplify = true;
627 while (ShouldSimplify) {
628 ShouldSimplify = sinkScalarOperands(Plan);
629 ShouldSimplify |= mergeReplicateRegionsIntoSuccessors(Plan);
630 ShouldSimplify |= mergeBlocksIntoPredecessors(Plan);
631 }
632}
633
634/// Remove redundant casts of inductions.
635///
636/// Such redundant casts are casts of induction variables that can be ignored,
637/// because we already proved that the casted phi is equal to the uncasted phi
638/// in the vectorized loop. There is no need to vectorize the cast - the same
639/// value can be used for both the phi and casts in the vector loop.
641 for (auto &Phi : Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
643 if (!IV || IV->getTruncInst())
644 continue;
645
646 // A sequence of IR Casts has potentially been recorded for IV, which
647 // *must be bypassed* when the IV is vectorized, because the vectorized IV
648 // will produce the desired casted value. This sequence forms a def-use
649 // chain and is provided in reverse order, ending with the cast that uses
650 // the IV phi. Search for the recipe of the last cast in the chain and
651 // replace it with the original IV. Note that only the final cast is
652 // expected to have users outside the cast-chain and the dead casts left
653 // over will be cleaned up later.
654 ArrayRef<Instruction *> Casts = IV->getInductionDescriptor().getCastInsts();
655 VPValue *FindMyCast = IV;
656 for (Instruction *IRCast : reverse(Casts)) {
657 VPSingleDefRecipe *FoundUserCast = nullptr;
658 for (auto *U : FindMyCast->users()) {
659 auto *UserCast = dyn_cast<VPSingleDefRecipe>(U);
660 if (UserCast && UserCast->getUnderlyingValue() == IRCast) {
661 FoundUserCast = UserCast;
662 break;
663 }
664 }
665 // A cast recipe in the chain may have been removed by earlier DCE.
666 if (!FoundUserCast)
667 break;
668 FindMyCast = FoundUserCast;
669 }
670 if (FindMyCast != IV)
671 FindMyCast->replaceAllUsesWith(IV);
672 }
673}
674
677 Instruction::BinaryOps InductionOpcode,
678 FPMathOperator *FPBinOp, Instruction *TruncI,
679 VPIRValue *StartV, VPValue *Step, DebugLoc DL,
680 VPBuilder &Builder) {
681 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
682 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
683 VPValue *CanonicalIV = LoopRegion->getCanonicalIV();
684 VPSingleDefRecipe *BaseIV =
685 Builder.createDerivedIV(Kind, FPBinOp, StartV, CanonicalIV, Step);
686
687 // Truncate base induction if needed.
688 Type *ResultTy = BaseIV->getScalarType();
689 if (TruncI) {
690 Type *TruncTy = TruncI->getType();
691 assert(ResultTy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits() &&
692 "Not truncating.");
693 assert(ResultTy->isIntegerTy() && "Truncation requires an integer type");
694 BaseIV = Builder.createScalarCast(Instruction::Trunc, BaseIV, TruncTy, DL);
695 ResultTy = TruncTy;
696 }
697
698 // Truncate step if needed.
699 Type *StepTy = Step->getScalarType();
700 if (ResultTy != StepTy) {
701 assert(StepTy->getScalarSizeInBits() > ResultTy->getScalarSizeInBits() &&
702 "Not truncating.");
703 assert(StepTy->isIntegerTy() && "Truncation requires an integer type");
704 auto *VecPreheader =
706 VPBuilder::InsertPointGuard Guard(Builder);
707 Builder.setInsertPoint(VecPreheader);
708 Step = Builder.createScalarCast(Instruction::Trunc, Step, ResultTy, DL);
709 }
710 return Builder.createScalarIVSteps(InductionOpcode, FPBinOp, BaseIV, Step,
711 &Plan.getVF(), DL);
712}
713
715 VPlan &Plan, ScalarEvolution &SE, const TargetTransformInfo &TTI,
717 const SmallPtrSetImpl<const Value *> &ValuesToIgnore) {
718 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
719 if (!LoopRegion)
720 return;
721
722 auto *WideCanIV =
724 if (!WideCanIV)
725 return;
726
727 Type *CanIVTy = LoopRegion->getCanonicalIVType();
728
729 // Replace the wide canonical IV with a scalar-iv-steps over the canonical
730 // IV.
731 if (Plan.hasScalarVFOnly() || vputils::onlyFirstLaneUsed(WideCanIV)) {
732 VPBuilder Builder(WideCanIV);
733 WideCanIV->replaceAllUsesWith(createScalarIVSteps(
734 Plan, InductionDescriptor::IK_IntInduction, Instruction::Add, nullptr,
735 nullptr, Plan.getZero(CanIVTy), Plan.getConstantInt(CanIVTy, 1),
736 WideCanIV->getDebugLoc(), Builder));
737 WideCanIV->eraseFromParent();
738 return;
739 }
740
741 if (vputils::onlyScalarValuesUsed(WideCanIV))
742 return;
743
744 // If a canonical VPWidenIntOrFpInductionRecipe already produces vector lanes
745 // in the header, reuse it instead of introducing another wide induction phi.
746 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
747 for (VPRecipeBase &Phi : Header->phis()) {
749 if (!match(&Phi, m_CanonicalWidenIV(WidenIV)))
750 continue;
751 // The reused wide IV feeds the header mask, whose lanes may extend past
752 // the trip count; drop flags that only hold inside the scalar loop.
753 WidenIV->dropPoisonGeneratingFlags();
754 WideCanIV->replaceAllUsesWith(WidenIV);
755 WideCanIV->eraseFromParent();
756 return;
757 }
758
759 // Introduce a new VPWidenIntOrFpInductionRecipe if profitable.
760 auto *VecTy = VectorType::get(CanIVTy, VF);
761 InstructionCost BroadcastCost = TTI.getShuffleCost(
763 InstructionCost PHICost = TTI.getCFInstrCost(Instruction::PHI, CostKind);
764 if (PHICost > BroadcastCost)
765 return;
766
767 // Bail out if the additional wide induction phi increase the expected spill
768 // cost.
769 VPRegisterUsage UnrolledBase =
770 calculateRegisterUsageForPlan(Plan, VF, TTI, ValuesToIgnore)[0];
771 for (unsigned &NumUsers : make_second_range(UnrolledBase.MaxLocalUsers))
772 NumUsers *= UF;
773 unsigned RegClass = TTI.getRegisterClassForType(/*Vector=*/true, VecTy);
774 VPRegisterUsage Projected = UnrolledBase;
775 Projected.MaxLocalUsers[RegClass] += TTI.getRegUsageForType(VecTy);
776 if (Projected.spillCost(TTI, CostKind) >
777 UnrolledBase.spillCost(TTI, CostKind))
778 return;
779
782 VPValue *StepV = Plan.getConstantInt(CanIVTy, 1);
783 auto *NewWideIV = new VPWidenIntOrFpInductionRecipe(
784 /*IV=*/nullptr, Plan.getZero(CanIVTy), StepV, &Plan.getVF(), ID,
785 WideCanIV->getNoWrapFlags(), WideCanIV->getDebugLoc());
786 NewWideIV->insertBefore(&*Header->getFirstNonPhi());
787 WideCanIV->replaceAllUsesWith(NewWideIV);
788 WideCanIV->eraseFromParent();
789}
790
791/// Returns true if \p R is dead and can be removed.
792static bool isDeadRecipe(VPRecipeBase &R) {
793 // Do remove conditional assume instructions as their conditions may be
794 // flattened.
795 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
796 bool IsConditionalAssume = RepR && RepR->isPredicated() &&
798 if (IsConditionalAssume)
799 return true;
800
801 if (R.mayHaveSideEffects())
802 return false;
803
804 // Recipe is dead if no user keeps the recipe alive.
805 return all_of(R.definedValues(), [](VPValue *V) { return V->user_empty(); });
806}
807
810 Plan.getEntry());
812 // The recipes in the block are processed in reverse order, to catch chains
813 // of dead recipes.
814 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
815 if (isDeadRecipe(R)) {
816 R.eraseFromParent();
817 continue;
818 }
819
820 // Check if R is a dead VPPhi <-> update cycle and remove it.
821 VPValue *Start, *Incoming;
822 if (!match(&R, m_VPPhi(m_VPValue(Start), m_VPValue(Incoming))))
823 continue;
824 auto *PhiR = cast<VPPhi>(&R);
825 VPUser *PhiUser = PhiR->getSingleUser();
826 if (!PhiUser)
827 continue;
828 if (PhiUser != Incoming->getDefiningRecipe() ||
829 Incoming->getNumUsers() != 1)
830 continue;
831 PhiR->replaceAllUsesWith(Start);
832 PhiR->eraseFromParent();
833 Incoming->getDefiningRecipe()->eraseFromParent();
834 }
835 }
836}
837
840 for (unsigned I = 0; I != Users.size(); ++I) {
842 for (VPValue *V : Cur->definedValues())
843 Users.insert_range(V->users());
844 }
845 return Users.takeVector();
846}
847
848/// Scalarize a VPWidenPointerInductionRecipe by replacing it with a PtrAdd
849/// (IndStart, ScalarIVSteps (0, Step)). This is used when the recipe only
850/// generates scalar values.
851static VPValue *
853 VPlan &Plan, VPBuilder &Builder) {
855 VPIRValue *StartV = Plan.getZero(ID.getStep()->getType());
856 VPValue *StepV = PtrIV->getOperand(1);
858 Plan, InductionDescriptor::IK_IntInduction, Instruction::Add, nullptr,
859 nullptr, StartV, StepV, PtrIV->getDebugLoc(), Builder);
860
861 return Builder.createPtrAdd(PtrIV->getStartValue(), Steps,
862 PtrIV->getDebugLoc(), "next.gep");
863}
864
865/// Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd
866/// (IndStart, ScalarIVSteps (0, Step)) if only its scalar values are used, as
867/// VPWidenPointerInductionRecipe will generate vectors only. If some users
868/// require vectors while other require scalars, the scalar uses need to extract
869/// the scalars from the generated vectors (Note that this is different to how
870/// int/fp inductions are handled). Legalize extract-from-ends using uniform
871/// VPReplicateRecipe of wide inductions to use regular VPReplicateRecipe, so
872/// the correct end value is available. Also optimize
873/// VPWidenIntOrFpInductionRecipe, if any of its users needs scalar values, by
874/// providing them scalar steps built on the canonical scalar IV and update the
875/// original IV's users. This is an optional optimization to reduce the needs of
876/// vector extracts.
879 bool HasOnlyVectorVFs = !Plan.hasScalarVFOnly();
880 VPBuilder Builder(HeaderVPBB, HeaderVPBB->getFirstNonPhi());
881 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
882 auto *PhiR = dyn_cast<VPWidenInductionRecipe>(&Phi);
883 if (!PhiR)
884 continue;
885
886 // Try to narrow wide and replicating recipes to uniform recipes, based on
887 // VPlan analysis.
888 // TODO: Apply to all recipes in the future, to replace legacy uniformity
889 // analysis.
890 auto Users = collectUsersRecursively(PhiR);
891 for (VPUser *U : reverse(Users)) {
892 auto *Def = dyn_cast<VPRecipeWithIRFlags>(U);
893 auto *RepR = dyn_cast<VPReplicateRecipe>(U);
894 // Skip recipes that shouldn't be narrowed.
895 if (!Def || !isa<VPReplicateRecipe, VPWidenRecipe>(Def) ||
896 Def->user_empty() || !Def->getUnderlyingValue() ||
897 (RepR && (RepR->isSingleScalar() || RepR->isPredicated())))
898 continue;
899
900 // Skip recipes that may have other lanes than their first used.
902 continue;
903
904 // TODO: Support scalarizing ExtractValue.
905 if (match(Def,
907 continue;
908
910 Def->getUnderlyingInstr()->getOpcode(), Def->operands(),
911 /*Mask=*/nullptr, *Def, {}, DebugLoc::getUnknown(),
912 Def->getUnderlyingInstr());
913 Clone->insertAfter(Def);
914 Def->replaceAllUsesWith(Clone);
915 }
916
917 // Replace wide pointer inductions which have only their scalars used by
918 // PtrAdd(IndStart, ScalarIVSteps (0, Step)).
919 if (auto *PtrIV = dyn_cast<VPWidenPointerInductionRecipe>(&Phi)) {
920 if (!Plan.hasScalarVFOnly() &&
921 !PtrIV->onlyScalarsGenerated(Plan.hasScalableVF()))
922 continue;
923
924 VPValue *PtrAdd = scalarizeVPWidenPointerInduction(PtrIV, Plan, Builder);
925 PtrIV->replaceAllUsesWith(PtrAdd);
926 continue;
927 }
928
929 // Replace widened induction with scalar steps for users that only use
930 // scalars.
931 auto *WideIV = cast<VPWidenIntOrFpInductionRecipe>(&Phi);
932 if (HasOnlyVectorVFs && none_of(WideIV->users(), [WideIV](VPUser *U) {
933 return U->usesScalars(WideIV);
934 }))
935 continue;
936
937 const InductionDescriptor &ID = WideIV->getInductionDescriptor();
939 Plan, ID.getKind(), ID.getInductionOpcode(),
940 dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
941 WideIV->getTruncInst(), WideIV->getStartValue(), WideIV->getStepValue(),
942 WideIV->getDebugLoc(), Builder);
943
944 // Update scalar users of IV to use Step instead.
945 if (!HasOnlyVectorVFs) {
946 assert(!Plan.hasScalableVF() &&
947 "plans containing a scalar VF cannot also include scalable VFs");
948 WideIV->replaceAllUsesWith(Steps);
949 } else {
950 bool HasScalableVF = Plan.hasScalableVF();
951 WideIV->replaceUsesWithIf(Steps,
952 [WideIV, HasScalableVF](VPUser &U, unsigned) {
953 if (HasScalableVF)
954 return U.usesFirstLaneOnly(WideIV);
955 return U.usesScalars(WideIV);
956 });
957 }
958 }
959}
960
961/// Check if \p VPV is an untruncated wide induction, either before or after the
962/// increment. If so return the header IV (before the increment), otherwise
963/// return null.
966 auto *WideIV = dyn_cast<VPWidenInductionRecipe>(VPV);
967 if (WideIV) {
968 // VPV itself is a wide induction, separately compute the end value for exit
969 // users if it is not a truncated IV.
970 auto *IntOrFpIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
971 return (IntOrFpIV && IntOrFpIV->getTruncInst()) ? nullptr : WideIV;
972 }
973
974 // Check if VPV is an optimizable induction increment.
975 VPRecipeBase *Def = VPV->getDefiningRecipe();
976 if (!Def || Def->getNumOperands() != 2)
977 return nullptr;
978 WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(0));
979 if (!WideIV)
980 WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(1));
981 if (!WideIV)
982 return nullptr;
983
984 auto IsWideIVInc = [&]() {
985 auto &ID = WideIV->getInductionDescriptor();
986
987 // Check if VPV increments the induction by the induction step.
988 VPValue *IVStep = WideIV->getStepValue();
989 switch (ID.getInductionOpcode()) {
990 case Instruction::Add:
991 return match(VPV, m_c_Add(m_Specific(WideIV), m_Specific(IVStep)));
992 case Instruction::FAdd:
993 return match(VPV, m_c_FAdd(m_Specific(WideIV), m_Specific(IVStep)));
994 case Instruction::FSub:
995 return match(VPV, m_Binary<Instruction::FSub>(m_Specific(WideIV),
996 m_Specific(IVStep)));
997 case Instruction::Sub: {
998 // IVStep will be the negated step of the subtraction. Check if Step == -1
999 // * IVStep.
1000 VPValue *Step;
1001 if (!match(VPV, m_Sub(m_VPValue(), m_VPValue(Step))))
1002 return false;
1003 const SCEV *IVStepSCEV = vputils::getSCEVExprForVPValue(IVStep, PSE);
1004 const SCEV *StepSCEV = vputils::getSCEVExprForVPValue(Step, PSE);
1005 ScalarEvolution &SE = *PSE.getSE();
1006 return !isa<SCEVCouldNotCompute>(IVStepSCEV) &&
1007 !isa<SCEVCouldNotCompute>(StepSCEV) &&
1008 IVStepSCEV == SE.getNegativeSCEV(StepSCEV);
1009 }
1010 default:
1011 return ID.getKind() == InductionDescriptor::IK_PtrInduction &&
1012 match(VPV, m_GetElementPtr(m_Specific(WideIV),
1013 m_Specific(WideIV->getStepValue())));
1014 }
1015 llvm_unreachable("should have been covered by switch above");
1016 };
1017 return IsWideIVInc() ? WideIV : nullptr;
1018}
1019
1020/// Attempts to optimize the induction variable exit values for users in the
1021/// early exit block.
1024 VPValue *Incoming, *Mask;
1026 m_VPValue(Incoming))))
1027 return nullptr;
1028
1029 auto *WideIV = getOptimizableIVOf(Incoming, PSE);
1030 if (!WideIV)
1031 return nullptr;
1032
1033 auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
1034 if (WideIntOrFp && WideIntOrFp->getTruncInst())
1035 return nullptr;
1036
1037 // Calculate the final index.
1038 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
1039 auto *CanonicalIV = LoopRegion->getCanonicalIV();
1040 Type *CanonicalIVType = LoopRegion->getCanonicalIVType();
1041 auto *ExtractR = cast<VPInstruction>(Op);
1042 VPBuilder B(ExtractR);
1043
1044 DebugLoc DL = ExtractR->getDebugLoc();
1045 VPValue *FirstActiveLane = B.createFirstActiveLane(Mask, DL);
1046 FirstActiveLane = B.createScalarZExtOrTrunc(
1047 FirstActiveLane, CanonicalIVType, FirstActiveLane->getScalarType(), DL);
1048 VPValue *EndValue = B.createAdd(CanonicalIV, FirstActiveLane, DL);
1049
1050 // `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
1051 // changed it means the exit is using the incremented value, so we need to
1052 // add the step.
1053 if (Incoming != WideIV) {
1054 VPValue *One = Plan.getConstantInt(CanonicalIVType, 1);
1055 EndValue = B.createAdd(EndValue, One, DL);
1056 }
1057
1058 if (!match(WideIV, m_CanonicalWidenIV())) {
1059 const InductionDescriptor &ID = WideIV->getInductionDescriptor();
1060 VPIRValue *Start = WideIV->getStartValue();
1061 VPValue *Step = WideIV->getStepValue();
1062 EndValue = B.createDerivedIV(
1063 ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
1064 Start, EndValue, Step);
1065 }
1066
1067 return EndValue;
1068}
1069
1070/// Compute the end value for \p WideIV, unless it is truncated. Creates a
1071/// VPDerivedIVRecipe for non-canonical inductions.
1073 VPBuilder &VectorPHBuilder,
1074 VPValue *VectorTC) {
1075 auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
1076 // Truncated wide inductions resume from the last lane of their vector value
1077 // in the last vector iteration which is handled elsewhere.
1078 if (WideIntOrFp && WideIntOrFp->getTruncInst())
1079 return nullptr;
1080
1081 VPIRValue *Start = WideIV->getStartValue();
1082 VPValue *Step = WideIV->getStepValue();
1084 VPValue *EndValue = VectorTC;
1085 if (!match(WideIV, m_CanonicalWidenIV())) {
1086 EndValue = VectorPHBuilder.createDerivedIV(
1087 ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
1088 Start, VectorTC, Step);
1089 }
1090
1091 // EndValue is derived from the vector trip count (which has the same type as
1092 // the widest induction) and thus may be wider than the induction here.
1093 Type *ScalarTypeOfWideIV = WideIV->getScalarType();
1094 if (ScalarTypeOfWideIV != EndValue->getScalarType()) {
1095 EndValue = VectorPHBuilder.createScalarCast(Instruction::Trunc, EndValue,
1096 ScalarTypeOfWideIV,
1097 WideIV->getDebugLoc());
1098 }
1099
1100 return EndValue;
1101}
1102
1103/// Attempts to optimize the induction variable exit values for users in the
1104/// exit block coming from the latch in the original scalar loop.
1105static VPValue *
1109 VPValue *Incoming;
1111 return nullptr;
1112
1113 VPWidenInductionRecipe *WideIV = getOptimizableIVOf(Incoming, PSE);
1114 if (!WideIV)
1115 return nullptr;
1116
1117 VPValue *EndValue = EndValues.lookup(WideIV);
1118 assert(EndValue && "Must have computed the end value up front");
1119
1120 // `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
1121 // changed it means the exit is using the incremented value, so we don't
1122 // need to subtract the step.
1123 if (Incoming != WideIV)
1124 return EndValue;
1125
1126 // Otherwise, subtract the step from the EndValue.
1127 auto *ExtractR = cast<VPInstruction>(Op);
1128 VPBuilder B(ExtractR);
1129 VPValue *Step = WideIV->getStepValue();
1130 Type *ScalarTy = WideIV->getScalarType();
1131 if (ScalarTy->isIntegerTy())
1132 return B.createSub(EndValue, Step, DebugLoc::getUnknown(), "ind.escape");
1133 if (ScalarTy->isPointerTy()) {
1134 Type *StepTy = Step->getScalarType();
1135 auto *Zero = Plan.getZero(StepTy);
1136 return B.createPtrAdd(EndValue, B.createSub(Zero, Step),
1137 DebugLoc::getUnknown(), "ind.escape");
1138 }
1139 if (ScalarTy->isFloatingPointTy()) {
1140 const auto &ID = WideIV->getInductionDescriptor();
1141 return B.createNaryOp(
1142 ID.getInductionBinOp()->getOpcode() == Instruction::FAdd
1143 ? Instruction::FSub
1144 : Instruction::FAdd,
1145 {EndValue, Step}, {ID.getInductionBinOp()->getFastMathFlags()});
1146 }
1147 llvm_unreachable("all possible induction types must be handled");
1148 return nullptr;
1149}
1150
1152 VPlan &Plan, PredicatedScalarEvolution &PSE, bool FoldTail) {
1153 // Compute end values for all inductions.
1154 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
1155 auto *VectorPH = cast<VPBasicBlock>(VectorRegion->getSinglePredecessor());
1156 VPBuilder VectorPHBuilder(VectorPH, VectorPH->begin());
1158 VPValue *ResumeTC =
1159 FoldTail ? Plan.getTripCount() : &Plan.getVectorTripCount();
1160 for (auto &Phi : VectorRegion->getEntryBasicBlock()->phis()) {
1161 auto *WideIV = dyn_cast<VPWidenInductionRecipe>(&Phi);
1162 if (!WideIV)
1163 continue;
1164 if (VPValue *EndValue =
1165 tryToComputeEndValueForInduction(WideIV, VectorPHBuilder, ResumeTC))
1166 EndValues[WideIV] = EndValue;
1167 }
1168
1169 VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
1170 for (VPRecipeBase &R : make_early_inc_range(*MiddleVPBB)) {
1171 VPValue *Op;
1172 if (!match(&R, m_ExitingIVValue(m_VPValue(Op))))
1173 continue;
1174 auto *WideIV = cast<VPWidenInductionRecipe>(Op);
1175 if (VPValue *EndValue = EndValues.lookup(WideIV)) {
1176 R.getVPSingleValue()->replaceAllUsesWith(EndValue);
1177 R.eraseFromParent();
1178 }
1179 }
1180
1181 // Then, optimize exit block users.
1182 for (VPIRBasicBlock *ExitVPBB : Plan.getExitBlocks()) {
1183 for (VPRecipeBase &R : ExitVPBB->phis()) {
1184 auto *ExitIRI = cast<VPIRPhi>(&R);
1185
1186 for (auto [Idx, PredVPBB] : enumerate(ExitVPBB->getPredecessors())) {
1187 VPValue *Escape = nullptr;
1188 if (PredVPBB == MiddleVPBB)
1190 Plan, ExitIRI->getOperand(Idx), EndValues, PSE);
1191 else
1193 Plan, ExitIRI->getOperand(Idx), PSE);
1194 if (Escape)
1195 ExitIRI->setOperand(Idx, Escape);
1196 }
1197 }
1198 }
1199}
1200
1201/// Remove redundant ExpandSCEVRecipes in \p Plan's entry block by replacing
1202/// them with already existing recipes expanding the same SCEV expression.
1205
1206 for (VPRecipeBase &R :
1208 auto *ExpR = dyn_cast<VPExpandSCEVRecipe>(&R);
1209 if (!ExpR)
1210 continue;
1211
1212 const auto &[V, Inserted] = SCEV2VPV.try_emplace(ExpR->getSCEV(), ExpR);
1213 if (Inserted)
1214 continue;
1215
1216 ExpR->replaceAllUsesWith(V->second);
1217 if (ExpR == Plan.getTripCount())
1218 Plan.resetTripCount(V->second);
1219
1220 ExpR->eraseFromParent();
1221 }
1222}
1223
1225 SmallVector<VPValue *> WorkList;
1227 WorkList.push_back(V);
1228
1229 while (!WorkList.empty()) {
1230 VPValue *Cur = WorkList.pop_back_val();
1231 if (!Seen.insert(Cur).second)
1232 continue;
1233 VPRecipeBase *R = Cur->getDefiningRecipe();
1234 if (!R)
1235 continue;
1236 if (!isDeadRecipe(*R))
1237 continue;
1238 append_range(WorkList, R->operands());
1239 R->eraseFromParent();
1240 }
1241}
1242
1243/// Get any instruction opcode or intrinsic ID data embedded in recipe \p R.
1244/// Returns an optional pair, where the first element indicates whether it is
1245/// an intrinsic ID.
1246static std::optional<std::pair<bool, unsigned>>
1248 return TypeSwitch<const VPSingleDefRecipe *,
1249 std::optional<std::pair<bool, unsigned>>>(R)
1252 [](auto *I) { return std::make_pair(false, I->getOpcode()); })
1253 .Case([](const VPWidenIntrinsicRecipe *I) {
1254 return std::make_pair(true, I->getVectorIntrinsicID());
1255 })
1256 .Case<VPVectorPointerRecipe, VPPredInstPHIRecipe, VPScalarIVStepsRecipe>(
1257 [](auto *I) {
1258 // For recipes that do not directly map to LLVM IR instructions,
1259 // assign opcodes after the last VPInstruction opcode (which is also
1260 // after the last IR Instruction opcode), based on the VPRecipeID.
1261 return std::make_pair(false, VPInstruction::OpsEnd + 1 +
1262 I->getVPRecipeID());
1263 })
1264 .Default([](auto *) { return std::nullopt; });
1265}
1266
1267/// Try to fold \p R using InstSimplifyFolder. Will succeed and return a
1268/// non-nullptr VPValue for a handled opcode or intrinsic ID if corresponding \p
1269/// Operands are foldable live-ins.
1271 ArrayRef<VPValue *> Operands,
1272 const DataLayout &DL) {
1273 auto OpcodeOrIID = getOpcodeOrIntrinsicID(&R);
1274 if (!OpcodeOrIID)
1275 return nullptr;
1276
1278 for (VPValue *Op : Operands) {
1279 VPValue *Candidate = Op;
1280 match(Op, m_Broadcast(m_VPValue(Candidate)));
1281 if (!match(Candidate, m_LiveIn()))
1282 return nullptr;
1283 Value *V = Candidate->getUnderlyingValue();
1284 if (!V)
1285 return nullptr;
1286 Ops.push_back(V);
1287 }
1288
1289 VPlan &Plan = *R.getParent()->getPlan();
1290 auto FoldToIRValue = [&]() -> Value * {
1291 InstSimplifyFolder Folder(DL);
1292 if (OpcodeOrIID->first) {
1293 auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(&R);
1294 return Folder.FoldIntrinsic(OpcodeOrIID->second, Ops, R.getScalarType(),
1295 RFlags ? RFlags->getFastMathFlagsOrNone()
1296 : FastMathFlags());
1297 }
1298 unsigned Opcode = OpcodeOrIID->second;
1299 if (Instruction::isBinaryOp(Opcode))
1300 return Folder.FoldBinOp(static_cast<Instruction::BinaryOps>(Opcode),
1301 Ops[0], Ops[1]);
1302 if (Instruction::isCast(Opcode))
1303 return Folder.FoldCast(static_cast<Instruction::CastOps>(Opcode), Ops[0],
1304 R.getVPSingleValue()->getScalarType());
1305 switch (Opcode) {
1306 case VPInstruction::Not:
1307 return Folder.FoldBinOp(Instruction::BinaryOps::Xor, Ops[0],
1309 case Instruction::Select:
1310 return Folder.FoldSelect(Ops[0], Ops[1], Ops[2]);
1311 case Instruction::ICmp:
1312 case Instruction::FCmp:
1313 return Folder.FoldCmp(cast<VPRecipeWithIRFlags>(R).getPredicate(), Ops[0],
1314 Ops[1]);
1315 case Instruction::GetElementPtr: {
1316 auto &RFlags = cast<VPRecipeWithIRFlags>(R);
1317 auto *GEP = cast<GetElementPtrInst>(RFlags.getUnderlyingInstr());
1318 return Folder.FoldGEP(GEP->getSourceElementType(), Ops[0],
1319 drop_begin(Ops), RFlags.getGEPNoWrapFlags());
1320 }
1323 return Folder.FoldGEP(IntegerType::getInt8Ty(Plan.getContext()), Ops[0],
1324 Ops[1],
1325 cast<VPRecipeWithIRFlags>(R).getGEPNoWrapFlags());
1326 // An extract of a live-in is an extract of a broadcast, so return the
1327 // broadcasted element.
1328 case Instruction::ExtractElement:
1329 assert(!Ops[0]->getType()->isVectorTy() && "Live-ins should be scalar");
1330 return Ops[0];
1331 }
1332 return nullptr;
1333 };
1334
1335 if (Value *V = FoldToIRValue())
1336 return Plan.getOrAddLiveIn(V);
1337 return nullptr;
1338}
1339
1340/// Try to simplify logical and bitwise recipes in \p Def.
1342 bool CanCreateNewRecipe) {
1343 VPlan *Plan = Def->getParent()->getPlan();
1344
1345 // Simplify (X && Y) | (X && !Y) -> X.
1346 // TODO: Split up into simpler, modular combines: (X && Y) | (X && Z) into X
1347 // && (Y | Z) and (X | !X) into true. This requires queuing newly created
1348 // recipes to be visited during simplification.
1349 VPValue *X, *Y, *Z;
1350 if (match(Def,
1353 Def->replaceAllUsesWith(X);
1354 Def->eraseFromParent();
1355 return true;
1356 }
1357
1358 // x | AllOnes -> AllOnes
1359 if (match(Def, m_c_BinaryOr(m_VPValue(X), m_AllOnes()))) {
1360 Def->replaceAllUsesWith(Plan->getAllOnesValue(Def->getScalarType()));
1361 return true;
1362 }
1363
1364 // x | 0 -> x
1365 if (match(Def, m_c_BinaryOr(m_VPValue(X), m_ZeroInt()))) {
1366 Def->replaceAllUsesWith(X);
1367 return true;
1368 }
1369
1370 // x | !x -> AllOnes
1371 if (match(Def, m_c_BinaryOr(m_VPValue(X), m_Not(m_Deferred(X))))) {
1372 Def->replaceAllUsesWith(Plan->getAllOnesValue(Def->getScalarType()));
1373 return true;
1374 }
1375
1376 // x & 0 -> 0
1377 if (match(Def, m_c_BinaryAnd(m_VPValue(X), m_ZeroInt()))) {
1378 Def->replaceAllUsesWith(Plan->getZero(Def->getScalarType()));
1379 return true;
1380 }
1381
1382 // x & AllOnes -> x
1383 if (match(Def, m_c_BinaryAnd(m_VPValue(X), m_AllOnes()))) {
1384 Def->replaceAllUsesWith(X);
1385 return true;
1386 }
1387
1388 // x && false -> false
1389 if (match(Def, m_c_LogicalAnd(m_VPValue(X), m_False()))) {
1390 Def->replaceAllUsesWith(Plan->getFalse());
1391 return true;
1392 }
1393
1394 // x && true -> x
1395 if (match(Def, m_c_LogicalAnd(m_VPValue(X), m_True()))) {
1396 Def->replaceAllUsesWith(X);
1397 return true;
1398 }
1399
1400 // (x && y) | (x && z) -> x && (y | z)
1401 if (CanCreateNewRecipe &&
1404 // Simplify only if one of the operands has one use to avoid creating an
1405 // extra recipe.
1406 (!Def->getOperand(0)->hasMoreThanOneUniqueUser() ||
1407 !Def->getOperand(1)->hasMoreThanOneUniqueUser())) {
1408 Def->replaceAllUsesWith(
1409 Builder.createLogicalAnd(X, Builder.createOr(Y, Z)));
1410 return true;
1411 }
1412
1413 // x && (x && y) -> x && y
1414 if (match(Def, m_LogicalAnd(m_VPValue(X),
1416 Def->replaceAllUsesWith(Def->getOperand(1));
1417 return true;
1418 }
1419
1420 // x && (y && x) -> x && y
1421 if (match(Def, m_LogicalAnd(m_VPValue(X),
1423 Def->replaceAllUsesWith(Builder.createLogicalAnd(X, Y));
1424 return true;
1425 }
1426
1427 // x && !x -> 0
1428 if (match(Def, m_LogicalAnd(m_VPValue(X), m_Not(m_Deferred(X))))) {
1429 Def->replaceAllUsesWith(Plan->getFalse());
1430 return true;
1431 }
1432
1433 if (match(Def, m_Select(m_VPValue(), m_VPValue(X), m_Deferred(X)))) {
1434 Def->replaceAllUsesWith(X);
1435 return true;
1436 }
1437
1438 // select c, false, true -> not c
1439 VPValue *C;
1440 if (CanCreateNewRecipe &&
1441 match(Def, m_Select(m_VPValue(C), m_False(), m_True()))) {
1442 Def->replaceAllUsesWith(Builder.createNot(C));
1443 return true;
1444 }
1445
1446 // select !c, x, y -> select c, y, x
1447 if (match(Def, m_Select(m_Not(m_VPValue(C)), m_VPValue(X), m_VPValue(Y)))) {
1448 Def->setOperand(0, C);
1449 Def->setOperand(1, Y);
1450 Def->setOperand(2, X);
1451 return true;
1452 }
1453
1454 // select x, (i1 y | z), y -> y | (x && z)
1455 if (CanCreateNewRecipe &&
1456 match(Def, m_Select(m_VPValue(X),
1458 m_Deferred(Y))) &&
1459 Y->getScalarType()->isIntegerTy(1)) {
1460 Def->replaceAllUsesWith(
1461 Builder.createOr(Y, Builder.createLogicalAnd(X, Z)));
1462 return true;
1463 }
1464
1465 return false;
1466}
1467
1468/// Try to simplify VPSingleDefRecipe \p Def.
1470 VPlan *Plan = Def->getParent()->getPlan();
1471
1472 // Simplification of live-in IR values for SingleDef recipes using
1473 // InstSimplifyFolder.
1474 const DataLayout &DL = Plan->getDataLayout();
1475 if (VPValue *V = tryToFoldLiveIns(*Def, Def->operands(), DL))
1476 return Def->replaceAllUsesWith(V);
1477
1478 // Fold PredPHI LiveIn -> LiveIn.
1479 if (auto *PredPHI = dyn_cast<VPPredInstPHIRecipe>(Def)) {
1480 VPValue *Op = PredPHI->getOperand(0);
1481 if (isa<VPIRValue>(Op))
1482 PredPHI->replaceAllUsesWith(Op);
1483 }
1484
1485 // Drop the mask of a predicated store masked by the header mask (which is
1486 // guaranteed to be true at least for the first lane) and both the stored
1487 // value and the address are uniform across VF and UF.
1488 if (auto *RepR = dyn_cast<VPReplicateRecipe>(Def);
1489 RepR && RepR->isPredicated() && RepR->getOpcode() == Instruction::Store &&
1490 all_of(RepR->operandsWithoutMask(), vputils::isUniformAcrossVFsAndUFs) &&
1491 vputils::isHeaderMask(RepR->getMask(), *Plan)) {
1492 auto *Unmasked = new VPReplicateRecipe(
1493 RepR->getUnderlyingInstr(), RepR->operandsWithoutMask(),
1494 RepR->isSingleScalar(), /*Mask=*/nullptr, *RepR, *RepR,
1495 RepR->getDebugLoc());
1496 Unmasked->insertBefore(RepR);
1497 RepR->replaceAllUsesWith(Unmasked);
1498 RepR->eraseFromParent();
1499 return;
1500 }
1501
1502 VPBuilder Builder(Def);
1503
1504 // Avoid replacing VPInstructions with underlying values with new
1505 // VPInstructions, as we would fail to create widen/replicate recpes from the
1506 // new VPInstructions without an underlying value, and miss out on some
1507 // transformations that only apply to widened/replicated recipes later, by
1508 // doing so.
1509 // TODO: We should also not replace non-VPInstructions like VPWidenRecipe with
1510 // VPInstructions without underlying values, as those will get skipped during
1511 // cost computation.
1512 bool CanCreateNewRecipe =
1513 !isa<VPInstruction>(Def) || !Def->getUnderlyingValue();
1514
1515 VPValue *A;
1516 if (match(Def, m_Trunc(m_ZExtOrSExt(m_VPValue(A))))) {
1517 Type *TruncTy = Def->getScalarType();
1518 Type *ATy = A->getScalarType();
1519 if (TruncTy == ATy) {
1520 Def->replaceAllUsesWith(A);
1521 } else {
1522 // Don't replace a non-widened cast recipe with a widened cast.
1523 if (!isa<VPWidenCastRecipe>(Def))
1524 return;
1525 if (ATy->getScalarSizeInBits() < TruncTy->getScalarSizeInBits()) {
1526
1527 unsigned ExtOpcode = match(Def->getOperand(0), m_SExt(m_VPValue()))
1528 ? Instruction::SExt
1529 : Instruction::ZExt;
1530 auto *Ext = Builder.createWidenCast(Instruction::CastOps(ExtOpcode), A,
1531 TruncTy);
1532 if (auto *UnderlyingExt = Def->getOperand(0)->getUnderlyingValue()) {
1533 // UnderlyingExt has distinct return type, used to retain legacy cost.
1534 Ext->setUnderlyingValue(UnderlyingExt);
1535 }
1536 Def->replaceAllUsesWith(Ext);
1537 } else if (ATy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits()) {
1538 auto *Trunc = Builder.createWidenCast(Instruction::Trunc, A, TruncTy);
1539 Def->replaceAllUsesWith(Trunc);
1540 }
1541 }
1542 }
1543
1544 if (simplifyLogicalRecipe(Def, Builder, CanCreateNewRecipe))
1545 return;
1546
1547 VPValue *X, *Y, *C;
1548 if (match(Def, m_c_Add(m_VPValue(A), m_ZeroInt())))
1549 return Def->replaceAllUsesWith(A);
1550
1551 if (match(Def, m_c_Mul(m_VPValue(A), m_One())))
1552 return Def->replaceAllUsesWith(A);
1553
1554 if (match(Def, m_c_Mul(m_VPValue(A), m_ZeroInt())))
1555 return Def->replaceAllUsesWith(Plan->getZero(Def->getScalarType()));
1556
1557 if (CanCreateNewRecipe && match(Def, m_c_Mul(m_VPValue(A), m_AllOnes()))) {
1558 // Preserve nsw from the Mul on the new Sub.
1560 false, cast<VPRecipeWithIRFlags>(Def)->hasNoSignedWrap()};
1561 return Def->replaceAllUsesWith(Builder.createSub(
1562 Plan->getZero(A->getScalarType()), A, Def->getDebugLoc(), "", NW));
1563 }
1564
1565 if (CanCreateNewRecipe &&
1567 // Preserve nsw from the Add and the Sub, if it's present on both, on the
1568 // new Sub.
1570 false,
1571 cast<VPRecipeWithIRFlags>(Def)->hasNoSignedWrap() &&
1572 cast<VPRecipeWithIRFlags>(Def->getOperand(Def->getOperand(0) == X))
1573 ->hasNoSignedWrap()};
1574 return Def->replaceAllUsesWith(
1575 Builder.createSub(X, Y, Def->getDebugLoc(), "", NW));
1576 }
1577
1578 const APInt *APC;
1579 if (CanCreateNewRecipe && match(Def, m_c_Mul(m_VPValue(A), m_APInt(APC))) &&
1580 APC->isPowerOf2())
1581 return Def->replaceAllUsesWith(Builder.createNaryOp(
1582 Instruction::Shl,
1583 {A, Plan->getConstantInt(APC->getBitWidth(), APC->exactLogBase2())},
1584 *cast<VPRecipeWithIRFlags>(Def), Def->getDebugLoc()));
1585
1586 if (CanCreateNewRecipe && match(Def, m_UDiv(m_VPValue(A), m_APInt(APC))) &&
1587 APC->isPowerOf2())
1588 return Def->replaceAllUsesWith(Builder.createNaryOp(
1589 Instruction::LShr,
1590 {A, Plan->getConstantInt(APC->getBitWidth(), APC->exactLogBase2())},
1591 *cast<VPRecipeWithIRFlags>(Def), Def->getDebugLoc()));
1592
1593 if (match(Def, m_Not(m_VPValue(A)))) {
1594 if (match(A, m_Not(m_VPValue(A))))
1595 return Def->replaceAllUsesWith(A);
1596
1597 // Try to fold Not into compares by adjusting the predicate in-place.
1598 CmpPredicate Pred;
1599 if (match(A, m_Cmp(Pred, m_VPValue(), m_VPValue()))) {
1600 auto *Cmp = cast<VPRecipeWithIRFlags>(A);
1601 if (all_of(Cmp->users(),
1603 m_Not(m_Specific(Cmp)),
1604 m_Select(m_Specific(Cmp), m_VPValue(), m_VPValue()))))) {
1605 Cmp->setPredicate(CmpInst::getInversePredicate(Pred));
1606 for (VPUser *U : to_vector(Cmp->users())) {
1607 auto *R = cast<VPSingleDefRecipe>(U);
1608 if (match(R, m_Select(m_Specific(Cmp), m_VPValue(X), m_VPValue(Y)))) {
1609 // select (cmp pred), x, y -> select (cmp inv_pred), y, x
1610 R->setOperand(1, Y);
1611 R->setOperand(2, X);
1612 } else {
1613 // not (cmp pred) -> cmp inv_pred
1614 assert(match(R, m_Not(m_Specific(Cmp))) && "Unexpected user");
1615 R->replaceAllUsesWith(Cmp);
1616 }
1617 }
1618 // If Cmp doesn't have a debug location, use the one from the negation,
1619 // to preserve the location.
1620 if (!Cmp->getDebugLoc() && Def->getDebugLoc())
1621 Cmp->setDebugLoc(Def->getDebugLoc());
1622 }
1623 }
1624 }
1625
1626 // Fold any-of (fcmp uno %A, %A), (fcmp uno %B, %B), ... ->
1627 // any-of (fcmp uno %A, %B), ...
1628 if (match(Def, m_AnyOf())) {
1630 VPRecipeBase *UnpairedCmp = nullptr;
1631 for (VPValue *Op : Def->operands()) {
1632 VPValue *X;
1633 if (Op->getNumUsers() > 1 ||
1635 m_Deferred(X)))) {
1636 NewOps.push_back(Op);
1637 } else if (!UnpairedCmp) {
1638 UnpairedCmp = Op->getDefiningRecipe();
1639 } else {
1640 NewOps.push_back(Builder.createFCmp(CmpInst::FCMP_UNO,
1641 UnpairedCmp->getOperand(0), X));
1642 UnpairedCmp = nullptr;
1643 }
1644 }
1645
1646 if (UnpairedCmp)
1647 NewOps.push_back(UnpairedCmp->getVPSingleValue());
1648
1649 if (NewOps.size() < Def->getNumOperands()) {
1650 VPValue *NewAnyOf = Builder.createNaryOp(VPInstruction::AnyOf, NewOps);
1651 return Def->replaceAllUsesWith(NewAnyOf);
1652 }
1653 }
1654
1655 // Fold (fcmp uno %X, %X) or (fcmp uno %Y, %Y) -> fcmp uno %X, %Y
1656 // This is useful for fmax/fmin without fast-math flags, where we need to
1657 // check if any operand is NaN.
1658 if (CanCreateNewRecipe &&
1660 m_Deferred(X)),
1662 m_Deferred(Y))))) {
1663 VPValue *NewCmp = Builder.createFCmp(CmpInst::FCMP_UNO, X, Y);
1664 return Def->replaceAllUsesWith(NewCmp);
1665 }
1666
1667 // Remove redundant DerviedIVs, that is 0 + A * 1 -> A and 0 + 0 * x -> 0.
1668 if ((match(Def, m_DerivedIV(m_ZeroInt(), m_VPValue(A), m_One())) ||
1669 match(Def, m_DerivedIV(m_ZeroInt(), m_ZeroInt(), m_VPValue()))) &&
1670 Def->getOperand(1)->getScalarType() == Def->getScalarType())
1671 return Def->replaceAllUsesWith(Def->getOperand(1));
1672
1674 m_One()))) {
1675 Type *WideStepTy = Def->getScalarType();
1676 if (X->getScalarType() != WideStepTy)
1677 X = Builder.createWidenCast(Instruction::Trunc, X, WideStepTy);
1678 Def->replaceAllUsesWith(X);
1679 return;
1680 }
1681
1682 // For i1 vp.merges produced by AnyOf reductions:
1683 // vp.merge true, (or x, y), x, evl -> vp.merge y, true, x, evl
1685 m_VPValue(X), m_VPValue())) &&
1687 Def->getScalarType()->isIntegerTy(1)) {
1688 Def->setOperand(1, Def->getOperand(0));
1689 Def->setOperand(0, Y);
1690 return;
1691 }
1692
1693 // Simplify MaskedCond with no block mask to its single operand.
1695 !cast<VPInstruction>(Def)->isMasked())
1696 return Def->replaceAllUsesWith(Def->getOperand(0));
1697
1698 // Look through ExtractLastLane.
1699 if (match(Def, m_ExtractLastLane(m_VPValue(A)))) {
1700 if (match(A, m_BuildVector())) {
1701 auto *BuildVector = cast<VPInstruction>(A);
1702 Def->replaceAllUsesWith(
1703 BuildVector->getOperand(BuildVector->getNumOperands() - 1));
1704 return;
1705 }
1706
1707 if (match(A, m_Broadcast(m_VPValue(X))))
1708 return Def->replaceAllUsesWith(X);
1709
1711 return Def->replaceAllUsesWith(A);
1712
1713 if (Plan->hasScalarVFOnly())
1714 return Def->replaceAllUsesWith(A);
1715 }
1716
1717 // Look through ExtractPenultimateElement (BuildVector ....).
1719 auto *BuildVector = cast<VPInstruction>(Def->getOperand(0));
1720 Def->replaceAllUsesWith(
1721 BuildVector->getOperand(BuildVector->getNumOperands() - 2));
1722 return;
1723 }
1724
1725 uint64_t Idx;
1727 auto *BuildVector = cast<VPInstruction>(Def->getOperand(0));
1728 Def->replaceAllUsesWith(BuildVector->getOperand(Idx));
1729 return;
1730 }
1731
1732 if (match(Def, m_BuildVector()) && all_equal(Def->operands())) {
1733 Def->replaceAllUsesWith(
1734 Builder.createNaryOp(VPInstruction::Broadcast, Def->getOperand(0)));
1735 return;
1736 }
1737
1738 // Look through broadcast of single-scalar when used as select conditions; in
1739 // that case the scalar condition can be used directly.
1740 if (match(Def,
1743 "broadcast operand must be single-scalar");
1744 Def->setOperand(0, C);
1745 return;
1746 }
1747
1748 if (match(Def, m_Broadcast(m_VPValue(X))))
1749 return Def->replaceUsesWithIf(
1750 X, [Def](const VPUser &U, unsigned) { return U.usesScalars(Def); });
1751
1753 if (Def->getNumOperands() == 1) {
1754 Def->replaceAllUsesWith(Def->getOperand(0));
1755 return;
1756 }
1757 if (auto *Phi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(Def)) {
1758 if (all_equal(Phi->incoming_values()))
1759 Phi->replaceAllUsesWith(Phi->getOperand(0));
1760 }
1761 return;
1762 }
1763
1764 VPIRValue *IRV;
1765 if (Def->getNumOperands() == 1 &&
1767 return Def->replaceAllUsesWith(IRV);
1768
1769 // Some simplifications can only be applied after unrolling. Perform them
1770 // below.
1771 if (!Plan->isUnrolled())
1772 return;
1773
1774 // After unrolling, extract-lane may be used to extract values from multiple
1775 // scalar sources. Only simplify when extracting from a single scalar source.
1776 VPValue *LaneToExtract;
1777 if (match(Def, m_ExtractLane(m_VPValue(LaneToExtract), m_VPValue(A)))) {
1778 // Simplify extract-lane(%lane_num, %scalar_val) -> %scalar_val.
1780 return Def->replaceAllUsesWith(A);
1781
1782 // Replace extract-lane(0, canonical-WIDEN-INDUCTION) with the region's
1783 // scalar canonical IV.
1785 if (match(LaneToExtract, m_ZeroInt()) &&
1786 match(A, m_CanonicalWidenIV(WidenIV)))
1787 return Def->replaceAllUsesWith(WidenIV->getRegion()->getCanonicalIV());
1788
1789 // Simplify extract-lane with single source to extract-element.
1790 Def->replaceAllUsesWith(Builder.createNaryOp(
1791 Instruction::ExtractElement, {A, LaneToExtract}, Def->getDebugLoc()));
1792 return;
1793 }
1794
1795 // Look for cycles where Def is of the form:
1796 // X = phi(0, IVInc) ; used only by IVInc, or by IVInc and Inc = X + Y
1797 // IVInc = X + Step ; used by X and Def
1798 // Def = IVInc + Y
1799 // Fold the increment Y into the phi's start value, replace Def with IVInc,
1800 // and if Inc exists, replace it with X.
1801 if (match(Def, m_Add(m_Add(m_VPValue(X), m_VPValue()), m_VPValue(Y))) &&
1802 isa<VPIRValue>(Y) &&
1803 match(X, m_VPPhi(m_ZeroInt(), m_Specific(Def->getOperand(0))))) {
1804 auto *Phi = cast<VPPhi>(X);
1805 auto *IVInc = Def->getOperand(0);
1806 if (IVInc->getNumUsers() == 2) {
1807 // If Phi has a second user (besides IVInc's defining recipe), it must
1808 // be Inc = Phi + Y for the fold to apply.
1810 findUserOf(Phi, m_Add(m_Specific(Phi), m_Specific(Y))));
1811 if (Phi->getNumUsers() == 1 || (Phi->getNumUsers() == 2 && Inc)) {
1812 Def->replaceAllUsesWith(IVInc);
1813 if (Inc)
1814 Inc->replaceAllUsesWith(Phi);
1815 Phi->setOperand(0, Y);
1816 return;
1817 }
1818 }
1819 }
1820
1821 // Simplify unrolled VectorPointer without offset, or with zero offset, to
1822 // just the pointer operand.
1823 if (auto *VPR = dyn_cast<VPVectorPointerRecipe>(Def))
1824 if (!VPR->getVFxPart() || match(VPR->getVFxPart(), m_ZeroInt()))
1825 return VPR->replaceAllUsesWith(VPR->getOperand(0));
1826
1827 // VPScalarIVSteps after unrolling can be replaced by their start value, if
1828 // the start index is zero and only the first lane 0 is demanded.
1829 if (auto *Steps = dyn_cast<VPScalarIVStepsRecipe>(Def)) {
1830 if (!Steps->getStartIndex() && vputils::onlyFirstLaneUsed(Steps)) {
1831 Steps->replaceAllUsesWith(Steps->getOperand(0));
1832 return;
1833 }
1834 }
1835 // Simplify redundant ReductionStartVector recipes after unrolling.
1836 VPValue *StartV;
1838 m_VPValue(StartV), m_VPValue(), m_VPValue()))) {
1839 Def->replaceUsesWithIf(StartV, [](const VPUser &U, unsigned Idx) {
1840 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&U);
1841 return PhiR && PhiR->isInLoop();
1842 });
1843 return;
1844 }
1845
1846 if (Plan->getConcreteUF() == 1 && match(Def, m_ExtractLastPart(m_VPValue(A))))
1847 return Def->replaceAllUsesWith(A);
1848}
1849
1859
1861 VPValue *X;
1864 for (VPRecipeBase &R : make_early_inc_range(*VPBB))
1865 if (match(&R, m_Reverse(m_Reverse(m_VPValue(X)))))
1866 R.getVPSingleValue()->replaceAllUsesWith(X);
1867}
1868
1869/// Reassociate (headermask && x) && y -> headermask && (x && y) to allow the
1870/// header mask to be simplified further when tail folding, e.g. in
1871/// optimizeEVLMasks.
1872static void reassociateHeaderMask(VPlan &Plan) {
1873 VPValue *HeaderMask = vputils::findHeaderMask(Plan);
1874 if (!HeaderMask)
1875 return;
1876
1877 SmallVector<VPUser *> Worklist;
1878 for (VPUser *U : HeaderMask->users())
1879 if (match(U, m_LogicalAnd(m_Specific(HeaderMask), m_VPValue())))
1881
1882 while (!Worklist.empty()) {
1883 auto *R = dyn_cast<VPSingleDefRecipe>(Worklist.pop_back_val());
1884 VPValue *X, *Y;
1885 if (!R || !match(R, m_LogicalAnd(
1886 m_LogicalAnd(m_Specific(HeaderMask), m_VPValue(X)),
1887 m_VPValue(Y))))
1888 continue;
1889 append_range(Worklist, R->users());
1890 VPBuilder Builder(R);
1891 R->replaceAllUsesWith(
1892 Builder.createLogicalAnd(HeaderMask, Builder.createLogicalAnd(X, Y)));
1893 }
1894}
1895
1896static std::optional<Instruction::BinaryOps>
1898 switch (ID) {
1899 case Intrinsic::masked_udiv:
1900 return Instruction::UDiv;
1901 case Intrinsic::masked_sdiv:
1902 return Instruction::SDiv;
1903 case Intrinsic::masked_urem:
1904 return Instruction::URem;
1905 case Intrinsic::masked_srem:
1906 return Instruction::SRem;
1907 default:
1908 return {};
1909 }
1910}
1911
1913 if (Plan.hasScalarVFOnly())
1914 return;
1915
1917 vp_depth_first_deep(Plan.getEntry()))) {
1918 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
1921 continue;
1922 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
1923 if (RepR && (RepR->isSingleScalar() || RepR->isPredicated()))
1924 continue;
1925
1926 auto *RepOrWidenR = cast<VPRecipeWithIRFlags>(&R);
1927 if (RepR && RepR->getOpcode() == Instruction::Store &&
1928 vputils::isSingleScalar(RepR->getOperand(1))) {
1929 auto *Clone = new VPReplicateRecipe(
1930 RepOrWidenR->getUnderlyingInstr(), RepOrWidenR->operands(),
1931 true /*IsSingleScalar*/, nullptr /*Mask*/, *RepR /*Flags*/,
1932 *RepR /*Metadata*/, RepR->getDebugLoc());
1933 Clone->insertBefore(RepOrWidenR);
1934 VPBuilder Builder(Clone);
1935 VPValue *ExtractOp = Clone->getOperand(0);
1936 if (vputils::isUniformAcrossVFsAndUFs(RepR->getOperand(1)))
1937 ExtractOp =
1938 Builder.createNaryOp(VPInstruction::ExtractLastPart, ExtractOp);
1939 ExtractOp =
1940 Builder.createNaryOp(VPInstruction::ExtractLastLane, ExtractOp);
1941 Clone->setOperand(0, ExtractOp);
1942 RepR->eraseFromParent();
1943 continue;
1944 }
1945
1946 // Narrow llvm.masked.{u,s}{div,rem} intrinsics with a safe divisor.
1947 if (auto *IntrR = dyn_cast<VPWidenIntrinsicRecipe>(RepOrWidenR)) {
1948 if (!vputils::onlyFirstLaneUsed(IntrR))
1949 continue;
1950 auto Opc = getUnmaskedDivRemOpcode(IntrR->getVectorIntrinsicID());
1951 if (!Opc)
1952 continue;
1953 VPBuilder Builder(IntrR);
1954 VPValue *SafeDivisor = Builder.createSelect(
1955 IntrR->getOperand(2), IntrR->getOperand(1),
1956 Plan.getConstantInt(IntrR->getScalarType(), 1));
1957 VPValue *Clone = Builder.createNaryOp(
1958 *Opc, {IntrR->getOperand(0), SafeDivisor},
1959 VPIRFlags::getDefaultFlags(*Opc), IntrR->getDebugLoc());
1960 IntrR->replaceAllUsesWith(Clone);
1961 IntrR->eraseFromParent();
1962 continue;
1963 }
1964
1965 // Skip recipes that aren't single scalars.
1966 if (!vputils::isSingleScalar(RepOrWidenR))
1967 continue;
1968
1969 // Predicate to check if a user of Op introduces extra broadcasts.
1970 auto IntroducesBCastOf = [](const VPValue *Op) {
1971 return [Op](const VPUser *U) {
1972 if (auto *VPI = dyn_cast<VPInstruction>(U)) {
1976 VPI->getOpcode()))
1977 return false;
1978 }
1979 return !U->usesScalars(Op);
1980 };
1981 };
1982
1983 if (any_of(RepOrWidenR->users(), IntroducesBCastOf(RepOrWidenR)) &&
1984 none_of(RepOrWidenR->operands(), [&](VPValue *Op) {
1985 if (any_of(
1986 make_filter_range(Op->users(), not_equal_to(RepOrWidenR)),
1987 IntroducesBCastOf(Op)))
1988 return false;
1989 // Non-constant live-ins require broadcasts, while constants do not
1990 // need explicit broadcasts.
1991 auto *IRV = dyn_cast<VPIRValue>(Op);
1992 bool LiveInNeedsBroadcast = IRV && !isa<Constant>(IRV->getValue());
1993 auto *OpR = dyn_cast<VPReplicateRecipe>(Op);
1994 return LiveInNeedsBroadcast || (OpR && OpR->isSingleScalar());
1995 }))
1996 continue;
1997
1998 auto *Clone = VPBuilder::createSingleScalarOp(
1999 getOpcodeOrIntrinsicID(RepOrWidenR)->second, RepOrWidenR->operands(),
2000 /*Mask=*/nullptr, *RepOrWidenR, {}, DebugLoc::getUnknown(),
2001 RepOrWidenR->getUnderlyingInstr());
2002 Clone->insertBefore(RepOrWidenR);
2003 RepOrWidenR->replaceAllUsesWith(Clone);
2004 if (isDeadRecipe(*RepOrWidenR))
2005 RepOrWidenR->eraseFromParent();
2006 }
2007 }
2008}
2009
2010/// Try to see if all of \p Blend's masks share a common value logically and'ed
2011/// and remove it from the masks.
2013 if (Blend->isNormalized())
2014 return;
2015 VPValue *CommonEdgeMask;
2016 if (!match(Blend->getMask(0),
2017 m_LogicalAnd(m_VPValue(CommonEdgeMask), m_VPValue())))
2018 return;
2019 for (unsigned I = 0; I < Blend->getNumIncomingValues(); I++)
2020 if (!match(Blend->getMask(I),
2021 m_LogicalAnd(m_Specific(CommonEdgeMask), m_VPValue())))
2022 return;
2023 for (unsigned I = 0; I < Blend->getNumIncomingValues(); I++)
2024 Blend->setMask(I, Blend->getMask(I)->getDefiningRecipe()->getOperand(1));
2025}
2026
2027/// Normalize and simplify VPBlendRecipes. Should be run after simplifyRecipes
2028/// to make sure the masks are simplified.
2029static void simplifyBlends(VPlan &Plan) {
2032 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
2033 auto *Blend = dyn_cast<VPBlendRecipe>(&R);
2034 if (!Blend)
2035 continue;
2036
2037 removeCommonBlendMask(Blend);
2038
2039 // Try to remove redundant blend recipes.
2040 SmallPtrSet<VPValue *, 4> UniqueValues;
2041 if (Blend->isNormalized() || !match(Blend->getMask(0), m_False()))
2042 UniqueValues.insert(Blend->getIncomingValue(0));
2043 for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
2044 if (!match(Blend->getMask(I), m_False()))
2045 UniqueValues.insert(Blend->getIncomingValue(I));
2046
2047 if (UniqueValues.size() == 1) {
2048 Blend->replaceAllUsesWith(*UniqueValues.begin());
2049 Blend->eraseFromParent();
2050 continue;
2051 }
2052
2053 if (Blend->isNormalized())
2054 continue;
2055
2056 // Normalize the blend so its first incoming value is used as the initial
2057 // value with the others blended into it.
2058
2059 unsigned StartIndex = 0;
2060 for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
2061 // If a value's mask is used only by the blend then is can be deadcoded.
2062 // TODO: Find the most expensive mask that can be deadcoded, or a mask
2063 // that's used by multiple blends where it can be removed from them all.
2064 VPValue *Mask = Blend->getMask(I);
2065 if (Mask->hasOneUse() && !match(Mask, m_False())) {
2066 StartIndex = I;
2067 break;
2068 }
2069 }
2070
2071 SmallVector<VPValue *, 4> OperandsWithMask;
2072 OperandsWithMask.push_back(Blend->getIncomingValue(StartIndex));
2073
2074 for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
2075 if (I == StartIndex)
2076 continue;
2077 OperandsWithMask.push_back(Blend->getIncomingValue(I));
2078 OperandsWithMask.push_back(Blend->getMask(I));
2079 }
2080
2081 auto *NewBlend =
2082 new VPBlendRecipe(cast_or_null<PHINode>(Blend->getUnderlyingValue()),
2083 OperandsWithMask, *Blend, Blend->getDebugLoc());
2084 NewBlend->insertBefore(&R);
2085
2086 VPValue *DeadMask = Blend->getMask(StartIndex);
2087 Blend->replaceAllUsesWith(NewBlend);
2088 Blend->eraseFromParent();
2090
2091 /// Simplify BLEND %a, %b, Not(%mask) -> BLEND %b, %a, %mask.
2092 VPValue *NewMask;
2093 if (NewBlend->getNumOperands() == 3 &&
2094 match(NewBlend->getMask(1), m_Not(m_VPValue(NewMask)))) {
2095 VPValue *Inc0 = NewBlend->getOperand(0);
2096 VPValue *Inc1 = NewBlend->getOperand(1);
2097 VPValue *OldMask = NewBlend->getOperand(2);
2098 NewBlend->setOperand(0, Inc1);
2099 NewBlend->setOperand(1, Inc0);
2100 NewBlend->setOperand(2, NewMask);
2101 if (OldMask->user_empty())
2102 cast<VPInstruction>(OldMask)->eraseFromParent();
2103 }
2104 }
2105 }
2106}
2107
2108/// Optimize the width of vector induction variables in \p Plan based on a known
2109/// constant Trip Count, \p BestVF and \p BestUF.
2111 ElementCount BestVF,
2112 unsigned BestUF) {
2113 // Only proceed if we have not completely removed the vector region.
2114 if (!Plan.getVectorLoopRegion())
2115 return false;
2116
2117 const APInt *TC;
2118 if (!BestVF.isFixed() || !match(Plan.getTripCount(), m_APInt(TC)))
2119 return false;
2120
2121 // Calculate the minimum power-of-2 bit width that can fit the known TC, VF
2122 // and UF. Returns at least 8.
2123 auto ComputeBitWidth = [](APInt TC, uint64_t Align) {
2124 APInt AlignedTC =
2127 APInt MaxVal = AlignedTC - 1;
2128 return std::max<unsigned>(PowerOf2Ceil(MaxVal.getActiveBits()), 8);
2129 };
2130 unsigned NewBitWidth =
2131 ComputeBitWidth(*TC, BestVF.getKnownMinValue() * BestUF);
2132
2133 LLVMContext &Ctx = Plan.getContext();
2134 auto *NewIVTy = IntegerType::get(Ctx, NewBitWidth);
2135
2136 bool MadeChange = false;
2137
2138 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
2139 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
2140 // Currently only handle canonical IVs as it is trivial to replace the start
2141 // and stop values, and we currently only perform the optimization when the
2142 // IV has a single use.
2144 if (!match(&Phi, m_CanonicalWidenIV(WideIV)))
2145 continue;
2146 if (WideIV->hasMoreThanOneUniqueUser() ||
2147 NewIVTy == WideIV->getScalarType())
2148 continue;
2149
2150 // Currently only handle cases where the single user is a header-mask
2151 // comparison with the backedge-taken-count.
2152 VPUser *SingleUser = WideIV->getSingleUser();
2153 if (!SingleUser ||
2154 !match(SingleUser,
2155 m_ICmp(m_Specific(WideIV),
2157 continue;
2158
2159 // Update IV operands and comparison bound to use new narrower type.
2160 assert(!WideIV->getTruncInst() &&
2161 "canonical IV is not expected to have a truncation");
2162 auto *NewWideIV = new VPWidenIntOrFpInductionRecipe(
2163 WideIV->getPHINode(), Plan.getZero(NewIVTy),
2164 Plan.getConstantInt(NewIVTy, 1), WideIV->getVFValue(),
2165 WideIV->getInductionDescriptor(), *WideIV, WideIV->getDebugLoc());
2166 NewWideIV->insertBefore(WideIV);
2167
2168 auto *NewBTC = new VPWidenCastRecipe(
2169 Instruction::Trunc, Plan.getOrCreateBackedgeTakenCount(), NewIVTy,
2170 nullptr, VPIRFlags::getDefaultFlags(Instruction::Trunc));
2171 Plan.getVectorPreheader()->appendRecipe(NewBTC);
2172 auto *Cmp = cast<VPInstruction>(WideIV->getSingleUser());
2173 Cmp->replaceAllUsesWith(
2174 VPBuilder(Cmp).createICmp(Cmp->getPredicate(), NewWideIV, NewBTC));
2175
2176 MadeChange = true;
2177 }
2178
2179 return MadeChange;
2180}
2181
2182/// Return true if \p Cond is known to be true for given \p BestVF and \p
2183/// BestUF.
2185 ElementCount BestVF, unsigned BestUF,
2188 return any_of(Cond->getDefiningRecipe()->operands(), [&Plan, BestVF, BestUF,
2189 &PSE](VPValue *C) {
2190 return isConditionTrueViaVFAndUF(C, Plan, BestVF, BestUF, PSE);
2191 });
2192
2193 auto *CanIV = Plan.getVectorLoopRegion()->getCanonicalIV();
2196 m_c_Add(m_Specific(CanIV), m_Specific(&Plan.getVFxUF())),
2197 m_Specific(&Plan.getVectorTripCount()))))
2198 return false;
2199
2200 // The compare checks CanIV + VFxUF == vector trip count. The vector trip
2201 // count is not conveniently available as SCEV so far, so we compare directly
2202 // against the original trip count. This is stricter than necessary, as we
2203 // will only return true if the trip count == vector trip count.
2204 const SCEV *VectorTripCount =
2206 if (isa<SCEVCouldNotCompute>(VectorTripCount))
2207 VectorTripCount = vputils::getSCEVExprForVPValue(Plan.getTripCount(), PSE);
2208 assert(!isa<SCEVCouldNotCompute>(VectorTripCount) &&
2209 "Trip count SCEV must be computable");
2210 ScalarEvolution &SE = *PSE.getSE();
2211 ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF);
2212 const SCEV *C = SE.getElementCount(VectorTripCount->getType(), NumElements);
2213 return SE.isKnownPredicate(CmpInst::ICMP_EQ, VectorTripCount, C);
2214}
2215
2216/// Try to replace multiple active lane masks used for control flow with
2217/// a single, wide active lane mask instruction followed by multiple
2218/// extract subvector intrinsics. This applies to the active lane mask
2219/// instructions both in the loop and in the preheader.
2220/// Incoming values of all ActiveLaneMaskPHIs are updated to use the
2221/// new extracts from the first active lane mask, which has it's last
2222/// operand (multiplier) set to UF.
2224 unsigned UF) {
2225 if (!EnableWideActiveLaneMask || !VF.isVector() || UF == 1)
2226 return false;
2227
2228 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
2229 VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
2230 auto *Term = &ExitingVPBB->back();
2231
2232 using namespace llvm::VPlanPatternMatch;
2234 m_VPValue(), m_VPValue(), m_VPValue())))))
2235 return false;
2236
2237 auto *Header = cast<VPBasicBlock>(VectorRegion->getEntry());
2238 LLVMContext &Ctx = Plan.getContext();
2239
2240 auto ExtractFromALM = [&](VPInstruction *ALM,
2241 SmallVectorImpl<VPValue *> &Extracts) {
2242 DebugLoc DL = ALM->getDebugLoc();
2243 for (unsigned Part = 0; Part < UF; ++Part) {
2245 Ops.append({ALM, Plan.getConstantInt(64, VF.getKnownMinValue() * Part)});
2246 auto *Ext =
2247 new VPWidenIntrinsicRecipe(Intrinsic::vector_extract, Ops,
2248 IntegerType::getInt1Ty(Ctx), {}, {}, DL);
2249 Extracts[Part] = Ext;
2250 Ext->insertAfter(ALM);
2251 }
2252 };
2253
2254 // Create a list of each active lane mask phi, ordered by unroll part.
2256 for (VPRecipeBase &R : Header->phis()) {
2258 if (!Phi)
2259 continue;
2260 VPValue *Index = nullptr;
2261 match(Phi->getBackedgeValue(),
2263 assert(Index && "Expected index from ActiveLaneMask instruction");
2264
2265 uint64_t Part;
2266 if (match(Index,
2268 m_VPValue(), m_Mul(m_VPValue(), m_ConstantInt(Part)))))
2269 Phis[Part] = Phi;
2270 else {
2271 // Anything other than a CanonicalIVIncrementForPart is part 0
2272 assert(!match(
2273 Index,
2275 Phis[0] = Phi;
2276 }
2277 }
2278
2279 assert(all_of(Phis, not_equal_to(nullptr)) &&
2280 "Expected one VPActiveLaneMaskPHIRecipe for each unroll part");
2281
2282 auto *EntryALM = cast<VPInstruction>(Phis[0]->getStartValue());
2283 auto *LoopALM = cast<VPInstruction>(Phis[0]->getBackedgeValue());
2284
2285 assert((EntryALM->getOpcode() == VPInstruction::ActiveLaneMask &&
2286 LoopALM->getOpcode() == VPInstruction::ActiveLaneMask) &&
2287 "Expected incoming values of Phi to be ActiveLaneMasks");
2288
2289 // When using wide lane masks, the return type of the get.active.lane.mask
2290 // intrinsic is VF x UF (last operand).
2291 VPValue *ALMMultiplier = Plan.getConstantInt(64, UF);
2292 EntryALM->setOperand(2, ALMMultiplier);
2293 LoopALM->setOperand(2, ALMMultiplier);
2294
2295 // Create UF x extract vectors and insert into preheader.
2296 SmallVector<VPValue *> EntryExtracts(UF);
2297 ExtractFromALM(EntryALM, EntryExtracts);
2298
2299 // Create UF x extract vectors and insert before the loop compare & branch,
2300 // updating the compare to use the first extract.
2301 SmallVector<VPValue *> LoopExtracts(UF);
2302 ExtractFromALM(LoopALM, LoopExtracts);
2303 VPInstruction *Not = cast<VPInstruction>(Term->getOperand(0));
2304 Not->setOperand(0, LoopExtracts[0]);
2305
2306 // Update the incoming values of active lane mask phis.
2307 for (unsigned Part = 0; Part < UF; ++Part) {
2308 Phis[Part]->setStartValue(EntryExtracts[Part]);
2309 Phis[Part]->setBackedgeValue(LoopExtracts[Part]);
2310 }
2311
2312 return true;
2313}
2314
2315/// Try to simplify the branch condition of \p Plan. This may restrict the
2316/// resulting plan to \p BestVF and \p BestUF.
2318 unsigned BestUF,
2320 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
2321 VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
2322 auto *Term = &ExitingVPBB->back();
2323 VPValue *Cond;
2324 auto m_CanIVInc = m_Add(m_VPValue(), m_Specific(&Plan.getVFxUF()));
2325 // Check if the branch condition compares the canonical IV increment (for main
2326 // loop), or the canonical IV increment plus an offset (for epilog loop).
2327 if (match(Term, m_BranchOnCount(
2328 m_CombineOr(m_CanIVInc, m_c_Add(m_CanIVInc, m_LiveIn())),
2329 m_VPValue())) ||
2331 m_VPValue(), m_VPValue(), m_VPValue()))))) {
2332 // Try to simplify the branch condition if VectorTC <= VF * UF when the
2333 // latch terminator is BranchOnCount or BranchOnCond(Not(ActiveLaneMask)).
2334 const SCEV *VectorTripCount =
2336 if (isa<SCEVCouldNotCompute>(VectorTripCount))
2337 VectorTripCount =
2339 assert(!isa<SCEVCouldNotCompute>(VectorTripCount) &&
2340 "Trip count SCEV must be computable");
2341 ScalarEvolution &SE = *PSE.getSE();
2342 ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF);
2343 const SCEV *C = SE.getElementCount(VectorTripCount->getType(), NumElements);
2344 if (!SE.isKnownPredicate(CmpInst::ICMP_ULE, VectorTripCount, C))
2345 return false;
2346 } else if (match(Term, m_BranchOnCond(m_VPValue(Cond))) ||
2348 // For BranchOnCond, check if we can prove the condition to be true using VF
2349 // and UF.
2350 if (!isConditionTrueViaVFAndUF(Cond, Plan, BestVF, BestUF, PSE))
2351 return false;
2352 } else {
2353 return false;
2354 }
2355
2356 // The vector loop region only executes once. Convert terminator of the
2357 // exiting block to exit in the first iteration.
2358 if (match(Term, m_BranchOnTwoConds())) {
2359 Term->setOperand(1, Plan.getTrue());
2360 return true;
2361 }
2362
2363 auto *BOC = new VPInstruction(VPInstruction::BranchOnCond, Plan.getTrue(), {},
2364 {}, Term->getDebugLoc());
2365 ExitingVPBB->appendRecipe(BOC);
2366 Term->eraseFromParent();
2367
2368 return true;
2369}
2370
2371/// From the definition of llvm.experimental.get.vector.length,
2372/// VPInstruction::ExplicitVectorLength(%AVL) = %AVL when %AVL <= VF.
2376 vp_depth_first_deep(Plan.getEntry()))) {
2377 for (VPRecipeBase &R : *VPBB) {
2378 VPValue *AVL;
2379 if (!match(&R, m_EVL(m_VPValue(AVL))))
2380 continue;
2381
2382 const SCEV *AVLSCEV = vputils::getSCEVExprForVPValue(AVL, PSE);
2383 if (isa<SCEVCouldNotCompute>(AVLSCEV))
2384 continue;
2385 ScalarEvolution &SE = *PSE.getSE();
2386 const SCEV *VFSCEV = SE.getElementCount(AVLSCEV->getType(), VF);
2387 if (!SE.isKnownPredicate(CmpInst::ICMP_ULE, AVLSCEV, VFSCEV))
2388 continue;
2389
2391 AVL, Type::getInt32Ty(Plan.getContext()), AVLSCEV->getType(),
2392 R.getDebugLoc());
2393 if (Trunc != AVL) {
2394 auto *TruncR = cast<VPSingleDefRecipe>(Trunc);
2395 const DataLayout &DL = Plan.getDataLayout();
2396 if (VPValue *Folded = tryToFoldLiveIns(*TruncR, TruncR->operands(), DL))
2397 Trunc = Folded;
2398 }
2399 R.getVPSingleValue()->replaceAllUsesWith(Trunc);
2400 return true;
2401 }
2402 }
2403 return false;
2404}
2405
2407 unsigned BestUF,
2409 assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
2410 assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
2411
2412 bool MadeChange = tryToReplaceALMWithWideALM(Plan, BestVF, BestUF);
2413 MadeChange |= simplifyBranchConditionForVFAndUF(Plan, BestVF, BestUF, PSE);
2414 MadeChange |= optimizeVectorInductionWidthForTCAndVFUF(Plan, BestVF, BestUF);
2415
2416 if (MadeChange) {
2417 Plan.setVF(BestVF);
2418 assert(Plan.getConcreteUF() == BestUF && "BestUF must match the Plan's UF");
2419 }
2420}
2421
2423 for (VPRecipeBase &R :
2425 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
2426 if (!PhiR)
2427 continue;
2428 RecurKind RK = PhiR->getRecurrenceKind();
2429 if (RK != RecurKind::Add && RK != RecurKind::Mul && RK != RecurKind::Sub &&
2431 continue;
2432
2433 for (VPUser *U : collectUsersRecursively(PhiR))
2434 if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(U)) {
2435 RecWithFlags->dropPoisonGeneratingFlags();
2436 }
2437 }
2438}
2439
2440namespace {
2441struct VPCSEDenseMapInfo : public DenseMapInfo<VPSingleDefRecipe *> {
2442 /// If recipe \p R will lower to a GEP with a non-i8 source element type,
2443 /// return that source element type.
2444 static Type *getGEPSourceElementType(const VPSingleDefRecipe *R) {
2445 // All VPInstructions that lower to GEPs must have the i8 source element
2446 // type (as they are PtrAdds), so we omit it.
2448 .Case([](const VPReplicateRecipe *I) -> Type * {
2449 if (auto *GEP = dyn_cast<GetElementPtrInst>(I->getUnderlyingValue()))
2450 return GEP->getSourceElementType();
2451 return nullptr;
2452 })
2453 .Case<VPVectorPointerRecipe, VPWidenGEPRecipe>(
2454 [](auto *I) { return I->getSourceElementType(); })
2455 .Default([](auto *) { return nullptr; });
2456 }
2457
2458 /// Returns true if recipe \p Def can be safely handed for CSE.
2459 static bool canHandle(const VPSingleDefRecipe *Def) {
2460 // We can extend the list of handled recipes in the future,
2461 // provided we account for the data embedded in them while checking for
2462 // equality or hashing.
2463 auto C = getOpcodeOrIntrinsicID(Def);
2464
2465 // The issue with (Insert|Extract)Value is that the index of the
2466 // insert/extract is not a proper operand in LLVM IR, and hence also not in
2467 // VPlan.
2468 if (!C || (!C->first && (C->second == Instruction::InsertValue ||
2469 C->second == Instruction::ExtractValue)))
2470 return false;
2471
2472 // During CSE, we can only handle recipes that don't read from memory: if
2473 // they read from memory, there could be an intervening write to memory
2474 // before the next instance is CSE'd, leading to an incorrect result.
2475 return !Def->mayReadFromMemory();
2476 }
2477
2478 /// Hash the underlying data of \p Def.
2479 static unsigned getHashValue(const VPSingleDefRecipe *Def) {
2480 hash_code Result = hash_combine(
2481 Def->getVPRecipeID(), getOpcodeOrIntrinsicID(Def),
2482 getGEPSourceElementType(Def), Def->getScalarType(),
2484 if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(Def))
2485 if (RFlags->hasPredicate())
2486 return hash_combine(Result, RFlags->getPredicate());
2487 if (auto *SIVSteps = dyn_cast<VPScalarIVStepsRecipe>(Def))
2488 return hash_combine(Result, SIVSteps->getInductionOpcode());
2489 return Result;
2490 }
2491
2492 /// Check equality of underlying data of \p L and \p R.
2493 static bool isEqual(const VPSingleDefRecipe *L, const VPSingleDefRecipe *R) {
2494 if (L->getVPRecipeID() != R->getVPRecipeID() ||
2496 getGEPSourceElementType(L) != getGEPSourceElementType(R) ||
2498 !equal(L->operands(), R->operands()))
2499 return false;
2501 "must have valid opcode info for both recipes");
2502 if (auto *LFlags = dyn_cast<VPRecipeWithIRFlags>(L))
2503 if (LFlags->hasPredicate() &&
2504 LFlags->getPredicate() !=
2505 cast<VPRecipeWithIRFlags>(R)->getPredicate())
2506 return false;
2507 if (auto *LSIV = dyn_cast<VPScalarIVStepsRecipe>(L))
2508 if (LSIV->getInductionOpcode() !=
2509 cast<VPScalarIVStepsRecipe>(R)->getInductionOpcode())
2510 return false;
2511 // Recipes in replicate regions implicitly depend on predicate. If either
2512 // recipe is in a replicate region, only consider them equal if both have
2513 // the same parent.
2514 const VPRegionBlock *RegionL = L->getRegion();
2515 const VPRegionBlock *RegionR = R->getRegion();
2516 if (((RegionL && RegionL->isReplicator()) ||
2517 (RegionR && RegionR->isReplicator())) &&
2518 L->getParent() != R->getParent())
2519 return false;
2520 return L->getScalarType() == R->getScalarType();
2521 }
2522};
2523} // end anonymous namespace
2524
2525/// Perform a common-subexpression-elimination of VPSingleDefRecipes on the \p
2526/// Plan.
2528 VPDominatorTree VPDT(Plan);
2530
2532 Plan.getEntry());
2534 for (VPRecipeBase &R : *VPBB) {
2535 auto *Def = dyn_cast<VPSingleDefRecipe>(&R);
2536 if (!Def || !VPCSEDenseMapInfo::canHandle(Def))
2537 continue;
2538 if (VPSingleDefRecipe *V = CSEMap.lookup(Def)) {
2539 // V must dominate Def for a valid replacement.
2540 if (!VPDT.dominates(V->getParent(), VPBB))
2541 continue;
2542 // Only keep flags present on both V and Def.
2543 if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(V))
2544 RFlags->intersectFlags(*cast<VPRecipeWithIRFlags>(Def));
2545 Def->replaceAllUsesWith(V);
2546 continue;
2547 }
2548 CSEMap[Def] = Def;
2549 }
2550 }
2551}
2552
2553/// Return true if we do not know how to (mechanically) hoist or sink a
2554/// non-memory or memory recipe \p R out of a loop region. When sinking, passing
2555/// \p Sinking = true ensures that assumes aren't sunk.
2557 VPBasicBlock *LastBB,
2558 bool Sinking = false) {
2559 if (!isa<VPReplicateRecipe>(R) || !R.mayReadOrWriteMemory() ||
2561 return vputils::cannotHoistOrSinkRecipe(R, Sinking);
2562
2563 // Check that the memory operation doesn't alias between FirstBB and LastBB.
2564 auto MemLoc = vputils::getMemoryLocation(R);
2565
2566 // TODO: Could make use of SinkStoreInfo::isNoAliasViaDistance by collecting
2567 // stores upfront, and constructing a full SinkStoreInfo.
2568 auto SinkInfo =
2569 Sinking ? std::make_optional(SinkStoreInfo(cast<VPReplicateRecipe>(R)))
2570 : std::nullopt;
2571
2572 return !MemLoc ||
2573 !canHoistOrSinkWithNoAliasCheck(*MemLoc, FirstBB, LastBB, SinkInfo);
2574}
2575
2576/// Move loop-invariant recipes out of the vector loop region in \p Plan.
2577static void licm(VPlan &Plan) {
2578 VPBasicBlock *Preheader = Plan.getVectorPreheader();
2579
2580 // Hoist any loop invariant recipes from the vector loop region to the
2581 // preheader. Preform a shallow traversal of the vector loop region, to
2582 // exclude recipes in replicate regions. Since the top-level blocks in the
2583 // vector loop region are guaranteed to execute if the vector pre-header is,
2584 // we don't need to check speculation safety.
2585 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2586 assert(Preheader->getSingleSuccessor() == LoopRegion &&
2587 "Expected vector prehader's successor to be the vector loop region");
2589 vp_depth_first_shallow(LoopRegion->getEntry()))) {
2590 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
2591 if (cannotHoistOrSinkRecipe(R, LoopRegion->getEntryBasicBlock(),
2592 LoopRegion->getExitingBasicBlock()))
2593 continue;
2594 if (any_of(R.operands(), [](VPValue *Op) {
2595 return !Op->isDefinedOutsideLoopRegions();
2596 }))
2597 continue;
2598 R.moveBefore(*Preheader, Preheader->end());
2599 }
2600 }
2601
2602#ifndef NDEBUG
2603 VPDominatorTree VPDT(Plan);
2604#endif
2605 // Sink recipes with no users inside the vector loop region if all users are
2606 // in the same exit block of the region.
2607 // TODO: Extend to sink recipes from inner loops.
2609 LoopRegion->getEntry());
2611 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
2612 if (cannotHoistOrSinkRecipe(R, LoopRegion->getEntryBasicBlock(),
2613 LoopRegion->getExitingBasicBlock(),
2614 /*Sinking=*/true))
2615 continue;
2616
2617 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
2618 assert(!RepR->isPredicated() &&
2619 "Expected prior transformation of predicated replicates to "
2620 "replicate regions");
2621 // narrowToSingleScalarRecipes should have already maximally narrowed
2622 // replicates to single-scalar replicates.
2623 // TODO: When unrolling, replicateByVF doesn't handle sunk
2624 // non-single-scalar replicates correctly.
2625 if (!RepR->isSingleScalar())
2626 continue;
2627
2628 // The pointer operand of stores must be loop-invariant.
2629 if (RepR->getOpcode() == Instruction::Store &&
2630 !RepR->getOperand(1)->isDefinedOutsideLoopRegions())
2631 continue;
2632 }
2633
2634 [[maybe_unused]] auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
2635 assert((!R.mayWriteToMemory() ||
2636 (RepR && RepR->getOpcode() == Instruction::Store &&
2637 RepR->getOperand(1)->isDefinedOutsideLoopRegions())) &&
2638 "The only recipes that may write to memory are expected to be "
2639 "stores with invariant pointer-operand");
2640
2641 // TODO: Use R.definedValues() instead of casting to VPSingleDefRecipe to
2642 // support recipes with multiple defined values (e.g., interleaved loads).
2643 auto *Def = cast<VPSingleDefRecipe>(&R);
2644
2645 // Cannot sink the recipe if the user is defined in a loop region or a
2646 // non-successor of the vector loop region. Cannot sink if user is a phi
2647 // either.
2648 VPBasicBlock *SinkBB = nullptr;
2649 if (any_of(Def->users(), [&SinkBB, &LoopRegion](VPUser *U) {
2650 auto *UserR = cast<VPRecipeBase>(U);
2651 VPBasicBlock *Parent = UserR->getParent();
2652 // TODO: Support sinking when users are in multiple blocks.
2653 if (SinkBB && SinkBB != Parent)
2654 return true;
2655 SinkBB = Parent;
2656 // TODO: If the user is a PHI node, we should check the block of
2657 // incoming value. Support PHI node users if needed.
2658 return UserR->isPhi() || Parent->getEnclosingLoopRegion() ||
2659 Parent->getSinglePredecessor() != LoopRegion;
2660 }))
2661 continue;
2662
2663 if (!SinkBB)
2664 SinkBB = cast<VPBasicBlock>(LoopRegion->getSingleSuccessor());
2665
2666 // TODO: This will need to be a check instead of a assert after
2667 // conditional branches in vectorized loops are supported.
2668 assert(VPDT.properlyDominates(VPBB, SinkBB) &&
2669 "Defining block must dominate sink block");
2670 // TODO: Clone the recipe if users are on multiple exit paths, instead of
2671 // just moving.
2672 Def->moveBefore(*SinkBB, SinkBB->getFirstNonPhi());
2673 }
2674 }
2675}
2676
2678 VPlan &Plan, const MapVector<Instruction *, uint64_t> &MinBWs) {
2679 if (Plan.hasScalarVFOnly())
2680 return;
2681 // Keep track of created truncates, so they can be re-used. Note that we
2682 // cannot use RAUW after creating a new truncate, as this would could make
2683 // other uses have different types for their operands, making them invalidly
2684 // typed.
2686 VPBasicBlock *PH = Plan.getVectorPreheader();
2689 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
2692 continue;
2693
2694 VPValue *ResultVPV = R.getVPSingleValue();
2695 auto *UI = cast_or_null<Instruction>(ResultVPV->getUnderlyingValue());
2696 unsigned NewResSizeInBits = MinBWs.lookup(UI);
2697 if (!NewResSizeInBits)
2698 continue;
2699
2700 // If the value wasn't vectorized, we must maintain the original scalar
2701 // type. Skip those here, after incrementing NumProcessedRecipes. Also
2702 // skip casts which do not need to be handled explicitly here, as
2703 // redundant casts will be removed during recipe simplification.
2705 continue;
2706
2707 Type *OldResTy = ResultVPV->getScalarType();
2708 unsigned OldResSizeInBits = OldResTy->getScalarSizeInBits();
2709 assert(OldResTy->isIntegerTy() && "only integer types supported");
2710 (void)OldResSizeInBits;
2711
2712 auto *NewResTy = IntegerType::get(Plan.getContext(), NewResSizeInBits);
2713
2714 // Any wrapping introduced by shrinking this operation shouldn't be
2715 // considered undefined behavior. So, we can't unconditionally copy
2716 // arithmetic wrapping flags to VPW.
2717 if (auto *VPW = dyn_cast<VPRecipeWithIRFlags>(&R))
2718 VPW->dropPoisonGeneratingFlags();
2719
2720 assert((OldResSizeInBits != NewResSizeInBits ||
2721 match(&R, m_ICmp(m_VPValue(), m_VPValue()))) &&
2722 "Only ICmps should not need extending the result.");
2723 assert(!isa<VPWidenStoreRecipe>(&R) && "stores cannot be narrowed");
2724
2725 // For loads/intrinsics we don't recreate the recipe; just wrap the
2726 // original wide result in a ZExt to OldResTy.
2728 if (OldResSizeInBits != NewResSizeInBits) {
2730 Instruction::ZExt, ResultVPV, OldResTy);
2731 ResultVPV->replaceAllUsesWith(Ext);
2732 Ext->setOperand(0, ResultVPV);
2733 }
2734 continue;
2735 }
2736
2737 // Shrink operands by introducing truncates as needed.
2738 unsigned StartIdx =
2739 match(&R, m_Select(m_VPValue(), m_VPValue(), m_VPValue())) ? 1 : 0;
2740 SmallVector<VPValue *> NewOperands(R.operands());
2741 for (VPValue *&Op : drop_begin(NewOperands, StartIdx)) {
2742 unsigned OpSizeInBits = Op->getScalarType()->getScalarSizeInBits();
2743 if (OpSizeInBits == NewResSizeInBits)
2744 continue;
2745 assert(OpSizeInBits > NewResSizeInBits && "nothing to truncate");
2746 auto [ProcessedIter, Inserted] = ProcessedTruncs.try_emplace(Op);
2747 if (Inserted) {
2748 VPBuilder Builder;
2749 if (isa<VPIRValue>(Op))
2750 Builder.setInsertPoint(PH);
2751 else
2752 Builder.setInsertPoint(&R);
2753 ProcessedIter->second =
2754 Builder.createWidenCast(Instruction::Trunc, Op, NewResTy);
2755 }
2756 Op = ProcessedIter->second;
2757 }
2758
2759 auto *NWR = cast<VPWidenRecipe>(&R)->cloneWithOperands(NewOperands);
2760 NWR->insertBefore(&R);
2761
2762 // Wrap NWR in a ZExt to preserve the original wide type for downstream
2763 // users (unless this is an ICmp, which produces i1 regardless).
2764 VPValue *Replacement = NWR->getVPSingleValue();
2765 if (OldResSizeInBits != NewResSizeInBits)
2766 Replacement =
2768 .createWidenCast(Instruction::ZExt, Replacement, OldResTy)
2769 ->getVPSingleValue();
2770 ResultVPV->replaceAllUsesWith(Replacement);
2771 R.eraseFromParent();
2772 }
2773 }
2774}
2775
2776bool VPlanTransforms::removeBranchOnConst(VPlan &Plan, bool OnlyLatches) {
2777 std::optional<VPDominatorTree> VPDT;
2778 if (OnlyLatches)
2779 VPDT.emplace(Plan);
2780
2781 // Collect all blocks before modifying the CFG so we can identify unreachable
2782 // ones after constant branch removal.
2784
2785 bool SimplifiedPhi = false;
2786 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(AllBlocks)) {
2787 VPValue *Cond;
2788 // Skip blocks that are not terminated by BranchOnCond.
2789 if (VPBB->empty() || !match(&VPBB->back(), m_BranchOnCond(m_VPValue(Cond))))
2790 continue;
2791
2792 if (OnlyLatches && !VPBlockUtils::isLatch(VPBB, *VPDT))
2793 continue;
2794
2795 assert(VPBB->getNumSuccessors() == 2 &&
2796 "Two successors expected for BranchOnCond");
2797 unsigned RemovedIdx;
2798 if (match(Cond, m_True()))
2799 RemovedIdx = 1;
2800 else if (match(Cond, m_False()))
2801 RemovedIdx = 0;
2802 else
2803 continue;
2804
2805 VPBasicBlock *RemovedSucc =
2806 cast<VPBasicBlock>(VPBB->getSuccessors()[RemovedIdx]);
2807 assert(count(RemovedSucc->getPredecessors(), VPBB) == 1 &&
2808 "There must be a single edge between VPBB and its successor");
2809 // Values coming from VPBB into phi recipes of RemovedSucc are removed from
2810 // these recipes.
2811 auto Phis = RemovedSucc->phis();
2812 for (VPRecipeBase &R : Phis)
2813 cast<VPPhiAccessors>(&R)->removeIncomingValueFor(VPBB);
2814 SimplifiedPhi |= !std::empty(Phis);
2815
2816 // Disconnect blocks and remove the terminator.
2817 VPBlockUtils::disconnectBlocks(VPBB, RemovedSucc);
2818 VPBB->back().eraseFromParent();
2819 }
2820
2821 // Compute which blocks are still reachable from the entry after constant
2822 // branch removal.
2825
2826 // Detach all unreachable blocks from their successors, removing their recipes
2827 // and incoming values from phi recipes.
2828 VPSymbolicValue Tmp(nullptr);
2829 for (VPBlockBase *B : AllBlocks) {
2830 if (Reachable.contains(B))
2831 continue;
2832 for (VPBlockBase *Succ : to_vector(B->successors())) {
2833 if (auto *SuccBB = dyn_cast<VPBasicBlock>(Succ))
2834 for (VPRecipeBase &R : SuccBB->phis())
2835 cast<VPPhiAccessors>(&R)->removeIncomingValueFor(B);
2837 }
2838 for (VPBasicBlock *DeadBB :
2840 for (VPRecipeBase &R : make_early_inc_range(*DeadBB)) {
2841 for (VPValue *Def : R.definedValues())
2842 Def->replaceAllUsesWith(&Tmp);
2843 R.eraseFromParent();
2844 }
2845 }
2846 }
2847 return SimplifiedPhi;
2848}
2849
2870
2871// Add a VPActiveLaneMaskPHIRecipe and related recipes to \p Plan and replace
2872// the loop terminator with a branch-on-cond recipe with the negated
2873// active-lane-mask as operand. Note that this turns the loop into an
2874// uncountable one. Only the existing terminator is replaced, all other existing
2875// recipes/users remain unchanged, except for poison-generating flags being
2876// dropped from the canonical IV increment. Return the created
2877// VPActiveLaneMaskPHIRecipe.
2878//
2879// The function adds the following recipes:
2880//
2881// vector.ph:
2882// %EntryInc = canonical-iv-increment-for-part CanonicalIVStart
2883// %EntryALM = active-lane-mask %EntryInc, TC
2884//
2885// vector.body:
2886// ...
2887// %P = active-lane-mask-phi [ %EntryALM, %vector.ph ], [ %ALM, %vector.body ]
2888// ...
2889// %InLoopInc = canonical-iv-increment-for-part CanonicalIVIncrement
2890// %ALM = active-lane-mask %InLoopInc, TC
2891// %Negated = Not %ALM
2892// branch-on-cond %Negated
2893//
2896 VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
2897 VPBasicBlock *EB = TopRegion->getExitingBasicBlock();
2898 VPValue *StartV = Plan.getZero(TopRegion->getCanonicalIVType());
2899 auto *CanonicalIVIncrement = TopRegion->getOrCreateCanonicalIVIncrement();
2900 // TODO: Check if dropping the flags is needed.
2901 TopRegion->clearCanonicalIVNUW(CanonicalIVIncrement);
2902 DebugLoc DL = CanonicalIVIncrement->getDebugLoc();
2903 // We can't use StartV directly in the ActiveLaneMask VPInstruction, since
2904 // we have to take unrolling into account. Each part needs to start at
2905 // Part * VF
2906 auto *VecPreheader = Plan.getVectorPreheader();
2907 VPBuilder Builder(VecPreheader);
2908
2909 // Create the ActiveLaneMask instruction using the correct start values.
2910 VPValue *TC = Plan.getTripCount();
2911 VPValue *VF = &Plan.getVF();
2912
2913 auto *EntryIncrement = Builder.createOverflowingOp(
2914 VPInstruction::CanonicalIVIncrementForPart, {StartV, VF}, {false, false},
2915 DL, "index.part.next");
2916
2917 // Create the active lane mask instruction in the VPlan preheader.
2918 VPValue *ALMMultiplier =
2919 Plan.getConstantInt(TopRegion->getCanonicalIVType(), 1);
2920 auto *EntryALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
2921 {EntryIncrement, TC, ALMMultiplier}, DL,
2922 "active.lane.mask.entry");
2923
2924 // Now create the ActiveLaneMaskPhi recipe in the main loop using the
2925 // preheader ActiveLaneMask instruction.
2926 auto *LaneMaskPhi =
2928 auto *HeaderVPBB = TopRegion->getEntryBasicBlock();
2929 LaneMaskPhi->insertBefore(*HeaderVPBB, HeaderVPBB->begin());
2930
2931 // Create the active lane mask for the next iteration of the loop before the
2932 // original terminator.
2933 VPRecipeBase *OriginalTerminator = EB->getTerminator();
2934 Builder.setInsertPoint(OriginalTerminator);
2935 auto *InLoopIncrement = Builder.createOverflowingOp(
2937 {CanonicalIVIncrement, &Plan.getVF()}, {false, false}, DL);
2938 auto *ALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
2939 {InLoopIncrement, TC, ALMMultiplier}, DL,
2940 "active.lane.mask.next");
2941 LaneMaskPhi->addBackedgeValue(ALM);
2942
2943 // Replace the original terminator with BranchOnCond. We have to invert the
2944 // mask here because a true condition means jumping to the exit block.
2945 auto *NotMask = Builder.createNot(ALM, DL);
2946 Builder.createNaryOp(VPInstruction::BranchOnCond, {NotMask}, DL);
2947 OriginalTerminator->eraseFromParent();
2948 return LaneMaskPhi;
2949}
2950
2952 bool UseActiveLaneMaskForControlFlow) {
2953 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2954 auto *WideCanonicalIV =
2956 assert(WideCanonicalIV &&
2957 "Must have widened canonical IV when tail folding!");
2958 VPSingleDefRecipe *HeaderMask = vputils::findHeaderMask(Plan);
2959 VPSingleDefRecipe *LaneMask;
2960 if (UseActiveLaneMaskForControlFlow) {
2961 LaneMask = addVPLaneMaskPhiAndUpdateExitBranch(Plan);
2962 } else {
2963 VPBuilder B = VPBuilder::getToInsertAfter(WideCanonicalIV);
2964 VPValue *ALMMultiplier =
2965 Plan.getConstantInt(LoopRegion->getCanonicalIVType(), 1);
2966 LaneMask =
2967 B.createNaryOp(VPInstruction::ActiveLaneMask,
2968 {WideCanonicalIV, Plan.getTripCount(), ALMMultiplier},
2969 nullptr, "active.lane.mask");
2970 }
2971
2972 // Walk users of WideCanonicalIV and replace the header mask of the form
2973 // (ICMP_ULE, WideCanonicalIV, backedge-taken-count) with an active-lane-mask,
2974 // removing the old one to ensure there is always only a single header mask.
2975 HeaderMask->replaceAllUsesWith(LaneMask);
2976 HeaderMask->eraseFromParent();
2977}
2978
2979template <typename Op0_t, typename Op1_t> struct RemoveMask_match {
2980 Op0_t In;
2982
2983 RemoveMask_match(const Op0_t &In, Op1_t &Out) : In(In), Out(Out) {}
2984
2985 template <typename OpTy> bool match(OpTy *V) const {
2986 if (m_Specific(In).match(V)) {
2987 Out = nullptr;
2988 return true;
2989 }
2990 return m_LogicalAnd(m_Specific(In), m_VPValue(Out)).match(V);
2991 }
2992};
2993
2994/// Match a specific mask \p In, or a combination of it (logical-and In, Out).
2995/// Returns the remaining part \p Out if so, or nullptr otherwise.
2996template <typename Op0_t, typename Op1_t>
2997static inline RemoveMask_match<Op0_t, Op1_t> m_RemoveMask(const Op0_t &In,
2998 Op1_t &Out) {
2999 return RemoveMask_match<Op0_t, Op1_t>(In, Out);
3000}
3001
3002static std::optional<Intrinsic::ID> getVPDivRemIntrinsic(Intrinsic::ID IntrID) {
3003 switch (IntrID) {
3004 case Intrinsic::masked_udiv:
3005 return Intrinsic::vp_udiv;
3006 case Intrinsic::masked_sdiv:
3007 return Intrinsic::vp_sdiv;
3008 case Intrinsic::masked_urem:
3009 return Intrinsic::vp_urem;
3010 case Intrinsic::masked_srem:
3011 return Intrinsic::vp_srem;
3012 default:
3013 return std::nullopt;
3014 }
3015}
3016
3017/// Try to optimize a \p CurRecipe masked by \p HeaderMask to a corresponding
3018/// EVL-based recipe without the header mask. Returns nullptr if no EVL-based
3019/// recipe could be created.
3020/// \p HeaderMask Header Mask.
3021/// \p CurRecipe Recipe to be transform.
3022/// \p EVL The explicit vector length parameter of vector-predication
3023/// intrinsics.
3025 VPRecipeBase &CurRecipe, VPValue &EVL) {
3026 VPlan *Plan = CurRecipe.getParent()->getPlan();
3027 DebugLoc DL = CurRecipe.getDebugLoc();
3028 VPValue *Addr, *Mask, *EndPtr;
3029
3030 /// Adjust any end pointers so that they point to the end of EVL lanes not VF.
3031 auto AdjustEndPtr = [&CurRecipe, &EVL](VPValue *EndPtr) {
3032 auto *EVLEndPtr = cast<VPVectorEndPointerRecipe>(EndPtr)->clone();
3033 EVLEndPtr->insertBefore(&CurRecipe);
3034 // Cast EVL (i32) to match the VF operand's type.
3035 VPValue *EVLAsVF = VPBuilder(EVLEndPtr).createScalarZExtOrTrunc(
3036 &EVL, EVLEndPtr->getOperand(1)->getScalarType(), EVL.getScalarType(),
3038 EVLEndPtr->setOperand(1, EVLAsVF);
3039 return EVLEndPtr;
3040 };
3041
3042 auto GetVPReverse = [&CurRecipe, &EVL, Plan,
3044 if (!V)
3045 return nullptr;
3046 auto *Reverse = new VPWidenIntrinsicRecipe(
3047 Intrinsic::experimental_vp_reverse, {V, Plan->getTrue(), &EVL},
3048 V->getScalarType(), {}, {}, DL);
3049 Reverse->insertBefore(&CurRecipe);
3050 return Reverse;
3051 };
3052
3053 if (match(&CurRecipe,
3054 m_MaskedLoad(m_VPValue(Addr), m_RemoveMask(HeaderMask, Mask))))
3055 return new VPWidenLoadEVLRecipe(cast<VPWidenLoadRecipe>(CurRecipe), Addr,
3056 EVL, Mask);
3057
3058 if (match(&CurRecipe,
3059 m_MaskedLoad(m_VPValue(EndPtr),
3060 m_Reverse(m_RemoveMask(HeaderMask, Mask)))) &&
3061 match(EndPtr, m_VecEndPtr(m_VPValue(), m_Specific(&Plan->getVF())))) {
3062 Mask = GetVPReverse(Mask);
3063 Addr = AdjustEndPtr(EndPtr);
3064 auto *LoadR = new VPWidenLoadEVLRecipe(cast<VPWidenLoadRecipe>(CurRecipe),
3065 Addr, EVL, Mask);
3066 LoadR->insertBefore(&CurRecipe);
3067 VPValue *Poison = Plan->getPoison(LoadR->getScalarType());
3068 return new VPWidenIntrinsicRecipe(Intrinsic::vector_splice_left,
3069 {Poison, LoadR, &EVL},
3070 LoadR->getScalarType(), {}, {}, DL);
3071 }
3072
3073 VPValue *Stride;
3075 m_VPValue(Addr), m_VPValue(Stride),
3076 m_RemoveMask(HeaderMask, Mask),
3077 m_TruncOrSelf(m_Specific(&Plan->getVF()))))) {
3078 if (!Mask)
3079 Mask = Plan->getTrue();
3080 auto *NewLoad = cast<VPWidenMemIntrinsicRecipe>(&CurRecipe)->clone();
3081 NewLoad->setOperand(2, Mask);
3082 NewLoad->setOperand(3, &EVL);
3083 return NewLoad;
3084 }
3085
3086 VPValue *StoredVal;
3087 if (match(&CurRecipe, m_MaskedStore(m_VPValue(Addr), m_VPValue(StoredVal),
3088 m_RemoveMask(HeaderMask, Mask))))
3089 return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(CurRecipe), Addr,
3090 StoredVal, EVL, Mask);
3091
3092 if (match(&CurRecipe,
3093 m_MaskedStore(m_VPValue(EndPtr), m_VPValue(StoredVal),
3094 m_Reverse(m_RemoveMask(HeaderMask, Mask)))) &&
3095 match(EndPtr, m_VecEndPtr(m_VPValue(), m_Specific(&Plan->getVF())))) {
3096 Mask = GetVPReverse(Mask);
3097 Addr = AdjustEndPtr(EndPtr);
3098 VPValue *Poison = Plan->getPoison(StoredVal->getScalarType());
3099 auto *SpliceR = new VPWidenIntrinsicRecipe(
3100 Intrinsic::vector_splice_right, {StoredVal, Poison, &EVL},
3101 StoredVal->getScalarType(), {}, {}, DL);
3102 SpliceR->insertBefore(&CurRecipe);
3103 return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(CurRecipe), Addr,
3104 SpliceR, EVL, Mask);
3105 }
3106
3107 if (auto *Rdx = dyn_cast<VPReductionRecipe>(&CurRecipe))
3108 if (Rdx->isConditional() &&
3109 match(Rdx->getCondOp(), m_RemoveMask(HeaderMask, Mask)))
3110 return new VPReductionEVLRecipe(*Rdx, EVL, Mask);
3111
3112 if (auto *Interleave = dyn_cast<VPInterleaveRecipe>(&CurRecipe))
3113 if (Interleave->getMask() &&
3114 match(Interleave->getMask(), m_RemoveMask(HeaderMask, Mask)))
3115 return new VPInterleaveEVLRecipe(*Interleave, EVL, Mask);
3116
3117 VPValue *LHS, *RHS;
3118 if (match(&CurRecipe, m_SelectLike(m_RemoveMask(HeaderMask, Mask),
3120 return new VPWidenIntrinsicRecipe(
3121 Intrinsic::vp_merge, {Mask ? Mask : Plan->getTrue(), LHS, RHS, &EVL},
3122 LHS->getScalarType(), {}, {}, DL);
3123
3124 if (match(&CurRecipe, m_LastActiveLane(m_Specific(HeaderMask)))) {
3125 Type *Ty = CurRecipe.getVPSingleValue()->getScalarType();
3126 VPValue *ZExt =
3127 VPBuilder(&CurRecipe)
3128 .createScalarZExtOrTrunc(&EVL, Ty, EVL.getScalarType(), DL);
3129 return new VPInstruction(
3130 Instruction::Sub, {ZExt, Plan->getConstantInt(Ty, 1)},
3131 VPIRFlags::getDefaultFlags(Instruction::Sub), {}, DL);
3132 }
3133
3134 // lhs | (headermask && rhs) -> vp.merge rhs, true, lhs, evl
3135 if (match(&CurRecipe,
3137 m_LogicalAnd(m_Specific(HeaderMask), m_VPValue(RHS)))))
3138 return new VPWidenIntrinsicRecipe(Intrinsic::vp_merge,
3139 {RHS, Plan->getTrue(), LHS, &EVL},
3140 LHS->getScalarType(), {}, {}, DL);
3141
3142 if (auto *IntrR = dyn_cast<VPWidenIntrinsicRecipe>(&CurRecipe))
3143 if (auto VPID = getVPDivRemIntrinsic(IntrR->getVectorIntrinsicID()))
3144 if (match(IntrR->getOperand(2), m_RemoveMask(HeaderMask, Mask)))
3145 return new VPWidenIntrinsicRecipe(*VPID,
3146 {IntrR->getOperand(0),
3147 IntrR->getOperand(1),
3148 Mask ? Mask : Plan->getTrue(), &EVL},
3149 IntrR->getScalarType(), {}, {}, DL);
3150
3151 return nullptr;
3152}
3153
3154/// Optimize away any EVL-based header masks to VP intrinsic based recipes.
3155/// The transforms here need to preserve the original semantics.
3157 // Find the EVL-based header mask if it exists: icmp ult step-vector, EVL
3158 VPValue *HeaderMask = nullptr, *EVL = nullptr;
3161 m_VPValue(EVL))) &&
3162 match(EVL, m_EVL(m_VPValue()))) {
3163 HeaderMask = R.getVPSingleValue();
3164 break;
3165 }
3166 }
3167 if (!HeaderMask)
3168 return;
3169
3170 SmallVector<VPRecipeBase *> OldRecipes;
3171 for (VPUser *U : collectUsersRecursively(HeaderMask)) {
3173 if (auto *NewR = optimizeMaskToEVL(HeaderMask, *R, *EVL)) {
3174 NewR->insertBefore(R);
3175 for (auto [Old, New] :
3176 zip_equal(R->definedValues(), NewR->definedValues()))
3177 Old->replaceAllUsesWith(New);
3178 OldRecipes.push_back(R);
3179 }
3180 }
3181
3182 // Replace remaining (HeaderMask && Mask) with vp.merge (True, Mask,
3183 // False, EVL)
3184 for (VPUser *U : collectUsersRecursively(HeaderMask)) {
3185 VPValue *Mask;
3186 if (match(U, m_LogicalAnd(m_Specific(HeaderMask), m_VPValue(Mask)))) {
3187 auto *LogicalAnd = cast<VPInstruction>(U);
3188 auto *Merge = new VPWidenIntrinsicRecipe(
3189 Intrinsic::vp_merge, {Plan.getTrue(), Mask, Plan.getFalse(), EVL},
3190 Mask->getScalarType(), {}, {}, LogicalAnd->getDebugLoc());
3191 Merge->insertBefore(LogicalAnd);
3192 LogicalAnd->replaceAllUsesWith(Merge);
3193 OldRecipes.push_back(LogicalAnd);
3194 }
3195 }
3196
3197 // Fold the following splice patterns:
3198 // splice.right(splice.left(poison, x, evl), poison, evl) -> x
3199 // vector.reverse(splice.left(poison, x, evl)) -> vp.reverse(x, true, evl)
3200 // splice.right(vector.reverse(x), poison, evl) -> vp.reverse(x, true, evl)
3201 for (VPUser *U : collectUsersRecursively(EVL)) {
3202 auto *R = cast<VPRecipeBase>(U);
3203 VPValue *X;
3206 m_Poison(), m_VPValue(X), m_Specific(EVL)),
3207 m_Poison(), m_Specific(EVL)))) {
3208 R->getVPSingleValue()->replaceAllUsesWith(X);
3209 OldRecipes.push_back(R);
3210 continue;
3211 }
3212
3213 if (!match(U,
3216 m_Poison(), m_VPValue(X), m_Specific(EVL))),
3218 m_Reverse(m_VPValue(X)), m_Poison(), m_Specific(EVL)))))
3219 continue;
3220
3221 auto *VPReverse = new VPWidenIntrinsicRecipe(
3222 Intrinsic::experimental_vp_reverse, {X, Plan.getTrue(), EVL},
3223 X->getScalarType(), {}, {}, R->getDebugLoc());
3224 VPReverse->insertBefore(R);
3225 R->getVPSingleValue()->replaceAllUsesWith(VPReverse);
3226 OldRecipes.push_back(R);
3227 }
3228
3229 for (VPRecipeBase *R : reverse(OldRecipes)) {
3230 SmallVector<VPValue *> PossiblyDead(R->operands());
3231 R->eraseFromParent();
3232 for (VPValue *Op : PossiblyDead)
3234 }
3235}
3236
3237/// After replacing the canonical IV with a EVL-based IV, fixup recipes that use
3238/// VF to use the EVL instead to avoid incorrect updates on the penultimate
3239/// iteration.
3240static void fixupVFUsersForEVL(VPlan &Plan, VPValue &EVL) {
3241 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3242 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
3243
3244 // EVL is i32 but VF/VFxUF are IdxTy. Convert as needed.
3245 VPValue *EVLAsIdx =
3249
3250 assert(all_of(Plan.getVF().users(),
3251 [&Plan](VPUser *U) {
3252 auto IsAllowedUser =
3253 IsaPred<VPVectorEndPointerRecipe, VPScalarIVStepsRecipe,
3254 VPWidenIntOrFpInductionRecipe,
3255 VPWidenMemIntrinsicRecipe>;
3256 if (match(U, m_Trunc(m_Specific(&Plan.getVF()))))
3257 return all_of(cast<VPSingleDefRecipe>(U)->users(),
3258 IsAllowedUser);
3259 return IsAllowedUser(U);
3260 }) &&
3261 "User of VF that we can't transform to EVL.");
3262 Plan.getVF().replaceUsesWithIf(EVLAsIdx, [](VPUser &U, unsigned Idx) {
3264 });
3265
3266 assert(all_of(Plan.getVFxUF().users(),
3268 m_c_Add(m_Specific(LoopRegion->getCanonicalIV()),
3269 m_Specific(&Plan.getVFxUF())),
3271 "Only users of VFxUF should be VPWidenPointerInductionRecipe and the "
3272 "increment of the canonical induction.");
3273 Plan.getVFxUF().replaceUsesWithIf(EVLAsIdx, [](VPUser &U, unsigned Idx) {
3274 // Only replace uses in VPWidenPointerInductionRecipe; The increment of the
3275 // canonical induction must not be updated.
3277 });
3278
3279 // Create a scalar phi to track the previous EVL if fixed-order recurrence is
3280 // contained.
3281 bool ContainsFORs =
3283 if (ContainsFORs) {
3284 // TODO: Use VPInstruction::ExplicitVectorLength to get maximum EVL.
3285 VPValue *MaxEVL = &Plan.getVF();
3286 // Emit VPScalarCastRecipe in preheader if VF is not a 32 bits integer.
3287 VPBuilder Builder(LoopRegion->getPreheaderVPBB());
3288 MaxEVL = Builder.createScalarZExtOrTrunc(
3289 MaxEVL, Type::getInt32Ty(Plan.getContext()), MaxEVL->getScalarType(),
3291
3292 Builder.setInsertPoint(Header, Header->getFirstNonPhi());
3293 VPValue *PrevEVL = Builder.createScalarPhi(
3294 {MaxEVL, &EVL}, DebugLoc::getUnknown(), "prev.evl");
3295
3298 for (VPRecipeBase &R : *VPBB) {
3299 VPValue *V1, *V2;
3300 if (!match(&R,
3302 m_VPValue(V1), m_VPValue(V2))))
3303 continue;
3304 VPValue *Imm = Plan.getOrAddLiveIn(
3307 Intrinsic::experimental_vp_splice,
3308 {V1, V2, Imm, Plan.getTrue(), PrevEVL, &EVL},
3309 R.getVPSingleValue()->getScalarType(), {}, {}, R.getDebugLoc());
3310 VPSplice->insertBefore(&R);
3311 R.getVPSingleValue()->replaceAllUsesWith(VPSplice);
3312 }
3313 }
3314 }
3315
3316 VPValue *HeaderMask = vputils::findHeaderMask(Plan);
3317 if (!HeaderMask)
3318 return;
3319
3320 // Ensure that any reduction that uses a select to mask off tail lanes does so
3321 // in the vector loop, not the middle block, since EVL tail folding can have
3322 // tail elements in the penultimate iteration.
3323 assert(all_of(*Plan.getMiddleBlock(), [&Plan, HeaderMask](VPRecipeBase &R) {
3324 if (match(&R, m_ComputeReductionResult(m_Select(m_Specific(HeaderMask),
3325 m_VPValue(), m_VPValue()))))
3326 return R.getOperand(0)->getDefiningRecipe()->getRegion() ==
3327 Plan.getVectorLoopRegion();
3328 return true;
3329 }));
3330
3331 // Replace header masks with a mask equivalent to predicating by EVL:
3332 //
3333 // icmp ule widen-canonical-iv backedge-taken-count
3334 // ->
3335 // icmp ult step-vector, EVL
3336 VPRecipeBase *EVLR = EVL.getDefiningRecipe();
3337 VPBuilder Builder(EVLR->getParent(), std::next(EVLR->getIterator()));
3338 Type *EVLType = EVL.getScalarType();
3339 VPValue *EVLMask = Builder.createICmp(
3341 Builder.createNaryOp(VPInstruction::StepVector, {}, EVLType), &EVL);
3342 HeaderMask->replaceAllUsesWith(EVLMask);
3343}
3344
3345/// Converts a tail folded vector loop region to step by
3346/// VPInstruction::ExplicitVectorLength elements instead of VF elements each
3347/// iteration.
3348///
3349/// - Add a VPCurrentIterationPHIRecipe and related recipes to \p Plan and
3350/// replaces all uses of the canonical IV except for the canonical IV
3351/// increment with a VPCurrentIterationPHIRecipe. The canonical IV is used
3352/// only for loop iterations counting after this transformation.
3353///
3354/// - The header mask is replaced with a header mask based on the EVL.
3355///
3356/// - Plans with FORs have a new phi added to keep track of the EVL of the
3357/// previous iteration, and VPFirstOrderRecurrencePHIRecipes are replaced with
3358/// @llvm.vp.splice.
3359///
3360/// The function uses the following definitions:
3361/// %StartV is the canonical induction start value.
3362///
3363/// The function adds the following recipes:
3364///
3365/// vector.ph:
3366/// ...
3367///
3368/// vector.body:
3369/// ...
3370/// %CurrentIter = CURRENT-ITERATION-PHI [ %StartV, %vector.ph ],
3371/// [ %NextIter, %vector.body ]
3372/// %AVL = phi [ trip-count, %vector.ph ], [ %NextAVL, %vector.body ]
3373/// %VPEVL = EXPLICIT-VECTOR-LENGTH %AVL
3374/// ...
3375/// %OpEVL = cast i32 %VPEVL to IVSize
3376/// %NextIter = add IVSize %OpEVL, %CurrentIter
3377/// %NextAVL = sub IVSize nuw %AVL, %OpEVL
3378/// ...
3379///
3380/// If MaxSafeElements is provided, the function adds the following recipes:
3381/// vector.ph:
3382/// ...
3383///
3384/// vector.body:
3385/// ...
3386/// %CurrentIter = CURRENT-ITERATION-PHI [ %StartV, %vector.ph ],
3387/// [ %NextIter, %vector.body ]
3388/// %AVL = phi [ trip-count, %vector.ph ], [ %NextAVL, %vector.body ]
3389/// %cmp = cmp ult %AVL, MaxSafeElements
3390/// %SAFE_AVL = select %cmp, %AVL, MaxSafeElements
3391/// %VPEVL = EXPLICIT-VECTOR-LENGTH %SAFE_AVL
3392/// ...
3393/// %OpEVL = cast i32 %VPEVL to IVSize
3394/// %NextIter = add IVSize %OpEVL, %CurrentIter
3395/// %NextAVL = sub IVSize nuw %AVL, %OpEVL
3396/// ...
3397///
3399 VPlan &Plan, const std::optional<unsigned> &MaxSafeElements) {
3400 if (Plan.hasScalarVFOnly())
3401 return;
3402 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3403 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
3404
3405 auto *CanonicalIV = LoopRegion->getCanonicalIV();
3406 auto *CanIVTy = LoopRegion->getCanonicalIVType();
3407 VPValue *StartV = Plan.getZero(CanIVTy);
3408 auto *CanonicalIVIncrement = LoopRegion->getOrCreateCanonicalIVIncrement();
3409
3410 // Create the CurrentIteration recipe in the vector loop.
3411 auto *CurrentIteration =
3413 CurrentIteration->insertBefore(*Header, Header->begin());
3414 VPBuilder Builder(Header, Header->getFirstNonPhi());
3415 // Create the AVL (application vector length), starting from TC -> 0 in steps
3416 // of EVL.
3417 VPPhi *AVLPhi = Builder.createScalarPhi(
3418 {Plan.getTripCount()}, DebugLoc::getCompilerGenerated(), "avl");
3419 VPValue *AVL = AVLPhi;
3420
3421 if (MaxSafeElements) {
3422 // Support for MaxSafeDist for correct loop emission.
3423 VPValue *AVLSafe = Plan.getConstantInt(CanIVTy, *MaxSafeElements);
3424 VPValue *Cmp = Builder.createICmp(ICmpInst::ICMP_ULT, AVL, AVLSafe);
3425 AVL = Builder.createSelect(Cmp, AVL, AVLSafe, DebugLoc::getUnknown(),
3426 "safe_avl");
3427 }
3428 auto *VPEVL = Builder.createNaryOp(VPInstruction::ExplicitVectorLength, AVL,
3429 DebugLoc::getUnknown(), "evl");
3430
3431 Builder.setInsertPoint(CanonicalIVIncrement);
3432 VPValue *OpVPEVL = VPEVL;
3433
3434 auto *I32Ty = Type::getInt32Ty(Plan.getContext());
3435 OpVPEVL = Builder.createScalarZExtOrTrunc(
3436 OpVPEVL, CanIVTy, I32Ty, CanonicalIVIncrement->getDebugLoc());
3437
3438 auto *NextIter = Builder.createAdd(
3439 OpVPEVL, CurrentIteration, CanonicalIVIncrement->getDebugLoc(),
3440 "current.iteration.next", CanonicalIVIncrement->getNoWrapFlags());
3441 CurrentIteration->addBackedgeValue(NextIter);
3442
3443 VPValue *NextAVL =
3444 Builder.createSub(AVLPhi, OpVPEVL, DebugLoc::getCompilerGenerated(),
3445 "avl.next", {/*NUW=*/true, /*NSW=*/false});
3446 AVLPhi->addIncoming(NextAVL);
3447
3448 fixupVFUsersForEVL(Plan, *VPEVL);
3449 removeDeadRecipes(Plan);
3450
3451 // Replace all uses of the canonical IV with VPCurrentIterationPHIRecipe
3452 // except for the canonical IV increment.
3453 CanonicalIV->replaceAllUsesWith(CurrentIteration);
3454 CanonicalIVIncrement->setOperand(0, CanonicalIV);
3455 // TODO: support unroll factor > 1.
3456 Plan.setUF(1);
3457}
3458
3460 // Find the vector loop entry by locating VPCurrentIterationPHIRecipe.
3461 // There should be only one VPCurrentIteration in the entire plan.
3462 VPCurrentIterationPHIRecipe *CurrentIteration = nullptr;
3463
3466 for (VPRecipeBase &R : VPBB->phis())
3467 if (auto *PhiR = dyn_cast<VPCurrentIterationPHIRecipe>(&R)) {
3468 assert(!CurrentIteration &&
3469 "Found multiple CurrentIteration. Only one expected");
3470 CurrentIteration = PhiR;
3471 }
3472
3473 // Early return if it is not variable-length stepping.
3474 if (!CurrentIteration)
3475 return;
3476
3477 VPBasicBlock *HeaderVPBB = CurrentIteration->getParent();
3478 VPValue *CurrentIterationIncr = CurrentIteration->getBackedgeValue();
3479
3480 // Convert CurrentIteration to concrete recipe.
3481 auto *ScalarR =
3482 VPBuilder(CurrentIteration)
3484 {CurrentIteration->getStartValue(), CurrentIterationIncr},
3485 CurrentIteration->getDebugLoc(), "current.iteration.iv");
3486 CurrentIteration->replaceAllUsesWith(ScalarR);
3487 CurrentIteration->eraseFromParent();
3488
3489 // Replace CanonicalIVInc with CurrentIteration increment if it exists.
3490 auto *CanonicalIV = cast<VPPhi>(&*HeaderVPBB->begin());
3491 if (auto *CanIVInc = findUserOf(
3492 CanonicalIV, m_c_Add(m_VPValue(), m_Specific(&Plan.getVFxUF())))) {
3493 cast<VPInstruction>(CanIVInc)->replaceAllUsesWith(CurrentIterationIncr);
3494 CanIVInc->eraseFromParent();
3495 }
3496}
3497
3499 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3500 if (!LoopRegion)
3501 return;
3502 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
3503 if (Header->empty())
3504 return;
3505 // The EVL IV is always at the beginning.
3506 auto *EVLPhi = dyn_cast<VPCurrentIterationPHIRecipe>(&Header->front());
3507 if (!EVLPhi)
3508 return;
3509
3510 // Bail if not an EVL tail folded loop.
3511 VPValue *AVL;
3512 if (!match(EVLPhi->getBackedgeValue(),
3513 m_c_Add(m_ZExtOrSelf(m_EVL(m_VPValue(AVL))), m_Specific(EVLPhi))))
3514 return;
3515
3516 // The AVL may be capped to a safe distance.
3517 VPValue *SafeAVL, *UnsafeAVL;
3518 if (match(AVL,
3520 m_VPValue(SafeAVL)),
3521 m_Deferred(UnsafeAVL), m_Deferred(SafeAVL))))
3522 AVL = UnsafeAVL;
3523
3524 VPValue *AVLNext;
3525 [[maybe_unused]] bool FoundAVLNext =
3527 m_Specific(Plan.getTripCount()), m_VPValue(AVLNext)));
3528 assert(FoundAVLNext && "Didn't find AVL backedge?");
3529
3530 VPBasicBlock *Latch = LoopRegion->getExitingBasicBlock();
3531 auto *LatchBr = cast<VPInstruction>(Latch->getTerminator());
3532 if (match(LatchBr, m_BranchOnCond(m_True())))
3533 return;
3534
3535 VPValue *CanIVInc;
3536 [[maybe_unused]] bool FoundIncrement = match(
3537 LatchBr,
3539 m_Specific(&Plan.getVectorTripCount()))));
3540 assert(FoundIncrement &&
3541 match(CanIVInc, m_Add(m_Specific(LoopRegion->getCanonicalIV()),
3542 m_Specific(&Plan.getVFxUF()))) &&
3543 "Expected BranchOnCond with ICmp comparing CanIV + VFxUF with vector "
3544 "trip count");
3545
3546 Type *AVLTy = AVLNext->getScalarType();
3547 VPBuilder Builder(LatchBr);
3548 LatchBr->setOperand(
3549 0, Builder.createICmp(CmpInst::ICMP_EQ, AVLNext, Plan.getZero(AVLTy)));
3550}
3551
3553 VPlan &Plan, PredicatedScalarEvolution &PSE,
3554 const DenseMap<Value *, const SCEV *> &StridesMap,
3555 const VPDominatorTree &VPDT) {
3556 // Replace VPValues for known constant strides guaranteed by predicated scalar
3557 // evolution that are guaranteed to be guarded by the runtime checks; that is,
3558 // blocks dominated by the vector preheader.
3559 assert(!Plan.getVectorLoopRegion() &&
3560 "expected to run before loop regions are created");
3561 VPBlockBase *Preheader = Plan.getEntry()->getSuccessors()[1];
3562 auto CanUseVersionedStride = [&VPDT, Preheader](VPUser &U, unsigned) {
3563 auto *R = cast<VPRecipeBase>(&U);
3564 VPBlockBase *Parent = R->getParent();
3565 return VPDT.dominates(Preheader, Parent);
3566 };
3567 ValueToSCEVMapTy RewriteMap;
3568 for (const SCEV *Stride : StridesMap.values()) {
3569 using namespace SCEVPatternMatch;
3570 auto *StrideV = cast<SCEVUnknown>(Stride)->getValue();
3571 const APInt *StrideConst;
3572 if (!match(PSE.getSCEV(StrideV), m_scev_APInt(StrideConst)))
3573 // Only handle constant strides for now.
3574 continue;
3575
3576 auto *CI = Plan.getConstantInt(*StrideConst);
3577 if (VPValue *StrideVPV = Plan.getLiveIn(StrideV))
3578 StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
3579
3580 // The versioned value may not be used in the loop directly but through a
3581 // sext/zext. Add new live-ins in those cases.
3582 for (Value *U : StrideV->users()) {
3584 continue;
3585 VPValue *StrideVPV = Plan.getLiveIn(U);
3586 if (!StrideVPV)
3587 continue;
3588 unsigned BW = U->getType()->getScalarSizeInBits();
3589 APInt C =
3590 isa<SExtInst>(U) ? StrideConst->sext(BW) : StrideConst->zext(BW);
3591 VPValue *CI = Plan.getConstantInt(C);
3592 StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
3593 }
3594 RewriteMap[StrideV] = PSE.getSCEV(StrideV);
3595 }
3596
3597 for (VPRecipeBase &R : *Plan.getEntry()) {
3598 auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(&R);
3599 if (!ExpSCEV)
3600 continue;
3601 const SCEV *ScevExpr = ExpSCEV->getSCEV();
3602 auto *NewSCEV =
3603 SCEVParameterRewriter::rewrite(ScevExpr, *PSE.getSE(), RewriteMap);
3604 if (NewSCEV != ScevExpr) {
3605 VPValue *NewExp = vputils::getOrCreateVPValueForSCEVExpr(Plan, NewSCEV);
3606 ExpSCEV->replaceAllUsesWith(NewExp);
3607 if (Plan.getTripCount() == ExpSCEV)
3608 Plan.resetTripCount(NewExp);
3609 }
3610 }
3611}
3612
3614 // Collect recipes in the backward slice of `Root` that may generate a poison
3615 // value that is used after vectorization.
3617 auto CollectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) {
3619 Worklist.push_back(Root);
3620
3621 // Traverse the backward slice of Root through its use-def chain.
3622 while (!Worklist.empty()) {
3623 VPRecipeBase *CurRec = Worklist.pop_back_val();
3624
3625 if (!Visited.insert(CurRec).second)
3626 continue;
3627
3628 // Prune search if we find another recipe generating a widen memory
3629 // instruction. Widen memory instructions involved in address computation
3630 // will lead to gather/scatter instructions, which don't need to be
3631 // handled.
3633 VPHeaderPHIRecipe>(CurRec))
3634 continue;
3635
3636 // This recipe contributes to the address computation of a widen
3637 // load/store. If the underlying instruction has poison-generating flags,
3638 // drop them directly.
3639 if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(CurRec)) {
3640 VPValue *A, *B;
3641 // Dropping disjoint from an OR may yield incorrect results, as some
3642 // analysis may have converted it to an Add implicitly (e.g. SCEV used
3643 // for dependence analysis). Instead, replace it with an equivalent Add.
3644 // This is possible as all users of the disjoint OR only access lanes
3645 // where the operands are disjoint or poison otherwise.
3646 if (match(RecWithFlags, m_BinaryOr(m_VPValue(A), m_VPValue(B))) &&
3647 RecWithFlags->isDisjoint()) {
3648 VPBuilder Builder(RecWithFlags);
3649 VPInstruction *New =
3650 Builder.createAdd(A, B, RecWithFlags->getDebugLoc());
3651 New->setUnderlyingValue(RecWithFlags->getUnderlyingValue());
3652 RecWithFlags->replaceAllUsesWith(New);
3653 RecWithFlags->eraseFromParent();
3654 CurRec = New;
3655 } else
3656 RecWithFlags->dropPoisonGeneratingFlags();
3657 } else {
3660 (void)Instr;
3661 assert((!Instr || !Instr->hasPoisonGeneratingFlags()) &&
3662 "found instruction with poison generating flags not covered by "
3663 "VPRecipeWithIRFlags");
3664 }
3665
3666 // Add new definitions to the worklist.
3667 for (VPValue *Operand : CurRec->operands())
3668 if (VPRecipeBase *OpDef = Operand->getDefiningRecipe())
3669 Worklist.push_back(OpDef);
3670 }
3671 });
3672
3673 // We want to exclude the tail folding case, as we don't need to drop flags
3674 // for operations computing the first lane in this case: the first lane of the
3675 // header mask must always be true.
3676 auto IsNotHeaderMask = [&Plan](VPValue *Mask) {
3677 return Mask && !vputils::isHeaderMask(Mask, Plan);
3678 };
3679
3680 // Traverse all the recipes in the VPlan and collect the poison-generating
3681 // recipes in the backward slice starting at the address of a VPWidenRecipe or
3682 // VPInterleaveRecipe.
3683 auto Iter =
3686 for (VPRecipeBase &Recipe : *VPBB) {
3687 if (auto *WidenRec = dyn_cast<VPWidenMemoryRecipe>(&Recipe)) {
3688 VPRecipeBase *AddrDef = WidenRec->getAddr()->getDefiningRecipe();
3689 if (AddrDef && WidenRec->isConsecutive() &&
3690 IsNotHeaderMask(WidenRec->getMask()))
3691 CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
3692 } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) {
3693 VPRecipeBase *AddrDef = InterleaveRec->getAddr()->getDefiningRecipe();
3694 if (AddrDef && IsNotHeaderMask(InterleaveRec->getMask()))
3695 CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
3696 }
3697 }
3698 }
3699}
3700
3702 VPlan &Plan,
3704 &InterleaveGroups,
3705 const bool &EpilogueAllowed) {
3706 if (InterleaveGroups.empty())
3707 return;
3708
3710 for (VPBasicBlock *VPBB :
3713 for (VPRecipeBase &R : make_filter_range(*VPBB, [](VPRecipeBase &R) {
3714 return isa<VPWidenMemoryRecipe>(&R);
3715 })) {
3716 auto *MemR = cast<VPWidenMemoryRecipe>(&R);
3717 IRMemberToRecipe[&MemR->getIngredient()] = MemR;
3718 }
3719
3720 // Interleave memory: for each Interleave Group we marked earlier as relevant
3721 // for this VPlan, replace the Recipes widening its memory instructions with a
3722 // single VPInterleaveRecipe at its insertion point.
3723 VPDominatorTree VPDT(Plan);
3724 for (const auto *IG : InterleaveGroups) {
3725 // Skip interleave groups where members don't have recipes. This can happen
3726 // when removeDeadRecipes removes recipes that are part of interleave groups
3727 // but have no users.
3728 if (llvm::any_of(IG->members(), [&IRMemberToRecipe](Instruction *Member) {
3729 return !IRMemberToRecipe.contains(Member);
3730 }))
3731 continue;
3732
3733 auto *Start = IRMemberToRecipe.lookup(IG->getMember(0));
3734 VPIRMetadata InterleaveMD(*Start);
3735 SmallVector<VPValue *, 4> StoredValues;
3736 if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(Start->getAsRecipe()))
3737 StoredValues.push_back(StoreR->getStoredValue());
3738 for (unsigned I = 1; I < IG->getFactor(); ++I) {
3739 Instruction *MemberI = IG->getMember(I);
3740 if (!MemberI)
3741 continue;
3742 VPWidenMemoryRecipe *MemoryR = IRMemberToRecipe.lookup(MemberI);
3743 if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(MemoryR->getAsRecipe()))
3744 StoredValues.push_back(StoreR->getStoredValue());
3745 InterleaveMD.intersect(*MemoryR);
3746 }
3747
3748 bool NeedsMaskForGaps =
3749 (IG->requiresScalarEpilogue() && !EpilogueAllowed) ||
3750 (!StoredValues.empty() && !IG->isFull());
3751
3752 Instruction *IRInsertPos = IG->getInsertPos();
3753 auto *InsertPos = IRMemberToRecipe.lookup(IRInsertPos);
3754 VPRecipeBase *InsertPosR = InsertPos->getAsRecipe();
3755
3757 if (auto *Gep = dyn_cast<GetElementPtrInst>(
3758 getLoadStorePointerOperand(IRInsertPos)->stripPointerCasts()))
3759 NW = Gep->getNoWrapFlags().withoutNoUnsignedWrap();
3760
3761 // Get or create the start address for the interleave group.
3762 VPValue *Addr = Start->getAddr();
3763 VPRecipeBase *AddrDef = Addr->getDefiningRecipe();
3764 if (AddrDef && !VPDT.properlyDominates(AddrDef, InsertPosR)) {
3765 // We cannot re-use the address of member zero because it does not
3766 // dominate the insert position. Instead, use the address of the insert
3767 // position and create a PtrAdd adjusting it to the address of member
3768 // zero.
3769 // TODO: Hoist Addr's defining recipe (and any operands as needed) to
3770 // InsertPos or sink loads above zero members to join it.
3771 assert(IG->getIndex(IRInsertPos) != 0 &&
3772 "index of insert position shouldn't be zero");
3773 auto &DL = IRInsertPos->getDataLayout();
3774 APInt Offset(32,
3775 DL.getTypeAllocSize(getLoadStoreType(IRInsertPos)) *
3776 IG->getIndex(IRInsertPos),
3777 /*IsSigned=*/true);
3778 VPValue *OffsetVPV = Plan.getConstantInt(-Offset);
3779 VPBuilder B(InsertPosR);
3780 Addr = B.createNoWrapPtrAdd(InsertPos->getAddr(), OffsetVPV, NW);
3781 }
3782 // If the group is reverse, adjust the index to refer to the last vector
3783 // lane instead of the first. We adjust the index from the first vector
3784 // lane, rather than directly getting the pointer for lane VF - 1, because
3785 // the pointer operand of the interleaved access is supposed to be uniform.
3786 if (IG->isReverse()) {
3787 auto *ReversePtr = new VPVectorEndPointerRecipe(
3788 Addr, &Plan.getVF(), getLoadStoreType(IRInsertPos),
3789 -(int64_t)IG->getFactor(), NW, InsertPosR->getDebugLoc());
3790 ReversePtr->insertBefore(InsertPosR);
3791 Addr = ReversePtr;
3792 }
3793 auto *VPIG = new VPInterleaveRecipe(
3794 IG, Addr, StoredValues, InsertPos->getMask(), NeedsMaskForGaps,
3795 InterleaveMD, InsertPosR->getDebugLoc());
3796 VPIG->insertBefore(InsertPosR);
3797
3798 unsigned J = 0;
3799 for (unsigned i = 0; i < IG->getFactor(); ++i)
3800 if (Instruction *Member = IG->getMember(i)) {
3801 VPRecipeBase *MemberR = IRMemberToRecipe.lookup(Member)->getAsRecipe();
3802 if (!Member->getType()->isVoidTy()) {
3803 VPValue *OriginalV = MemberR->getVPSingleValue();
3804 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));
3805 J++;
3806 }
3807 MemberR->eraseFromParent();
3808 }
3809 }
3810}
3811
3812/// Expand a VPWidenIntOrFpInduction into executable recipes, for the initial
3813/// value, phi and backedge value. In the following example:
3814///
3815/// vector.ph:
3816/// Successor(s): vector loop
3817///
3818/// <x1> vector loop: {
3819/// vector.body:
3820/// WIDEN-INDUCTION %i = phi %start, %step, %vf
3821/// ...
3822/// EMIT branch-on-count ...
3823/// No successors
3824/// }
3825///
3826/// WIDEN-INDUCTION will get expanded to:
3827///
3828/// vector.ph:
3829/// ...
3830/// vp<%induction.start> = ...
3831/// vp<%induction.increment> = ...
3832///
3833/// Successor(s): vector loop
3834///
3835/// <x1> vector loop: {
3836/// vector.body:
3837/// ir<%i> = WIDEN-PHI vp<%induction.start>, vp<%vec.ind.next>
3838/// ...
3839/// vp<%vec.ind.next> = add ir<%i>, vp<%induction.increment>
3840/// EMIT branch-on-count ...
3841/// No successors
3842/// }
3843static void
3845 VPlan *Plan = WidenIVR->getParent()->getPlan();
3846 VPValue *Start = WidenIVR->getStartValue();
3847 VPValue *Step = WidenIVR->getStepValue();
3848 VPValue *VF = WidenIVR->getVFValue();
3849 DebugLoc DL = WidenIVR->getDebugLoc();
3850
3851 // The value from the original loop to which we are mapping the new induction
3852 // variable.
3853 Type *Ty = WidenIVR->getScalarType();
3854
3855 const InductionDescriptor &ID = WidenIVR->getInductionDescriptor();
3858 VPIRFlags Flags = *WidenIVR;
3859 if (ID.getKind() == InductionDescriptor::IK_IntInduction) {
3860 AddOp = Instruction::Add;
3861 MulOp = Instruction::Mul;
3862 } else {
3863 AddOp = ID.getInductionOpcode();
3864 MulOp = Instruction::FMul;
3865 }
3866
3867 // If the phi is truncated, truncate the start and step values.
3868 VPBuilder Builder(Plan->getVectorPreheader());
3869 Type *StepTy = Step->getScalarType();
3870 if (Ty->getScalarSizeInBits() < StepTy->getScalarSizeInBits()) {
3871 assert(StepTy->isIntegerTy() && "Truncation requires an integer type");
3872 Step = Builder.createScalarCast(Instruction::Trunc, Step, Ty, DL);
3873 Start = Builder.createScalarCast(Instruction::Trunc, Start, Ty, DL);
3874 StepTy = Ty;
3875 }
3876
3877 // Construct the initial value of the vector IV in the vector loop preheader.
3878 Type *IVIntTy =
3880 VPValue *Init = Builder.createNaryOp(VPInstruction::StepVector, {}, IVIntTy);
3881 if (StepTy->isFloatingPointTy())
3882 Init = Builder.createWidenCast(Instruction::UIToFP, Init, StepTy);
3883
3884 VPValue *SplatStart = Builder.createNaryOp(VPInstruction::Broadcast, Start);
3885 VPValue *SplatStep = Builder.createNaryOp(VPInstruction::Broadcast, Step);
3886
3887 Init = Builder.createNaryOp(MulOp, {Init, SplatStep}, Flags);
3888 Init = Builder.createNaryOp(AddOp, {SplatStart, Init}, Flags,
3889 DebugLoc::getUnknown(), "induction");
3890
3891 // Create the widened phi of the vector IV.
3892 auto *WidePHI = VPBuilder(WidenIVR).createWidenPhi(
3893 Init, WidenIVR->getDebugLoc(), "vec.ind");
3894
3895 // Create the backedge value for the vector IV.
3896 VPValue *Inc;
3897 VPValue *Prev;
3898 // If unrolled, use the increment and prev value from the operands.
3899 if (auto *SplatVF = WidenIVR->getSplatVFValue()) {
3900 Inc = SplatVF;
3901 Prev = WidenIVR->getLastUnrolledPartOperand();
3902 } else {
3903 // Move the insertion point after the VF definition when the VF is defined
3904 // inside a loop, such as for EVL tail-folding.
3905 if (VPRecipeBase *R = VF->getDefiningRecipe())
3906 if (R->getParent()->getEnclosingLoopRegion())
3907 Builder.setInsertPoint(R->getParent(), std::next(R->getIterator()));
3908
3909 // Multiply the vectorization factor by the step using integer or
3910 // floating-point arithmetic as appropriate.
3911 if (StepTy->isFloatingPointTy())
3912 VF = Builder.createScalarCast(Instruction::CastOps::UIToFP, VF, StepTy,
3913 DL);
3914 else
3915 VF = Builder.createScalarZExtOrTrunc(VF, StepTy, VF->getScalarType(), DL);
3916
3917 Inc = Builder.createNaryOp(MulOp, {Step, VF}, Flags);
3918 Inc = Builder.createNaryOp(VPInstruction::Broadcast, Inc);
3919 Prev = WidePHI;
3920 }
3921
3923 Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator());
3924 auto *Next = Builder.createNaryOp(AddOp, {Prev, Inc}, Flags,
3925 WidenIVR->getDebugLoc(), "vec.ind.next");
3926
3927 WidePHI->addIncoming(Next);
3928
3929 WidenIVR->replaceAllUsesWith(WidePHI);
3930}
3931
3932/// Expand a VPWidenPointerInductionRecipe into executable recipes, for the
3933/// initial value, phi and backedge value. In the following example:
3934///
3935/// <x1> vector loop: {
3936/// vector.body:
3937/// EMIT ir<%ptr.iv> = WIDEN-POINTER-INDUCTION %start, %step, %vf
3938/// ...
3939/// EMIT branch-on-count ...
3940/// }
3941///
3942/// WIDEN-POINTER-INDUCTION will get expanded to:
3943///
3944/// <x1> vector loop: {
3945/// vector.body:
3946/// EMIT-SCALAR %pointer.phi = phi %start, %ptr.ind
3947/// EMIT %mul = mul %stepvector, %step
3948/// EMIT %vector.gep = wide-ptradd %pointer.phi, %mul
3949/// ...
3950/// EMIT %ptr.ind = ptradd %pointer.phi, %vf
3951/// EMIT branch-on-count ...
3952/// }
3954 VPlan *Plan = R->getParent()->getPlan();
3955 VPValue *Start = R->getStartValue();
3956 VPValue *Step = R->getStepValue();
3957 VPValue *VF = R->getVFValue();
3958
3959 assert(R->getInductionDescriptor().getKind() ==
3961 "Not a pointer induction according to InductionDescriptor!");
3962 assert(R->getScalarType()->isPointerTy() && "Unexpected type.");
3963 assert(!R->onlyScalarsGenerated(Plan->hasScalableVF()) &&
3964 "Recipe should have been replaced");
3965
3966 VPBuilder Builder(R);
3967 DebugLoc DL = R->getDebugLoc();
3968
3969 // Build a scalar pointer phi.
3970 VPPhi *ScalarPtrPhi = Builder.createScalarPhi(Start, DL, "pointer.phi");
3971
3972 // Create actual address geps that use the pointer phi as base and a
3973 // vectorized version of the step value (<step*0, ..., step*N>) as offset.
3974 Builder.setInsertPoint(R->getParent(), R->getParent()->getFirstNonPhi());
3975 Type *StepTy = Step->getScalarType();
3976 VPValue *Offset = Builder.createNaryOp(VPInstruction::StepVector, {}, StepTy);
3977 Offset = Builder.createOverflowingOp(Instruction::Mul, {Offset, Step});
3978 VPValue *PtrAdd =
3979 Builder.createWidePtrAdd(ScalarPtrPhi, Offset, DL, "vector.gep");
3980 R->replaceAllUsesWith(PtrAdd);
3981
3982 // Create the backedge value for the scalar pointer phi.
3984 Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator());
3985 VF = Builder.createScalarZExtOrTrunc(VF, StepTy, VF->getScalarType(), DL);
3986 VPValue *Inc = Builder.createOverflowingOp(Instruction::Mul, {Step, VF});
3987
3988 VPValue *InductionGEP =
3989 Builder.createPtrAdd(ScalarPtrPhi, Inc, DL, "ptr.ind");
3990 ScalarPtrPhi->addIncoming(InductionGEP);
3991}
3992
3993/// Expand a VPDerivedIVRecipe into executable recipes.
3995 VPBuilder Builder(R);
3996 VPIRValue *Start = R->getStartValue();
3997 VPValue *Step = R->getStepValue();
3998 VPValue *Index = R->getIndex();
3999 Type *StepTy = Step->getScalarType();
4000 Type *IndexTy = Index->getScalarType();
4001 Index = StepTy->isIntegerTy()
4002 ? Builder.createScalarSExtOrTrunc(
4003 Index, StepTy, IndexTy, DebugLoc::getCompilerGenerated())
4004 : Builder.createScalarCast(Instruction::SIToFP, Index, StepTy,
4006 switch (R->getInductionKind()) {
4008 assert(Index->getScalarType() == Start->getScalarType() &&
4009 "Index type does not match StartValue type");
4010 return R->replaceAllUsesWith(Builder.createAdd(
4011 Start, Builder.createOverflowingOp(Instruction::Mul, {Index, Step})));
4012 }
4014 return R->replaceAllUsesWith(Builder.createPtrAdd(
4015 Start, Builder.createOverflowingOp(Instruction::Mul, {Index, Step})));
4017 assert(StepTy->isFloatingPointTy() && "Expected FP Step value");
4018 const FPMathOperator *FPBinOp = R->getFPBinOp();
4019 assert(FPBinOp &&
4020 (FPBinOp->getOpcode() == Instruction::FAdd ||
4021 FPBinOp->getOpcode() == Instruction::FSub) &&
4022 "Original BinOp should be defined for FP induction");
4023 FastMathFlags FMF = FPBinOp->getFastMathFlags();
4024 VPValue *FMul = Builder.createNaryOp(Instruction::FMul, {Step, Index}, FMF);
4025 return R->replaceAllUsesWith(
4026 Builder.createNaryOp(FPBinOp->getOpcode(), {Start, FMul}, FMF));
4027 }
4029 return;
4030 }
4031 llvm_unreachable("Unhandled induction kind");
4032}
4033
4035 // Replace loop regions with explicity CFG.
4036 SmallVector<VPRegionBlock *> LoopRegions;
4038 vp_depth_first_deep(Plan.getEntry()))) {
4039 if (!R->isReplicator())
4040 LoopRegions.push_back(R);
4041 }
4042 for (VPRegionBlock *R : LoopRegions)
4043 R->dissolveToCFGLoop();
4044}
4045
4048 // The transform runs after dissolving loop regions, so all VPBasicBlocks
4049 // terminated with BranchOnTwoConds are reached via a shallow traversal.
4052 if (!VPBB->empty() && match(&VPBB->back(), m_BranchOnTwoConds()))
4053 WorkList.push_back(cast<VPInstruction>(&VPBB->back()));
4054 }
4055
4056 // Expand BranchOnTwoConds instructions into explicit CFG with two new
4057 // single-condition branches:
4058 // 1. A branch that replaces BranchOnTwoConds, jumps to the first successor if
4059 // the first condition is true, and otherwise jumps to a new interim block.
4060 // 2. A branch that ends the interim block, jumps to the second successor if
4061 // the second condition is true, and otherwise jumps to the third
4062 // successor.
4063 for (VPInstruction *Br : WorkList) {
4064 assert(Br->getNumOperands() == 2 &&
4065 "BranchOnTwoConds must have exactly 2 conditions");
4066 DebugLoc DL = Br->getDebugLoc();
4067 VPBasicBlock *BrOnTwoCondsBB = Br->getParent();
4068 const auto Successors = to_vector(BrOnTwoCondsBB->getSuccessors());
4069 assert(Successors.size() == 3 &&
4070 "BranchOnTwoConds must have exactly 3 successors");
4071
4072 for (VPBlockBase *Succ : Successors)
4073 VPBlockUtils::disconnectBlocks(BrOnTwoCondsBB, Succ);
4074
4075 VPValue *Cond0 = Br->getOperand(0);
4076 VPValue *Cond1 = Br->getOperand(1);
4077 VPBlockBase *Succ0 = Successors[0];
4078 VPBlockBase *Succ1 = Successors[1];
4079 VPBlockBase *Succ2 = Successors[2];
4080
4081 // If the successor block for both conditions is the same, then combine the
4082 // two conditions and plant a single conditional branch.
4083 if (Succ0 == Succ1) {
4084 VPBuilder Builder(Br);
4085 VPValue *Combined = Builder.createOr(Cond0, Cond1, DL);
4086 Builder.createNaryOp(VPInstruction::BranchOnCond, {Combined}, DL);
4087 VPBlockUtils::connectBlocks(BrOnTwoCondsBB, Succ0);
4088 VPBlockUtils::connectBlocks(BrOnTwoCondsBB, Succ2);
4089 Br->eraseFromParent();
4090 continue;
4091 }
4092
4093 assert(!Succ0->getParent() && !Succ1->getParent() && !Succ2->getParent() &&
4094 !BrOnTwoCondsBB->getParent() && "regions must already be dissolved");
4095
4096 VPBasicBlock *InterimBB =
4097 Plan.createVPBasicBlock(BrOnTwoCondsBB->getName() + ".interim");
4098
4099 VPBuilder(BrOnTwoCondsBB)
4101 VPBlockUtils::connectBlocks(BrOnTwoCondsBB, Succ0);
4102 VPBlockUtils::connectBlocks(BrOnTwoCondsBB, InterimBB);
4103
4105 VPBlockUtils::connectBlocks(InterimBB, Succ1);
4106 VPBlockUtils::connectBlocks(InterimBB, Succ2);
4107 Br->eraseFromParent();
4108 }
4109}
4110
4113 vp_depth_first_deep(Plan.getEntry()))) {
4114 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
4115 VPBuilder Builder(&R);
4116 if (auto *WidenIVR = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R)) {
4118 WidenIVR->eraseFromParent();
4119 continue;
4120 }
4121
4122 if (auto *WidenIVR = dyn_cast<VPWidenPointerInductionRecipe>(&R)) {
4123 // If the recipe only generates scalars, scalarize it instead of
4124 // expanding it.
4125 if (WidenIVR->onlyScalarsGenerated(Plan.hasScalableVF())) {
4126 VPValue *PtrAdd =
4127 scalarizeVPWidenPointerInduction(WidenIVR, Plan, Builder);
4128 WidenIVR->replaceAllUsesWith(PtrAdd);
4129 WidenIVR->eraseFromParent();
4130 continue;
4131 }
4133 WidenIVR->eraseFromParent();
4134 continue;
4135 }
4136
4137 if (auto *DerivedIVR = dyn_cast<VPDerivedIVRecipe>(&R)) {
4138 expandVPDerivedIV(DerivedIVR);
4139 DerivedIVR->eraseFromParent();
4140 continue;
4141 }
4142
4143 if (auto *WideCanIV = dyn_cast<VPWidenCanonicalIVRecipe>(&R)) {
4144 VPValue *CanIV = WideCanIV->getCanonicalIV();
4145 Type *CanIVTy = CanIV->getScalarType();
4146 VPValue *Step = WideCanIV->getStepValue();
4147 if (!Step) {
4148 assert(Plan.getConcreteUF() == 1 &&
4149 "Expected unroller to have materialized step for UF != 1");
4150 Step = Plan.getZero(CanIVTy);
4151 }
4152 CanIV = Builder.createNaryOp(VPInstruction::Broadcast, CanIV);
4153 Step = Builder.createNaryOp(VPInstruction::Broadcast, Step);
4154 Step = Builder.createAdd(
4155 Step, Builder.createNaryOp(VPInstruction::StepVector, {}, CanIVTy));
4156 VPValue *CanVecIV =
4157 Builder.createAdd(CanIV, Step, WideCanIV->getDebugLoc(), "vec.iv",
4158 WideCanIV->getNoWrapFlags());
4159 WideCanIV->replaceAllUsesWith(CanVecIV);
4160 WideCanIV->eraseFromParent();
4161 continue;
4162 }
4163
4164 // Expand VPBlendRecipe into VPInstruction::Select.
4165 if (auto *Blend = dyn_cast<VPBlendRecipe>(&R)) {
4166 VPValue *Select = Blend->getIncomingValue(0);
4167 for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
4168 Select = Builder.createSelect(Blend->getMask(I),
4169 Blend->getIncomingValue(I), Select,
4170 R.getDebugLoc(), "predphi", *Blend);
4171 Blend->replaceAllUsesWith(Select);
4172 Blend->eraseFromParent();
4173 continue;
4174 }
4175
4176 if (auto *VEPR = dyn_cast<VPVectorEndPointerRecipe>(&R)) {
4177 if (!VEPR->getOffset()) {
4178 assert(Plan.getConcreteUF() == 1 &&
4179 "Expected unroller to have materialized offset for UF != 1");
4180 VEPR->materializeOffset();
4181 }
4182 continue;
4183 }
4184
4185 if (auto *Expr = dyn_cast<VPExpressionRecipe>(&R)) {
4186 Expr->decompose();
4187 Expr->eraseFromParent();
4188 continue;
4189 }
4190
4191 // Expand LastActiveLane into Not + FirstActiveLane + Sub.
4192 auto *LastActiveL = dyn_cast<VPInstruction>(&R);
4193 if (LastActiveL &&
4194 LastActiveL->getOpcode() == VPInstruction::LastActiveLane) {
4195 // Create Not(Mask) for all operands.
4197 for (VPValue *Op : LastActiveL->operands()) {
4198 VPValue *NotMask = Builder.createNot(Op, LastActiveL->getDebugLoc());
4199 NotMasks.push_back(NotMask);
4200 }
4201
4202 // Create FirstActiveLane on the inverted masks.
4203 VPValue *FirstInactiveLane = Builder.createFirstActiveLane(
4204 NotMasks, LastActiveL->getDebugLoc(), "first.inactive.lane");
4205
4206 // Subtract 1 to get the last active lane.
4207 VPValue *One =
4208 Plan.getConstantInt(FirstInactiveLane->getScalarType(), 1);
4209 VPValue *LastLane =
4210 Builder.createSub(FirstInactiveLane, One,
4211 LastActiveL->getDebugLoc(), "last.active.lane");
4212
4213 LastActiveL->replaceAllUsesWith(LastLane);
4214 LastActiveL->eraseFromParent();
4215 continue;
4216 }
4217
4218 // Lower MaskedCond with block mask to LogicalAnd.
4220 auto *VPI = cast<VPInstruction>(&R);
4221 assert(VPI->isMasked() &&
4222 "Unmasked MaskedCond should be simplified earlier");
4223 VPI->replaceAllUsesWith(Builder.createNaryOp(
4224 VPInstruction::LogicalAnd, {VPI->getMask(), VPI->getOperand(0)}));
4225 VPI->eraseFromParent();
4226 continue;
4227 }
4228
4229 // Lower CanonicalIVIncrementForPart to plain Add.
4230 if (match(
4231 &R,
4233 auto *VPI = cast<VPInstruction>(&R);
4234 VPValue *Add = Builder.createOverflowingOp(
4235 Instruction::Add, VPI->operands(), VPI->getNoWrapFlags(),
4236 VPI->getDebugLoc());
4237 VPI->replaceAllUsesWith(Add);
4238 VPI->eraseFromParent();
4239 continue;
4240 }
4241
4242 // Lower BranchOnCount to ICmp + BranchOnCond.
4243 VPValue *IV, *TC;
4244 if (match(&R, m_BranchOnCount(m_VPValue(IV), m_VPValue(TC)))) {
4245 auto *BranchOnCountInst = cast<VPInstruction>(&R);
4246 DebugLoc DL = BranchOnCountInst->getDebugLoc();
4247 VPValue *Cond = Builder.createICmp(CmpInst::ICMP_EQ, IV, TC, DL);
4248 Builder.createNaryOp(VPInstruction::BranchOnCond, Cond, DL);
4249 BranchOnCountInst->eraseFromParent();
4250 continue;
4251 }
4252
4253 VPValue *VectorStep;
4254 VPValue *ScalarStep;
4256 m_VPValue(VectorStep), m_VPValue(ScalarStep))))
4257 continue;
4258
4259 // Expand WideIVStep.
4260 auto *VPI = cast<VPInstruction>(&R);
4261 Type *IVTy = VPI->getScalarType();
4262 if (VectorStep->getScalarType() != IVTy) {
4264 ? Instruction::UIToFP
4265 : Instruction::Trunc;
4266 VectorStep = Builder.createWidenCast(CastOp, VectorStep, IVTy);
4267 }
4268
4269 assert(!match(ScalarStep, m_One()) && "Expected non-unit scalar-step");
4270 if (ScalarStep->getScalarType() != IVTy) {
4271 ScalarStep =
4272 Builder.createWidenCast(Instruction::Trunc, ScalarStep, IVTy);
4273 }
4274
4275 VPIRFlags Flags;
4276 unsigned MulOpc;
4277 if (IVTy->isFloatingPointTy()) {
4278 MulOpc = Instruction::FMul;
4279 Flags = VPI->getFastMathFlagsOrNone();
4280 } else {
4281 MulOpc = Instruction::Mul;
4282 Flags = VPIRFlags::getDefaultFlags(MulOpc);
4283 }
4284
4285 VPInstruction *Mul = Builder.createNaryOp(
4286 MulOpc, {VectorStep, ScalarStep}, Flags, R.getDebugLoc());
4287 VectorStep = Mul;
4288 VPI->replaceAllUsesWith(VectorStep);
4289 VPI->eraseFromParent();
4290 }
4291 }
4292}
4293
4294/// Returns the VPValue representing the uncountable exit comparison used by
4295/// AnyOf if the recipes it depends on can be traced back to live-ins and
4296/// the addresses (in GEP/PtrAdd form) of any (non-masked) load used in
4297/// generating the values for the comparison. The recipes are stored in
4298/// \p Recipes.
4299static std::optional<VPValue *>
4301 VPBasicBlock *LatchVPBB) {
4302 // Given a plain CFG VPlan loop with countable latch exiting block
4303 // \p LatchVPBB, we're looking to match the recipes contributing to the
4304 // uncountable exit condition comparison (here, vp<%4>) back to either
4305 // live-ins or the address nodes for the load used as part of the uncountable
4306 // exit comparison so that we can either move them within the loop, or copy
4307 // them to the preheader depending on the chosen method for dealing with
4308 // stores in uncountable exit loops.
4309 //
4310 // Currently, the address of the load is restricted to a GEP with 2 operands
4311 // and a live-in base address. This constraint may be relaxed later.
4312 //
4313 // VPlan ' for UF>=1' {
4314 // Live-in vp<%0> = VF * UF
4315 // Live-in vp<%1> = vector-trip-count
4316 // Live-in ir<20> = original trip-count
4317 //
4318 // ir-bb<entry>:
4319 // Successor(s): scalar.ph, vector.ph
4320 //
4321 // vector.ph:
4322 // Successor(s): for.body
4323 //
4324 // for.body:
4325 // EMIT vp<%2> = phi ir<0>, vp<%index.next>
4326 // EMIT-SCALAR ir<%iv> = phi [ ir<0>, vector.ph ], [ ir<%iv.next>, for.inc ]
4327 // EMIT ir<%uncountable.addr> = getelementptr inbounds nuw ir<%pred>,ir<%iv>
4328 // EMIT ir<%uncountable.val> = load ir<%uncountable.addr>
4329 // EMIT ir<%uncountable.cond> = icmp sgt ir<%uncountable.val>, ir<500>
4330 // EMIT vp<%3> = masked-cond ir<%uncountable.cond>
4331 // Successor(s): for.inc
4332 //
4333 // for.inc:
4334 // EMIT ir<%iv.next> = add nuw nsw ir<%iv>, ir<1>
4335 // EMIT ir<%countable.cond> = icmp eq ir<%iv.next>, ir<20>
4336 // EMIT vp<%index.next> = add nuw vp<%2>, vp<%0>
4337 // EMIT vp<%4> = any-of ir<%3>
4338 // EMIT vp<%5> = icmp eq vp<%index.next>, vp<%1>
4339 // EMIT branch-on-two-conds vp<%4>, vp<%5>
4340 // Successor(s): middle.block, middle.block, for.body
4341 //
4342 // middle.block:
4343 // Successor(s): ir-bb<exit>, scalar.ph
4344 //
4345 // ir-bb<exit>:
4346 // No successors
4347 //
4348 // scalar.ph:
4349 // }
4350
4351 // Find the uncountable loop exit condition.
4352 VPValue *UncountableCondition = nullptr;
4353 if (!match(LatchVPBB->getTerminator(),
4354 m_BranchOnTwoConds(m_AnyOf(m_VPValue(UncountableCondition)),
4355 m_VPValue())))
4356 return std::nullopt;
4357
4359 Worklist.push_back(UncountableCondition);
4360 while (!Worklist.empty()) {
4361 VPValue *V = Worklist.pop_back_val();
4362
4363 // Any value defined outside the loop does not need to be copied.
4364 if (V->isDefinedOutsideLoopRegions())
4365 continue;
4366
4367 // FIXME: Remove the single user restriction; it's here because we're
4368 // starting with the simplest set of loops we can, and multiple
4369 // users means needing to add PHI nodes in the transform.
4370 if (V->getNumUsers() > 1)
4371 return std::nullopt;
4372
4373 VPValue *Op1, *Op2;
4374 // Walk back through recipes until we find at least one load from memory.
4375 if (match(V, m_ICmp(m_VPValue(Op1), m_VPValue(Op2)))) {
4376 Worklist.push_back(Op1);
4377 Worklist.push_back(Op2);
4378 Recipes.push_back(cast<VPInstruction>(V->getDefiningRecipe()));
4379 } else if (match(V, m_VPInstruction<Instruction::Load>(m_VPValue(Op1)))) {
4380 VPRecipeBase *GepR = Op1->getDefiningRecipe();
4381 // Only matching base + single offset term for now.
4382 if (GepR->getNumOperands() != 2)
4383 return std::nullopt;
4384 // Matching a GEP with a loop-invariant base ptr.
4386 m_LiveIn(), m_VPValue())))
4387 return std::nullopt;
4388 Recipes.push_back(cast<VPInstruction>(V->getDefiningRecipe()));
4389 Recipes.push_back(cast<VPInstruction>(GepR));
4391 m_VPValue(Op1)))) {
4392 Worklist.push_back(Op1);
4393 Recipes.push_back(cast<VPInstruction>(V->getDefiningRecipe()));
4394 } else
4395 return std::nullopt;
4396 }
4397
4398 // If we couldn't match anything, don't return the condition. It may be
4399 // defined outside the loop.
4400 if (Recipes.empty() || none_of(Recipes, [](VPInstruction *I) {
4402 }))
4403 return std::nullopt;
4404
4405 return UncountableCondition;
4406}
4407
4413
4414/// Update \p Plan to mask memory operations in the loop based on whether the
4415/// early exit is taken or not.
4416///
4417/// We're currently expecting to find a loop with properties similar to the
4418/// following:
4419///
4420/// for.body:
4421/// ir<%indvars.iv> = WIDEN-INDUCTION nuw nsw ir<0>, ir<1>, vp<%0>
4422/// EMIT ir<%arrayidx> = getelementptr inbounds nuw ir<@c>, ir<%indvars.iv>
4423/// EMIT-SCALAR ir<%0> = load ir<%arrayidx>
4424/// EMIT ir<%cmp1> = icmp sgt ir<%0>, ir<5>
4425/// EMIT vp<%1> = masked-cond ir<%cmp1>
4426/// Successor(s): if.end
4427///
4428/// if.end:
4429/// EMIT ir<%arrayidx3> = getelementptr inbounds nuw ir<@src>, ir<%indvars.iv>
4430/// EMIT-SCALAR ir<%2> = load ir<%arrayidx3>
4431/// EMIT ir<%add> = add nsw ir<%2>, ir<42>
4432/// EMIT ir<%arrayidx5> = getelementptr inbounds nuw ir<@dst>, ir<%indvars.iv>
4433/// EMIT store ir<%add>, ir<%arrayidx5>
4434/// EMIT ir<%indvars.iv.next> = add nuw nsw ir<%indvars.iv>, ir<1>
4435/// EMIT vp<%3> = any-of ir<%1>
4436/// EMIT ir<%exitcond.not> = icmp eq ir<%indvars.iv.next>, ir<10000>
4437/// EMIT branch-on-two-conds vp<%3>, ir<%exitcond.not>
4438/// Successor(s): middle.block, middle.block, for.body
4439///
4440/// We currently expect LoopVectorizationLegality to ensure that:
4441/// * There must also be a counted exit. We will need to support speculative
4442/// or first-faulting loads before we can remove this restriction.
4443/// * Any stores within the loop must not alias with the load used for the
4444/// uncountable exit. We can relax this a bit with runtime aliasing checks.
4445/// * Other memory operations in the loop can take place before or after the
4446/// uncountable exit, but must also be unconditional. We need to support
4447/// combining the conditions in VPlanPredicator.
4448/// * The loop must have a single unconditional load contributing to the
4449/// uncountable exit comparison, and the other term must be loop-invariant.
4450/// Improving upon this requires work in getRecipesForUncountableExit to
4451/// handle more complex recipe graphs.
4454 VPBasicBlock *HeaderVPBB, VPBasicBlock *LatchVPBB, VPBasicBlock *MiddleVPBB,
4455 Loop *TheLoop, PredicatedScalarEvolution &PSE, DominatorTree &DT,
4456 AssumptionCache *AC) {
4457
4458 // Disconnect early exiting blocks from successors, remove branches. We
4459 // currently don't support multiple uses for recipes involved in creating
4460 // the uncountable exit condition.
4461 for (auto &Exit : Exits) {
4462 if (Exit.EarlyExitingVPBB == LatchVPBB)
4463 continue;
4464
4465 for (VPRecipeBase &R : Exit.EarlyExitVPBB->phis())
4466 cast<VPIRPhi>(&R)->removeIncomingValueFor(Exit.EarlyExitingVPBB);
4467 Exit.EarlyExitingVPBB->getTerminator()->eraseFromParent();
4468 VPBlockUtils::disconnectBlocks(Exit.EarlyExitingVPBB, Exit.EarlyExitVPBB);
4469 }
4470
4471 VPDominatorTree VPDT(Plan);
4472
4473 // We can abandon a VPlan entirely if we return false here, so we shouldn't
4474 // crash if some earlier assumptions on scalar IR don't hold for the vplan
4475 // version of the loop.
4476 SmallVector<VPInstruction *, 8> ConditionRecipes;
4477
4478 std::optional<VPValue *> Cond =
4479 getRecipesForUncountableExit(ConditionRecipes, LatchVPBB);
4480 if (!Cond)
4481 return false;
4482
4483 // Find load contributing to condition.
4484 // At the moment LoopVectorizationLegality only supports a single
4485 // early-exit expression with a compare and a single load that must
4486 // be unconditional.
4487 // TODO: Support more than one load.
4488 auto *Load =
4489 find_singleton<VPInstruction>(ConditionRecipes, [](auto *I, bool _) {
4491 ? I
4492 : nullptr;
4493 });
4494 assert(Load && "Couldn't find exactly one load");
4495 // TODO: Support conditional loads for uncountable exits.
4496 assert(VPDT.dominates(Load->getParent(), LatchVPBB) &&
4497 "Uncountable exit condition load is conditional.");
4498 VPInstruction *Ptr = cast<VPInstruction>(Load->getOperand(0));
4499
4500 // Ensure that we are guaranteed to be able to dereference the memory used
4501 // for determining the uncountable exit for the maximum possible number of
4502 // scalar iterations of the loop.
4503 //
4504 // TODO: Support first-faulting loads in cases where we don't know whether
4505 // all possible addresses are dereferenceable.
4506 {
4508 const SCEV *PtrSCEV = vputils::getSCEVExprForVPValue(Ptr, PSE, TheLoop);
4509 const DataLayout &DL = Plan.getDataLayout();
4510 APInt EltSize(DL.getIndexTypeSizeInBits(Ptr->getScalarType()),
4511 DL.getTypeStoreSize(Load->getScalarType()).getFixedValue());
4513 PtrSCEV, cast<LoadInst>(Load->getUnderlyingInstr())->getAlign(),
4514 PSE.getSE()->getConstant(EltSize), TheLoop, *PSE.getSE(), DT, AC,
4515 &Predicates))
4516 return false;
4517 }
4518
4519 // Check for a single GEP for the condition load to see if we can link it to
4520 // a widen IV recipe with a step of 1; we're only interested in contiguous
4521 // accesses for the condition load right now.
4522 auto *IV = cast<VPWidenInductionRecipe>(&HeaderVPBB->front());
4523 if (!match(IV->getStartValue(), m_SpecificInt(0)) ||
4524 !match(IV->getStepValue(), m_SpecificInt(1)))
4525 return false;
4527 m_Specific(IV))))
4528 return false;
4529
4530 // We want to guarantee that the uncountable exit condition (and the mask
4531 // we will generate from it) are available for all operations in the loop
4532 // that need to be masked. If the condition recipes are not already the first
4533 // recipes in the header after the last phi, move them there.
4534 auto InsertIt = HeaderVPBB->getFirstNonPhi();
4535 while (InsertIt != HeaderVPBB->end() &&
4536 is_contained(ConditionRecipes, &*InsertIt)) {
4537 erase(ConditionRecipes, &*InsertIt);
4538 InsertIt++;
4539 }
4540 for (auto *Recipe : reverse(ConditionRecipes))
4541 Recipe->moveBefore(*HeaderVPBB, InsertIt);
4542
4543 // Create a mask to represent all lanes that fully execute in the vector loop,
4544 // stopping short of any early exit.
4545 VPBuilder MaskBuilder(HeaderVPBB, InsertIt);
4546 VPValue *FirstActive = MaskBuilder.createFirstActiveLane(*Cond);
4547 Type *IVScalarTy = IV->getScalarType();
4548 Type *FirstActiveTy = FirstActive->getScalarType();
4549 VPValue *ALMMultiplier = Plan.getConstantInt(IVScalarTy, 1);
4550 VPValue *Zero = Plan.getZero(IVScalarTy);
4551 FirstActive = MaskBuilder.createScalarZExtOrTrunc(FirstActive, IVScalarTy,
4552 FirstActiveTy, DebugLoc());
4554 {Zero, FirstActive, ALMMultiplier},
4555 DebugLoc(), "uncountable.exit.mask");
4556
4557 // Convert all other memory operations to use the mask.
4558 for (VPBasicBlock *VPBB : vp_rpo_plain_cfg_loop_body(HeaderVPBB))
4559 for (VPRecipeBase &R : *VPBB)
4560 if (R.mayReadOrWriteMemory() && &R != Load) {
4561 // TODO: Handle conditional memory operations in the loop.
4562 if (!VPDT.dominates(R.getParent(), LatchVPBB))
4563 return false;
4564 cast<VPInstruction>(&R)->addMask(Mask);
4565 }
4566
4567 // Update middle block branch to compare (IV + however many lanes were active)
4568 // against the full trip count, since we may be exiting the vector loop early.
4569 // If we didn't take an early exit, we should get the equivalent of VF from
4570 // the FirstActiveLane.
4571 VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->end());
4572 VPValue *ScalarIV = MiddleBuilder.createNaryOp(VPInstruction::ExtractLane,
4573 {Zero, IV}, DebugLoc());
4574 VPValue *ExitIV = MiddleBuilder.createAdd(ScalarIV, FirstActive);
4575 VPValue *FullTC =
4576 MiddleBuilder.createICmp(CmpInst::ICMP_EQ, ExitIV, Plan.getTripCount());
4577 MiddleBuilder.createNaryOp(VPInstruction::BranchOnCond, {FullTC});
4578
4579 // Update resume phi in scalar.ph.
4580 VPBasicBlock *ScalarPH = Plan.getScalarPreheader();
4581 auto Phis = ScalarPH->phis();
4582 // TODO: Handle more than one Phi; re-derive from IV.
4583 // TODO: Handle reductions.
4584 if (range_size(Phis) != 1)
4585 return false;
4586 VPPhi *ContinueIV = cast<VPPhi>(Phis.begin());
4587 // Make sure we're referring to the same IV.
4588 assert(
4589 match(ContinueIV->getOperand(0),
4591 "Continuing from different IV");
4592 ContinueIV->setOperand(0, ExitIV);
4593 return true;
4594}
4595
4597 VPlan &Plan, VPBasicBlock *HeaderVPBB, VPBasicBlock *LatchVPBB,
4598 VPBasicBlock *MiddleVPBB, Loop *TheLoop, PredicatedScalarEvolution &PSE,
4600#ifndef NDEBUG
4601 VPDominatorTree VPDT(Plan);
4602#endif
4603 VPBuilder LatchBuilder(LatchVPBB->getTerminator());
4605 for (VPIRBasicBlock *ExitBlock : Plan.getExitBlocks()) {
4606 for (VPBlockBase *Pred : to_vector(ExitBlock->getPredecessors())) {
4607 if (Pred == MiddleVPBB)
4608 continue;
4609 // Collect condition for this early exit.
4610 auto *EarlyExitingVPBB = cast<VPBasicBlock>(Pred);
4611 VPBlockBase *TrueSucc = EarlyExitingVPBB->getSuccessors()[0];
4612 VPValue *CondOfEarlyExitingVPBB;
4613 [[maybe_unused]] bool Matched =
4614 match(EarlyExitingVPBB->getTerminator(),
4615 m_BranchOnCond(m_VPValue(CondOfEarlyExitingVPBB)));
4616 assert(Matched && "Terminator must be BranchOnCond");
4617
4618 // Insert the MaskedCond in the EarlyExitingVPBB so the predicator adds
4619 // the correct block mask.
4620 VPBuilder EarlyExitingBuilder(EarlyExitingVPBB->getTerminator());
4621 auto *CondToEarlyExit = EarlyExitingBuilder.createNaryOp(
4623 TrueSucc == ExitBlock
4624 ? CondOfEarlyExitingVPBB
4625 : EarlyExitingBuilder.createNot(CondOfEarlyExitingVPBB));
4626 assert((isa<VPIRValue>(CondOfEarlyExitingVPBB) ||
4627 !VPDT.properlyDominates(EarlyExitingVPBB, LatchVPBB) ||
4628 VPDT.properlyDominates(
4629 CondOfEarlyExitingVPBB->getDefiningRecipe()->getParent(),
4630 LatchVPBB)) &&
4631 "exit condition must dominate the latch");
4632 Exits.push_back({
4633 EarlyExitingVPBB,
4634 ExitBlock,
4635 CondToEarlyExit,
4636 });
4637 }
4638 }
4639
4640 assert(!Exits.empty() && "must have at least one early exit");
4641 // Sort exits by RPO order to get correct program order. RPO gives a
4642 // topological ordering of the CFG, ensuring upstream exits are checked
4643 // before downstream exits in the dispatch chain.
4645 HeaderVPBB);
4647 for (const auto &[Num, VPB] : enumerate(RPOT))
4648 RPOIdx[VPB] = Num;
4649 llvm::sort(Exits, [&RPOIdx](const EarlyExitInfo &A, const EarlyExitInfo &B) {
4650 return RPOIdx[A.EarlyExitingVPBB] < RPOIdx[B.EarlyExitingVPBB];
4651 });
4652#ifndef NDEBUG
4653 // After RPO sorting, verify that for any pair where one exit dominates
4654 // another, the dominating exit comes first. This is guaranteed by RPO
4655 // (topological order) and is required for the dispatch chain correctness.
4656 for (unsigned I = 0; I + 1 < Exits.size(); ++I)
4657 for (unsigned J = I + 1; J < Exits.size(); ++J)
4658 assert(!VPDT.properlyDominates(Exits[J].EarlyExitingVPBB,
4659 Exits[I].EarlyExitingVPBB) &&
4660 "RPO sort must place dominating exits before dominated ones");
4661#endif
4662
4663 // Build the AnyOf condition for the latch terminator using logical OR
4664 // to avoid poison propagation from later exit conditions when an earlier
4665 // exit is taken.
4666 VPValue *Combined = Exits[0].CondToExit;
4667 for (const EarlyExitInfo &Info : drop_begin(Exits))
4668 Combined = LatchBuilder.createLogicalOr(Combined, Info.CondToExit);
4669
4670 VPValue *IsAnyExitTaken =
4671 LatchBuilder.createNaryOp(VPInstruction::AnyOf, {Combined});
4672
4673 // Create a comparison for the latch exit condition and replace the
4674 // BranchOnCond with a BranchOnTwoConds. The original BranchOnCond's condition
4675 // is used as the latch-exit condition; canonical IV recipes have not been
4676 // introduced yet, so there is no BranchOnCount to derive the condition from.
4677 auto *LatchExitingBranch = cast<VPInstruction>(LatchVPBB->getTerminator());
4678 assert(LatchExitingBranch->getOpcode() == VPInstruction::BranchOnCond &&
4679 "Unexpected terminator");
4680 VPValue *IsLatchExitTaken = LatchExitingBranch->getOperand(0);
4681 DebugLoc LatchDL = LatchExitingBranch->getDebugLoc();
4682 LatchExitingBranch->eraseFromParent();
4683 LatchBuilder.setInsertPoint(LatchVPBB);
4685 {IsAnyExitTaken, IsLatchExitTaken}, LatchDL);
4686 LatchVPBB->clearSuccessors();
4687
4689 // If handling the exiting lane in the scalar loop, combine the exit
4690 // conditions into a single BranchOnCond.
4691 LatchVPBB->setSuccessors({MiddleVPBB, MiddleVPBB, HeaderVPBB});
4692 MiddleVPBB->clearPredecessors();
4693 MiddleVPBB->setPredecessors({LatchVPBB, LatchVPBB});
4695 Plan, Exits, HeaderVPBB, LatchVPBB, MiddleVPBB, TheLoop, PSE, DT, AC);
4696 }
4697
4698 // Create the vector.early.exit blocks.
4699 SmallVector<VPBasicBlock *> VectorEarlyExitVPBBs(Exits.size());
4700 for (unsigned Idx = 0; Idx != Exits.size(); ++Idx) {
4701 Twine BlockSuffix = Exits.size() == 1 ? "" : Twine(".") + Twine(Idx);
4702 VPBasicBlock *VectorEarlyExitVPBB =
4703 Plan.createVPBasicBlock("vector.early.exit" + BlockSuffix);
4704 VectorEarlyExitVPBBs[Idx] = VectorEarlyExitVPBB;
4705 }
4706
4707 // Create the dispatch block (or reuse the single exit block if only one
4708 // exit). The dispatch block computes the first active lane of the combined
4709 // condition and, for multiple exits, chains through conditions to determine
4710 // which exit to take.
4711 VPBasicBlock *DispatchVPBB =
4712 Exits.size() == 1 ? VectorEarlyExitVPBBs[0]
4713 : Plan.createVPBasicBlock("vector.early.exit.check");
4714 DispatchVPBB->setPredecessors({LatchVPBB});
4715 LatchVPBB->setSuccessors({DispatchVPBB, MiddleVPBB, HeaderVPBB});
4716 VPBuilder DispatchBuilder(DispatchVPBB, DispatchVPBB->begin());
4717 VPValue *FirstActiveLane = DispatchBuilder.createFirstActiveLane(
4718 {Combined}, DebugLoc::getUnknown(), "first.active.lane");
4719
4720 // For each early exit, disconnect the original exiting block
4721 // (early.exiting.I) from the exit block (ir-bb<exit.I>) and route through a
4722 // new vector.early.exit block. Update ir-bb<exit.I>'s phis to extract their
4723 // values at the first active lane:
4724 //
4725 // Input:
4726 // early.exiting.I:
4727 // ...
4728 // EMIT branch-on-cond vp<%cond.I>
4729 // Successor(s): in.loop.succ, ir-bb<exit.I>
4730 //
4731 // ir-bb<exit.I>:
4732 // IR %phi = phi [ vp<%incoming.I>, early.exiting.I ], ...
4733 //
4734 // Output:
4735 // early.exiting.I:
4736 // ...
4737 // Successor(s): in.loop.succ
4738 //
4739 // vector.early.exit.I:
4740 // EMIT vp<%exit.val> = extract-lane vp<%first.lane>, vp<%incoming.I>
4741 // Successor(s): ir-bb<exit.I>
4742 //
4743 // ir-bb<exit.I>:
4744 // IR %phi = phi ... (extra operand: vp<%exit.val> from
4745 // vector.early.exit.I)
4746 //
4747 for (auto [Exit, VectorEarlyExitVPBB] :
4748 zip_equal(Exits, VectorEarlyExitVPBBs)) {
4749 auto &[EarlyExitingVPBB, EarlyExitVPBB, _] = Exit;
4750 // Adjust the phi nodes in EarlyExitVPBB.
4751 // 1. remove incoming values from EarlyExitingVPBB,
4752 // 2. extract the incoming value at FirstActiveLane
4753 // 3. add back the extracts as last operands for the phis
4754 // Then adjust the CFG, removing the edge between EarlyExitingVPBB and
4755 // EarlyExitVPBB and adding a new edge between VectorEarlyExitVPBB and
4756 // EarlyExitVPBB. The extracts at FirstActiveLane are now the incoming
4757 // values from VectorEarlyExitVPBB.
4758 for (VPRecipeBase &R : EarlyExitVPBB->phis()) {
4759 auto *ExitIRI = cast<VPIRPhi>(&R);
4760 VPValue *IncomingVal =
4761 ExitIRI->getIncomingValueForBlock(EarlyExitingVPBB);
4762 VPValue *NewIncoming = IncomingVal;
4763 if (!isa<VPIRValue>(IncomingVal)) {
4764 VPBuilder EarlyExitBuilder(VectorEarlyExitVPBB);
4765 NewIncoming = EarlyExitBuilder.createNaryOp(
4766 VPInstruction::ExtractLane, {FirstActiveLane, IncomingVal},
4767 DebugLoc::getUnknown(), "early.exit.value");
4768 }
4769 ExitIRI->removeIncomingValueFor(EarlyExitingVPBB);
4770 ExitIRI->addIncoming(NewIncoming);
4771 }
4772
4773 EarlyExitingVPBB->getTerminator()->eraseFromParent();
4774 VPBlockUtils::disconnectBlocks(EarlyExitingVPBB, EarlyExitVPBB);
4775 VPBlockUtils::connectBlocks(VectorEarlyExitVPBB, EarlyExitVPBB);
4776 }
4777
4778 // Chain through exits: for each exit, check if its condition is true at
4779 // the first active lane. If so, take that exit; otherwise, try the next.
4780 // The last exit needs no check since it must be taken if all others fail.
4781 //
4782 // For 3 exits (cond.0, cond.1, cond.2), this creates:
4783 //
4784 // latch:
4785 // ...
4786 // EMIT vp<%combined> = logical-or vp<%cond.0>, vp<%cond.1>, vp<%cond.2>
4787 // ...
4788 //
4789 // vector.early.exit.check:
4790 // EMIT vp<%first.lane> = first-active-lane vp<%combined>
4791 // EMIT vp<%at.cond.0> = extract-lane vp<%first.lane>, vp<%cond.0>
4792 // EMIT branch-on-cond vp<%at.cond.0>
4793 // Successor(s): vector.early.exit.0, vector.early.exit.check.0
4794 //
4795 // vector.early.exit.check.0:
4796 // EMIT vp<%at.cond.1> = extract-lane vp<%first.lane>, vp<%cond.1>
4797 // EMIT branch-on-cond vp<%at.cond.1>
4798 // Successor(s): vector.early.exit.1, vector.early.exit.2
4799 VPBasicBlock *CurrentBB = DispatchVPBB;
4800 for (auto [I, Exit] : enumerate(ArrayRef(Exits).drop_back())) {
4801 VPValue *LaneVal = DispatchBuilder.createNaryOp(
4802 VPInstruction::ExtractLane, {FirstActiveLane, Exit.CondToExit},
4803 DebugLoc::getUnknown(), "exit.cond.at.lane");
4804
4805 // For the last dispatch, branch directly to the last exit on false;
4806 // otherwise, create a new check block.
4807 bool IsLastDispatch = (I + 2 == Exits.size());
4808 VPBasicBlock *FalseBB =
4809 IsLastDispatch ? VectorEarlyExitVPBBs.back()
4810 : Plan.createVPBasicBlock(
4811 Twine("vector.early.exit.check.") + Twine(I));
4812
4813 DispatchBuilder.createNaryOp(VPInstruction::BranchOnCond, {LaneVal});
4814 CurrentBB->setSuccessors({VectorEarlyExitVPBBs[I], FalseBB});
4815 VectorEarlyExitVPBBs[I]->setPredecessors({CurrentBB});
4816 FalseBB->setPredecessors({CurrentBB});
4817
4818 CurrentBB = FalseBB;
4819 DispatchBuilder.setInsertPoint(CurrentBB);
4820 }
4821
4822 return true;
4823}
4824
4825/// This function tries convert extended in-loop reductions to
4826/// VPExpressionRecipe and clamp the \p Range if it is beneficial and
4827/// valid. The created recipe must be decomposed to its constituent
4828/// recipes before execution.
4829static VPExpressionRecipe *
4831 VFRange &Range) {
4832 Type *RedTy = Red->getScalarType();
4833 VPValue *VecOp = Red->getVecOp();
4834
4835 assert(!Red->isPartialReduction() &&
4836 "This path does not support partial reductions");
4837
4838 // Clamp the range if using extended-reduction is profitable.
4839 auto IsExtendedRedValidAndClampRange =
4840 [&](unsigned Opcode, Instruction::CastOps ExtOpc, Type *SrcTy) -> bool {
4842 [&](ElementCount VF) {
4843 auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF));
4845
4847 InstructionCost ExtCost =
4848 cast<VPWidenCastRecipe>(VecOp)->computeCost(VF, Ctx);
4849 InstructionCost RedCost = Red->computeCost(VF, Ctx);
4850
4851 assert(!RedTy->isFloatingPointTy() &&
4852 "getExtendedReductionCost only supports integer types");
4853 ExtRedCost = Ctx.TTI.getExtendedReductionCost(
4854 Opcode, ExtOpc == Instruction::CastOps::ZExt, RedTy, SrcVecTy,
4855 Red->getFastMathFlagsOrNone(), CostKind);
4856 return ExtRedCost.isValid() && ExtRedCost < ExtCost + RedCost;
4857 },
4858 Range);
4859 };
4860
4861 VPValue *A;
4862 // Match reduce(ext)).
4864 IsExtendedRedValidAndClampRange(
4865 RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()),
4866 cast<VPWidenCastRecipe>(VecOp)->getOpcode(), A->getScalarType()))
4867 return new VPExpressionRecipe(cast<VPWidenCastRecipe>(VecOp), Red);
4868
4869 return nullptr;
4870}
4871
4872/// This function tries convert extended in-loop reductions to
4873/// VPExpressionRecipe and clamp the \p Range if it is beneficial
4874/// and valid. The created VPExpressionRecipe must be decomposed to its
4875/// constituent recipes before execution. Patterns of the
4876/// VPExpressionRecipe:
4877/// reduce.add(mul(...)),
4878/// reduce.add(mul(ext(A), ext(B))),
4879/// reduce.add(ext(mul(ext(A), ext(B)))).
4880/// reduce.fadd(fmul(ext(A), ext(B)))
4881static VPExpressionRecipe *
4883 VPCostContext &Ctx, VFRange &Range) {
4884 unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind());
4885 if (Opcode != Instruction::Add && Opcode != Instruction::Sub &&
4886 Opcode != Instruction::FAdd)
4887 return nullptr;
4888
4889 assert(!Red->isPartialReduction() &&
4890 "This path does not support partial reductions");
4891 Type *RedTy = Red->getScalarType();
4892
4893 // Clamp the range if using multiply-accumulate-reduction is profitable.
4894 auto IsMulAccValidAndClampRange =
4896 VPWidenCastRecipe *OuterExt) -> bool {
4898 [&](ElementCount VF) {
4900 Type *SrcTy = Ext0 ? Ext0->getOperand(0)->getScalarType() : RedTy;
4901 InstructionCost MulAccCost;
4902
4903 // getMulAccReductionCost for in-loop reductions does not support
4904 // mixed or floating-point extends.
4905 if (Ext0 && Ext1 &&
4906 (Ext0->getOpcode() != Ext1->getOpcode() ||
4907 Ext0->getOpcode() == Instruction::CastOps::FPExt))
4908 return false;
4909
4910 bool IsZExt =
4911 !Ext0 || Ext0->getOpcode() == Instruction::CastOps::ZExt;
4912 auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF));
4913 MulAccCost = Ctx.TTI.getMulAccReductionCost(IsZExt, Opcode, RedTy,
4914 SrcVecTy, CostKind);
4915
4916 InstructionCost MulCost = Mul->computeCost(VF, Ctx);
4917 InstructionCost RedCost = Red->computeCost(VF, Ctx);
4918 InstructionCost ExtCost = 0;
4919 if (Ext0)
4920 ExtCost += Ext0->computeCost(VF, Ctx);
4921 if (Ext1)
4922 ExtCost += Ext1->computeCost(VF, Ctx);
4923 if (OuterExt)
4924 ExtCost += OuterExt->computeCost(VF, Ctx);
4925
4926 return MulAccCost.isValid() &&
4927 MulAccCost < ExtCost + MulCost + RedCost;
4928 },
4929 Range);
4930 };
4931
4932 VPValue *VecOp = Red->getVecOp();
4933 VPRecipeBase *Sub = nullptr;
4934 VPValue *A, *B;
4935 VPValue *Tmp = nullptr;
4936
4937 if (RedTy->isFloatingPointTy())
4938 return nullptr;
4939
4940 // Sub reductions could have a sub between the add reduction and vec op.
4941 if (match(VecOp, m_Sub(m_ZeroInt(), m_VPValue(Tmp)))) {
4942 Sub = VecOp->getDefiningRecipe();
4943 VecOp = Tmp;
4944 }
4945
4946 // If ValB is a constant and can be safely extended, truncate it to the same
4947 // type as ExtA's operand, then extend it to the same type as ExtA. This
4948 // creates two uniform extends that can more easily be matched by the rest of
4949 // the bundling code. The ExtB reference, ValB and operand 1 of Mul are all
4950 // replaced with the new extend of the constant.
4951 auto ExtendAndReplaceConstantOp = [](VPWidenCastRecipe *ExtA,
4952 VPWidenCastRecipe *&ExtB, VPValue *&ValB,
4953 VPWidenRecipe *Mul) {
4954 if (!ExtA || ExtB || !isa<VPIRValue>(ValB))
4955 return;
4956 Type *NarrowTy = ExtA->getOperand(0)->getScalarType();
4957 Instruction::CastOps ExtOpc = ExtA->getOpcode();
4958 const APInt *Const;
4959 if (!match(ValB, m_APInt(Const)) ||
4961 Const, NarrowTy, TTI::getPartialReductionExtendKind(ExtOpc)))
4962 return;
4963 // The truncate ensures that the type of each extended operand is the
4964 // same, and it's been proven that the constant can be extended from
4965 // NarrowTy safely. Necessary since ExtA's extended operand would be
4966 // e.g. an i8, while the const will likely be an i32. This will be
4967 // elided by later optimisations.
4968 VPBuilder Builder(Mul);
4969 auto *Trunc =
4970 Builder.createWidenCast(Instruction::CastOps::Trunc, ValB, NarrowTy);
4971 Type *WideTy = ExtA->getScalarType();
4972 ValB = ExtB = Builder.createWidenCast(ExtOpc, Trunc, WideTy);
4973 Mul->setOperand(1, ExtB);
4974 };
4975
4976 // Try to match reduce.add(mul(...)).
4977 if (match(VecOp, m_Mul(m_VPValue(A), m_VPValue(B)))) {
4978 auto *RecipeA = dyn_cast<VPWidenCastRecipe>(A);
4979 auto *RecipeB = dyn_cast<VPWidenCastRecipe>(B);
4980 auto *Mul = cast<VPWidenRecipe>(VecOp);
4981
4982 // Convert reduce.add(mul(ext, const)) to reduce.add(mul(ext, ext(const)))
4983 ExtendAndReplaceConstantOp(RecipeA, RecipeB, B, Mul);
4984
4985 // Match reduce.add/sub(mul(ext, ext)).
4986 if (RecipeA && RecipeB && match(RecipeA, m_ZExtOrSExt(m_VPValue())) &&
4987 match(RecipeB, m_ZExtOrSExt(m_VPValue())) &&
4988 IsMulAccValidAndClampRange(Mul, RecipeA, RecipeB, nullptr)) {
4989 if (Sub)
4990 return new VPExpressionRecipe(RecipeA, RecipeB, Mul,
4991 cast<VPWidenRecipe>(Sub), Red);
4992 return new VPExpressionRecipe(RecipeA, RecipeB, Mul, Red);
4993 }
4994 // TODO: Add an expression type for this variant with a negated mul
4995 if (!Sub && IsMulAccValidAndClampRange(Mul, nullptr, nullptr, nullptr))
4996 return new VPExpressionRecipe(Mul, Red);
4997 }
4998 // TODO: Add an expression type for negated versions of other expression
4999 // variants.
5000 if (Sub)
5001 return nullptr;
5002
5003 // Match reduce.add(ext(mul(A, B))).
5004 if (match(VecOp, m_ZExtOrSExt(m_Mul(m_VPValue(A), m_VPValue(B))))) {
5005 auto *Ext = cast<VPWidenCastRecipe>(VecOp);
5006 auto *Mul = cast<VPWidenRecipe>(Ext->getOperand(0));
5007 auto *Ext0 = dyn_cast<VPWidenCastRecipe>(A);
5008 auto *Ext1 = dyn_cast<VPWidenCastRecipe>(B);
5009
5010 // reduce.add(ext(mul(ext, const)))
5011 // -> reduce.add(ext(mul(ext, ext(const))))
5012 ExtendAndReplaceConstantOp(Ext0, Ext1, B, Mul);
5013
5014 // reduce.add(ext(mul(ext(A), ext(B))))
5015 // -> reduce.add(mul(wider_ext(A), wider_ext(B)))
5016 // The inner extends must either have the same opcode as the outer extend or
5017 // be the same, in which case the multiply can never result in a negative
5018 // value and the outer extend can be folded away by doing wider
5019 // extends for the operands of the mul.
5020 if (Ext0 && Ext1 &&
5021 (Ext->getOpcode() == Ext0->getOpcode() || Ext0 == Ext1) &&
5022 Ext0->getOpcode() == Ext1->getOpcode() &&
5023 IsMulAccValidAndClampRange(Mul, Ext0, Ext1, Ext) && Mul->hasOneUse()) {
5024 auto *NewExt0 = new VPWidenCastRecipe(
5025 Ext0->getOpcode(), Ext0->getOperand(0), Ext->getScalarType(), nullptr,
5026 *Ext0, *Ext0, Ext0->getDebugLoc());
5027 NewExt0->insertBefore(Ext0);
5028
5029 VPWidenCastRecipe *NewExt1 = NewExt0;
5030 if (Ext0 != Ext1) {
5031 NewExt1 = new VPWidenCastRecipe(Ext1->getOpcode(), Ext1->getOperand(0),
5032 Ext->getScalarType(), nullptr, *Ext1,
5033 *Ext1, Ext1->getDebugLoc());
5034 NewExt1->insertBefore(Ext1);
5035 }
5036 auto *NewMul = Mul->cloneWithOperands({NewExt0, NewExt1});
5037 NewMul->insertBefore(Mul);
5038 Ext->replaceAllUsesWith(NewMul);
5039 Ext->eraseFromParent();
5040 Mul->eraseFromParent();
5041 return new VPExpressionRecipe(NewExt0, NewExt1, NewMul, Red);
5042 }
5043 }
5044 return nullptr;
5045}
5046
5047/// This function tries to create abstract recipes from the reduction recipe for
5048/// following optimizations and cost estimation.
5050 VPCostContext &Ctx,
5051 VFRange &Range) {
5052 // Creation of VPExpressions for partial reductions is entirely handled in
5053 // transformToPartialReduction.
5054 assert(!Red->isPartialReduction() &&
5055 "This path does not support partial reductions");
5056
5057 VPExpressionRecipe *AbstractR = nullptr;
5058 auto IP = std::next(Red->getIterator());
5059 auto *VPBB = Red->getParent();
5060 if (auto *MulAcc = tryToMatchAndCreateMulAccumulateReduction(Red, Ctx, Range))
5061 AbstractR = MulAcc;
5062 else if (auto *ExtRed = tryToMatchAndCreateExtendedReduction(Red, Ctx, Range))
5063 AbstractR = ExtRed;
5064 // Cannot create abstract inloop reduction recipes.
5065 if (!AbstractR)
5066 return;
5067
5068 AbstractR->insertBefore(*VPBB, IP);
5069 Red->replaceAllUsesWith(AbstractR);
5070}
5071
5082
5084 if (Plan.hasScalarVFOnly())
5085 return;
5086
5087#ifndef NDEBUG
5088 VPDominatorTree VPDT(Plan);
5089#endif
5090
5091 SmallVector<VPValue *> VPValues;
5092 if (VPValue *BTC = Plan.getBackedgeTakenCount())
5093 VPValues.push_back(BTC);
5094 append_range(VPValues, Plan.getLiveIns());
5095 for (VPRecipeBase &R : *Plan.getEntry())
5096 append_range(VPValues, R.definedValues());
5097
5098 auto *VectorPreheader = Plan.getVectorPreheader();
5099 for (VPValue *VPV : VPValues) {
5101 (isa<VPIRValue>(VPV) && isa<Constant>(VPV->getLiveInIRValue())))
5102 continue;
5103
5104 // Add explicit broadcast at the insert point that dominates all users.
5105 VPBasicBlock *HoistBlock = VectorPreheader;
5106 VPBasicBlock::iterator HoistPoint = VectorPreheader->end();
5107 for (VPUser *User : VPV->users()) {
5108 if (User->usesScalars(VPV))
5109 continue;
5110 if (cast<VPRecipeBase>(User)->getParent() == VectorPreheader)
5111 HoistPoint = HoistBlock->begin();
5112 else
5113 assert(VPDT.dominates(VectorPreheader,
5114 cast<VPRecipeBase>(User)->getParent()) &&
5115 "All users must be in the vector preheader or dominated by it");
5116 }
5117
5118 VPBuilder Builder(cast<VPBasicBlock>(HoistBlock), HoistPoint);
5119 auto *Broadcast = Builder.createNaryOp(VPInstruction::Broadcast, {VPV});
5120 VPV->replaceUsesWithIf(Broadcast,
5121 [VPV, Broadcast](VPUser &U, unsigned Idx) {
5122 return Broadcast != &U && !U.usesScalars(VPV);
5123 });
5124 }
5125}
5126
5127// Collect common metadata from a group of replicate recipes by intersecting
5128// metadata from all recipes in the group.
5130 VPIRMetadata CommonMetadata = *Recipes.front();
5131 for (VPReplicateRecipe *Recipe : drop_begin(Recipes))
5132 CommonMetadata.intersect(*Recipe);
5133 return CommonMetadata;
5134}
5135
5136template <unsigned Opcode>
5140 const Loop *L) {
5141 static_assert(Opcode == Instruction::Load || Opcode == Instruction::Store,
5142 "Only Load and Store opcodes supported");
5143 [[maybe_unused]] constexpr bool IsLoad = (Opcode == Instruction::Load);
5144
5145 // For each address, collect operations with the same or complementary masks.
5148 Plan, PSE, L,
5149 [](VPReplicateRecipe *RepR) { return RepR->isPredicated(); });
5150 for (auto Recipes : Groups) {
5151 if (Recipes.size() < 2)
5152 continue;
5153
5155 map_range(Recipes, bind_back<getLoadStoreValueType>(IsLoad))) &&
5156 "Expected all recipes in group to have the same load-store type");
5157
5158 // Collect groups with the same or complementary masks.
5159 for (VPReplicateRecipe *&RecipeI : Recipes) {
5160 if (!RecipeI)
5161 continue;
5162
5163 VPValue *MaskI = RecipeI->getMask();
5165 Group.push_back(RecipeI);
5166 RecipeI = nullptr;
5167
5168 // Find all operations with the same or complementary masks.
5169 bool HasComplementaryMask = false;
5170 for (VPReplicateRecipe *&RecipeJ : Recipes) {
5171 if (!RecipeJ)
5172 continue;
5173
5174 VPValue *MaskJ = RecipeJ->getMask();
5175 // Check if any operation in the group has a complementary mask with
5176 // another, that is M1 == NOT(M2) or M2 == NOT(M1).
5177 HasComplementaryMask |= match(MaskI, m_Not(m_Specific(MaskJ))) ||
5178 match(MaskJ, m_Not(m_Specific(MaskI)));
5179 Group.push_back(RecipeJ);
5180 RecipeJ = nullptr;
5181 }
5182
5183 if (HasComplementaryMask) {
5184 assert(Group.size() >= 2 && "must have at least 2 entries");
5185 AllGroups.push_back(std::move(Group));
5186 }
5187 }
5188 }
5189
5190 return AllGroups;
5191}
5192
5193// Find the recipe with minimum alignment in the group.
5194template <typename InstType>
5195static VPReplicateRecipe *
5197 return *min_element(Group, [](VPReplicateRecipe *A, VPReplicateRecipe *B) {
5198 return cast<InstType>(A->getUnderlyingInstr())->getAlign() <
5199 cast<InstType>(B->getUnderlyingInstr())->getAlign();
5200 });
5201}
5202
5205 const Loop *L) {
5206 auto Groups =
5208 if (Groups.empty())
5209 return;
5210
5211 // Process each group of loads.
5212 for (auto &Group : Groups) {
5213 // Try to use the earliest (most dominating) load to replace all others.
5214 VPReplicateRecipe *EarliestLoad = Group[0];
5215 VPBasicBlock *FirstBB = EarliestLoad->getParent();
5216 VPBasicBlock *LastBB = Group.back()->getParent();
5217
5218 // Check that the load doesn't alias with stores between first and last.
5219 auto LoadLoc = vputils::getMemoryLocation(*EarliestLoad);
5220 if (!LoadLoc || !canHoistOrSinkWithNoAliasCheck(*LoadLoc, FirstBB, LastBB))
5221 continue;
5222
5223 // Collect common metadata from all loads in the group.
5224 VPIRMetadata CommonMetadata = getCommonMetadata(Group);
5225
5226 // Find the load with minimum alignment to use.
5227 auto *LoadWithMinAlign = findRecipeWithMinAlign<LoadInst>(Group);
5228
5229 bool IsSingleScalar = EarliestLoad->isSingleScalar();
5230 assert(all_of(Group,
5231 [IsSingleScalar](VPReplicateRecipe *R) {
5232 return R->isSingleScalar() == IsSingleScalar;
5233 }) &&
5234 "all members in group must agree on IsSingleScalar");
5235
5236 // Create an unpredicated version of the earliest load with common
5237 // metadata.
5238 auto *UnpredicatedLoad = new VPReplicateRecipe(
5239 LoadWithMinAlign->getUnderlyingInstr(), {EarliestLoad->getOperand(0)},
5240 IsSingleScalar, /*Mask=*/nullptr, *EarliestLoad, CommonMetadata);
5241
5242 UnpredicatedLoad->insertBefore(EarliestLoad);
5243
5244 // Replace all loads in the group with the unpredicated load.
5245 for (VPReplicateRecipe *Load : Group) {
5246 Load->replaceAllUsesWith(UnpredicatedLoad);
5247 Load->eraseFromParent();
5248 }
5249 }
5250}
5251
5252static bool
5254 PredicatedScalarEvolution &PSE, const Loop &L) {
5255 auto StoreLoc = vputils::getMemoryLocation(*StoresToSink.front());
5256 if (!StoreLoc || !StoreLoc->AATags.Scope)
5257 return false;
5258
5259 // When sinking a group of stores, all members of the group alias each other.
5260 // Skip them during the alias checks.
5261 VPBasicBlock *FirstBB = StoresToSink.front()->getParent();
5262 VPBasicBlock *LastBB = StoresToSink.back()->getParent();
5263 SinkStoreInfo SinkInfo(StoresToSink, *StoresToSink[0], PSE, L);
5264 return canHoistOrSinkWithNoAliasCheck(*StoreLoc, FirstBB, LastBB, SinkInfo);
5265}
5266
5269 const Loop *L) {
5270 auto Groups =
5272 if (Groups.empty())
5273 return;
5274
5275 for (auto &Group : Groups) {
5276 if (!canSinkStoreWithNoAliasCheck(Group, PSE, *L))
5277 continue;
5278
5279 // Use the last (most dominated) store's location for the unconditional
5280 // store.
5281 VPReplicateRecipe *LastStore = Group.back();
5282 VPBasicBlock *InsertBB = LastStore->getParent();
5283
5284 // Collect common alias metadata from all stores in the group.
5285 VPIRMetadata CommonMetadata = getCommonMetadata(Group);
5286
5287 // Build select chain for stored values.
5288 VPValue *SelectedValue = Group[0]->getOperand(0);
5289 VPBuilder Builder(InsertBB, LastStore->getIterator());
5290
5291 bool IsSingleScalar = Group[0]->isSingleScalar();
5292 for (unsigned I = 1; I < Group.size(); ++I) {
5293 assert(IsSingleScalar == Group[I]->isSingleScalar() &&
5294 "all members in group must agree on IsSingleScalar");
5295 VPValue *Mask = Group[I]->getMask();
5296 VPValue *Value = Group[I]->getOperand(0);
5297 SelectedValue = Builder.createSelect(Mask, Value, SelectedValue,
5298 Group[I]->getDebugLoc());
5299 }
5300
5301 // Find the store with minimum alignment to use.
5302 auto *StoreWithMinAlign = findRecipeWithMinAlign<StoreInst>(Group);
5303
5304 // Create unconditional store with selected value and common metadata.
5305 auto *UnpredicatedStore = new VPReplicateRecipe(
5306 StoreWithMinAlign->getUnderlyingInstr(),
5307 {SelectedValue, LastStore->getOperand(1)}, IsSingleScalar,
5308 /*Mask=*/nullptr, *LastStore, CommonMetadata);
5309 UnpredicatedStore->insertBefore(*InsertBB, LastStore->getIterator());
5310
5311 // Remove all predicated stores from the group.
5312 for (VPReplicateRecipe *Store : Group)
5313 Store->eraseFromParent();
5314 }
5315}
5316
5318 VPlan &Plan, ElementCount BestVF, unsigned BestUF,
5320 assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
5321 assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
5322
5323 VPValue *TC = Plan.getTripCount();
5324 if (TC->user_empty())
5325 return;
5326
5327 // Skip cases for which the trip count may be non-trivial to materialize.
5328 // I.e., when a scalar tail is absent - due to tail folding, or when a scalar
5329 // tail is required.
5330 if (!Plan.hasScalarTail() ||
5332 Plan.getScalarPreheader() ||
5333 !isa<VPIRValue>(TC))
5334 return;
5335
5336 // Materialize vector trip counts for constants early if it can simply
5337 // be computed as (Original TC / VF * UF) * VF * UF.
5338 // TODO: Compute vector trip counts for loops requiring a scalar epilogue and
5339 // tail-folded loops.
5340 ScalarEvolution &SE = *PSE.getSE();
5341 auto *TCScev = SE.getSCEV(TC->getLiveInIRValue());
5342 if (!isa<SCEVConstant>(TCScev))
5343 return;
5344 const SCEV *VFxUF = SE.getElementCount(TCScev->getType(), BestVF * BestUF);
5345 auto VecTCScev = SE.getMulExpr(SE.getUDivExpr(TCScev, VFxUF), VFxUF);
5346 if (auto *ConstVecTC = dyn_cast<SCEVConstant>(VecTCScev))
5347 Plan.getVectorTripCount().setUnderlyingValue(ConstVecTC->getValue());
5348}
5349
5351 VPBasicBlock *VectorPH) {
5353 if (BTC->user_empty())
5354 return;
5355
5356 VPBuilder Builder(VectorPH, VectorPH->begin());
5357 auto *TCTy = Plan.getTripCount()->getScalarType();
5358 auto *TCMO =
5359 Builder.createSub(Plan.getTripCount(), Plan.getConstantInt(TCTy, 1),
5360 DebugLoc::getCompilerGenerated(), "trip.count.minus.1");
5361 BTC->replaceAllUsesWith(TCMO);
5362}
5363
5365 if (Plan.hasScalarVFOnly())
5366 return;
5367
5368 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
5369 auto VPBBsOutsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
5371 auto VPBBsInsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
5372 vp_depth_first_shallow(LoopRegion->getEntry()));
5373 // Materialize Build(Struct)Vector for all replicating VPReplicateRecipes,
5374 // VPScalarIVStepsRecipe and VPInstructions, excluding ones in replicate
5375 // regions. Those are not materialized explicitly yet.
5376 // TODO: materialize build vectors for replicating recipes in replicating
5377 // regions.
5378 for (VPBasicBlock *VPBB :
5379 concat<VPBasicBlock *>(VPBBsOutsideLoopRegion, VPBBsInsideLoopRegion)) {
5380 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
5382 continue;
5383 auto *DefR = cast<VPSingleDefRecipe>(&R);
5384 auto UsesVectorOrInsideReplicateRegion = [DefR, LoopRegion](VPUser *U) {
5385 VPRegionBlock *ParentRegion = cast<VPRecipeBase>(U)->getRegion();
5386 return !U->usesScalars(DefR) || ParentRegion != LoopRegion;
5387 };
5388 if ((isa<VPReplicateRecipe>(DefR) &&
5389 cast<VPReplicateRecipe>(DefR)->isSingleScalar()) ||
5390 (isa<VPInstruction>(DefR) &&
5392 !cast<VPInstruction>(DefR)->doesGeneratePerAllLanes())) ||
5393 none_of(DefR->users(), UsesVectorOrInsideReplicateRegion))
5394 continue;
5395
5396 Type *ScalarTy = DefR->getScalarType();
5397 unsigned Opcode = ScalarTy->isStructTy()
5400 auto *BuildVector = new VPInstruction(Opcode, {DefR});
5401 BuildVector->insertAfter(DefR);
5402
5403 DefR->replaceUsesWithIf(
5404 BuildVector, [BuildVector, &UsesVectorOrInsideReplicateRegion](
5405 VPUser &U, unsigned) {
5406 return &U != BuildVector && UsesVectorOrInsideReplicateRegion(&U);
5407 });
5408 }
5409 }
5410
5411 // Create explicit VPInstructions to convert vectors to scalars. The current
5412 // implementation is conservative - it may miss some cases that may or may not
5413 // be vector values. TODO: introduce Unpacks speculatively - remove them later
5414 // if they are known to operate on scalar values.
5415 for (VPBasicBlock *VPBB : VPBBsInsideLoopRegion) {
5416 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
5418 VPDerivedIVRecipe>(&R))
5419 continue;
5420 for (VPValue *Def : R.definedValues()) {
5421 // Skip recipes that are single-scalar or only have their first lane
5422 // used.
5423 // TODO: The Defs skipped here may or may not be vector values.
5424 // Introduce Unpacks, and remove them later, if they are guaranteed to
5425 // produce scalar values.
5427 continue;
5428
5429 // At the moment, we create unpacks only for scalar users outside
5430 // replicate regions. Recipes inside replicate regions still extract the
5431 // required lanes implicitly.
5432 // TODO: Remove once replicate regions are unrolled completely.
5433 auto IsCandidateUnpackUser = [Def](VPUser *U) {
5434 VPRegionBlock *ParentRegion = cast<VPRecipeBase>(U)->getRegion();
5435 return U->usesScalars(Def) &&
5436 (!ParentRegion || !ParentRegion->isReplicator());
5437 };
5438 if (none_of(Def->users(), IsCandidateUnpackUser))
5439 continue;
5440
5441 auto *Unpack = new VPInstruction(VPInstruction::Unpack, {Def});
5442 if (R.isPhi())
5443 Unpack->insertBefore(*VPBB, VPBB->getFirstNonPhi());
5444 else
5445 Unpack->insertAfter(&R);
5446 Def->replaceUsesWithIf(Unpack,
5447 [&IsCandidateUnpackUser](VPUser &U, unsigned) {
5448 return IsCandidateUnpackUser(&U);
5449 });
5450 }
5451 }
5452 }
5453}
5454
5456 VPlan &Plan, VPBasicBlock *VectorPHVPBB, bool TailByMasking,
5457 bool RequiresScalarEpilogue, VPValue *Step,
5458 std::optional<uint64_t> MaxRuntimeStep) {
5459 VPSymbolicValue &VectorTC = Plan.getVectorTripCount();
5460 // There's nothing to do if there are no users of the vector trip count or its
5461 // IR value has already been set.
5462 if (VectorTC.user_empty() || VectorTC.getUnderlyingValue())
5463 return;
5464
5465 VPValue *TC = Plan.getTripCount();
5466 Type *TCTy = TC->getScalarType();
5467 VPBasicBlock::iterator InsertPt = VectorPHVPBB->begin();
5468 if (auto *StepR = Step->getDefiningRecipe()) {
5469 assert(VPDominatorTree(Plan).dominates(StepR->getParent(), VectorPHVPBB) &&
5470 "Step VPBB must dominate VectorPHVPBB");
5471 // Insert after Step's definition to maintain valid def-use ordering.
5472 InsertPt = std::next(StepR->getIterator());
5473 }
5474 VPBuilder Builder(VectorPHVPBB, InsertPt);
5475
5476 // For scalable steps, if TC is a constant and is divisible by the maximum
5477 // possible runtime step, then TC % Step == 0 for all valid vscale values
5478 // and the vector trip count equals TC directly.
5479 const APInt *TCVal;
5480 if (!RequiresScalarEpilogue && match(TC, m_APInt(TCVal)) && MaxRuntimeStep &&
5481 TCVal->urem(*MaxRuntimeStep) == 0) {
5482 VectorTC.replaceAllUsesWith(TC);
5483 return;
5484 }
5485
5486 // If the tail is to be folded by masking, round the number of iterations N
5487 // up to a multiple of Step instead of rounding down. This is done by first
5488 // adding Step-1 and then rounding down. Note that it's ok if this addition
5489 // overflows: the vector induction variable will eventually wrap to zero given
5490 // that it starts at zero and its Step is a power of two; the loop will then
5491 // exit, with the last early-exit vector comparison also producing all-true.
5492 if (TailByMasking) {
5493 TC = Builder.createAdd(
5494 TC, Builder.createSub(Step, Plan.getConstantInt(TCTy, 1)),
5495 DebugLoc::getCompilerGenerated(), "n.rnd.up");
5496 }
5497
5498 // Now we need to generate the expression for the part of the loop that the
5499 // vectorized body will execute. This is equal to N - (N % Step) if scalar
5500 // iterations are not required for correctness, or N - Step, otherwise. Step
5501 // is equal to the vectorization factor (number of SIMD elements) times the
5502 // unroll factor (number of SIMD instructions).
5503 VPValue *R =
5504 Builder.createNaryOp(Instruction::URem, {TC, Step},
5505 DebugLoc::getCompilerGenerated(), "n.mod.vf");
5506
5507 // There are cases where we *must* run at least one iteration in the remainder
5508 // loop. See the cost model for when this can happen. If the step evenly
5509 // divides the trip count, we set the remainder to be equal to the step. If
5510 // the step does not evenly divide the trip count, no adjustment is necessary
5511 // since there will already be scalar iterations. Note that the minimum
5512 // iterations check ensures that N >= Step.
5513 if (RequiresScalarEpilogue) {
5514 assert(!TailByMasking &&
5515 "requiring scalar epilogue is not supported with fail folding");
5516 VPValue *IsZero =
5517 Builder.createICmp(CmpInst::ICMP_EQ, R, Plan.getZero(TCTy));
5518 R = Builder.createSelect(IsZero, Step, R);
5519 }
5520
5521 VPValue *Res =
5522 Builder.createSub(TC, R, DebugLoc::getCompilerGenerated(), "n.vec");
5523 VectorTC.replaceAllUsesWith(Res);
5524}
5525
5527 ElementCount VFEC) {
5528 // If VF and VFxUF have already been materialized (no remaining users),
5529 // there's nothing more to do.
5530 if (Plan.getVF().isMaterialized()) {
5531 assert(Plan.getVFxUF().isMaterialized() &&
5532 "VF and VFxUF must be materialized together");
5533 return;
5534 }
5535
5536 VPBuilder Builder(VectorPH, VectorPH->begin());
5537 Type *TCTy = Plan.getTripCount()->getScalarType();
5538 VPValue &VF = Plan.getVF();
5539 VPValue &VFxUF = Plan.getVFxUF();
5540 // If there are no users of the runtime VF, compute VFxUF by constant folding
5541 // the multiplication of VF and UF.
5542 if (VF.user_empty()) {
5543 VPValue *RuntimeVFxUF =
5544 Builder.createElementCount(TCTy, VFEC * Plan.getConcreteUF());
5545 VFxUF.replaceAllUsesWith(RuntimeVFxUF);
5546 return;
5547 }
5548
5549 // For users of the runtime VF, compute it as VF * vscale, and VFxUF as (VF *
5550 // vscale) * UF.
5551 VPValue *RuntimeVF = Builder.createElementCount(TCTy, VFEC);
5553 VPValue *BC = Builder.createNaryOp(VPInstruction::Broadcast, RuntimeVF);
5555 BC, [&VF](VPUser &U, unsigned) { return !U.usesScalars(&VF); });
5556 }
5557 VF.replaceAllUsesWith(RuntimeVF);
5558
5559 VPValue *MulByUF = Builder.createOverflowingOp(
5560 Instruction::Mul,
5561 {RuntimeVF, Plan.getConstantInt(TCTy, Plan.getConcreteUF())},
5562 {true, false});
5563 VFxUF.replaceAllUsesWith(MulByUF);
5564}
5565
5567 VPSingleDefRecipe *HeaderMask = vputils::findHeaderMask(Plan);
5568 auto *HeaderMaskDef = HeaderMask->getDefiningRecipe();
5569 Type *I1Ty = IntegerType::getInt1Ty(Plan.getContext());
5570
5571 VPBuilder Builder(Plan.getVectorPreheader());
5572 auto *AliasMask = Builder.createNaryOp(
5573 VPInstruction::IncomingAliasMask, {}, nullptr, {}, {},
5574 DebugLoc::getUnknown(), "incoming.alias.mask", I1Ty);
5575
5576 if (HeaderMaskDef->isPhi())
5577 Builder = VPBuilder(&*HeaderMaskDef->getParent()->getFirstNonPhi());
5578 else
5579 Builder = VPBuilder::getToInsertAfter(HeaderMaskDef);
5580
5581 // Update all existing users of the header mask to "HeaderMask & AliasMask".
5582 auto *ClampedHeaderMask = Builder.createAnd(HeaderMask, AliasMask);
5583 HeaderMask->replaceUsesWithIf(ClampedHeaderMask, [&](VPUser &U, unsigned) {
5584 return &U != ClampedHeaderMask;
5585 });
5586}
5587
5588VPValue *
5590 ArrayRef<PointerDiffInfo> DiffChecks) {
5591 VPBuilder Builder(AliasCheckVPBB);
5592 Type *I1Ty = IntegerType::getInt1Ty(Plan.getContext());
5593
5594 VPValue *IncomingAliasMask = vputils::findIncomingAliasMask(Plan);
5595 assert(IncomingAliasMask && "Expected an alias mask!");
5596
5597 VPValue *AliasMask = nullptr;
5598 for (const PointerDiffInfo &Check : DiffChecks) {
5600 VPValue *Sink =
5602 Type *AddrType = Src->getScalarType();
5603
5604 // TODO: Only freeze the required pointer (not both src and sink).
5605 if (Check.NeedsFreeze) {
5606 Src = Builder.createScalarFreeze(Src, AddrType, DebugLoc::getUnknown());
5607 Sink = Builder.createScalarFreeze(Sink, AddrType, DebugLoc::getUnknown());
5608 }
5609
5610 // TODO: Generate loop_dependence_raw_mask when there's a read-after-write
5611 // dependency between the source and the sink. This is not necessary for
5612 // correctness of the mask, but using the "raw" variant prevents loads
5613 // depending on the completion of stores.
5614 VPWidenIntrinsicRecipe *WARMask = Builder.insert(new VPWidenIntrinsicRecipe(
5615 Intrinsic::loop_dependence_war_mask,
5616 {Src, Sink, Plan.getConstantInt(AddrType, Check.AccessSize)}, I1Ty));
5617
5618 if (AliasMask)
5619 AliasMask = Builder.createAnd(AliasMask, WARMask);
5620 else
5621 AliasMask = WARMask;
5622 }
5623
5625 Type *IndexTy = Plan.getDataLayout().getIndexType(Plan.getContext(), 0);
5626 VPValue *NumActive = Builder.createNaryOp(
5627 VPInstruction::NumActiveLanes, {AliasMask}, nullptr, {}, {},
5628 DebugLoc::getUnknown(), "num.active.lanes", IndexTy);
5629 VPValue *ClampedVF = Builder.createScalarZExtOrTrunc(
5630 NumActive, IVTy, IndexTy, DebugLoc::getCompilerGenerated());
5631
5632 IncomingAliasMask->replaceAllUsesWith(AliasMask);
5633
5634 return ClampedVF;
5635}
5636
5638 VPlan &Plan, ArrayRef<PointerDiffInfo> DiffChecks, bool HasBranchWeights) {
5639 VPBasicBlock *ClampedVFCheck =
5640 Plan.createVPBasicBlock("vector.clamped.vf.check");
5641
5642 VPValue *ClampedVF = materializeAliasMask(Plan, ClampedVFCheck, DiffChecks);
5643 VPBuilder Builder(ClampedVFCheck);
5645 Type *TCTy = Plan.getTripCount()->getScalarType();
5646
5647 // Check the "ClampedVF" from the alias mask is larger than one.
5648 VPValue *IsScalar =
5649 Builder.createICmp(CmpInst::ICMP_ULE, ClampedVF,
5650 Plan.getConstantInt(TCTy, 1), DL, "vf.is.scalar");
5651
5652 VPValue *TripCount = Plan.getTripCount();
5653 VPValue *MaxUIntTripCount =
5655 VPValue *DistanceToMax = Builder.createSub(MaxUIntTripCount, TripCount);
5656
5657 // For tail-folding: Don't execute the vector loop if (UMax - n) < ClampedVF.
5658 // Note: The ClampedVF may not be a power-of-two. This means the loop exit
5659 // condition (index.next == n.vec) may not be correct in the case of an
5660 // overflow. The issue is `n.vec` could be zero due to an overflow, but
5661 // index.next is not guaranteed to overflow to zero as the ClampedVF is not a
5662 // power-of-two).
5663 VPValue *TripCountCheck = Builder.createICmp(
5664 ICmpInst::ICMP_ULT, DistanceToMax, ClampedVF, DL, "vf.step.overflow");
5665
5666 VPValue *Cond = Builder.createOr(IsScalar, TripCountCheck, DL);
5667 attachVPCheckBlock(Plan, Cond, ClampedVFCheck, HasBranchWeights);
5668
5669 // Materialize the trip count early as this will add a use of (VFxUF) that
5670 // needs to be replaced with the ClampedVF.
5672 /*TailByMasking=*/true,
5673 /*RequiresScalarEpilogue=*/false,
5674 &Plan.getVFxUF());
5675
5676 assert(Plan.getConcreteUF() == 1 &&
5677 "Clamped VF not supported with interleaving");
5678 Plan.getVF().replaceAllUsesWith(ClampedVF);
5679 Plan.getVFxUF().replaceAllUsesWith(ClampedVF);
5680}
5681
5683 ScalarEvolution &SE) {
5684 auto *Entry = Plan.getEntry();
5685 VPBuilder Builder(Entry, Entry->begin());
5687 ->getIRBasicBlock()
5688 ->getTerminator()
5689 ->getDebugLoc();
5690 VPSCEVExpander Expander(Builder, SE, DL);
5691
5692 // Expand VPExpandSCEVRecipes to VPInstructions using VPSCEVExpander. During
5693 // the transition, unsupported VPExpandSCEVRecipes are skipped and left for
5694 // late expansion.
5695 for (VPRecipeBase &R : make_early_inc_range(*Entry)) {
5696 auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(&R);
5697 if (!ExpSCEV || ExpSCEV->user_empty())
5698 continue;
5699 Builder.setInsertPoint(ExpSCEV);
5700 VPValue *Expanded = Expander.tryToExpand(ExpSCEV->getSCEV());
5701 if (!Expanded)
5702 continue;
5703 ExpSCEV->replaceAllUsesWith(Expanded);
5704 // TripCount should not be used after expansion to VPInstructions. Reset to
5705 // poison to avoid dangling references.
5706 if (Plan.getTripCount() == ExpSCEV)
5707 Plan.resetTripCount(Plan.getPoison(ExpSCEV->getScalarType()));
5708 ExpSCEV->eraseFromParent();
5709 }
5710}
5711
5714 SCEVExpander Expander(SE, "induction", /*PreserveLCSSA=*/false);
5715
5716 auto *Entry = cast<VPIRBasicBlock>(Plan.getEntry());
5717 BasicBlock *EntryBB = Entry->getIRBasicBlock();
5718 DenseMap<const SCEV *, Value *> ExpandedSCEVs;
5719 // Expand remaining VPExpandSCEVRecipes to IR instructions using SCEVExpander.
5720 for (VPRecipeBase &R : make_early_inc_range(*Entry)) {
5721 auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(&R);
5722 if (!ExpSCEV)
5723 continue;
5724 const SCEV *Expr = ExpSCEV->getSCEV();
5725 Value *Res =
5726 Expander.expandCodeFor(Expr, Expr->getType(), EntryBB->getTerminator());
5727 ExpandedSCEVs[Expr] = Res;
5728 VPValue *Exp = Plan.getOrAddLiveIn(Res);
5729 ExpSCEV->replaceAllUsesWith(Exp);
5730 if (Plan.getTripCount() == ExpSCEV)
5731 Plan.resetTripCount(Exp);
5732 ExpSCEV->eraseFromParent();
5733 }
5735 "all VPExpandSCEVRecipes must have been expanded");
5736 // Add IR instructions in the entry basic block but not in the VPIRBasicBlock
5737 // to the VPIRBasicBlock.
5738 auto EI = Entry->begin();
5739 for (Instruction &I : drop_end(*EntryBB)) {
5740 if (EI != Entry->end() && isa<VPIRInstruction>(*EI) &&
5741 &cast<VPIRInstruction>(&*EI)->getInstruction() == &I) {
5742 EI++;
5743 continue;
5744 }
5746 }
5747
5748 return ExpandedSCEVs;
5749}
5750
5751/// Returns true if \p V is VPWidenLoadRecipe or VPInterleaveRecipe that can be
5752/// converted to a narrower recipe. \p V is used by a wide recipe that feeds a
5753/// store interleave group at index \p Idx, \p WideMember0 is the recipe feeding
5754/// the same interleave group at index 0. A VPWidenLoadRecipe can be narrowed to
5755/// an index-independent load if it feeds all wide ops at all indices (\p OpV
5756/// must be the operand at index \p OpIdx for both the recipe at lane 0, \p
5757/// WideMember0). A VPInterleaveRecipe can be narrowed to a wide load, if \p V
5758/// is defined at \p Idx of a load interleave group.
5759static bool canNarrowLoad(VPSingleDefRecipe *WideMember0, unsigned OpIdx,
5760 VPValue *OpV, unsigned Idx, bool IsScalable) {
5761 VPValue *Member0Op = WideMember0->getOperand(OpIdx);
5762 VPRecipeBase *Member0OpR = Member0Op->getDefiningRecipe();
5763 if (!Member0OpR) {
5764 // Member0's operand is a uniform live-in, broadcast across all fields.
5765 if (Member0Op == OpV)
5766 return true;
5767 // Otherwise distinct per-field live-ins are assembled into a BuildVector.
5768 return !IsScalable && !OpV->hasDefiningRecipe() &&
5769 OpV->getScalarType() == Member0Op->getScalarType();
5770 }
5771 if (auto *W = dyn_cast<VPWidenLoadRecipe>(Member0OpR))
5772 // For scalable VFs, the narrowed plan processes vscale iterations at once,
5773 // so a shared wide load cannot be narrowed to a uniform scalar; bail out.
5774 return !IsScalable && !W->getMask() && W->isConsecutive() &&
5775 Member0Op == OpV;
5776 if (auto *IR = dyn_cast<VPInterleaveRecipe>(Member0OpR))
5777 return IR->getInterleaveGroup()->isFull() && IR->getVPValue(Idx) == OpV;
5778 return false;
5779}
5780
5781static bool canNarrowOps(ArrayRef<VPValue *> Ops, bool IsScalable) {
5783 auto *WideMember0 = dyn_cast<VPRecipeWithIRFlags>(Ops[0]);
5784 if (!WideMember0)
5785 return false;
5786 for (VPValue *V : Ops) {
5788 return false;
5789 auto *R = cast<VPRecipeWithIRFlags>(V);
5790 if (getOpcodeOrIntrinsicID(R) != getOpcodeOrIntrinsicID(WideMember0))
5791 return false;
5792 if (R->getScalarType() != WideMember0->getScalarType())
5793 return false;
5794 if (R->hasPredicate() && R->getPredicate() != WideMember0->getPredicate())
5795 return false;
5796 }
5797
5798 for (unsigned Idx = 0; Idx != WideMember0->getNumOperands(); ++Idx) {
5800 for (VPValue *Op : Ops)
5801 OpsI.push_back(Op->getDefiningRecipe()->getOperand(Idx));
5802
5803 if (canNarrowOps(OpsI, IsScalable))
5804 continue;
5805
5806 if (any_of(enumerate(OpsI), [WideMember0, Idx, IsScalable](const auto &P) {
5807 const auto &[OpIdx, OpV] = P;
5808 return !canNarrowLoad(WideMember0, Idx, OpV, OpIdx, IsScalable);
5809 }))
5810 return false;
5811 }
5812
5813 return true;
5814}
5815
5816/// Returns VF from \p VFs if \p IR is a full interleave group with factor and
5817/// number of members both equal to VF. The interleave group must also access
5818/// the full vector width.
5819static std::optional<ElementCount>
5822 const TargetTransformInfo &TTI) {
5823 if (!InterleaveR || InterleaveR->getMask())
5824 return std::nullopt;
5825
5826 Type *GroupElementTy = nullptr;
5827 if (InterleaveR->getStoredValues().empty()) {
5828 GroupElementTy = InterleaveR->getVPValue(0)->getScalarType();
5829 if (!all_of(InterleaveR->definedValues(), [GroupElementTy](VPValue *Op) {
5830 return Op->getScalarType() == GroupElementTy;
5831 }))
5832 return std::nullopt;
5833 } else {
5834 GroupElementTy = InterleaveR->getStoredValues()[0]->getScalarType();
5835 if (!all_of(InterleaveR->getStoredValues(), [GroupElementTy](VPValue *Op) {
5836 return Op->getScalarType() == GroupElementTy;
5837 }))
5838 return std::nullopt;
5839 }
5840
5841 auto IG = InterleaveR->getInterleaveGroup();
5842 if (IG->getFactor() != IG->getNumMembers())
5843 return std::nullopt;
5844
5845 auto GetVectorBitWidthForVF = [&TTI](ElementCount VF) {
5846 TypeSize Size = TTI.getRegisterBitWidth(
5849 assert(Size.isScalable() == VF.isScalable() &&
5850 "if Size is scalable, VF must be scalable and vice versa");
5851 return Size.getKnownMinValue();
5852 };
5853
5854 for (ElementCount VF : VFs) {
5855 unsigned MinVal = VF.getKnownMinValue();
5856 unsigned GroupSize = GroupElementTy->getScalarSizeInBits() * MinVal;
5857 if (IG->getFactor() == MinVal && GroupSize == GetVectorBitWidthForVF(VF))
5858 return {VF};
5859 }
5860 return std::nullopt;
5861}
5862
5863/// Returns true if \p VPValue is a narrow VPValue.
5864static bool isAlreadyNarrow(VPValue *VPV) {
5865 if (isa<VPIRValue>(VPV))
5866 return true;
5867 auto *RepR = dyn_cast<VPReplicateRecipe>(VPV);
5868 return RepR && RepR->isSingleScalar();
5869}
5870
5871// Convert the wide recipes defining the VPValues in \p Members feeding an
5872// interleave group to a single narrow variant. The first member is reused as
5873// the narrowed recipe. BuildVectors for live-in operands are inserted into \p
5874// Preheader.
5876 SmallPtrSetImpl<VPValue *> &NarrowedOps,
5877 VPBasicBlock *Preheader) {
5878 VPValue *V = Members.front();
5879 auto *R = V->getDefiningRecipe();
5880 if (NarrowedOps.contains(V))
5881 return V;
5882
5883 if (!R) {
5884 assert(all_of(Members,
5885 [V](VPValue *M) {
5886 return !M->hasDefiningRecipe() &&
5887 M->getScalarType() == V->getScalarType();
5888 }) &&
5889 "expected distinct live-ins of matching scalar type");
5890 auto *BV = new VPInstruction(VPInstruction::BuildVector, Members);
5891 Preheader->appendRecipe(BV);
5892 NarrowedOps.insert(BV);
5893 return BV;
5894 }
5895
5896 if (isAlreadyNarrow(V))
5897 return V;
5898
5900 auto *WideMember0 = cast<VPRecipeWithIRFlags>(R);
5901 for (VPValue *Member : Members.drop_front())
5902 WideMember0->intersectFlags(*cast<VPRecipeWithIRFlags>(Member));
5903 for (unsigned Idx = 0, E = WideMember0->getNumOperands(); Idx != E; ++Idx) {
5905 for (VPValue *Member : Members)
5906 OpsI.push_back(Member->getDefiningRecipe()->getOperand(Idx));
5907 WideMember0->setOperand(
5908 Idx, narrowInterleaveGroupOp(OpsI, NarrowedOps, Preheader));
5909 }
5910 return V;
5911 }
5912
5913 if (auto *LoadGroup = dyn_cast<VPInterleaveRecipe>(R)) {
5914 // Narrow interleave group to wide load, as transformed VPlan will only
5915 // process one original iteration.
5916 auto *LI = cast<LoadInst>(LoadGroup->getInterleaveGroup()->getInsertPos());
5917 auto *L = new VPWidenLoadRecipe(*LI, LoadGroup->getAddr(),
5918 LoadGroup->getMask(), /*Consecutive=*/true,
5919 *LoadGroup, LoadGroup->getDebugLoc());
5920 L->insertBefore(LoadGroup);
5921 NarrowedOps.insert(L);
5922 return L;
5923 }
5924
5925 if (auto *RepR = dyn_cast<VPReplicateRecipe>(R)) {
5926 assert(RepR->isSingleScalar() && RepR->getOpcode() == Instruction::Load &&
5927 "must be a single scalar load");
5928 NarrowedOps.insert(RepR);
5929 return RepR;
5930 }
5931
5932 auto *WideLoad = cast<VPWidenLoadRecipe>(R);
5933 VPValue *PtrOp = WideLoad->getAddr();
5934 if (auto *VecPtr = dyn_cast<VPVectorPointerRecipe>(PtrOp))
5935 PtrOp = VecPtr->getOperand(0);
5936 // Narrow wide load to uniform scalar load, as transformed VPlan will only
5937 // process one original iteration.
5938 auto *N = new VPReplicateRecipe(&WideLoad->getIngredient(), {PtrOp},
5939 /*IsUniform*/ true,
5940 /*Mask*/ nullptr, {}, *WideLoad);
5941 N->insertBefore(WideLoad);
5942 NarrowedOps.insert(N);
5943 return N;
5944}
5945
5946std::unique_ptr<VPlan>
5948 const TargetTransformInfo &TTI) {
5949 VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
5950
5951 if (!VectorLoop)
5952 return nullptr;
5953
5954 // Only handle single-block loops for now.
5955 if (VectorLoop->getEntryBasicBlock() != VectorLoop->getExitingBasicBlock())
5956 return nullptr;
5957
5958 // Skip plans when we may not be able to properly narrow.
5959 VPBasicBlock *Exiting = VectorLoop->getExitingBasicBlock();
5960 if (!match(&Exiting->back(), m_BranchOnCount()))
5961 return nullptr;
5962
5963 assert(match(&Exiting->back(),
5965 m_Specific(&Plan.getVectorTripCount()))) &&
5966 "unexpected branch-on-count");
5967
5969 std::optional<ElementCount> VFToOptimize;
5970 for (auto &R : *VectorLoop->getEntryBasicBlock()) {
5973 continue;
5974
5975 // Bail out on recipes not supported at the moment:
5976 // * phi recipes other than the canonical induction
5977 // * recipes writing to memory except interleave groups
5978 // Only support plans with a canonical induction phi.
5979 if (R.isPhi())
5980 return nullptr;
5981
5982 auto *InterleaveR = dyn_cast<VPInterleaveRecipe>(&R);
5983 if (R.mayWriteToMemory() && !InterleaveR)
5984 return nullptr;
5985
5986 // Bail out if any recipe defines a vector value used outside the
5987 // vector loop region.
5988 if (any_of(R.definedValues(), [&](VPValue *V) {
5989 return any_of(V->users(), [&](VPUser *U) {
5990 auto *UR = cast<VPRecipeBase>(U);
5991 return UR->getParent()->getParent() != VectorLoop;
5992 });
5993 }))
5994 return nullptr;
5995
5996 // All other ops are allowed, but we reject uses that cannot be converted
5997 // when checking all allowed consumers (store interleave groups) below.
5998 if (!InterleaveR)
5999 continue;
6000
6001 // Try to find a single VF, where all interleave groups are consecutive and
6002 // saturate the full vector width. If we already have a candidate VF, check
6003 // if it is applicable for the current InterleaveR, otherwise look for a
6004 // suitable VF across the Plan's VFs.
6006 VFToOptimize ? SmallVector<ElementCount>({*VFToOptimize})
6007 : to_vector(Plan.vectorFactors());
6008 std::optional<ElementCount> NarrowedVF =
6009 isConsecutiveInterleaveGroup(InterleaveR, VFs, TTI);
6010 if (!NarrowedVF || (VFToOptimize && NarrowedVF != VFToOptimize))
6011 return nullptr;
6012 VFToOptimize = NarrowedVF;
6013
6014 // Skip read interleave groups.
6015 if (InterleaveR->getStoredValues().empty())
6016 continue;
6017
6018 // Narrow interleave groups, if all operands are already matching narrow
6019 // ops.
6020 auto *Member0 = InterleaveR->getStoredValues()[0];
6021 if (isAlreadyNarrow(Member0) &&
6022 all_of(InterleaveR->getStoredValues(), equal_to(Member0))) {
6023 StoreGroups.push_back(InterleaveR);
6024 continue;
6025 }
6026
6027 // For now, we only support full interleave groups storing load interleave
6028 // groups.
6029 if (all_of(enumerate(InterleaveR->getStoredValues()), [](auto Op) {
6030 VPRecipeBase *DefR = Op.value()->getDefiningRecipe();
6031 if (!DefR)
6032 return false;
6033 auto *IR = dyn_cast<VPInterleaveRecipe>(DefR);
6034 return IR && IR->getInterleaveGroup()->isFull() &&
6035 IR->getVPValue(Op.index()) == Op.value();
6036 })) {
6037 StoreGroups.push_back(InterleaveR);
6038 continue;
6039 }
6040
6041 // Check if all values feeding InterleaveR are matching wide recipes, which
6042 // operands that can be narrowed.
6043 if (!canNarrowOps(InterleaveR->getStoredValues(),
6044 VFToOptimize->isScalable()))
6045 return nullptr;
6046 StoreGroups.push_back(InterleaveR);
6047 }
6048
6049 if (StoreGroups.empty())
6050 return nullptr;
6051
6052 VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
6053 bool RequiresScalarEpilogue =
6054 MiddleVPBB->getNumSuccessors() == 1 &&
6055 MiddleVPBB->getSingleSuccessor() == Plan.getScalarPreheader();
6056 // Bail out for tail-folding (middle block with a single successor to exit).
6057 if (MiddleVPBB->getNumSuccessors() != 2 && !RequiresScalarEpilogue)
6058 return nullptr;
6059
6060 // All interleave groups in Plan can be narrowed for VFToOptimize. Split the
6061 // original Plan into 2: a) a new clone which contains all VFs of Plan, except
6062 // VFToOptimize, and b) the original Plan with VFToOptimize as single VF.
6063 // TODO: Handle cases where only some interleave groups can be narrowed.
6064 std::unique_ptr<VPlan> NewPlan;
6065 if (size(Plan.vectorFactors()) != 1) {
6066 NewPlan = std::unique_ptr<VPlan>(Plan.duplicate());
6067 Plan.setVF(*VFToOptimize);
6068 NewPlan->removeVF(*VFToOptimize);
6069 }
6070
6071 // Convert InterleaveGroup \p R to a single VPWidenLoadRecipe.
6072 SmallPtrSet<VPValue *, 4> NarrowedOps;
6073 VPBasicBlock *Preheader = Plan.getVectorPreheader();
6074 // Narrow operation tree rooted at store groups.
6075 for (auto *StoreGroup : StoreGroups) {
6076 VPValue *Res = narrowInterleaveGroupOp(StoreGroup->getStoredValues(),
6077 NarrowedOps, Preheader);
6078 auto *SI =
6079 cast<StoreInst>(StoreGroup->getInterleaveGroup()->getInsertPos());
6080 auto *S = new VPWidenStoreRecipe(*SI, StoreGroup->getAddr(), Res, nullptr,
6081 /*Consecutive=*/true, *StoreGroup,
6082 StoreGroup->getDebugLoc());
6083 S->insertBefore(StoreGroup);
6084 StoreGroup->eraseFromParent();
6085 }
6086
6087 // Adjust induction to reflect that the transformed plan only processes one
6088 // original iteration.
6090 Type *CanIVTy = VectorLoop->getCanonicalIVType();
6091 VPBasicBlock *VectorPH = Plan.getVectorPreheader();
6092 VPBuilder PHBuilder(VectorPH, VectorPH->begin());
6093
6094 VPValue *UF = &Plan.getUF();
6095 VPValue *Step;
6096 if (VFToOptimize->isScalable()) {
6097 VPValue *VScale =
6098 PHBuilder.createElementCount(CanIVTy, ElementCount::getScalable(1));
6099 Step = PHBuilder.createOverflowingOp(Instruction::Mul, {VScale, UF},
6100 {true, false});
6101 Plan.getVF().replaceAllUsesWith(VScale);
6102 } else {
6103 Step = UF;
6104 Plan.getVF().replaceAllUsesWith(Plan.getConstantInt(CanIVTy, 1));
6105 }
6106 // Materialize vector trip count with the narrowed step.
6107 materializeVectorTripCount(Plan, VectorPH, /*TailByMasking=*/false,
6108 RequiresScalarEpilogue, Step);
6109
6110 CanIVInc->setOperand(1, Step);
6111 Plan.getVFxUF().replaceAllUsesWith(Step);
6112
6113 removeDeadRecipes(Plan);
6114 assert(none_of(*VectorLoop->getEntryBasicBlock(),
6116 "All VPVectorPointerRecipes should have been removed");
6117 return NewPlan;
6118}
6119
6120/// Add branch weight metadata, if the \p Plan's middle block is terminated by a
6121/// BranchOnCond recipe.
6123 VPlan &Plan, ElementCount VF, std::optional<unsigned> VScaleForTuning) {
6124 VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
6125 auto *MiddleTerm =
6127 // Only add branch metadata if there is a (conditional) terminator.
6128 if (!MiddleTerm)
6129 return;
6130
6131 assert(MiddleTerm->getOpcode() == VPInstruction::BranchOnCond &&
6132 "must have a BranchOnCond");
6133 // Assume that `TripCount % VectorStep ` is equally distributed.
6134 unsigned VectorStep = Plan.getConcreteUF() * VF.getKnownMinValue();
6135 if (VF.isScalable() && VScaleForTuning.has_value())
6136 VectorStep *= *VScaleForTuning;
6137 assert(VectorStep > 0 && "trip count should not be zero");
6138 MDBuilder MDB(Plan.getContext());
6139 MDNode *BranchWeights =
6140 MDB.createBranchWeights({1, VectorStep - 1}, /*IsExpected=*/false);
6141 MiddleTerm->setMetadata(LLVMContext::MD_prof, BranchWeights);
6142}
6143
6145 VFRange &Range) {
6146 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
6147 auto *MiddleVPBB = Plan.getMiddleBlock();
6148 VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
6149
6150 auto IsScalableOne = [](ElementCount VF) -> bool {
6151 return VF == ElementCount::getScalable(1);
6152 };
6153
6154 for (auto &HeaderPhi : VectorRegion->getEntryBasicBlock()->phis()) {
6155 auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&HeaderPhi);
6156 if (!FOR)
6157 continue;
6158
6159 assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
6160 "Cannot handle loops with uncountable early exits");
6161
6162 // Find the existing splice for this FOR, created in
6163 // createHeaderPhiRecipes. All uses of FOR have already been replaced with
6164 // RecurSplice there; only RecurSplice itself still references FOR.
6165 auto *RecurSplice =
6167 assert(RecurSplice && "expected FirstOrderRecurrenceSplice");
6168
6169 // For VF vscale x 1, if vscale = 1, we are unable to extract the
6170 // penultimate value of the recurrence. Instead we rely on the existing
6171 // extract of the last element from the result of
6172 // VPInstruction::FirstOrderRecurrenceSplice.
6173 // TODO: Consider vscale_range info and UF.
6174 if (any_of(RecurSplice->users(),
6175 [](VPUser *U) { return !cast<VPRecipeBase>(U)->getRegion(); }) &&
6177 Range))
6178 return;
6179
6180 // This is the second phase of vectorizing first-order recurrences, creating
6181 // extracts for users outside the loop. An overview of the transformation is
6182 // described below. Suppose we have the following loop with some use after
6183 // the loop of the last a[i-1],
6184 //
6185 // for (int i = 0; i < n; ++i) {
6186 // t = a[i - 1];
6187 // b[i] = a[i] - t;
6188 // }
6189 // use t;
6190 //
6191 // There is a first-order recurrence on "a". For this loop, the shorthand
6192 // scalar IR looks like:
6193 //
6194 // scalar.ph:
6195 // s.init = a[-1]
6196 // br scalar.body
6197 //
6198 // scalar.body:
6199 // i = phi [0, scalar.ph], [i+1, scalar.body]
6200 // s1 = phi [s.init, scalar.ph], [s2, scalar.body]
6201 // s2 = a[i]
6202 // b[i] = s2 - s1
6203 // br cond, scalar.body, exit.block
6204 //
6205 // exit.block:
6206 // use = lcssa.phi [s1, scalar.body]
6207 //
6208 // In this example, s1 is a recurrence because it's value depends on the
6209 // previous iteration. In the first phase of vectorization, we created a
6210 // VPFirstOrderRecurrencePHIRecipe v1 for s1. Now we create the extracts
6211 // for users in the scalar preheader and exit block.
6212 //
6213 // vector.ph:
6214 // v_init = vector(..., ..., ..., a[-1])
6215 // br vector.body
6216 //
6217 // vector.body
6218 // i = phi [0, vector.ph], [i+4, vector.body]
6219 // v1 = phi [v_init, vector.ph], [v2, vector.body]
6220 // v2 = a[i, i+1, i+2, i+3]
6221 // v1' = splice(v1(3), v2(0, 1, 2))
6222 // b[i, i+1, i+2, i+3] = v2 - v1'
6223 // br cond, vector.body, middle.block
6224 //
6225 // middle.block:
6226 // vector.recur.extract.for.phi = v2(2)
6227 // vector.recur.extract = v2(3)
6228 // br cond, scalar.ph, exit.block
6229 //
6230 // scalar.ph:
6231 // scalar.recur.init = phi [vector.recur.extract, middle.block],
6232 // [s.init, otherwise]
6233 // br scalar.body
6234 //
6235 // scalar.body:
6236 // i = phi [0, scalar.ph], [i+1, scalar.body]
6237 // s1 = phi [scalar.recur.init, scalar.ph], [s2, scalar.body]
6238 // s2 = a[i]
6239 // b[i] = s2 - s1
6240 // br cond, scalar.body, exit.block
6241 //
6242 // exit.block:
6243 // lo = lcssa.phi [s1, scalar.body],
6244 // [vector.recur.extract.for.phi, middle.block]
6245 //
6246 // Update extracts of the splice in the middle block: they extract the
6247 // penultimate element of the recurrence.
6249 make_range(MiddleVPBB->getFirstNonPhi(), MiddleVPBB->end()))) {
6250 if (!match(&R, m_ExtractLastLaneOfLastPart(m_Specific(RecurSplice))))
6251 continue;
6252
6253 auto *ExtractR = cast<VPInstruction>(&R);
6254 VPValue *PenultimateElement = MiddleBuilder.createNaryOp(
6255 VPInstruction::ExtractPenultimateElement, RecurSplice->getOperand(1),
6256 {}, "vector.recur.extract.for.phi");
6257 for (VPUser *ExitU : to_vector(ExtractR->users())) {
6258 if (auto *ExitPhi = dyn_cast<VPIRPhi>(ExitU))
6259 ExitPhi->replaceUsesOfWith(ExtractR, PenultimateElement);
6260 }
6261 }
6262 }
6263}
6264
6265/// Check if \p V is a binary expression of a widened IV and a loop-invariant
6266/// value. Returns the widened IV if found, nullptr otherwise.
6268 auto *BinOp = dyn_cast<VPWidenRecipe>(V);
6269 if (!BinOp || !Instruction::isBinaryOp(BinOp->getOpcode()) ||
6270 Instruction::isIntDivRem(BinOp->getOpcode()))
6271 return nullptr;
6272
6273 VPValue *WidenIVCandidate = BinOp->getOperand(0);
6274 VPValue *InvariantCandidate = BinOp->getOperand(1);
6275 if (!isa<VPWidenIntOrFpInductionRecipe>(WidenIVCandidate))
6276 std::swap(WidenIVCandidate, InvariantCandidate);
6277
6278 if (!InvariantCandidate->isDefinedOutsideLoopRegions())
6279 return nullptr;
6280
6281 return dyn_cast<VPWidenIntOrFpInductionRecipe>(WidenIVCandidate);
6282}
6283
6284/// Create a scalar version of \p BinOp, with its \p WidenIV operand replaced
6285/// by \p ScalarIV, and place it after \p ScalarIV's defining recipe.
6289 BinOp->getNumOperands() == 2 && "BinOp must have 2 operands");
6290 auto *ClonedOp = BinOp->clone();
6291 if (ClonedOp->getOperand(0) == WidenIV) {
6292 ClonedOp->setOperand(0, ScalarIV);
6293 } else {
6294 assert(ClonedOp->getOperand(1) == WidenIV && "one operand must be WideIV");
6295 ClonedOp->setOperand(1, ScalarIV);
6296 }
6297 ClonedOp->insertAfter(ScalarIV->getDefiningRecipe());
6298 return ClonedOp;
6299}
6300
6303 Loop &L) {
6304 ScalarEvolution &SE = *PSE.getSE();
6305 VPRegionBlock *VectorLoopRegion = Plan.getVectorLoopRegion();
6306
6307 // Helper lambda to check if the IV range excludes the sentinel value. Try
6308 // signed first, then unsigned. Return an excluded sentinel if found,
6309 // otherwise return std::nullopt.
6310 auto CheckSentinel = [&SE](const SCEV *IVSCEV,
6311 bool UseMax) -> std::optional<APSInt> {
6312 unsigned BW = IVSCEV->getType()->getScalarSizeInBits();
6313 for (bool Signed : {true, false}) {
6314 APSInt Sentinel = UseMax ? APSInt::getMinValue(BW, /*Unsigned=*/!Signed)
6315 : APSInt::getMaxValue(BW, /*Unsigned=*/!Signed);
6316
6317 ConstantRange IVRange =
6318 Signed ? SE.getSignedRange(IVSCEV) : SE.getUnsignedRange(IVSCEV);
6319 if (!IVRange.contains(Sentinel))
6320 return Sentinel;
6321 }
6322 return std::nullopt;
6323 };
6324
6325 VPValue *HeaderMask = vputils::findHeaderMask(Plan);
6326 for (VPRecipeBase &Phi :
6327 make_early_inc_range(VectorLoopRegion->getEntryBasicBlock()->phis())) {
6328 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&Phi);
6330 PhiR->getRecurrenceKind()))
6331 continue;
6332
6333 Type *PhiTy = PhiR->getScalarType();
6334 if (PhiTy->isPointerTy() || PhiTy->isFloatingPointTy())
6335 continue;
6336
6337 // If there's a header mask, the backedge select will not be the find-last
6338 // select.
6339 VPValue *BackedgeVal = PhiR->getBackedgeValue();
6340 auto *FindLastSelect = cast<VPSingleDefRecipe>(BackedgeVal);
6341 if (HeaderMask &&
6342 !match(BackedgeVal,
6343 m_Select(m_Specific(HeaderMask),
6344 m_VPSingleDefRecipe(FindLastSelect), m_Specific(PhiR))))
6345 continue;
6346
6347 // Get the find-last expression from the find-last select of the reduction
6348 // phi. The find-last select should be a select between the phi and the
6349 // find-last expression.
6350 VPValue *Cond, *FindLastExpression;
6351 if (!match(FindLastSelect, m_SelectLike(m_VPValue(Cond), m_Specific(PhiR),
6352 m_VPValue(FindLastExpression))) &&
6353 !match(FindLastSelect,
6354 m_SelectLike(m_VPValue(Cond), m_VPValue(FindLastExpression),
6355 m_Specific(PhiR))))
6356 continue;
6357
6358 // Check if FindLastExpression is a simple expression of a widened IV. If
6359 // so, we can track the underlying IV instead and sink the expression.
6360 auto *IVOfExpressionToSink = getExpressionIV(FindLastExpression);
6361 const SCEV *IVSCEV = vputils::getSCEVExprForVPValue(
6362 IVOfExpressionToSink ? IVOfExpressionToSink : FindLastExpression, PSE,
6363 &L);
6364 const SCEV *Step;
6365 if (!match(IVSCEV, m_scev_AffineAddRec(m_SCEV(), m_SCEV(Step)))) {
6366 assert(!match(vputils::getSCEVExprForVPValue(FindLastExpression, PSE, &L),
6368 "IVOfExpressionToSink not being an AddRec must imply "
6369 "FindLastExpression not being an AddRec.");
6370 continue;
6371 }
6372
6373 // Determine direction from SCEV step.
6374 if (!SE.isKnownNonZero(Step))
6375 continue;
6376
6377 // Positive step means we need UMax/SMax to find the last IV value, and
6378 // UMin/SMin otherwise.
6379 bool UseMax = SE.isKnownPositive(Step);
6380 std::optional<APSInt> SentinelVal = CheckSentinel(IVSCEV, UseMax);
6381 bool UseSigned = SentinelVal && SentinelVal->isSigned();
6382
6383 // Sinking an expression will disable epilogue vectorization. Only use it,
6384 // if FindLastExpression cannot be vectorized via a sentinel. Sinking may
6385 // also prevent vectorizing using a sentinel (e.g., if the expression is a
6386 // multiply or divide by large constant, respectively), which also makes
6387 // sinking undesirable.
6388 if (IVOfExpressionToSink) {
6389 const SCEV *FindLastExpressionSCEV =
6390 vputils::getSCEVExprForVPValue(FindLastExpression, PSE, &L);
6391 if (match(FindLastExpressionSCEV,
6392 m_scev_AffineAddRec(m_SCEV(), m_SCEV(Step)))) {
6393 bool NewUseMax = SE.isKnownPositive(Step);
6394 if (auto NewSentinel =
6395 CheckSentinel(FindLastExpressionSCEV, NewUseMax)) {
6396 // The original expression already has a sentinel, so prefer not
6397 // sinking to keep epilogue vectorization possible.
6398 SentinelVal = *NewSentinel;
6399 UseSigned = NewSentinel->isSigned();
6400 UseMax = NewUseMax;
6401 IVSCEV = FindLastExpressionSCEV;
6402 IVOfExpressionToSink = nullptr;
6403 }
6404 }
6405 }
6406
6407 // If no sentinel was found, fall back to a boolean AnyOf reduction to track
6408 // if the condition was ever true. Requires the IV to not wrap, otherwise we
6409 // cannot use min/max.
6410 if (!SentinelVal) {
6411 auto *AR = cast<SCEVAddRecExpr>(IVSCEV);
6412 if (AR->hasNoSignedWrap())
6413 UseSigned = true;
6414 else if (AR->hasNoUnsignedWrap())
6415 UseSigned = false;
6416 else
6417 continue;
6418 }
6419
6421 BackedgeVal,
6423
6424 VPValue *NewFindLastSelect = BackedgeVal;
6425 VPValue *SelectCond = Cond;
6426 if (!SentinelVal || IVOfExpressionToSink) {
6427 // When we need to create a new select, normalize the condition so that
6428 // PhiR is the last operand and include the header mask if needed.
6429 DebugLoc DL = FindLastSelect->getDefiningRecipe()->getDebugLoc();
6430 VPBuilder LoopBuilder(FindLastSelect->getDefiningRecipe());
6431 if (FindLastSelect->getDefiningRecipe()->getOperand(1) == PhiR)
6432 SelectCond = LoopBuilder.createNot(SelectCond);
6433
6434 // When tail folding, mask the condition with the header mask to prevent
6435 // propagating poison from inactive lanes in the last vector iteration.
6436 if (HeaderMask)
6437 SelectCond = LoopBuilder.createLogicalAnd(HeaderMask, SelectCond);
6438
6439 if (SelectCond != Cond || IVOfExpressionToSink) {
6440 NewFindLastSelect = LoopBuilder.createSelect(
6441 SelectCond,
6442 IVOfExpressionToSink ? IVOfExpressionToSink : FindLastExpression,
6443 PhiR, DL);
6444 }
6445 }
6446
6447 // Create the reduction result in the middle block using sentinel directly.
6448 RecurKind MinMaxKind =
6449 UseMax ? (UseSigned ? RecurKind::SMax : RecurKind::UMax)
6450 : (UseSigned ? RecurKind::SMin : RecurKind::UMin);
6451 VPIRFlags Flags(MinMaxKind, /*IsOrdered=*/false, /*IsInLoop=*/false,
6452 FastMathFlags());
6453 DebugLoc ExitDL = RdxResult->getDebugLoc();
6454 VPBuilder MiddleBuilder(RdxResult);
6455 VPValue *ReducedIV =
6457 NewFindLastSelect, Flags, ExitDL);
6458
6459 // If IVOfExpressionToSink is an expression to sink, sink it now.
6460 VPValue *VectorRegionExitingVal = ReducedIV;
6461 if (IVOfExpressionToSink)
6462 VectorRegionExitingVal =
6463 cloneBinOpForScalarIV(cast<VPWidenRecipe>(FindLastExpression),
6464 ReducedIV, IVOfExpressionToSink);
6465
6466 VPValue *NewRdxResult;
6467 VPValue *StartVPV = PhiR->getStartValue();
6468 if (SentinelVal) {
6469 // Sentinel-based approach: reduce IVs with min/max, compare against
6470 // sentinel to detect if condition was ever true, select accordingly.
6471 VPValue *Sentinel = Plan.getConstantInt(*SentinelVal);
6472 auto *Cmp = MiddleBuilder.createICmp(CmpInst::ICMP_NE, ReducedIV,
6473 Sentinel, ExitDL);
6474 NewRdxResult = MiddleBuilder.createSelect(Cmp, VectorRegionExitingVal,
6475 StartVPV, ExitDL);
6476 StartVPV = Sentinel;
6477 } else {
6478 // Introduce a boolean AnyOf reduction to track if the condition was ever
6479 // true in the loop. Use it to select the initial start value, if it was
6480 // never true.
6481 auto *AnyOfPhi = new VPReductionPHIRecipe(
6482 /*Phi=*/nullptr, RecurKind::Or, *Plan.getFalse(), *Plan.getFalse(),
6483 RdxUnordered{1}, {}, /*HasUsesOutsideReductionChain=*/false);
6484 AnyOfPhi->insertAfter(PhiR);
6485
6486 VPBuilder LoopBuilder(BackedgeVal->getDefiningRecipe());
6487 VPValue *OrVal = LoopBuilder.createOr(AnyOfPhi, SelectCond);
6488 AnyOfPhi->setOperand(1, OrVal);
6489
6490 NewRdxResult = MiddleBuilder.createAnyOfReduction(
6491 OrVal, VectorRegionExitingVal, StartVPV, ExitDL);
6492
6493 // Initialize the IV reduction phi with the neutral element, not the
6494 // original start value, to ensure correct min/max reduction results.
6495 StartVPV = Plan.getOrAddLiveIn(
6496 getRecurrenceIdentity(MinMaxKind, IVSCEV->getType(), {}));
6497 }
6498 RdxResult->replaceAllUsesWith(NewRdxResult);
6499 RdxResult->eraseFromParent();
6500
6501 auto *NewPhiR = new VPReductionPHIRecipe(
6502 cast<PHINode>(PhiR->getUnderlyingInstr()), RecurKind::FindIV, *StartVPV,
6503 *NewFindLastSelect, RdxUnordered{1}, {},
6504 PhiR->hasUsesOutsideReductionChain());
6505 NewPhiR->insertBefore(PhiR);
6506 PhiR->replaceAllUsesWith(NewPhiR);
6507 PhiR->eraseFromParent();
6508 }
6509}
6510
6511namespace {
6512
6513using ExtendKind = TTI::PartialReductionExtendKind;
6514struct ReductionExtend {
6515 Type *SrcType = nullptr;
6516 ExtendKind Kind = ExtendKind::PR_None;
6517};
6518
6519/// Describes the extends used to compute the extended reduction operand.
6520/// ExtendB is optional. If ExtendB is present, ExtendsUser is a binary
6521/// operation.
6522struct ExtendedReductionOperand {
6523 /// The recipe that consumes the extends.
6524 VPWidenRecipe *ExtendsUser = nullptr;
6525 /// Extend descriptions (inputs to getPartialReductionCost).
6526 ReductionExtend ExtendA, ExtendB;
6527};
6528
6529/// A chain of recipes that form a partial reduction. Matches either
6530/// reduction_bin_op (extended op, accumulator), or
6531/// reduction_bin_op (accumulator, extended op).
6532/// The possible forms of the "extended op" are listed in
6533/// matchExtendedReductionOperand.
6534struct VPPartialReductionChain {
6535 /// The top-level binary operation that forms the reduction to a scalar
6536 /// after the loop body.
6537 VPWidenRecipe *ReductionBinOp = nullptr;
6538 /// The user of the extends that is then reduced.
6539 ExtendedReductionOperand ExtendedOp;
6540 /// The recurrence kind for the entire partial reduction chain.
6541 /// This allows distinguishing between Sub and AddWithSub recurrences,
6542 /// when the ReductionBinOp is a Instruction::Sub.
6543 RecurKind RK;
6544 /// The index of the accumulator operand of ReductionBinOp. The extended op
6545 /// is `1 - AccumulatorOpIdx`.
6546 unsigned AccumulatorOpIdx;
6547 unsigned ScaleFactor;
6548};
6549
6550static VPSingleDefRecipe *
6551optimizeExtendsForPartialReduction(VPSingleDefRecipe *Op) {
6552 // reduce.add(mul(ext(A), C))
6553 // -> reduce.add(mul(ext(A), ext(trunc(C))))
6554 const APInt *Const;
6555 if (match(Op, m_Mul(m_ZExtOrSExt(m_VPValue()), m_APInt(Const)))) {
6556 auto *ExtA = cast<VPWidenCastRecipe>(Op->getOperand(0));
6557 Instruction::CastOps ExtOpc = ExtA->getOpcode();
6558 Type *NarrowTy = ExtA->getOperand(0)->getScalarType();
6559 if (!Op->hasOneUse() ||
6561 Const, NarrowTy, TTI::getPartialReductionExtendKind(ExtOpc)))
6562 return Op;
6563
6564 VPBuilder Builder(Op);
6565 auto *Trunc = Builder.createWidenCast(Instruction::CastOps::Trunc,
6566 Op->getOperand(1), NarrowTy);
6567 Type *WideTy = ExtA->getScalarType();
6568 Op->setOperand(1, Builder.createWidenCast(ExtOpc, Trunc, WideTy));
6569 return Op;
6570 }
6571
6572 // reduce.add(abs(sub(ext(A), ext(B))))
6573 // -> reduce.add(ext(absolute-difference(A, B)))
6574 VPValue *X, *Y;
6577 auto *Sub = Op->getOperand(0)->getDefiningRecipe();
6578 auto *Ext = cast<VPWidenCastRecipe>(Sub->getOperand(0));
6579 assert(Ext->getOpcode() ==
6580 cast<VPWidenCastRecipe>(Sub->getOperand(1))->getOpcode() &&
6581 "Expected both the LHS and RHS extends to be the same");
6582 bool IsSigned = Ext->getOpcode() == Instruction::SExt;
6583 VPBuilder Builder(Op);
6584 Type *SrcTy = X->getScalarType();
6585 auto *FreezeX = Builder.insert(new VPWidenRecipe(Instruction::Freeze, {X}));
6586 auto *FreezeY = Builder.insert(new VPWidenRecipe(Instruction::Freeze, {Y}));
6587 auto *Max = Builder.insert(
6588 new VPWidenIntrinsicRecipe(IsSigned ? Intrinsic::smax : Intrinsic::umax,
6589 {FreezeX, FreezeY}, SrcTy));
6590 auto *Min = Builder.insert(
6591 new VPWidenIntrinsicRecipe(IsSigned ? Intrinsic::smin : Intrinsic::umin,
6592 {FreezeX, FreezeY}, SrcTy));
6593 auto *AbsDiff =
6594 Builder.insert(new VPWidenRecipe(Instruction::Sub, {Max, Min}));
6595 return Builder.createWidenCast(Instruction::CastOps::ZExt, AbsDiff,
6596 Op->getScalarType());
6597 }
6598
6599 // reduce.add(ext(mul(ext(A), ext(B))))
6600 // -> reduce.add(mul(wider_ext(A), wider_ext(B)))
6601 // TODO: Support this optimization for float types.
6603 m_ZExtOrSExt(m_VPValue()))))) {
6604 auto *Ext = cast<VPWidenCastRecipe>(Op);
6605 auto *Mul = cast<VPWidenRecipe>(Ext->getOperand(0));
6606 auto *MulLHS = cast<VPWidenCastRecipe>(Mul->getOperand(0));
6607 auto *MulRHS = cast<VPWidenCastRecipe>(Mul->getOperand(1));
6608 if (!Mul->hasOneUse() ||
6609 (Ext->getOpcode() != MulLHS->getOpcode() && MulLHS != MulRHS) ||
6610 MulLHS->getOpcode() != MulRHS->getOpcode())
6611 return Op;
6612 VPBuilder Builder(Mul);
6613 auto *NewLHS = Builder.createWidenCast(
6614 MulLHS->getOpcode(), MulLHS->getOperand(0), Ext->getScalarType());
6615 auto *NewRHS = MulLHS == MulRHS
6616 ? NewLHS
6617 : Builder.createWidenCast(MulRHS->getOpcode(),
6618 MulRHS->getOperand(0),
6619 Ext->getScalarType());
6620 auto *NewMul = Mul->cloneWithOperands({NewLHS, NewRHS});
6621 Builder.insert(NewMul);
6622 Op->replaceAllUsesWith(NewMul);
6623 Op->eraseFromParent();
6624 Mul->eraseFromParent();
6625 return NewMul;
6626 }
6627
6628 return Op;
6629}
6630
6631static VPExpressionRecipe *
6632createPartialReductionExpression(VPReductionRecipe *Red) {
6633 VPValue *VecOp = Red->getVecOp();
6634
6635 // reduce.[f]add(ext(op))
6636 // -> VPExpressionRecipe(op, red)
6637 if (match(VecOp, m_WidenAnyExtend(m_VPValue())))
6638 return new VPExpressionRecipe(cast<VPWidenCastRecipe>(VecOp), Red);
6639
6640 // reduce.[f]add(neg(ext(op)))
6641 // -> VPExpressionRecipe(op, sub/neg, red)
6642 if (match(VecOp, m_AnyNeg(m_WidenAnyExtend(m_VPValue())))) {
6643 auto *Neg = cast<VPWidenRecipe>(VecOp);
6644 auto *Ext =
6645 cast<VPWidenCastRecipe>(Neg->getOperand(Neg->getNumOperands() - 1));
6646 return new VPExpressionRecipe(Ext, Neg, Red);
6647 }
6648
6649 // reduce.[f]add([f]mul(ext(a), ext(b)))
6650 // -> VPExpressionRecipe(a, b, mul, red)
6651 if (match(VecOp, m_FMul(m_FPExt(m_VPValue()), m_FPExt(m_VPValue()))) ||
6652 match(VecOp,
6654 auto *Mul = cast<VPWidenRecipe>(VecOp);
6655 auto *ExtA = cast<VPWidenCastRecipe>(Mul->getOperand(0));
6656 auto *ExtB = cast<VPWidenCastRecipe>(Mul->getOperand(1));
6657 return new VPExpressionRecipe(ExtA, ExtB, Mul, Red);
6658 }
6659
6660 // reduce.fadd(fneg(fmul(fpext(a), fpext(b))))
6661 // -> VPExpressionRecipe(a, b, fmul, fsub, red)
6662 if (match(VecOp,
6664 auto *FNeg = cast<VPWidenRecipe>(VecOp);
6665 auto *FMul = cast<VPWidenRecipe>(FNeg->getOperand(0));
6666 auto *ExtA = cast<VPWidenCastRecipe>(FMul->getOperand(0));
6667 auto *ExtB = cast<VPWidenCastRecipe>(FMul->getOperand(1));
6668 return new VPExpressionRecipe(ExtA, ExtB, FMul, FNeg, Red);
6669 }
6670
6671 // reduce.add(neg(mul(ext(a), ext(b))))
6672 // -> VPExpressionRecipe(a, b, mul, sub, red)
6674 m_ZExtOrSExt(m_VPValue()))))) {
6675 auto *Sub = cast<VPWidenRecipe>(VecOp);
6676 auto *Mul = cast<VPWidenRecipe>(Sub->getOperand(1));
6677 auto *ExtA = cast<VPWidenCastRecipe>(Mul->getOperand(0));
6678 auto *ExtB = cast<VPWidenCastRecipe>(Mul->getOperand(1));
6679 return new VPExpressionRecipe(ExtA, ExtB, Mul, Sub, Red);
6680 }
6681
6682 llvm_unreachable("Unsupported expression");
6683}
6684
6685// Helper to transform a partial reduction chain into a partial reduction
6686// recipe. Assumes profitability has been checked.
6687static void transformToPartialReduction(const VPPartialReductionChain &Chain,
6688 VPlan &Plan,
6689 VPReductionPHIRecipe *RdxPhi) {
6690 VPWidenRecipe *WidenRecipe = Chain.ReductionBinOp;
6691 assert(WidenRecipe->getNumOperands() == 2 && "Expected binary operation");
6692
6693 VPValue *Accumulator = WidenRecipe->getOperand(Chain.AccumulatorOpIdx);
6694 auto *ExtendedOp = cast<VPSingleDefRecipe>(
6695 WidenRecipe->getOperand(1 - Chain.AccumulatorOpIdx));
6696
6697 // FIXME: Do these transforms before invoking the cost-model.
6698 ExtendedOp = optimizeExtendsForPartialReduction(ExtendedOp);
6699
6700 // Sub-reductions can be implemented in two ways:
6701 // (1) negate the operand in the vector loop (the default way).
6702 // (2) subtract the reduced value from the init value in the middle block.
6703 // Both ways keep the reduction itself as an 'add' reduction.
6704 //
6705 // The ISD nodes for partial reductions don't support folding the
6706 // sub/negation into its operands because the following is not a valid
6707 // transformation:
6708 // sub(0, mul(ext(a), ext(b)))
6709 // -> mul(ext(a), ext(sub(0, b)))
6710 //
6711 // It's therefore better to choose option (2) such that the partial
6712 // reduction is always positive (starting at '0') and to do a final
6713 // subtract in the middle block.
6714 if ((WidenRecipe->getOpcode() == Instruction::Sub &&
6715 Chain.RK != RecurKind::Sub) ||
6716 (WidenRecipe->getOpcode() == Instruction::FSub &&
6717 Chain.RK != RecurKind::FSub)) {
6718 VPBuilder Builder(WidenRecipe);
6719 Type *ElemTy = ExtendedOp->getScalarType();
6720 VPWidenRecipe *NegRecipe;
6721 if (WidenRecipe->getOpcode() == Instruction::FSub) {
6722 NegRecipe =
6723 new VPWidenRecipe(Instruction::FNeg, {ExtendedOp}, VPIRFlags(),
6725 } else {
6726 auto *Zero = Plan.getZero(ElemTy);
6727 NegRecipe =
6728 new VPWidenRecipe(Instruction::Sub, {Zero, ExtendedOp}, VPIRFlags(),
6730 }
6731 Builder.insert(NegRecipe);
6732 ExtendedOp = NegRecipe;
6733 }
6734
6735 // Check if WidenRecipe is the final result of the reduction. If so look
6736 // through selects for predicated reductions.
6737 VPValue *Cond = nullptr;
6739 findUserOf(WidenRecipe, m_Select(m_VPValue(Cond), m_Specific(WidenRecipe),
6740 m_Specific(RdxPhi))));
6741 bool IsLastInChain = RdxPhi->getBackedgeValue() == WidenRecipe ||
6742 RdxPhi->getBackedgeValue() == ExitValue;
6743 assert((!ExitValue || IsLastInChain) &&
6744 "if we found ExitValue, it must match RdxPhi's backedge value");
6745
6746 Type *PhiType = RdxPhi->getScalarType();
6747 RecurKind RdxKind =
6749 auto *PartialRed = new VPReductionRecipe(
6750 RdxKind,
6751 RdxKind == RecurKind::FAdd ? WidenRecipe->getFastMathFlagsOrNone()
6752 : FastMathFlags(),
6753 WidenRecipe->getUnderlyingInstr(), Accumulator, ExtendedOp, Cond,
6754 RdxUnordered{/*VFScaleFactor=*/Chain.ScaleFactor});
6755 PartialRed->insertBefore(WidenRecipe);
6756
6757 if (Cond)
6758 ExitValue->replaceAllUsesWith(PartialRed);
6759 WidenRecipe->replaceAllUsesWith(PartialRed);
6760
6761 // For cost-model purposes, fold this into a VPExpression.
6762 VPExpressionRecipe *E = createPartialReductionExpression(PartialRed);
6763 E->insertBefore(WidenRecipe);
6764 PartialRed->replaceAllUsesWith(E);
6765
6766 // We only need to update the PHI node once, which is when we find the
6767 // last reduction in the chain.
6768 if (!IsLastInChain)
6769 return;
6770
6771 // Scale the PHI and ReductionStartVector by the VFScaleFactor
6772 assert(RdxPhi->getVFScaleFactor() == 1 && "scale factor must not be set");
6773 RdxPhi->setVFScaleFactor(Chain.ScaleFactor);
6774
6775 auto *StartInst = cast<VPInstruction>(RdxPhi->getStartValue());
6776 assert(StartInst->getOpcode() == VPInstruction::ReductionStartVector);
6777 auto *NewScaleFactor = Plan.getConstantInt(32, Chain.ScaleFactor);
6778 StartInst->setOperand(2, NewScaleFactor);
6779
6780 // If this is the last value in a sub-reduction chain, then update the PHI
6781 // node to start at `0` and update the reduction-result to subtract from
6782 // the PHI's start value.
6783 if (Chain.RK != RecurKind::Sub && Chain.RK != RecurKind::FSub)
6784 return;
6785
6786 VPValue *OldStartValue = StartInst->getOperand(0);
6787 StartInst->setOperand(0, StartInst->getOperand(1));
6788
6789 // Replace reduction_result by 'sub (startval, reductionresult)'.
6791 assert(RdxResult && "Could not find reduction result");
6792
6793 VPBuilder Builder = VPBuilder::getToInsertAfter(RdxResult);
6794 unsigned SubOpc = Chain.RK == RecurKind::FSub ? Instruction::BinaryOps::FSub
6795 : Instruction::BinaryOps::Sub;
6796 VPInstruction *NewResult = Builder.createNaryOp(
6797 SubOpc, {OldStartValue, RdxResult}, VPIRFlags::getDefaultFlags(SubOpc),
6798 RdxPhi->getDebugLoc());
6799 RdxResult->replaceUsesWithIf(
6800 NewResult,
6801 [&NewResult](VPUser &U, unsigned Idx) { return &U != NewResult; });
6802}
6803
6804/// Returns the cost of a link in a partial-reduction chain for a given VF.
6805static InstructionCost
6806getPartialReductionLinkCost(VPCostContext &CostCtx,
6807 const VPPartialReductionChain &Link,
6808 ElementCount VF) {
6809 Type *RdxType = Link.ReductionBinOp->getScalarType();
6810 const ExtendedReductionOperand &ExtendedOp = Link.ExtendedOp;
6811 std::optional<unsigned> BinOpc = std::nullopt;
6812 // If ExtendB is not none, then the "ExtendsUser" is the binary operation.
6813 if (ExtendedOp.ExtendB.Kind != ExtendKind::PR_None)
6814 BinOpc = ExtendedOp.ExtendsUser->getOpcode();
6815
6816 std::optional<llvm::FastMathFlags> Flags;
6817 if (RdxType->isFloatingPointTy())
6818 Flags = Link.ReductionBinOp->getFastMathFlagsOrNone();
6819
6820 auto GetLinkOpcode = [&Link]() -> unsigned {
6821 switch (Link.RK) {
6822 case RecurKind::Sub:
6823 return Instruction::Add;
6824 case RecurKind::FSub:
6825 return Instruction::FAdd;
6826 default:
6827 return Link.ReductionBinOp->getOpcode();
6828 }
6829 };
6830
6831 return CostCtx.TTI.getPartialReductionCost(
6832 GetLinkOpcode(), ExtendedOp.ExtendA.SrcType, ExtendedOp.ExtendB.SrcType,
6833 RdxType, VF, ExtendedOp.ExtendA.Kind, ExtendedOp.ExtendB.Kind, BinOpc,
6834 CostCtx.CostKind, Flags);
6835}
6836
6837static ExtendKind getPartialReductionExtendKind(VPWidenCastRecipe *Cast) {
6839}
6840
6841/// Checks if \p Op (which is an operand of \p UpdateR) is an extended reduction
6842/// operand. This is an operand where the source of the value (e.g. a load) has
6843/// been extended (sext, zext, or fpext) before it is used in the reduction.
6844///
6845/// Possible forms matched by this function:
6846/// - UpdateR(PrevValue, ext(...))
6847/// - UpdateR(PrevValue, mul(ext(...), ext(...)))
6848/// - UpdateR(PrevValue, mul(ext(...), Constant))
6849/// - UpdateR(PrevValue, ext(mul(ext(...), ext(...))))
6850/// - UpdateR(PrevValue, ext(mul(ext(...), Constant)))
6851/// - UpdateR(PrevValue, abs(sub(ext(...), ext(...)))
6852///
6853/// Note: The second operand of UpdateR corresponds to \p Op in the examples.
6854static std::optional<ExtendedReductionOperand>
6855matchExtendedReductionOperand(VPWidenRecipe *UpdateR, VPValue *Op) {
6856 assert(is_contained(UpdateR->operands(), Op) &&
6857 "Op should be operand of UpdateR");
6858
6859 // Try matching an absolute difference operand of the form
6860 // `abs(sub(ext(A), ext(B)))`. This will be later transformed into
6861 // `ext(absolute-difference(A, B))`. This allows us to perform the absolute
6862 // difference on a wider type and get the extend for "free" from the partial
6863 // reduction.
6864 VPValue *X, *Y;
6865 if (Op->hasOneUse() &&
6869 auto *Abs = cast<VPWidenIntrinsicRecipe>(Op);
6870 auto *Sub = cast<VPWidenRecipe>(Abs->getOperand(0));
6871 auto *LHSExt = cast<VPWidenCastRecipe>(Sub->getOperand(0));
6872 auto *RHSExt = cast<VPWidenCastRecipe>(Sub->getOperand(1));
6873 Type *LHSInputType = X->getScalarType();
6874 Type *RHSInputType = Y->getScalarType();
6875 if (LHSInputType != RHSInputType ||
6876 LHSExt->getOpcode() != RHSExt->getOpcode())
6877 return std::nullopt;
6878 // Note: This is essentially the same as matching ext(...) as we will
6879 // rewrite this operand to ext(absolute-difference(A, B)).
6880 return ExtendedReductionOperand{
6881 Sub,
6882 /*ExtendA=*/{LHSInputType, getPartialReductionExtendKind(LHSExt)},
6883 /*ExtendB=*/{}};
6884 }
6885
6886 std::optional<TTI::PartialReductionExtendKind> OuterExtKind;
6888 auto *CastRecipe = cast<VPWidenCastRecipe>(Op);
6889 VPValue *CastSource = CastRecipe->getOperand(0);
6890 OuterExtKind = getPartialReductionExtendKind(CastRecipe);
6891 if (match(CastSource, m_Mul(m_VPValue(), m_VPValue())) ||
6892 match(CastSource, m_FMul(m_VPValue(), m_VPValue()))) {
6893 // Match: ext(mul(...))
6894 // Record the outer extend kind and set `Op` to the mul. We can then match
6895 // this as a binary operation. Note: We can optimize out the outer extend
6896 // by widening the inner extends to match it. See
6897 // optimizeExtendsForPartialReduction.
6898 Op = CastSource;
6899 } else {
6900 return ExtendedReductionOperand{
6901 UpdateR,
6902 /*ExtendA=*/{CastSource->getScalarType(), *OuterExtKind},
6903 /*ExtendB=*/{}};
6904 }
6905 }
6906
6907 if (!Op->hasOneUse())
6908 return std::nullopt;
6909
6911 if (!MulOp ||
6912 !is_contained({Instruction::Mul, Instruction::FMul}, MulOp->getOpcode()))
6913 return std::nullopt;
6914
6915 // The rest of the matching assumes `Op` is a (possibly extended) mul
6916 // operation.
6917
6918 VPValue *LHS = MulOp->getOperand(0);
6919 VPValue *RHS = MulOp->getOperand(1);
6920
6921 // The LHS of the operation must always be an extend.
6923 return std::nullopt;
6924
6925 auto *LHSCast = cast<VPWidenCastRecipe>(LHS);
6926 Type *LHSInputType = LHSCast->getOperand(0)->getScalarType();
6927 ExtendKind LHSExtendKind = getPartialReductionExtendKind(LHSCast);
6928
6929 // The RHS of the operation can be an extend or a constant integer.
6930 const APInt *RHSConst = nullptr;
6931 VPWidenCastRecipe *RHSCast = nullptr;
6933 RHSCast = cast<VPWidenCastRecipe>(RHS);
6934 else if (!match(RHS, m_APInt(RHSConst)) ||
6935 !canConstantBeExtended(RHSConst, LHSInputType, LHSExtendKind))
6936 return std::nullopt;
6937
6938 // The outer extend kind must match the inner extends for folding.
6939 for (VPWidenCastRecipe *Cast : {LHSCast, RHSCast})
6940 if (Cast && OuterExtKind &&
6941 getPartialReductionExtendKind(Cast) != OuterExtKind)
6942 return std::nullopt;
6943
6944 Type *RHSInputType = LHSInputType;
6945 ExtendKind RHSExtendKind = LHSExtendKind;
6946 if (RHSCast) {
6947 RHSInputType = RHSCast->getOperand(0)->getScalarType();
6948 RHSExtendKind = getPartialReductionExtendKind(RHSCast);
6949 }
6950
6951 return ExtendedReductionOperand{
6952 MulOp, {LHSInputType, LHSExtendKind}, {RHSInputType, RHSExtendKind}};
6953}
6954
6955/// Examines each operation in the reduction chain corresponding to \p RedPhiR,
6956/// and determines if the target can use a cheaper operation with a wider
6957/// per-iteration input VF and narrower PHI VF. If successful, returns the chain
6958/// of operations in the reduction.
6959static std::optional<SmallVector<VPPartialReductionChain>>
6960getScaledReductions(VPReductionPHIRecipe *RedPhiR, VPCostContext &CostCtx,
6961 VFRange &Range) {
6962 // Get the backedge value from the reduction PHI and find the
6963 // ComputeReductionResult that uses it (directly or through a select for
6964 // predicated reductions).
6965 auto *RdxResult = vputils::findComputeReductionResult(RedPhiR);
6966 if (!RdxResult)
6967 return std::nullopt;
6968 VPValue *ExitValue = RdxResult->getOperand(0);
6969 match(ExitValue, m_Select(m_VPValue(), m_VPValue(ExitValue), m_VPValue()));
6970
6972 RecurKind RK = RedPhiR->getRecurrenceKind();
6973 Type *PhiType = RedPhiR->getScalarType();
6974 TypeSize PHISize = PhiType->getPrimitiveSizeInBits();
6975
6976 // Work backwards from the ExitValue examining each reduction operation.
6977 VPValue *CurrentValue = ExitValue;
6978 while (CurrentValue != RedPhiR) {
6979 auto *UpdateR = dyn_cast<VPWidenRecipe>(CurrentValue);
6980 if (!UpdateR || !Instruction::isBinaryOp(UpdateR->getOpcode()))
6981 return std::nullopt;
6982
6983 VPValue *Op = UpdateR->getOperand(1);
6984 VPValue *PrevValue = UpdateR->getOperand(0);
6985
6986 // Find the extended operand. The other operand (PrevValue) is the next link
6987 // in the reduction chain.
6988 std::optional<ExtendedReductionOperand> ExtendedOp =
6989 matchExtendedReductionOperand(UpdateR, Op);
6990 if (!ExtendedOp) {
6991 ExtendedOp = matchExtendedReductionOperand(UpdateR, PrevValue);
6992 if (!ExtendedOp)
6993 return std::nullopt;
6994 std::swap(Op, PrevValue);
6995 }
6996
6997 Type *ExtSrcType = ExtendedOp->ExtendA.SrcType;
6998 TypeSize ExtSrcSize = ExtSrcType->getPrimitiveSizeInBits();
6999 if (!PHISize.hasKnownScalarFactor(ExtSrcSize))
7000 return std::nullopt;
7001
7002 // Check if a partial reduction chain is supported by the target (i.e. does
7003 // not have an invalid cost) for the given VF range. Clamps the range and
7004 // returns true if feasible for any VF.
7005 VPPartialReductionChain Link(
7006 {UpdateR, *ExtendedOp, RK,
7007 PrevValue == UpdateR->getOperand(0) ? 0U : 1U,
7008 static_cast<unsigned>(PHISize.getKnownScalarFactor(ExtSrcSize))});
7009 Chain.push_back(Link);
7010 CurrentValue = PrevValue;
7011 }
7012
7013 // The chain links were collected by traversing backwards from the exit value.
7014 // Reverse the chains so they are in program order.
7015 std::reverse(Chain.begin(), Chain.end());
7016 return Chain;
7017}
7018} // namespace
7019
7021 VPCostContext &CostCtx,
7022 VFRange &Range) {
7023 // Find all possible valid partial reductions, grouping chains by their PHI.
7024 // This grouping allows invalidating the whole chain, if any link is not a
7025 // valid partial reduction.
7027 ChainsByPhi;
7028 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
7029 for (VPRecipeBase &R : HeaderVPBB->phis()) {
7030 auto *RedPhiR = dyn_cast<VPReductionPHIRecipe>(&R);
7031 if (!RedPhiR)
7032 continue;
7033
7034 if (auto Chains = getScaledReductions(RedPhiR, CostCtx, Range))
7035 ChainsByPhi.try_emplace(RedPhiR, std::move(*Chains));
7036 }
7037
7038 if (ChainsByPhi.empty())
7039 return;
7040
7041 // Build set of partial reduction operations for extend user validation and
7042 // a map of reduction bin ops to their scale factors for scale validation.
7043 SmallPtrSet<VPRecipeBase *, 4> PartialReductionOps;
7044 DenseMap<VPSingleDefRecipe *, unsigned> ScaledReductionMap;
7045 for (const auto &[_, Chains] : ChainsByPhi)
7046 for (const VPPartialReductionChain &Chain : Chains) {
7047 PartialReductionOps.insert(Chain.ExtendedOp.ExtendsUser);
7048 ScaledReductionMap[Chain.ReductionBinOp] = Chain.ScaleFactor;
7049 }
7050
7051 // A partial reduction is invalid if any of its extends are used by
7052 // something that isn't another partial reduction. This is because the
7053 // extends are intended to be lowered along with the reduction itself.
7054 auto ExtendUsersValid = [&](VPValue *Ext) {
7055 return !isa<VPWidenCastRecipe>(Ext) || all_of(Ext->users(), [&](VPUser *U) {
7056 return PartialReductionOps.contains(cast<VPRecipeBase>(U));
7057 });
7058 };
7059
7060 auto IsProfitablePartialReductionChainForVF =
7061 [&](ArrayRef<VPPartialReductionChain> Chain, ElementCount VF) -> bool {
7062 InstructionCost PartialCost = 0, RegularCost = 0;
7063
7064 // The chain is a profitable partial reduction chain if the cost of handling
7065 // the entire chain is cheaper when using partial reductions than when
7066 // handling the entire chain using regular reductions.
7067 for (const VPPartialReductionChain &Link : Chain) {
7068 const ExtendedReductionOperand &ExtendedOp = Link.ExtendedOp;
7069 InstructionCost LinkCost = getPartialReductionLinkCost(CostCtx, Link, VF);
7070 if (!LinkCost.isValid())
7071 return false;
7072
7073 PartialCost += LinkCost;
7074 RegularCost += Link.ReductionBinOp->computeCost(VF, CostCtx);
7075 // If ExtendB is not none, then the "ExtendsUser" is the binary operation.
7076 if (ExtendedOp.ExtendB.Kind != ExtendKind::PR_None)
7077 RegularCost += ExtendedOp.ExtendsUser->computeCost(VF, CostCtx);
7078 for (VPValue *Op : ExtendedOp.ExtendsUser->operands())
7079 if (auto *Extend = dyn_cast<VPWidenCastRecipe>(Op))
7080 RegularCost += Extend->computeCost(VF, CostCtx);
7081 }
7082 return PartialCost.isValid() && PartialCost < RegularCost;
7083 };
7084
7085 // Validate chains: check that extends are only used by partial reductions,
7086 // and that reduction bin ops are only used by other partial reductions with
7087 // matching scale factors, are outside the loop region or the select
7088 // introduced by tail-folding. Otherwise we would create users of scaled
7089 // reductions where the types of the other operands don't match.
7090 for (auto &[RedPhiR, Chains] : ChainsByPhi) {
7091 for (const VPPartialReductionChain &Chain : Chains) {
7092 if (!all_of(Chain.ExtendedOp.ExtendsUser->operands(), ExtendUsersValid)) {
7093 Chains.clear();
7094 break;
7095 }
7096 auto UseIsValid = [&, RedPhiR = RedPhiR](VPUser *U) {
7097 if (auto *PhiR = dyn_cast<VPReductionPHIRecipe>(U))
7098 return PhiR == RedPhiR;
7099 auto *R = cast<VPSingleDefRecipe>(U);
7100 return Chain.ScaleFactor == ScaledReductionMap.lookup_or(R, 0) ||
7102 m_Specific(Chain.ReductionBinOp))) ||
7103 match(R, m_Select(m_VPValue(), m_Specific(Chain.ReductionBinOp),
7104 m_Specific(RedPhiR)));
7105 };
7106 if (!all_of(Chain.ReductionBinOp->users(), UseIsValid)) {
7107 Chains.clear();
7108 break;
7109 }
7110
7111 // Check if the compute-reduction-result is used by a sunk store.
7112 // TODO: Also form partial reductions in those cases.
7113 if (auto *RdxResult = vputils::findComputeReductionResult(RedPhiR)) {
7114 if (any_of(RdxResult->users(), [](VPUser *U) {
7115 auto *RepR = dyn_cast<VPReplicateRecipe>(U);
7116 return RepR && RepR->getOpcode() == Instruction::Store;
7117 })) {
7118 Chains.clear();
7119 break;
7120 }
7121 }
7122 }
7123
7124 // Clear the chain if it is not profitable.
7126 [&, &Chains = Chains](ElementCount VF) {
7127 return IsProfitablePartialReductionChainForVF(Chains, VF);
7128 },
7129 Range))
7130 Chains.clear();
7131 }
7132
7133 for (auto &[Phi, Chains] : ChainsByPhi)
7134 for (const VPPartialReductionChain &Chain : Chains)
7135 transformToPartialReduction(Chain, Plan, Phi);
7136}
7137
7139 VPlan &Plan, VFRange &Range, VPRecipeBuilder &RecipeBuilder) {
7140 // Collect all loads/stores first. We will start with ones having simpler
7141 // decisions followed by more complex ones that are potentially
7142 // guided/dependent on the simpler ones.
7144 for (VPBasicBlock *VPBB :
7147 for (VPRecipeBase &R : *VPBB) {
7148 auto *VPI = dyn_cast<VPInstruction>(&R);
7149 if (VPI && VPI->getUnderlyingValue() &&
7150 is_contained({Instruction::Load, Instruction::Store},
7151 VPI->getOpcode()))
7152 MemOps.push_back(VPI);
7153 }
7154 }
7155
7156 // Few helpers to process different kinds of memory operations.
7157
7158 // To be used as argument to `VPlanTransforms::runPass` which explicitly
7159 // specified pass name, hence `VPlan &` parameter.
7160 auto ProcessSubset = [&](VPlan &, auto ProcessVPInst) {
7161 SmallVector<VPInstruction *> RemainingMemOps;
7162 for (VPInstruction *VPI : MemOps) {
7163 if (!ProcessVPInst(VPI))
7164 RemainingMemOps.push_back(VPI);
7165 }
7166
7167 MemOps.clear();
7168 std::swap(MemOps, RemainingMemOps);
7169 };
7170
7171 auto ReplaceWith = [&](VPInstruction *VPI, VPRecipeBase *New) {
7172 New->insertBefore(VPI);
7173 if (VPI->getOpcode() == Instruction::Load)
7174 VPI->replaceAllUsesWith(New->getVPSingleValue());
7175 VPI->eraseFromParent();
7176
7177 // VPI has been processed.
7178 return true;
7179 };
7180
7181 auto Scalarize = [&](VPInstruction *VPI) {
7182 return ReplaceWith(VPI, RecipeBuilder.handleReplication(VPI, Range));
7183 };
7184
7185 VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
7186 VPBuilder FinalRedStoresBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
7188 "lowerMemoryIdioms", ProcessSubset, Plan, [&](VPInstruction *VPI) {
7189 if (RecipeBuilder.replaceWithFinalIfReductionStore(
7190 VPI, FinalRedStoresBuilder))
7191 return true;
7192
7193 // Filter out scalar VPlan for the remaining idioms.
7195 [](ElementCount VF) { return VF.isScalar(); }, Range))
7196 return false;
7197
7198 if (VPHistogramRecipe *Histogram = RecipeBuilder.widenIfHistogram(VPI))
7199 return ReplaceWith(VPI, Histogram);
7200
7201 return false;
7202 });
7203
7204 // Filter out scalar VPlan for the remaining memory operations.
7206 [](ElementCount VF) { return VF.isScalar(); }, Range))
7207 return;
7208
7209 // If the instruction's allocated size doesn't equal it's type size, it
7210 // requires padding and will be scalarized.
7212 "scalarizeMemOpsWithIrregularTypes", ProcessSubset, Plan,
7213 [&](VPInstruction *VPI) {
7215 if (hasIrregularType(getLoadStoreType(I), I->getDataLayout()))
7216 return Scalarize(VPI);
7217
7218 return false;
7219 });
7220
7221 VPlanTransforms::runPass("delegateMemOpWideningToLegacyCM", ProcessSubset,
7222 Plan, [&](VPInstruction *VPI) {
7223 if (VPRecipeBase *Recipe =
7224 RecipeBuilder.tryToWidenMemory(VPI, Range))
7225 return ReplaceWith(VPI, Recipe);
7226
7227 return Scalarize(VPI);
7228 });
7229}
7230
7233 [&](ElementCount VF) { return VF.isScalar(); }, Range))
7234 return;
7235
7237 Plan.getEntry());
7239 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
7240 auto *VPI = dyn_cast<VPInstruction>(&R);
7241 if (!VPI)
7242 continue;
7243
7244 auto *I = cast_or_null<Instruction>(VPI->getUnderlyingValue());
7245 // Wouldn't be able to create a `VPReplicateRecipe` anyway.
7246 if (!I)
7247 continue;
7248
7249 // If executing other lanes produces side-effects we can't avoid them.
7250 if (VPI->mayHaveSideEffects())
7251 continue;
7252
7253 // We want to drop the mask operand, verify we can safely do that.
7254 if (VPI->isMasked() && !VPI->isSafeToSpeculativelyExecute())
7255 continue;
7256
7257 // Avoid rewriting IV increment as that interferes with
7258 // `removeRedundantCanonicalIVs`.
7259 if (VPI->getOpcode() == Instruction::Add &&
7261 continue;
7262
7263 // Other lanes are needed - can't drop them.
7265 continue;
7266
7267 auto *Recipe = VPBuilder::createSingleScalarOp(
7268 VPI->getOpcode(), VPI->operandsWithoutMask(), /*Mask=*/nullptr, *VPI,
7269 *VPI, VPI->getDebugLoc(), I);
7270 Recipe->insertBefore(VPI);
7271 VPI->replaceAllUsesWith(Recipe);
7272 VPI->eraseFromParent();
7273 }
7274 }
7275}
7276
7277/// Returns true if \p Info's parameter kinds are compatible with \p Args.
7278static bool areVFParamsOk(const VFInfo &Info, ArrayRef<VPValue *> Args,
7279 PredicatedScalarEvolution &PSE, const Loop *L) {
7280 ScalarEvolution *SE = PSE.getSE();
7281 return all_of(Info.Shape.Parameters, [&](VFParameter Param) {
7282 switch (Param.ParamKind) {
7283 case VFParamKind::Vector:
7284 case VFParamKind::GlobalPredicate:
7285 return true;
7286 case VFParamKind::OMP_Uniform:
7287 return SE->isSCEVable(Args[Param.ParamPos]->getScalarType()) &&
7288 SE->isLoopInvariant(
7289 vputils::getSCEVExprForVPValue(Args[Param.ParamPos], PSE, L),
7290 L);
7291 case VFParamKind::OMP_Linear:
7292 return match(vputils::getSCEVExprForVPValue(Args[Param.ParamPos], PSE, L),
7293 m_scev_AffineAddRec(
7294 m_SCEV(), m_scev_SpecificSInt(Param.LinearStepOrPos),
7295 m_SpecificLoop(L)));
7296 default:
7297 return false;
7298 }
7299 });
7300}
7301
7302/// Find a vector variant of \p CI for \p VF, respecting \p MaskRequired.
7303/// Returns the variant function, or nullptr. Masked variants are assumed to
7304/// take the mask as a trailing parameter.
7306 ElementCount VF, bool MaskRequired,
7308 const Loop *L) {
7309 if (CI->isNoBuiltin())
7310 return nullptr;
7311 auto Mappings = VFDatabase::getMappings(*CI);
7312 const auto *It = find_if(Mappings, [&](const VFInfo &Info) {
7313 return Info.Shape.VF == VF && (!MaskRequired || Info.isMasked()) &&
7314 areVFParamsOk(Info, Args, PSE, L);
7315 });
7316 if (It == Mappings.end())
7317 return nullptr;
7318 return CI->getModule()->getFunction(It->VectorName);
7319}
7320
7321namespace {
7322/// The outcome of choosing how to widen a call at a given VF.
7323struct CallWideningDecision {
7324 enum class KindTy { Scalarize, Intrinsic, VectorVariant };
7325 CallWideningDecision(KindTy Kind, Function *Variant = nullptr)
7326 : Kind(Kind), Variant(Variant) {}
7327 KindTy Kind;
7328
7329 /// Set when Kind == VectorVariant.
7331
7332 bool operator==(const CallWideningDecision &Other) const {
7333 return Kind == Other.Kind && Variant == Other.Variant;
7334 }
7335};
7336} // namespace
7337
7338/// Pick the cheapest widening for the call \p VPI at \p VF among scalarization,
7339/// vector intrinsic, and vector library variant.
7340static CallWideningDecision decideCallWidening(VPInstruction &VPI,
7342 ElementCount VF,
7343 VPCostContext &CostCtx) {
7344 auto *CI = cast<CallInst>(VPI.getUnderlyingInstr());
7345
7346 // Scalar VFs and calls forced or known to scalarize always replicate.
7347 if (VF.isScalar() || CostCtx.willBeScalarized(CI, VF))
7348 return CallWideningDecision::KindTy::Scalarize;
7349
7350 auto *CalledFn = cast<Function>(
7352 Type *ResultTy = VPI.getScalarType();
7354 bool MaskRequired = CostCtx.isMaskRequired(CI);
7355
7356 // Pseudo intrinsics (assume, lifetime, ...) are always scalarized.
7358 return CallWideningDecision::KindTy::Scalarize;
7359
7360 InstructionCost ScalarCost =
7361 VPReplicateRecipe::computeCallCost(CalledFn, ResultTy, Ops,
7362 /*IsSingleScalar=*/false, VF, CostCtx);
7363
7364 Function *VecFunc =
7365 findVectorVariant(CI, Ops, VF, MaskRequired, CostCtx.PSE, CostCtx.L);
7367 if (VecFunc)
7368 VecCallCost = VPWidenCallRecipe::computeCallCost(VecFunc, CostCtx);
7369
7370 // Prefer the intrinsic if it is at least as cheap as scalarizing and any
7371 // available vector variant.
7372 if (ID) {
7375 if (IntrinsicCost.isValid() && ScalarCost >= IntrinsicCost &&
7376 (!VecFunc || VecCallCost >= IntrinsicCost))
7377 return CallWideningDecision::KindTy::Intrinsic;
7378 }
7379
7380 // Otherwise, use a vector library variant when it beats scalarizing.
7381 if (VecFunc && ScalarCost >= VecCallCost)
7382 return {CallWideningDecision::KindTy::VectorVariant, VecFunc};
7383
7384 return CallWideningDecision::KindTy::Scalarize;
7385}
7386
7388 VPRecipeBuilder &RecipeBuilder,
7389 VPCostContext &CostCtx) {
7392 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
7393 auto *VPI = dyn_cast<VPInstruction>(&R);
7394 if (!VPI || !VPI->getUnderlyingValue() ||
7395 VPI->getOpcode() != Instruction::Call)
7396 continue;
7397
7398 auto *CI = cast<CallInst>(VPI->getUnderlyingInstr());
7399 SmallVector<VPValue *, 4> Ops(VPI->op_begin(),
7400 VPI->op_begin() + CI->arg_size());
7401
7402 CallWideningDecision Decision =
7403 decideCallWidening(*VPI, Ops, Range.Start, CostCtx);
7405 [&](ElementCount VF) {
7406 return Decision == decideCallWidening(*VPI, Ops, VF, CostCtx);
7407 },
7408 Range);
7409
7410 VPSingleDefRecipe *Replacement = nullptr;
7411 switch (Decision.Kind) {
7412 case CallWideningDecision::KindTy::Intrinsic: {
7414 Type *ResultTy = VPI->getScalarType();
7415 Replacement = new VPWidenIntrinsicRecipe(*CI, ID, Ops, ResultTy, *VPI,
7416 *VPI, VPI->getDebugLoc());
7417 break;
7418 }
7419 case CallWideningDecision::KindTy::VectorVariant: {
7420 // Masked variants take the mask as a trailing parameter, so they have
7421 // one more parameter than the original call's arguments.
7422 if (Decision.Variant->arg_size() > Ops.size()) {
7423 VPValue *Mask = VPI->isMasked() ? VPI->getMask() : Plan.getTrue();
7424 Ops.push_back(Mask);
7425 }
7426 Ops.push_back(VPI->getOperand(VPI->getNumOperandsWithoutMask() - 1));
7427 Replacement = new VPWidenCallRecipe(CI, Decision.Variant, Ops, *VPI,
7428 *VPI, VPI->getDebugLoc());
7429 break;
7430 }
7431 case CallWideningDecision::KindTy::Scalarize:
7432 Replacement = RecipeBuilder.handleReplication(VPI, Range);
7433 break;
7434 }
7435
7436 Replacement->insertBefore(VPI);
7437 VPI->replaceAllUsesWith(Replacement);
7438 VPI->eraseFromParent();
7439 }
7440 }
7441}
7442
7445 Loop &L, VPCostContext &Ctx,
7446 VFRange &Range) {
7447 if (Plan.hasScalarVFOnly())
7448 return;
7449
7450 VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
7451 VPValue *I32VF = nullptr;
7453 vp_depth_first_shallow(VectorLoop->getEntry()))) {
7454 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
7455 auto *LoadR = dyn_cast<VPWidenLoadRecipe>(&R);
7456 // TODO: Support strided store.
7457 // TODO: Transform reverse access into strided access with -1 stride.
7458 // TODO: Transform gather/scatter with uniform address into strided access
7459 // with 0 stride.
7460 // TODO: Transform interleave access into multiple strided accesses.
7461 if (!LoadR || LoadR->isConsecutive())
7462 continue;
7463
7464 auto *Ptr = dyn_cast<VPWidenGEPRecipe>(LoadR->getAddr());
7465 if (!Ptr)
7466 continue;
7467
7468 // Check if this is a strided access by analyzing the address SCEV for an
7469 // affine addRec.
7470 const SCEV *PtrSCEV = vputils::getSCEVExprForVPValue(Ptr, PSE, &L);
7471 const SCEV *Start;
7472 const SCEVConstant *Step;
7473 // TODO: Support non-constant loop invariant stride.
7474 if (!match(PtrSCEV,
7476 m_SpecificLoop(&L))))
7477 continue;
7478
7479 Type *LoadTy = LoadR->getScalarType();
7480 Align Alignment = LoadR->getAlign();
7481 auto IsProfitable = [&](ElementCount VF) {
7482 Type *DataTy = toVectorTy(LoadTy, VF);
7483 if (!Ctx.TTI.isLegalStridedLoadStore(DataTy, Alignment))
7484 return false;
7485 const InstructionCost CurrentCost = LoadR->computeCost(VF, Ctx);
7486 const InstructionCost StridedLoadStoreCost =
7488 Intrinsic::experimental_vp_strided_load, DataTy,
7489 LoadR->isMasked(), Alignment, Ctx);
7490 return StridedLoadStoreCost < CurrentCost;
7491 };
7492
7494 Range))
7495 continue;
7496
7497 // Invalidate the legacy widening decision so the cost of replaced load is
7498 // not counted during precomputeCosts.
7499 // TODO: Remove once the legacy exit cost computation is retired.
7500 for (ElementCount VF : Range)
7501 Ctx.invalidateWideningDecision(&LoadR->getIngredient(), VF);
7502
7503 // Get VF as i32 for the vector length operand.
7504 if (!I32VF) {
7505 VPBuilder Builder(Plan.getVectorPreheader());
7506 I32VF = Builder.createScalarZExtOrTrunc(
7507 &Plan.getVF(), Type::getInt32Ty(Plan.getContext()),
7509 }
7510
7511 VPBuilder Builder(LoadR);
7512 // Create the base pointer of strided access.
7513 // TODO: reuse VPDerivedIVRecipe for base pointer computation when it
7514 // supports a general VPValue as the start value.
7515 VPValue *StartVPV = vputils::getOrCreateVPValueForSCEVExpr(Plan, Start);
7516 VPValue *StrideInBytes = Plan.getOrAddLiveIn(Step->getValue());
7517 Type *IndexTy = Plan.getDataLayout().getIndexType(Ptr->getScalarType());
7518 assert(IndexTy == StrideInBytes->getScalarType() &&
7519 "Stride type from SCEV must match the index type");
7520 VPValue *CanIV = Builder.createScalarSExtOrTrunc(
7521 VectorLoop->getCanonicalIV(), IndexTy,
7522 VectorLoop->getCanonicalIVType(), DebugLoc::getUnknown());
7523 auto *AddRecPtr = cast<SCEVAddRecExpr>(PtrSCEV);
7524 auto *Offset = Builder.createOverflowingOp(
7525 Instruction::Mul, {CanIV, StrideInBytes},
7526 {AddRecPtr->hasNoUnsignedWrap(), AddRecPtr->hasNoSignedWrap()});
7527 auto *BasePtr = Builder.createNoWrapPtrAdd(
7528 StartVPV, Offset,
7529 AddRecPtr->hasNoUnsignedWrap() ? GEPNoWrapFlags::noUnsignedWrap()
7531
7532 // Create a new vector pointer for strided access.
7533 VPValue *NewPtr = Builder.createVectorPointer(
7534 BasePtr, Type::getInt8Ty(Plan.getContext()), StrideInBytes,
7535 Ptr->getGEPNoWrapFlags(), Ptr->getDebugLoc());
7536
7537 VPValue *Mask = LoadR->getMask();
7538 if (!Mask)
7539 Mask = Plan.getTrue();
7540 auto *StridedLoad = Builder.createWidenMemIntrinsic(
7541 Intrinsic::experimental_vp_strided_load,
7542 {NewPtr, StrideInBytes, Mask, I32VF}, LoadTy, Alignment, *LoadR,
7543 LoadR->getDebugLoc());
7544 LoadR->replaceAllUsesWith(StridedLoad);
7545 }
7546 }
7547}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
AMDGPU Register Bank Select
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool isEqual(const Function &Caller, const Function &Callee)
static const Function * getParent(const Value *V)
#define X(NUM, ENUM, NAME)
Definition ELF.h:854
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
static cl::opt< IntrinsicCostStrategy > IntrinsicCost("intrinsic-cost-strategy", cl::desc("Costing strategy for intrinsic instructions"), cl::init(IntrinsicCostStrategy::InstructionCost), cl::values(clEnumValN(IntrinsicCostStrategy::InstructionCost, "instruction-cost", "Use TargetTransformInfo::getInstructionCost"), clEnumValN(IntrinsicCostStrategy::IntrinsicCost, "intrinsic-cost", "Use TargetTransformInfo::getIntrinsicInstrCost"), clEnumValN(IntrinsicCostStrategy::TypeBasedIntrinsicCost, "type-based-intrinsic-cost", "Calculate the intrinsic cost based only on argument types")))
@ Default
Hexagon Common GEP
#define _
iv Induction Variable Users
Definition IVUsers.cpp:48
iv users
Definition IVUsers.cpp:48
static std::pair< Value *, APInt > getMask(Value *WideMask, unsigned Factor, ElementCount LeafValueEC)
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
licm
Definition LICM.cpp:383
Legalize the Machine IR a function s Machine IR
Definition Legalizer.cpp:81
#define I(x, y, z)
Definition MD5.cpp:57
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first DebugLoc that has line number information, given a range of instructions.
This file provides utility analysis objects describing memory locations.
This file contains the declarations for metadata subclasses.
MachineInstr unsigned OpIdx
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
#define P(N)
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
R600 Clause Merge
const SmallVectorImpl< MachineOperand > & Cond
static bool dominates(InstrPosIndexes &PosIndexes, const MachineInstr &A, const MachineInstr &B)
This file contains some templates that are useful if you are working with the STL at all.
This is the interface for a metadata-based scoped no-alias analysis.
This file implements a set that has insertion order iteration characteristics.
This file defines the SmallPtrSet class.
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This file implements the TypeSwitch template, which mimics a switch() statement whose cases are type ...
This file implements dominator tree analysis for a single level of a VPlan's H-CFG.
This file contains the declarations of different VPlan-related auxiliary helpers.
static SmallVector< SmallVector< VPReplicateRecipe *, 4 > > collectComplementaryPredicatedMemOps(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L)
static void removeCommonBlendMask(VPBlendRecipe *Blend)
Try to see if all of Blend's masks share a common value logically and'ed and remove it from the masks...
static void tryToCreateAbstractReductionRecipe(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries to create abstract recipes from the reduction recipe for following optimizations ...
static VPReplicateRecipe * findRecipeWithMinAlign(ArrayRef< VPReplicateRecipe * > Group)
static bool handleUncountableExitsWithSideEffects(VPlan &Plan, SmallVectorImpl< EarlyExitInfo > &Exits, VPBasicBlock *HeaderVPBB, VPBasicBlock *LatchVPBB, VPBasicBlock *MiddleVPBB, Loop *TheLoop, PredicatedScalarEvolution &PSE, DominatorTree &DT, AssumptionCache *AC)
Update Plan to mask memory operations in the loop based on whether the early exit is taken or not.
static CallWideningDecision decideCallWidening(VPInstruction &VPI, ArrayRef< VPValue * > Ops, ElementCount VF, VPCostContext &CostCtx)
Pick the cheapest widening for the call VPI at VF among scalarization, vector intrinsic,...
static bool areVFParamsOk(const VFInfo &Info, ArrayRef< VPValue * > Args, PredicatedScalarEvolution &PSE, const Loop *L)
Returns true if Info's parameter kinds are compatible with Args.
static std::optional< VPValue * > getRecipesForUncountableExit(SmallVectorImpl< VPInstruction * > &Recipes, VPBasicBlock *LatchVPBB)
Returns the VPValue representing the uncountable exit comparison used by AnyOf if the recipes it depe...
static bool simplifyLogicalRecipe(VPSingleDefRecipe *Def, VPBuilder &Builder, bool CanCreateNewRecipe)
Try to simplify logical and bitwise recipes in Def.
static bool sinkScalarOperands(VPlan &Plan)
static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Try to simplify the branch condition of Plan.
static VPValue * cloneBinOpForScalarIV(VPWidenRecipe *BinOp, VPValue *ScalarIV, VPWidenIntOrFpInductionRecipe *WidenIV)
Create a scalar version of BinOp, with its WidenIV operand replaced by ScalarIV, and place it after S...
static VPWidenIntOrFpInductionRecipe * getExpressionIV(VPValue *V)
Check if V is a binary expression of a widened IV and a loop-invariant value.
static void removeRedundantInductionCasts(VPlan &Plan)
Remove redundant casts of inductions.
static bool isConditionTrueViaVFAndUF(VPValue *Cond, VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Return true if Cond is known to be true for given BestVF and BestUF.
static bool tryToReplaceALMWithWideALM(VPlan &Plan, ElementCount VF, unsigned UF)
Try to replace multiple active lane masks used for control flow with a single, wide active lane mask ...
static std::optional< std::pair< bool, unsigned > > getOpcodeOrIntrinsicID(const VPSingleDefRecipe *R)
Get any instruction opcode or intrinsic ID data embedded in recipe R.
static VPExpressionRecipe * tryToMatchAndCreateExtendedReduction(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries convert extended in-loop reductions to VPExpressionRecipe and clamp the Range if ...
static RemoveMask_match< Op0_t, Op1_t > m_RemoveMask(const Op0_t &In, Op1_t &Out)
Match a specific mask In, or a combination of it (logical-and In, Out).
static std::optional< ElementCount > isConsecutiveInterleaveGroup(VPInterleaveRecipe *InterleaveR, ArrayRef< ElementCount > VFs, const TargetTransformInfo &TTI)
Returns VF from VFs if IR is a full interleave group with factor and number of members both equal to ...
static Type * getLoadStoreValueType(VPReplicateRecipe *R, bool IsLoad)
Get the value type of the replicate load or store.
static VPIRMetadata getCommonMetadata(ArrayRef< VPReplicateRecipe * > Recipes)
static VPValue * getPredicatedMask(VPRegionBlock *R)
If R is a region with a VPBranchOnMaskRecipe in the entry block, return the mask.
static bool mergeReplicateRegionsIntoSuccessors(VPlan &Plan)
static Function * findVectorVariant(CallInst *CI, ArrayRef< VPValue * > Args, ElementCount VF, bool MaskRequired, PredicatedScalarEvolution &PSE, const Loop *L)
Find a vector variant of CI for VF, respecting MaskRequired.
static VPScalarIVStepsRecipe * createScalarIVSteps(VPlan &Plan, InductionDescriptor::InductionKind Kind, Instruction::BinaryOps InductionOpcode, FPMathOperator *FPBinOp, Instruction *TruncI, VPIRValue *StartV, VPValue *Step, DebugLoc DL, VPBuilder &Builder)
static VPWidenInductionRecipe * getOptimizableIVOf(VPValue *VPV, PredicatedScalarEvolution &PSE)
Check if VPV is an untruncated wide induction, either before or after the increment.
static void fixupVFUsersForEVL(VPlan &Plan, VPValue &EVL)
After replacing the canonical IV with a EVL-based IV, fixup recipes that use VF to use the EVL instea...
static bool canNarrowLoad(VPSingleDefRecipe *WideMember0, unsigned OpIdx, VPValue *OpV, unsigned Idx, bool IsScalable)
Returns true if V is VPWidenLoadRecipe or VPInterleaveRecipe that can be converted to a narrower reci...
static void simplifyRecipe(VPSingleDefRecipe *Def)
Try to simplify VPSingleDefRecipe Def.
static bool isDeadRecipe(VPRecipeBase &R)
Returns true if R is dead and can be removed.
static void legalizeAndOptimizeInductions(VPlan &Plan)
Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd (IndStart, ScalarIVSteps (0,...
static void addReplicateRegions(VPlan &Plan)
static SmallVector< SmallVector< VPReplicateRecipe *, 4 > > collectGroupedReplicateMemOps(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L, function_ref< bool(VPReplicateRecipe *)> FilterFn)
Collect either replicated Loads or Stores grouped by their address SCEV and their load-store type,...
static VPValue * tryToComputeEndValueForInduction(VPWidenInductionRecipe *WideIV, VPBuilder &VectorPHBuilder, VPValue *VectorTC)
Compute the end value for WideIV, unless it is truncated.
static std::optional< Intrinsic::ID > getVPDivRemIntrinsic(Intrinsic::ID IntrID)
static void removeRedundantExpandSCEVRecipes(VPlan &Plan)
Remove redundant ExpandSCEVRecipes in Plan's entry block by replacing them with already existing reci...
static VPValue * optimizeEarlyExitInductionUser(VPlan &Plan, VPValue *Op, PredicatedScalarEvolution &PSE)
Attempts to optimize the induction variable exit values for users in the early exit block.
static VPValue * narrowInterleaveGroupOp(ArrayRef< VPValue * > Members, SmallPtrSetImpl< VPValue * > &NarrowedOps, VPBasicBlock *Preheader)
static VPValue * scalarizeVPWidenPointerInduction(VPWidenPointerInductionRecipe *PtrIV, VPlan &Plan, VPBuilder &Builder)
Scalarize a VPWidenPointerInductionRecipe by replacing it with a PtrAdd (IndStart,...
static VPIRValue * tryToFoldLiveIns(VPSingleDefRecipe &R, ArrayRef< VPValue * > Operands, const DataLayout &DL)
Try to fold R using InstSimplifyFolder.
static SmallVector< VPUser * > collectUsersRecursively(VPValue *V)
static VPValue * optimizeLatchExitInductionUser(VPlan &Plan, VPValue *Op, DenseMap< VPValue *, VPValue * > &EndValues, PredicatedScalarEvolution &PSE)
Attempts to optimize the induction variable exit values for users in the exit block coming from the l...
static void recursivelyDeleteDeadRecipes(VPValue *V)
static void reassociateHeaderMask(VPlan &Plan)
Reassociate (headermask && x) && y -> headermask && (x && y) to allow the header mask to be simplifie...
static VPActiveLaneMaskPHIRecipe * addVPLaneMaskPhiAndUpdateExitBranch(VPlan &Plan)
static void expandVPDerivedIV(VPDerivedIVRecipe *R)
Expand a VPDerivedIVRecipe into executable recipes.
static VPBasicBlock * getPredicatedThenBlock(VPRegionBlock *R)
If R is a triangle region, return the 'then' block of the triangle.
static bool canHoistOrSinkWithNoAliasCheck(const MemoryLocation &MemLoc, VPBasicBlock *FirstBB, VPBasicBlock *LastBB, std::optional< SinkStoreInfo > SinkInfo={})
Check if a memory operation doesn't alias with memory operations using scoped noalias metadata,...
static VPRegionBlock * createReplicateRegion(VPReplicateRecipe *PredRecipe, VPRegionBlock *ParentRegion, VPlan &Plan)
static void simplifyBlends(VPlan &Plan)
Normalize and simplify VPBlendRecipes.
static bool cannotHoistOrSinkRecipe(VPRecipeBase &R, VPBasicBlock *FirstBB, VPBasicBlock *LastBB, bool Sinking=false)
Return true if we do not know how to (mechanically) hoist or sink a non-memory or memory recipe R out...
static std::optional< Instruction::BinaryOps > getUnmaskedDivRemOpcode(Intrinsic::ID ID)
static bool isAlreadyNarrow(VPValue *VPV)
Returns true if VPValue is a narrow VPValue.
static bool canNarrowOps(ArrayRef< VPValue * > Ops, bool IsScalable)
static bool optimizeVectorInductionWidthForTCAndVFUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF)
Optimize the width of vector induction variables in Plan based on a known constant Trip Count,...
static VPExpressionRecipe * tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries convert extended in-loop reductions to VPExpressionRecipe and clamp the Range if ...
static bool canSinkStoreWithNoAliasCheck(ArrayRef< VPReplicateRecipe * > StoresToSink, PredicatedScalarEvolution &PSE, const Loop &L)
static void expandVPWidenIntOrFpInduction(VPWidenIntOrFpInductionRecipe *WidenIVR)
Expand a VPWidenIntOrFpInduction into executable recipes, for the initial value, phi and backedge val...
static VPRecipeBase * optimizeMaskToEVL(VPValue *HeaderMask, VPRecipeBase &CurRecipe, VPValue &EVL)
Try to optimize a CurRecipe masked by HeaderMask to a corresponding EVL-based recipe without the head...
static void expandVPWidenPointerInduction(VPWidenPointerInductionRecipe *R)
Expand a VPWidenPointerInductionRecipe into executable recipes, for the initial value,...
static void narrowToSingleScalarRecipes(VPlan &Plan)
This file provides utility VPlan to VPlan transformations.
#define RUN_VPLAN_PASS(PASS,...)
This file declares the class VPlanVerifier, which contains utility functions to check the consistency...
This file contains the declarations of the Vectorization Plan base classes:
static const X86InstrFMA3Group Groups[]
Value * RHS
Value * LHS
BinaryOperator * Mul
static const uint32_t IV[8]
Definition blake3_impl.h:83
Helper for extra no-alias checks via known-safe recipe and SCEV.
SinkStoreInfo(ArrayRef< VPReplicateRecipe * > ExcludeRecipes, VPReplicateRecipe &GroupLeader, PredicatedScalarEvolution &PSE, const Loop &L)
SinkStoreInfo(VPReplicateRecipe &GroupLeader)
bool shouldSkip(VPRecipeBase &R) const
Return true if R should be skipped during alias checking, either because it's in the exclude set or b...
Class for arbitrary precision integers.
Definition APInt.h:78
LLVM_ABI APInt zext(unsigned width) const
Zero extend to a new width.
Definition APInt.cpp:1055
unsigned getActiveBits() const
Compute the number of active bits in the value.
Definition APInt.h:1535
APInt abs() const
Get the absolute value.
Definition APInt.h:1818
LLVM_ABI APInt urem(const APInt &RHS) const
Unsigned remainder operation.
Definition APInt.cpp:1692
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1511
LLVM_ABI APInt sext(unsigned width) const
Sign extend to a new width.
Definition APInt.cpp:1028
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition APInt.h:441
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition APInt.h:1228
An arbitrary precision integer that knows its signedness.
Definition APSInt.h:24
static APSInt getMinValue(uint32_t numBits, bool Unsigned)
Return the APSInt representing the minimum integer value with the given bit width and signedness.
Definition APSInt.h:310
static APSInt getMaxValue(uint32_t numBits, bool Unsigned)
Return the APSInt representing the maximum integer value with the given bit width and signedness.
Definition APSInt.h:302
@ NoAlias
The two locations do not alias at all.
Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
const T & back() const
Get the last element.
Definition ArrayRef.h:150
ArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
Definition ArrayRef.h:194
const T & front() const
Get the first element.
Definition ArrayRef.h:144
A cache of @llvm.assume calls within a function.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction; assumes that the block is well-formed.
Definition BasicBlock.h:237
bool isNoBuiltin() const
Return true if the call should not be treated as a call to a builtin.
This class represents a function call, abstracting a target machine's calling convention.
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:765
@ ICMP_NE
not equal
Definition InstrTypes.h:762
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:766
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition InstrTypes.h:750
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition InstrTypes.h:852
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
static ConstantInt * getSigned(IntegerType *Ty, int64_t V, bool ImplicitTrunc=false)
Return a ConstantInt with the specified value for the specified type.
Definition Constants.h:135
This class represents a range of values.
LLVM_ABI bool contains(const APInt &Val) const
Return true if the specified value is in the set.
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
LLVM_ABI IntegerType * getIndexType(LLVMContext &C, unsigned AddressSpace) const
Returns the type of a GEP index in AddressSpace.
A debug info location.
Definition DebugLoc.h:124
static DebugLoc getCompilerGenerated()
Definition DebugLoc.h:152
static DebugLoc getUnknown()
Definition DebugLoc.h:151
ValueT lookup(const_arg_type_t< KeyT > Val) const
Return the entry for the specified key, or a default constructed value if no such entry exists.
Definition DenseMap.h:252
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:301
ValueT lookup_or(const_arg_type_t< KeyT > Val, U &&Default) const
Definition DenseMap.h:262
bool dominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
dominates - Returns true iff A dominates B.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:151
constexpr bool isVector() const
One or more elements.
Definition TypeSize.h:324
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition TypeSize.h:312
constexpr bool isScalar() const
Exactly one element.
Definition TypeSize.h:320
Utility class for floating point operations which can have information about relaxed accuracy require...
Definition Operator.h:202
FastMathFlags getFastMathFlags() const
Convenience function for getting all the fast-math flags.
Definition Operator.h:291
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:23
size_t arg_size() const
Definition Function.h:875
Represents flags for the getelementptr instruction/expression.
static GEPNoWrapFlags noUnsignedWrap()
GEPNoWrapFlags withoutNoUnsignedWrap() const
static GEPNoWrapFlags none()
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
A struct for saving information about induction variables.
static LLVM_ABI InductionDescriptor getCanonicalIntInduction(Type *Ty, ScalarEvolution &SE)
Returns the canonical integer induction for type Ty with start = 0 and step = 1.
InductionKind
This enum represents the kinds of inductions that we support.
@ IK_NoInduction
Not an induction variable.
@ IK_FpInduction
Floating point induction variable.
@ IK_PtrInduction
Pointer induction var. Step = C.
@ IK_IntInduction
Integer induction variable. Step = C.
InstSimplifyFolder - Use InstructionSimplify to fold operations to existing values.
static InstructionCost getInvalid(CostType Val=0)
bool isCast() const
LLVM_ABI const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
bool isBinaryOp() const
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
bool isIntDivRem() const
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:350
The group of interleaved loads/stores sharing the same stride and close to each other.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
static bool getDecisionAndClampRange(const std::function< bool(ElementCount)> &Predicate, VFRange &Range)
Test a Predicate on a Range of VF's.
Definition VPlan.cpp:1659
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
LLVM_ABI MDNode * createBranchWeights(uint32_t TrueWeight, uint32_t FalseWeight, bool IsExpected=false)
Return metadata containing two branch weights.
Definition MDBuilder.cpp:38
Metadata node.
Definition Metadata.h:1069
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:38
ValueT lookup(const KeyT &Key) const
Definition MapVector.h:110
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
Definition MapVector.h:118
bool empty() const
Definition MapVector.h:79
Representation for a specific memory location.
AAMDNodes AATags
The metadata nodes which describes the aliasing of the location (each member is null if that kind of ...
Function * getFunction(StringRef Name) const
Look up the specified function in the module symbol table.
Definition Module.cpp:235
unsigned getOpcode() const
Return the opcode for this Instruction or ConstantExpr.
Definition Operator.h:43
Post-order traversal of a graph.
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
ScalarEvolution * getSE() const
Returns the ScalarEvolution analysis used.
LLVM_ABI const SCEV * getSCEV(Value *V)
Returns the SCEV expression of V, in the context of the current SCEV predicate.
static LLVM_ABI unsigned getOpcode(RecurKind Kind)
Returns the opcode corresponding to the RecurrenceKind.
static bool isFindLastRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is of the form select(cmp(),x,y) where one of (x,...
RegionT * getParent() const
Get the parent of the Region.
Definition RegionInfo.h:362
This class represents a constant integer value.
ConstantInt * getValue() const
This class uses information about analyze scalars to rewrite expressions in canonical form.
LLVM_ABI Value * expandCodeFor(SCEVUse SH, Type *Ty, BasicBlock::iterator I)
Insert code to directly compute the specified SCEV expression into the program.
static const SCEV * rewrite(const SCEV *Scev, ScalarEvolution &SE, ValueToSCEVMapTy &Map)
This class represents an analyzed expression in the program.
LLVM_ABI Type * getType() const
Return the LLVM type of this SCEV expression.
The main scalar evolution driver.
LLVM_ABI const SCEV * getUDivExpr(SCEVUse LHS, SCEVUse RHS)
Get a canonical unsigned division expression, or something simpler if possible.
const DataLayout & getDataLayout() const
Return the DataLayout associated with the module this SCEV instance is operating on.
LLVM_ABI const SCEV * getNegativeSCEV(const SCEV *V, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
Return the SCEV object corresponding to -V.
LLVM_ABI bool isKnownNonZero(const SCEV *S)
Test if the given expression is known to be non-zero.
LLVM_ABI const SCEV * getConstant(ConstantInt *V)
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI const SCEV * getMinusSCEV(SCEVUse LHS, SCEVUse RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
ConstantRange getSignedRange(const SCEV *S)
Determine the signed range for a particular SCEV.
LLVM_ABI bool isKnownPositive(const SCEV *S)
Test if the given expression is known to be positive.
LLVM_ABI const SCEV * getElementCount(Type *Ty, ElementCount EC, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
ConstantRange getUnsignedRange(const SCEV *S)
Determine the unsigned range for a particular SCEV.
LLVM_ABI const SCEV * getMulExpr(SmallVectorImpl< SCEVUse > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
LLVM_ABI bool isKnownPredicate(CmpPredicate Pred, SCEVUse LHS, SCEVUse RHS)
Test if the given expression is known to satisfy the condition described by Pred, LHS,...
static LLVM_ABI AliasResult alias(const MemoryLocation &LocA, const MemoryLocation &LocB)
A vector that has set insertion semantics.
Definition SetVector.h:57
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:103
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:151
size_type size() const
Definition SmallPtrSet.h:99
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
iterator begin() const
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
Provides information about what library functions are available for the current target.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
static LLVM_ABI PartialReductionExtendKind getPartialReductionExtendKind(Instruction *I)
Get the kind of extension that an instruction represents.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
LLVM_ABI InstructionCost getPartialReductionCost(unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType, ElementCount VF, PartialReductionExtendKind OpAExtend, PartialReductionExtendKind OpBExtend, std::optional< unsigned > BinOp, TTI::TargetCostKind CostKind, std::optional< FastMathFlags > FMF) const
@ SK_Broadcast
Broadcast element 0 to all other elements.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
This class implements a switch-like dispatch statement for a value of 'T' using dyn_cast functionalit...
Definition TypeSwitch.h:89
TypeSwitch< T, ResultT > & Case(CallableT &&caseFn)
Add a case on the given type.
Definition TypeSwitch.h:98
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:309
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:282
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
Definition Type.cpp:307
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:368
bool isStructTy() const
True if this is an instance of StructType.
Definition Type.h:276
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:197
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:232
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:306
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:186
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:257
op_range operands()
Definition User.h:267
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
Definition VectorUtils.h:76
A recipe for generating the active lane mask for the vector loop that is used to predicate the vector...
Definition VPlan.h:4055
VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph.
Definition VPlan.h:4407
void appendRecipe(VPRecipeBase *Recipe)
Augment the existing recipes of a VPBasicBlock with an additional Recipe as the last recipe.
Definition VPlan.h:4482
RecipeListTy::iterator iterator
Instruction iterators...
Definition VPlan.h:4434
iterator end()
Definition VPlan.h:4444
iterator begin()
Recipe iterator methods.
Definition VPlan.h:4442
iterator_range< iterator > phis()
Returns an iterator range over the PHI-like recipes in the block.
Definition VPlan.h:4495
iterator getFirstNonPhi()
Return the position of the first non-phi node recipe in the block.
Definition VPlan.cpp:266
VPBasicBlock * splitAt(iterator SplitAt)
Split current block at SplitAt by inserting a new block between the current block and its successors ...
Definition VPlan.cpp:560
const VPRecipeBase & front() const
Definition VPlan.h:4454
VPRecipeBase * getTerminator()
If the block has multiple successors, return the branch recipe terminating the block.
Definition VPlan.cpp:639
const VPRecipeBase & back() const
Definition VPlan.h:4456
A recipe for vectorizing a phi-node as a sequence of mask-based select instructions.
Definition VPlan.h:2957
VPValue * getMask(unsigned Idx) const
Return mask number Idx.
Definition VPlan.h:3007
unsigned getNumIncomingValues() const
Return the number of incoming values, taking into account when normalized the first incoming value wi...
Definition VPlan.h:2997
void setMask(unsigned Idx, VPValue *V)
Set mask number Idx to V.
Definition VPlan.h:3013
bool isNormalized() const
A normalized blend is one that has an odd number of operands, whereby the first operand does not have...
Definition VPlan.h:2993
VPBlockBase is the building block of the Hierarchical Control-Flow Graph.
Definition VPlan.h:94
void setSuccessors(ArrayRef< VPBlockBase * > NewSuccs)
Set each VPBasicBlock in NewSuccss as successor of this VPBlockBase.
Definition VPlan.h:315
VPRegionBlock * getParent()
Definition VPlan.h:186
const VPBasicBlock * getExitingBasicBlock() const
Definition VPlan.cpp:236
size_t getNumSuccessors() const
Definition VPlan.h:237
void setPredecessors(ArrayRef< VPBlockBase * > NewPreds)
Set each VPBasicBlock in NewPreds as predecessor of this VPBlockBase.
Definition VPlan.h:306
const VPBlocksTy & getPredecessors() const
Definition VPlan.h:222
VPlan * getPlan()
Definition VPlan.cpp:211
const std::string & getName() const
Definition VPlan.h:177
void clearSuccessors()
Remove all the successors of this block.
Definition VPlan.h:325
VPBlockBase * getSinglePredecessor() const
Definition VPlan.h:233
void clearPredecessors()
Remove all the predecessor of this block.
Definition VPlan.h:322
const VPBasicBlock * getEntryBasicBlock() const
Definition VPlan.cpp:216
VPBlockBase * getSingleHierarchicalPredecessor()
Definition VPlan.h:279
VPBlockBase * getSingleSuccessor() const
Definition VPlan.h:227
const VPBlocksTy & getSuccessors() const
Definition VPlan.h:211
static auto blocksAs(T &&Range)
Return an iterator range over Range with each block cast to BlockTy.
Definition VPlanUtils.h:331
static void insertOnEdge(VPBlockBase *From, VPBlockBase *To, VPBlockBase *BlockPtr)
Inserts BlockPtr on the edge between From and To.
Definition VPlanUtils.h:350
static bool isLatch(const VPBlockBase *VPB, const VPDominatorTree &VPDT)
Returns true if VPB is a loop latch, using isHeader().
static void insertTwoBlocksAfter(VPBlockBase *IfTrue, VPBlockBase *IfFalse, VPBlockBase *BlockPtr)
Insert disconnected VPBlockBases IfTrue and IfFalse after BlockPtr.
Definition VPlanUtils.h:240
static void connectBlocks(VPBlockBase *From, VPBlockBase *To, unsigned PredIdx=-1u, unsigned SuccIdx=-1u)
Connect VPBlockBases From and To bi-directionally.
Definition VPlanUtils.h:258
static void disconnectBlocks(VPBlockBase *From, VPBlockBase *To)
Disconnect VPBlockBases From and To bi-directionally.
Definition VPlanUtils.h:276
static auto blocksOnly(T &&Range)
Return an iterator range over Range which only includes BlockTy blocks.
Definition VPlanUtils.h:312
static void transferSuccessors(VPBlockBase *Old, VPBlockBase *New)
Transfer successors from Old to New. New must have no successors.
Definition VPlanUtils.h:296
static SmallVector< VPBasicBlock * > blocksInSingleSuccessorChainBetween(VPBasicBlock *FirstBB, VPBasicBlock *LastBB)
Returns the blocks between FirstBB and LastBB, where FirstBB to LastBB forms a single-sucessor chain.
A recipe for generating conditional branches on the bits of a mask.
Definition VPlan.h:3502
RAII object that stores the current insertion point and restores it when the object is destroyed.
VPlan-based builder utility analogous to IRBuilder.
VPInstruction * createFirstActiveLane(ArrayRef< VPValue * > Masks, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
VPInstruction * createAdd(VPValue *LHS, VPValue *RHS, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="", VPRecipeWithIRFlags::WrapFlagsTy WrapFlags={false, false})
VPInstruction * createOr(VPValue *LHS, VPValue *RHS, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
VPValue * createScalarZExtOrTrunc(VPValue *Op, Type *ResultTy, Type *SrcTy, DebugLoc DL)
VPInstruction * createLogicalOr(VPValue *LHS, VPValue *RHS, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
VPInstruction * createNot(VPValue *Operand, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
VPInstruction * createAnyOfReduction(VPValue *ChainOp, VPValue *TrueVal, VPValue *FalseVal, DebugLoc DL=DebugLoc::getUnknown())
Create an AnyOf reduction pattern: or-reduce ChainOp, freeze the result, then select between TrueVal ...
Definition VPlan.cpp:1646
VPInstruction * createLogicalAnd(VPValue *LHS, VPValue *RHS, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
VPDerivedIVRecipe * createDerivedIV(InductionDescriptor::InductionKind Kind, FPMathOperator *FPBinOp, VPIRValue *Start, VPValue *Current, VPValue *Step)
Convert the input value Current to the corresponding value of an induction with Start and Step values...
VPInstruction * createScalarCast(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy, DebugLoc DL, const VPIRMetadata &Metadata={})
VPWidenPHIRecipe * createWidenPhi(ArrayRef< VPValue * > IncomingValues, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
static VPBuilder getToInsertAfter(VPRecipeBase *R)
Create a VPBuilder to insert after R.
VPWidenCastRecipe * createWidenCast(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy)
VPInstruction * createICmp(CmpInst::Predicate Pred, VPValue *A, VPValue *B, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
Create a new ICmp VPInstruction with predicate Pred and operands A and B.
VPPhi * createScalarPhi(ArrayRef< VPValue * > IncomingValues, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="", const VPIRFlags &Flags={}, Type *ResultTy=nullptr)
VPInstruction * createSelect(VPValue *Cond, VPValue *TrueVal, VPValue *FalseVal, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="", const VPIRFlags &Flags={})
VPInstruction * createNaryOp(unsigned Opcode, ArrayRef< VPValue * > Operands, Instruction *Inst=nullptr, const VPIRFlags &Flags={}, const VPIRMetadata &MD={}, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="", Type *ResultTy=nullptr)
Create an N-ary operation with Opcode, Operands and set Inst as its underlying Instruction.
static VPSingleDefRecipe * createSingleScalarOp(unsigned Opcode, ArrayRef< VPValue * > Operands, VPValue *Mask, const VPIRFlags &Flags, const VPIRMetadata &Metadata, DebugLoc DL, Instruction *UV)
Create a single-scalar recipe with Opcode and Operands without inserting it.
void setInsertPoint(VPBasicBlock *TheBB)
This specifies that created VPInstructions should be appended to the end of the specified block.
A recipe for generating the phi node tracking the current scalar iteration index.
Definition VPlan.h:4087
unsigned getNumDefinedValues() const
Returns the number of values defined by the VPDef.
Definition VPlanValue.h:562
VPValue * getVPSingleValue()
Returns the only VPValue defined by the VPDef.
Definition VPlanValue.h:535
VPValue * getVPValue(unsigned I)
Returns the VPValue with index I defined by the VPDef.
Definition VPlanValue.h:547
ArrayRef< VPRecipeValue * > definedValues()
Returns an ArrayRef of the values defined by the VPDef.
Definition VPlanValue.h:557
A recipe for converting the input value IV value to the corresponding value of an IV with different s...
Definition VPlan.h:4188
Template specialization of the standard LLVM dominator tree utility for VPBlockBases.
bool properlyDominates(const VPRecipeBase *A, const VPRecipeBase *B) const
A recipe to combine multiple recipes into a single 'expression' recipe, which should be considered a ...
Definition VPlan.h:3547
A pure virtual base class for all recipes modeling header phis, including phis for first order recurr...
Definition VPlan.h:2436
virtual VPValue * getBackedgeValue()
Returns the incoming value from the loop backedge.
Definition VPlan.h:2483
VPValue * getStartValue()
Returns the start value of the phi, if one is set.
Definition VPlan.h:2472
A recipe representing a sequence of load -> update -> store as part of a histogram operation.
Definition VPlan.h:2163
A special type of VPBasicBlock that wraps an existing IR basic block.
Definition VPlan.h:4560
Class to record and manage LLVM IR flags.
Definition VPlan.h:695
static VPIRFlags getDefaultFlags(unsigned Opcode)
Returns default flags for Opcode for opcodes that support it, asserts otherwise.
LLVM_ABI_FOR_TEST FastMathFlags getFastMathFlagsOrNone() const
void dropPoisonGeneratingFlags()
Drop all poison-generating flags.
Definition VPlan.h:892
static LLVM_ABI_FOR_TEST VPIRInstruction * create(Instruction &I)
Create a new VPIRPhi for \I , if it is a PHINode, otherwise create a VPIRInstruction.
Helper to manage IR metadata for recipes.
Definition VPlan.h:1171
void intersect(const VPIRMetadata &MD)
Intersect this VPIRMetadata object with MD, keeping only metadata nodes that are common to both.
This is a concrete Recipe that models a single VPlan-level instruction.
Definition VPlan.h:1226
unsigned getNumOperandsWithoutMask() const
Returns the number of operands, excluding the mask if the VPInstruction is masked.
Definition VPlan.h:1473
@ ExtractLane
Extracts a single lane (first operand) from a set of vector operands.
Definition VPlan.h:1319
@ Unpack
Extracts all lanes from its (non-scalable) vector operand.
Definition VPlan.h:1269
@ ReductionStartVector
Start vector for reductions with 3 operands: the original start value, the identity value for the red...
Definition VPlan.h:1315
@ BuildVector
Creates a fixed-width vector containing all operands.
Definition VPlan.h:1264
@ BuildStructVector
Given operands of (the same) struct type, creates a struct of fixed- width vectors each containing a ...
Definition VPlan.h:1261
@ CanonicalIVIncrementForPart
Definition VPlan.h:1245
@ ComputeReductionResult
Reduce the operands to the final reduction result using the operation specified via the operation's V...
Definition VPlan.h:1272
unsigned getOpcode() const
Definition VPlan.h:1417
const InterleaveGroup< Instruction > * getInterleaveGroup() const
Definition VPlan.h:3109
VPValue * getMask() const
Return the mask used by this recipe.
Definition VPlan.h:3101
ArrayRef< VPValue * > getStoredValues() const
Return the VPValues stored by this interleave group.
Definition VPlan.h:3130
A recipe for interleaved memory operations with vector-predication intrinsics.
Definition VPlan.h:3182
VPInterleaveRecipe is a recipe for transforming an interleave group of load or stores into one wide l...
Definition VPlan.h:3140
void addIncoming(VPValue *IncomingV)
Append IncomingV as an incoming value to the phi-like recipe.
Definition VPlan.h:1665
VPPredInstPHIRecipe is a recipe for generating the phi nodes needed when control converges back from ...
Definition VPlan.h:3713
VPRecipeBase is a base class modeling a sequence of one or more output IR instructions.
Definition VPlan.h:402
VPBasicBlock * getParent()
Definition VPlan.h:477
DebugLoc getDebugLoc() const
Returns the debug location of the recipe.
Definition VPlan.h:555
void moveBefore(VPBasicBlock &BB, iplist< VPRecipeBase >::iterator I)
Unlink this recipe and insert into BB before I.
void insertBefore(VPRecipeBase *InsertPos)
Insert an unlinked recipe into a basic block immediately before the specified recipe.
void insertAfter(VPRecipeBase *InsertPos)
Insert an unlinked Recipe into a basic block immediately after the specified Recipe.
iplist< VPRecipeBase >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Helper class to create VPRecipies from IR instructions.
VPHistogramRecipe * widenIfHistogram(VPInstruction *VPI)
If VPI represents a histogram operation (as determined by LoopVectorizationLegality) make that safe f...
VPRecipeBase * tryToWidenMemory(VPInstruction *VPI, VFRange &Range)
Check if the load or store instruction VPI should widened for Range.Start and potentially masked.
bool replaceWithFinalIfReductionStore(VPInstruction *VPI, VPBuilder &FinalRedStoresBuilder)
If VPI is a store of a reduction into an invariant address, delete it.
VPSingleDefRecipe * handleReplication(VPInstruction *VPI, VFRange &Range)
Build a replicating or single-scalar recipe for VPI.
Type * getScalarType() const
Returns the scalar type of this VPRecipeValue.
Definition VPlanValue.h:338
A recipe to represent inloop reduction operations with vector-predication intrinsics,...
Definition VPlan.h:3353
A recipe for handling reduction phis.
Definition VPlan.h:2864
void setVFScaleFactor(unsigned ScaleFactor)
Set the VFScaleFactor for this reduction phi.
Definition VPlan.h:2915
unsigned getVFScaleFactor() const
Get the factor that the VF of this recipe's output should be scaled by, or 1 if it isn't scaled.
Definition VPlan.h:2908
RecurKind getRecurrenceKind() const
Returns the recurrence kind of the reduction.
Definition VPlan.h:2921
A recipe to represent inloop, ordered or partial reduction operations.
Definition VPlan.h:3233
VPRegionBlock represents a collection of VPBasicBlocks and VPRegionBlocks which form a Single-Entry-S...
Definition VPlan.h:4617
const VPBlockBase * getEntry() const
Definition VPlan.h:4661
bool isReplicator() const
An indicator whether this region is to generate multiple replicated instances of output IR correspond...
Definition VPlan.h:4693
VPInstruction * getOrCreateCanonicalIVIncrement()
Get the canonical IV increment instruction if it exists.
Definition VPlan.cpp:857
void setExiting(VPBlockBase *ExitingBlock)
Set ExitingBlock as the exiting VPBlockBase of this VPRegionBlock.
Definition VPlan.h:4678
Type * getCanonicalIVType() const
Return the type of the canonical IV for loop regions.
Definition VPlan.h:4737
void clearCanonicalIVNUW(VPInstruction *Increment)
Unsets NUW for the canonical IV increment Increment, for loop regions.
Definition VPlan.h:4745
VPRegionValue * getCanonicalIV()
Return the canonical induction variable of the region, null for replicating regions.
Definition VPlan.h:4729
const VPBlockBase * getExiting() const
Definition VPlan.h:4673
VPBasicBlock * getPreheaderVPBB()
Returns the pre-header VPBasicBlock of the loop region.
Definition VPlan.h:4686
VPReplicateRecipe replicates a given instruction producing multiple scalar copies of the original sca...
Definition VPlan.h:3398
bool isSingleScalar() const
Definition VPlan.h:3456
static InstructionCost computeCallCost(Function *CalledFn, Type *ResultTy, ArrayRef< const VPValue * > ArgOps, bool IsSingleScalar, ElementCount VF, VPCostContext &Ctx)
Return the cost of scalarizing a call to CalledFn with argument operands ArgOps for a given VF.
operand_range operandsWithoutMask()
Return the recipe's operands, excluding the mask of a predicated recipe.
Definition VPlan.h:3481
bool isPredicated() const
Definition VPlan.h:3458
VPValue * getMask()
Return the mask of a predicated VPReplicateRecipe.
Definition VPlan.h:3475
Lightweight SCEV-to-VPlan expander.
Definition VPlanUtils.h:178
VPValue * tryToExpand(const SCEV *S)
Try to expand S into recipes and live-ins using the builder.
A recipe for handling phi nodes of integer and floating-point inductions, producing their scalar valu...
Definition VPlan.h:4255
VPSingleDefRecipe is a base class for recipes that model a sequence of one or more output IR that def...
Definition VPlan.h:609
Instruction * getUnderlyingInstr()
Returns the underlying instruction.
Definition VPlan.h:680
VPSingleDefRecipe * clone() override=0
Clone the current recipe.
This class augments VPValue with operands which provide the inverse def-use edges from VPValue's user...
Definition VPlanValue.h:385
operand_range operands()
Definition VPlanValue.h:458
void setOperand(unsigned I, VPValue *New)
Definition VPlanValue.h:431
unsigned getNumOperands() const
Definition VPlanValue.h:425
VPValue * getOperand(unsigned N) const
Definition VPlanValue.h:426
This is the base class of the VPlan Def/Use graph, used for modeling the data flow into,...
Definition VPlanValue.h:50
bool hasDefiningRecipe() const
Returns true if this VPValue is defined by a recipe.
Definition VPlanValue.h:203
Type * getScalarType() const
Returns the scalar type of this VPValue, dispatching based on the concrete subclass.
Definition VPlan.cpp:149
Value * getLiveInIRValue() const
Return the underlying IR value for a VPIRValue.
Definition VPlan.cpp:143
bool isDefinedOutsideLoopRegions() const
Returns true if the VPValue is defined outside any loop.
Definition VPlan.cpp:1471
VPRecipeBase * getDefiningRecipe()
Returns the recipe defining this VPValue or nullptr if it is not defined by a recipe,...
Definition VPlan.cpp:130
bool hasMoreThanOneUniqueUser() const
Returns true if the value has more than one unique user.
Definition VPlanValue.h:164
Value * getUnderlyingValue() const
Return the underlying Value attached to this VPValue.
Definition VPlanValue.h:75
bool user_empty() const
Definition VPlanValue.h:161
void setUnderlyingValue(Value *Val)
Definition VPlanValue.h:209
VPUser * getSingleUser()
Return the single user of this value, or nullptr if there is not exactly one user.
Definition VPlanValue.h:179
void replaceAllUsesWith(VPValue *New)
Definition VPlan.cpp:1474
unsigned getNumUsers() const
Definition VPlanValue.h:115
void replaceUsesWithIf(VPValue *New, llvm::function_ref< bool(VPUser &U, unsigned Idx)> ShouldReplace)
Go through the uses list for this VPValue and make each use point to New if the callback ShouldReplac...
Definition VPlan.cpp:1480
user_range users()
Definition VPlanValue.h:157
A recipe to compute a pointer to the last element of each part of a widened memory access for widened...
Definition VPlan.h:2266
A recipe for widening Call instructions using library calls.
Definition VPlan.h:2097
static InstructionCost computeCallCost(Function *Variant, VPCostContext &Ctx)
Return the cost of widening a call using the vector function Variant.
VPWidenCastRecipe is a recipe to create vector cast instructions.
Definition VPlan.h:1878
Instruction::CastOps getOpcode() const
Definition VPlan.h:1914
A recipe for handling GEP instructions.
Definition VPlan.h:2206
Base class for widened induction (VPWidenIntOrFpInductionRecipe and VPWidenPointerInductionRecipe),...
Definition VPlan.h:2516
VPIRValue * getStartValue() const
Returns the start value of the induction.
Definition VPlan.h:2564
PHINode * getPHINode() const
Returns the underlying PHINode if one exists, or null otherwise.
Definition VPlan.h:2582
VPValue * getStepValue()
Returns the step value of the induction.
Definition VPlan.h:2567
const InductionDescriptor & getInductionDescriptor() const
Returns the induction descriptor for the recipe.
Definition VPlan.h:2587
A recipe for handling phi nodes of integer and floating-point inductions, producing their vector valu...
Definition VPlan.h:2623
VPIRValue * getStartValue() const
Returns the start value of the induction.
Definition VPlan.h:2670
VPValue * getSplatVFValue() const
If the recipe has been unrolled, return the VPValue for the induction increment, otherwise return nul...
Definition VPlan.h:2674
TruncInst * getTruncInst()
Returns the first defined value as TruncInst, if it is one or nullptr otherwise.
Definition VPlan.h:2685
VPValue * getLastUnrolledPartOperand()
Returns the VPValue representing the value of this induction at the last unrolled part,...
Definition VPlan.h:2696
A recipe for widening vector intrinsics.
Definition VPlan.h:1925
static InstructionCost computeCallCost(Intrinsic::ID ID, ArrayRef< const VPValue * > Operands, const VPRecipeWithIRFlags &R, ElementCount VF, VPCostContext &Ctx)
Compute the cost of a vector intrinsic with ID and Operands.
static InstructionCost computeMemIntrinsicCost(Intrinsic::ID IID, Type *Ty, bool IsMasked, Align Alignment, VPCostContext &Ctx)
Helper function for computing the cost of vector memory intrinsic.
A common mixin class for widening memory operations.
Definition VPlan.h:3749
virtual VPRecipeBase * getAsRecipe()=0
Return a VPRecipeBase* to the current object.
A recipe for widened phis.
Definition VPlan.h:2754
VPWidenRecipe is a recipe for producing a widened instruction using the opcode and operands of the re...
Definition VPlan.h:1817
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPWidenRecipe.
VPWidenRecipe * clone() override
Clone the current recipe.
Definition VPlan.h:1838
unsigned getOpcode() const
Definition VPlan.h:1857
VPlan models a candidate for vectorization, encoding various decisions take to produce efficient outp...
Definition VPlan.h:4765
VPIRValue * getLiveIn(Value *V) const
Return the live-in VPIRValue for V, if there is one or nullptr otherwise.
Definition VPlan.h:5095
bool hasVF(ElementCount VF) const
Definition VPlan.h:4988
const DataLayout & getDataLayout() const
Definition VPlan.h:4970
LLVMContext & getContext() const
Definition VPlan.h:4966
VPBasicBlock * getEntry()
Definition VPlan.h:4861
bool hasScalableVF() const
Definition VPlan.h:4989
VPValue * getTripCount() const
The trip count of the original loop.
Definition VPlan.h:4924
VPValue * getOrCreateBackedgeTakenCount()
The backedge taken count of the original loop.
Definition VPlan.h:4945
iterator_range< SmallSetVector< ElementCount, 2 >::iterator > vectorFactors() const
Returns an iterator range over all VFs of the plan.
Definition VPlan.h:4995
VPIRValue * getFalse()
Return a VPIRValue wrapping i1 false.
Definition VPlan.h:5061
VPSymbolicValue & getVFxUF()
Returns VF * UF of the vector loop region.
Definition VPlan.h:4964
VPIRValue * getAllOnesValue(Type *Ty)
Return a VPIRValue wrapping the AllOnes value of type Ty.
Definition VPlan.h:5067
VPRegionBlock * createReplicateRegion(VPBlockBase *Entry, VPBlockBase *Exiting, const std::string &Name="")
Create a new replicate region with Entry, Exiting and Name.
Definition VPlan.h:5144
auto getLiveIns() const
Return the list of live-in VPValues available in the VPlan.
Definition VPlan.h:5098
bool hasUF(unsigned UF) const
Definition VPlan.h:5013
VPIRValue * getPoison(Type *Ty)
Return a VPIRValue wrapping a poison value of type Ty.
Definition VPlan.h:5089
ArrayRef< VPIRBasicBlock * > getExitBlocks() const
Return an ArrayRef containing VPIRBasicBlocks wrapping the exit blocks of the original scalar loop.
Definition VPlan.h:4914
VPSymbolicValue & getVectorTripCount()
The vector trip count.
Definition VPlan.h:4954
VPValue * getBackedgeTakenCount() const
Definition VPlan.h:4951
VPIRValue * getOrAddLiveIn(Value *V)
Gets the live-in VPIRValue for V or adds a new live-in (if none exists yet) for V.
Definition VPlan.h:5038
VPIRValue * getZero(Type *Ty)
Return a VPIRValue wrapping the null value of type Ty.
Definition VPlan.h:5064
void setVF(ElementCount VF)
Definition VPlan.h:4976
bool isUnrolled() const
Returns true if the VPlan already has been unrolled, i.e.
Definition VPlan.h:5029
LLVM_ABI_FOR_TEST VPRegionBlock * getVectorLoopRegion()
Returns the VPRegionBlock of the vector loop.
Definition VPlan.cpp:1061
unsigned getConcreteUF() const
Returns the concrete UF of the plan, after unrolling.
Definition VPlan.h:5016
void resetTripCount(VPValue *NewTripCount)
Resets the trip count for the VPlan.
Definition VPlan.h:4938
VPBasicBlock * getMiddleBlock()
Returns the 'middle' block of the plan, that is the block that selects whether to execute the scalar ...
Definition VPlan.h:4890
VPBasicBlock * createVPBasicBlock(const Twine &Name, VPRecipeBase *Recipe=nullptr)
Create a new VPBasicBlock with Name and containing Recipe if present.
Definition VPlan.h:5121
VPIRValue * getTrue()
Return a VPIRValue wrapping i1 true.
Definition VPlan.h:5058
VPBasicBlock * getVectorPreheader() const
Returns the preheader of the vector loop region, if one exists, or null otherwise.
Definition VPlan.h:4866
VPSymbolicValue & getUF()
Returns the UF of the vector loop region.
Definition VPlan.h:4961
bool hasScalarVFOnly() const
Definition VPlan.h:5006
VPBasicBlock * getScalarPreheader() const
Return the VPBasicBlock for the preheader of the scalar loop.
Definition VPlan.h:4904
VPSymbolicValue & getVF()
Returns the VF of the vector loop region.
Definition VPlan.h:4957
void setUF(unsigned UF)
Definition VPlan.h:5021
bool hasScalarTail() const
Returns true if the scalar tail may execute after the vector loop, i.e.
Definition VPlan.h:5176
LLVM_ABI_FOR_TEST VPlan * duplicate()
Clone the current VPlan, update all VPValues of the new VPlan and cloned recipes to refer to the clon...
Definition VPlan.cpp:1217
VPIRValue * getConstantInt(Type *Ty, uint64_t Val, bool IsSigned=false)
Return a VPIRValue wrapping a ConstantInt with the given type and value.
Definition VPlan.h:5072
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255
iterator_range< user_iterator > users()
Definition Value.h:426
bool hasName() const
Definition Value.h:261
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:319
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
constexpr bool hasKnownScalarFactor(const FixedOrScalableQuantity &RHS) const
Returns true if there exists a value X where RHS.multiplyCoefficientBy(X) will result in a value whos...
Definition TypeSize.h:269
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
constexpr ScalarTy getKnownScalarFactor(const FixedOrScalableQuantity &RHS) const
Returns a value X where RHS.multiplyCoefficientBy(X) will result in a value whose quantity matches ou...
Definition TypeSize.h:277
static constexpr bool isKnownLT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:216
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:168
constexpr LeafTy multiplyCoefficientBy(ScalarTy RHS) const
Definition TypeSize.h:256
constexpr bool isFixed() const
Returns true if the quantity is not scaled by vscale.
Definition TypeSize.h:171
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
An efficient, type-erasing, non-owning reference to a callable.
self_iterator getIterator()
Definition ilist_node.h:123
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
LLVM_ABI APInt RoundingUDiv(const APInt &A, const APInt &B, APInt::Rounding RM)
Return A unsign-divided by B, rounded by the given rounding mode.
Definition APInt.cpp:2798
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
std::variant< std::monostate, Loc::Single, Loc::Multi, Loc::MMI, Loc::EntryValue > Variant
Alias for the std::variant specialization base class of DbgVariable.
Definition DwarfDebug.h:190
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
match_combine_or< Ty... > m_CombineOr(const Ty &...Ps)
Combine pattern matchers matching any of Ps patterns.
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
auto m_Cmp()
Matches any compare instruction and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
m_Intrinsic_Ty< Opnd0, Opnd1, Opnd2 >::Ty m_MaskedStore(const Opnd0 &Op0, const Opnd1 &Op1, const Opnd2 &Op2)
Matches MaskedStore Intrinsic.
match_combine_or< CastInst_match< OpTy, TruncInst >, OpTy > m_TruncOrSelf(const OpTy &Op)
auto m_Poison()
Match an arbitrary poison constant.
ap_match< APInt > m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
CastInst_match< OpTy, TruncInst > m_Trunc(const OpTy &Op)
Matches Trunc.
LogicalOp_match< LHS, RHS, Instruction::And > m_LogicalAnd(const LHS &L, const RHS &R)
Matches L && R either in the form of L & R or L ?
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, ZExtInst >, OpTy > m_ZExtOrSelf(const OpTy &Op)
bool match(Val *V, const Pattern &P)
match_deferred< Value > m_Deferred(Value *const &V)
Like m_Specific(), but works if the specific value to match is determined as part of the same match()...
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
auto match_fn(const Pattern &P)
A match functor that can be used as a UnaryPredicate in functional algorithms like all_of.
m_Intrinsic_Ty< Opnd0, Opnd1, Opnd2 >::Ty m_MaskedLoad(const Opnd0 &Op0, const Opnd1 &Op1, const Opnd2 &Op2)
Matches MaskedLoad Intrinsic.
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
SpecificCmpClass_match< LHS, RHS, CmpInst > m_SpecificCmp(CmpPredicate MatchPred, const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
CastInst_match< OpTy, FPExtInst > m_FPExt(const OpTy &Op)
SpecificCmpClass_match< LHS, RHS, ICmpInst > m_SpecificICmp(CmpPredicate MatchPred, const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::UDiv > m_UDiv(const LHS &L, const RHS &R)
SelectLike_match< CondTy, LTy, RTy > m_SelectLike(const CondTy &C, const LTy &TrueC, const RTy &FalseC)
Matches a value that behaves like a boolean-controlled select, i.e.
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
CmpClass_match< LHS, RHS, ICmpInst > m_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
FNeg_match< OpTy > m_FNeg(const OpTy &X)
Match 'fneg X' as 'fsub -0.0, X'.
BinaryOp_match< LHS, RHS, Instruction::FAdd, true > m_c_FAdd(const LHS &L, const RHS &R)
Matches FAdd with LHS and RHS in either order.
LogicalOp_match< LHS, RHS, Instruction::And, true > m_c_LogicalAnd(const LHS &L, const RHS &R)
Matches L && R with LHS and RHS in either order.
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
BinaryOp_match< LHS, RHS, Instruction::Mul, true > m_c_Mul(const LHS &L, const RHS &R)
Matches a Mul with LHS and RHS in either order.
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
auto m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
bind_cst_ty m_scev_APInt(const APInt *&C)
Match an SCEV constant and bind it to an APInt.
specificloop_ty m_SpecificLoop(const Loop *L)
bool match(const SCEV *S, const Pattern &P)
SCEVAffineAddRec_match< Op0_t, Op1_t, match_isa< const Loop > > m_scev_AffineAddRec(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::ExtractLastLane, VPInstruction_match< VPInstruction::ExtractLastPart, Op0_t > > m_ExtractLastLaneOfLastPart(const Op0_t &Op0)
AllRecipe_commutative_match< Instruction::And, Op0_t, Op1_t > m_c_BinaryAnd(const Op0_t &Op0, const Op1_t &Op1)
Match a binary AND operation.
AllRecipe_match< Instruction::Or, Op0_t, Op1_t > m_BinaryOr(const Op0_t &Op0, const Op1_t &Op1)
Match a binary OR operation.
VPInstruction_match< VPInstruction::AnyOf > m_AnyOf()
AllRecipe_commutative_match< Instruction::Or, Op0_t, Op1_t > m_c_BinaryOr(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::ComputeReductionResult, Op0_t > m_ComputeReductionResult(const Op0_t &Op0)
auto m_WidenAnyExtend(const Op0_t &Op0)
match_bind< VPIRValue > m_VPIRValue(VPIRValue *&V)
Match a VPIRValue.
VPInstruction_match< VPInstruction::StepVector > m_StepVector()
auto m_VPPhi(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::BranchOnTwoConds > m_BranchOnTwoConds()
AllRecipe_match< Opcode, Op0_t, Op1_t > m_Binary(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::LastActiveLane, Op0_t > m_LastActiveLane(const Op0_t &Op0)
auto m_WidenIntrinsic(const T &...Ops)
canonical_widen_iv_match m_CanonicalWidenIV()
VPInstruction_match< VPInstruction::ExitingIVValue, Op0_t > m_ExitingIVValue(const Op0_t &Op0)
VPInstruction_match< Instruction::ExtractElement, Op0_t, Op1_t > m_ExtractElement(const Op0_t &Op0, const Op1_t &Op1)
specific_intval< 1 > m_False()
VPInstruction_match< VPInstruction::ExtractLastLane, Op0_t > m_ExtractLastLane(const Op0_t &Op0)
VPInstruction_match< VPInstruction::ActiveLaneMask, Op0_t, Op1_t, Op2_t > m_ActiveLaneMask(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2)
match_bind< VPSingleDefRecipe > m_VPSingleDefRecipe(VPSingleDefRecipe *&V)
Match a VPSingleDefRecipe, capturing if we match.
VPInstruction_match< VPInstruction::BranchOnCount > m_BranchOnCount()
auto m_GetElementPtr(const Op0_t &Op0, const Op1_t &Op1)
specific_intval< 1 > m_True()
auto m_VPValue()
Match an arbitrary VPValue and ignore it.
VectorEndPointerRecipe_match< Op0_t, Op1_t > m_VecEndPtr(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::ExtractLastPart, Op0_t > m_ExtractLastPart(const Op0_t &Op0)
VPInstruction_match< VPInstruction::Broadcast, Op0_t > m_Broadcast(const Op0_t &Op0)
VPInstruction_match< VPInstruction::ExplicitVectorLength, Op0_t > m_EVL(const Op0_t &Op0)
VPInstruction_match< VPInstruction::BuildVector > m_BuildVector()
BuildVector is matches only its opcode, w/o matching its operands as the number of operands is not fi...
VPInstruction_match< VPInstruction::ExtractPenultimateElement, Op0_t > m_ExtractPenultimateElement(const Op0_t &Op0)
match_bind< VPInstruction > m_VPInstruction(VPInstruction *&V)
Match a VPInstruction, capturing if we match.
VPInstruction_match< VPInstruction::FirstActiveLane, Op0_t > m_FirstActiveLane(const Op0_t &Op0)
auto m_DerivedIV(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2)
VPInstruction_match< VPInstruction::BranchOnCond > m_BranchOnCond()
static VPRecipeBase * findUserOf(VPValue *V, const MatchT &P)
If V is used by a recipe matching pattern P, return it.
VPInstruction_match< VPInstruction::ExtractLane, Op0_t, Op1_t > m_ExtractLane(const Op0_t &Op0, const Op1_t &Op1)
auto m_AnyNeg(const Op0_t &Op0)
VPInstruction_match< VPInstruction::Reverse, Op0_t > m_Reverse(const Op0_t &Op0)
NodeAddr< DefNode * > Def
Definition RDFGraph.h:384
bool isSingleScalar(const VPValue *VPV)
Returns true if VPV is a single scalar, either because it produces the same value for all lanes or on...
VPValue * getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr)
Get or create a VPValue that corresponds to the expansion of Expr.
bool cannotHoistOrSinkRecipe(const VPRecipeBase &R, bool Sinking=false)
Return true if we do not know how to (mechanically) hoist or sink R.
VPInstruction * findComputeReductionResult(VPReductionPHIRecipe *PhiR)
Find the ComputeReductionResult recipe for PhiR, looking through selects inserted for predicated redu...
VPInstruction * findCanonicalIVIncrement(VPlan &Plan)
Find the canonical IV increment of Plan's vector loop region.
std::optional< MemoryLocation > getMemoryLocation(const VPRecipeBase &R)
Return a MemoryLocation for R with noalias metadata populated from R, if the recipe is supported and ...
bool onlyFirstLaneUsed(const VPValue *Def)
Returns true if only the first lane of Def is used.
VPValue * findIncomingAliasMask(const VPlan &Plan)
Finds the incoming alias-mask within the vector preheader.
VPRecipeBase * findRecipe(VPValue *Start, PredT Pred)
Search Start's users for a recipe satisfying Pred, looking through recipes with definitions.
Definition VPlanUtils.h:128
VPSingleDefRecipe * findHeaderMask(VPlan &Plan)
Collect the header mask with the pattern: (ICMP_ULE, WideCanonicalIV, backedge-taken-count) Note: If ...
bool onlyScalarValuesUsed(const VPValue *Def)
Returns true if only scalar values of Def are used by all users.
bool isUniformAcrossVFsAndUFs(const VPValue *V)
Checks if V is uniform across all VF lanes and UF parts.
const SCEV * getSCEVExprForVPValue(const VPValue *V, PredicatedScalarEvolution &PSE, const Loop *L=nullptr)
Return the SCEV expression for V.
bool isHeaderMask(const VPValue *V, const VPlan &Plan)
Return true if V is a header mask in Plan.
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:315
SmallVector< VPBasicBlock * > vp_rpo_plain_cfg_loop_body(VPBasicBlock *Header)
Returns the VPBasicBlocks forming the loop body of a plain (pre-region) VPlan in reverse post-order s...
Definition VPlanCFG.h:265
@ Offset
Definition DWP.cpp:558
constexpr auto not_equal_to(T &&Arg)
Functor variant of std::not_equal_to that can be used as a UnaryPredicate in functional algorithms li...
Definition STLExtras.h:2180
void stable_sort(R &&Range)
Definition STLExtras.h:2116
auto min_element(R &&Range)
Provide wrappers to std::min_element which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:2078
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1669
LLVM_ABI Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
Definition STLExtras.h:840
DenseMap< const Value *, const SCEV * > ValueToSCEVMapTy
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2554
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
constexpr from_range_t from_range
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2208
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:633
auto cast_or_null(const Y &Val)
Definition Casting.h:714
iterator_range< df_iterator< VPBlockShallowTraversalWrapper< VPBlockBase * > > > vp_depth_first_shallow(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order.
Definition VPlanCFG.h:253
constexpr auto bind_back(FnT &&Fn, BindArgsT &&...BindArgs)
C++23 bind_back.
iterator_range< df_iterator< VPBlockDeepTraversalWrapper< VPBlockBase * > > > vp_depth_first_deep(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order while traversing t...
Definition VPlanCFG.h:288
constexpr auto equal_to(T &&Arg)
Functor variant of std::equal_to that can be used as a UnaryPredicate in functional algorithms like a...
Definition STLExtras.h:2173
bool operator==(const AddressRangeValuePair &LHS, const AddressRangeValuePair &RHS)
SmallVector< VPRegisterUsage, 8 > calculateRegisterUsageForPlan(VPlan &Plan, ArrayRef< ElementCount > VFs, const TargetTransformInfo &TTI, const SmallPtrSetImpl< const Value * > &ValuesToIgnore)
Estimate the register usage for Plan and vectorization factors in VFs by calculating the highest numb...
auto map_range(ContainerTy &&C, FuncTy F)
Return a range that applies F to the elements of C.
Definition STLExtras.h:365
detail::concat_range< ValueT, RangeTs... > concat(RangeTs &&...Ranges)
Returns a concatenated range across two or more ranges.
Definition STLExtras.h:1151
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:385
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
Definition STLExtras.h:2200
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1746
auto reverse(ContainerTy &&C)
Definition STLExtras.h:407
constexpr size_t range_size(R &&Range)
Returns the size of the Range, i.e., the number of elements.
Definition STLExtras.h:1694
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1636
bool hasIrregularType(Type *Ty, const DataLayout &DL)
A helper function that returns true if the given type is irregular.
LLVM_ABI_FOR_TEST cl::opt< bool > EnableWideActiveLaneMask
UncountableExitStyle
Different methods of handling early exits.
Definition VPlan.h:79
@ MaskedHandleExitInScalarLoop
All memory operations other than the load(s) required to determine whether an uncountable exit occurr...
Definition VPlan.h:89
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1753
SmallVector< ValueTypeFromRangeType< R >, Size > to_vector(R &&Range)
Given a range of type R, iterate the entire range and return a SmallVector with elements of the vecto...
iterator_range< filter_iterator< detail::IterOfRange< RangeT >, PredicateT > > make_filter_range(RangeT &&Range, PredicateT Pred)
Convenience function that takes a range of elements and a predicate, and return a new filter_iterator...
Definition STLExtras.h:551
bool canConstantBeExtended(const APInt *C, Type *NarrowType, TTI::PartialReductionExtendKind ExtKind)
Check if a constant CI can be safely treated as having been extended from a narrower type with the gi...
Definition VPlan.cpp:1843
T * find_singleton(R &&Range, Predicate P, bool AllowRepeats=false)
Return the single value in Range that satisfies P(<member of Range> *, AllowRepeats)->T * returning n...
Definition STLExtras.h:1837
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
auto drop_end(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the last N elements excluded.
Definition STLExtras.h:322
@ Other
Any other memory.
Definition ModRef.h:68
TargetTransformInfo TTI
RecurKind
These are the kinds of recurrences that we support.
@ UMin
Unsigned integer min implemented in terms of select(cmp()).
@ FindIV
FindIV reduction with select(icmp(),x,y) where one of (x,y) is a loop induction variable (increasing ...
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ FSub
Subtraction of floats.
@ FMul
Product of floats.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ SMin
Signed integer min implemented in terms of select(cmp()).
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
@ AddChainWithSubs
A chain of adds and subs.
@ FAdd
Sum of floats.
@ UMax
Unsigned integer max implemented in terms of select(cmp()).
LLVM_ABI Value * getRecurrenceIdentity(RecurKind K, Type *Tp, FastMathFlags FMF)
Given information about an recurrence kind, return the identity for the @llvm.vector....
LLVM_ABI BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="")
Split the specified block at the specified instruction.
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition STLExtras.h:2012
DWARFExpression::Operation Op
auto max_element(R &&Range)
Provide wrappers to std::max_element which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:2088
ArrayRef(const T &OneElt) -> ArrayRef< T >
auto make_second_range(ContainerTy &&c)
Given a container of pairs, return a range over the second elements.
Definition STLExtras.h:1409
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1772
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1947
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition STLExtras.h:2166
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition Hashing.h:305
bool equal(L &&LRange, R &&RRange)
Wrapper function around std::equal to detect if pair-wise elements between two ranges are the same.
Definition STLExtras.h:2146
Type * toVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
LLVM_ABI bool isDereferenceableAndAlignedInLoop(LoadInst *LI, Loop *L, ScalarEvolution &SE, DominatorTree &DT, AssumptionCache *AC=nullptr, SmallVectorImpl< const SCEVPredicate * > *Predicates=nullptr)
Return true if we can prove that the given load (which is assumed to be within the specified loop) wo...
Definition Loads.cpp:304
@ Default
The result value is uniform if and only if all operands are uniform.
Definition Uniformity.h:20
constexpr detail::IsaCheckPredicate< Types... > IsaPred
Function object wrapper for the llvm::isa type check.
Definition Casting.h:866
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition Hashing.h:285
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:862
#define N
VPBasicBlock * EarlyExitingVPBB
VPIRBasicBlock * EarlyExitVPBB
RemoveMask_match(const Op0_t &In, Op1_t &Out)
bool match(OpTy *V) const
MDNode * Scope
The tag for alias scope specification (used with noalias).
Definition Metadata.h:786
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
An information struct used to provide DenseMap with the various necessary components for a given valu...
This reduction is unordered with the partial result scaled down by some factor.
Definition VPlan.h:2846
Holds the VFShape for a specific scalar to vector function mapping.
Encapsulates information needed to describe a parameter.
A range of powers-of-2 vectorization factors with fixed start and adjustable end.
Struct to hold various analysis needed for cost computations.
static bool isFreeScalarIntrinsic(Intrinsic::ID ID)
Returns true if ID is a pseudo intrinsic that is dropped via scalarization rather than widened.
Definition VPlan.cpp:1939
bool isMaskRequired(Instruction *I) const
Forwards to LoopVectorizationCostModel::isMaskRequired.
PredicatedScalarEvolution & PSE
bool willBeScalarized(Instruction *I, ElementCount VF) const
Returns true if I is known to be scalarized at VF.
TargetTransformInfo::TargetCostKind CostKind
const TargetLibraryInfo & TLI
const TargetTransformInfo & TTI
A VPValue representing a live-in from the input IR or a constant.
Definition VPlanValue.h:247
Type * getType() const
Returns the type of the underlying IR value.
Definition VPlan.cpp:147
A struct that represents some properties of the register usage of a loop.
SmallMapVector< unsigned, unsigned, 4 > MaxLocalUsers
Holds the maximum number of concurrent live intervals in the loop.
InstructionCost spillCost(const TargetTransformInfo &TTI, TargetTransformInfo::TargetCostKind CostKind, unsigned OverrideMaxNumRegs=0) const
Calculate the estimated cost of any spills due to using more registers than the number available for ...
A symbolic live-in VPValue, used for values like vector trip count, VF, and VFxUF.
Definition VPlanValue.h:287
bool isMaterialized() const
Returns true if this symbolic value has been materialized.
Definition VPlanValue.h:298
A recipe for widening load operations with vector-predication intrinsics, using the address to load f...
Definition VPlan.h:3863
A recipe for widening load operations, using the address to load from and an optional mask.
Definition VPlan.h:3813
A recipe for widening store operations with vector-predication intrinsics, using the value to store,...
Definition VPlan.h:3966
A recipe for widening store operations, using the stored value, the address to store to and an option...
Definition VPlan.h:3912
static VPValue * materializeAliasMask(VPlan &Plan, VPBasicBlock *AliasCheckVPBB, ArrayRef< PointerDiffInfo > DiffChecks)
Materializes within the AliasCheckVPBB block.
static LLVM_ABI_FOR_TEST bool tryToConvertVPInstructionsToVPRecipes(VPlan &Plan, const TargetLibraryInfo &TLI)
Replaces the VPInstructions in Plan with corresponding widen recipes.
static void makeMemOpWideningDecisions(VPlan &Plan, VFRange &Range, VPRecipeBuilder &RecipeBuilder)
Convert load/store VPInstructions in Plan into widened or replicate recipes.
static decltype(auto) runPass(StringRef PassName, PassTy &&Pass, VPlan &Plan, ArgsTy &&...Args)
Helper to run a VPlan pass Pass on VPlan, forwarding extra arguments to the pass.
static void expandSCEVsToVPInstructions(VPlan &Plan, ScalarEvolution &SE)
Try to expand VPExpandSCEVRecipes in Plan's entry block to VPInstructions.
static void materializeBroadcasts(VPlan &Plan)
Add explicit broadcasts for live-ins and VPValues defined in Plan's entry block if they are used as v...
static void materializePacksAndUnpacks(VPlan &Plan)
Add explicit Build[Struct]Vector recipes to Pack multiple scalar values into vectors and Unpack recip...
static void createInterleaveGroups(VPlan &Plan, const SmallPtrSetImpl< const InterleaveGroup< Instruction > * > &InterleaveGroups, const bool &EpilogueAllowed)
static bool simplifyKnownEVL(VPlan &Plan, ElementCount VF, PredicatedScalarEvolution &PSE)
Try to simplify VPInstruction::ExplicitVectorLength recipes when the AVL is known to be <= VF,...
static void materializeFactors(VPlan &Plan, VPBasicBlock *VectorPH, ElementCount VF)
Materialize UF, VF and VFxUF to be computed explicitly using VPInstructions.
static void materializeBackedgeTakenCount(VPlan &Plan, VPBasicBlock *VectorPH)
Materialize the backedge-taken count to be computed explicitly using VPInstructions.
static void addActiveLaneMask(VPlan &Plan, bool UseActiveLaneMaskForControlFlow)
Replace (ICMP_ULE, wide canonical IV, backedge-taken-count) checks with an (active-lane-mask recipe,...
static void replaceWideCanonicalIVWithWideIV(VPlan &Plan, ScalarEvolution &SE, const TargetTransformInfo &TTI, TargetTransformInfo::TargetCostKind CostKind, ElementCount VF, unsigned UF, const SmallPtrSetImpl< const Value * > &ValuesToIgnore)
Replace a VPWidenCanonicalIVRecipe if it is present in Plan, with a VPWidenIntOrFpInductionRecipe,...
static void createAndOptimizeReplicateRegions(VPlan &Plan)
Wrap predicated VPReplicateRecipes with a mask operand in an if-then region block and remove the mask...
static void convertToVariableLengthStep(VPlan &Plan)
Transform loops with variable-length stepping after region dissolution.
static void addBranchWeightToMiddleTerminator(VPlan &Plan, ElementCount VF, std::optional< unsigned > VScaleForTuning)
Add branch weight metadata, if the Plan's middle block is terminated by a BranchOnCond recipe.
static std::unique_ptr< VPlan > narrowInterleaveGroups(VPlan &Plan, const TargetTransformInfo &TTI)
Try to find a single VF among Plan's VFs for which all interleave groups (with known minimum VF eleme...
static void materializeAliasMaskCheckBlock(VPlan &Plan, ArrayRef< PointerDiffInfo > DiffChecks, bool HasBranchWeights)
Materializes the alias mask within a check block before the loop.
static DenseMap< const SCEV *, Value * > expandSCEVs(VPlan &Plan, ScalarEvolution &SE)
Expand remaining VPExpandSCEVRecipes in Plan's entry block using SCEVExpander.
static void convertToConcreteRecipes(VPlan &Plan)
Lower abstract recipes to concrete ones, that can be codegen'd.
static void expandBranchOnTwoConds(VPlan &Plan)
Expand BranchOnTwoConds instructions into explicit CFG with BranchOnCond instructions.
static void materializeVectorTripCount(VPlan &Plan, VPBasicBlock *VectorPHVPBB, bool TailByMasking, bool RequiresScalarEpilogue, VPValue *Step, std::optional< uint64_t > MaxRuntimeStep=std::nullopt)
Materialize vector trip count computations to a set of VPInstructions.
static void hoistPredicatedLoads(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L)
Hoist predicated loads from the same address to the loop entry block, if they are guaranteed to execu...
static bool mergeBlocksIntoPredecessors(VPlan &Plan)
Remove redundant VPBasicBlocks by merging them into their single predecessor if the latter has a sing...
static void attachAliasMaskToHeaderMask(VPlan &Plan)
Attaches the alias-mask to the existing header-mask.
static void optimizeFindIVReductions(VPlan &Plan, PredicatedScalarEvolution &PSE, Loop &L)
Optimize FindLast reductions selecting IVs (or expressions of IVs) by converting them to FindIV reduc...
static void convertToAbstractRecipes(VPlan &Plan, VPCostContext &Ctx, VFRange &Range)
This function converts initial recipes to the abstract recipes and clamps Range based on cost model f...
static void materializeConstantVectorTripCount(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
static void makeScalarizationDecisions(VPlan &Plan, VFRange &Range)
Make VPlan-based scalarization decision prior to delegating to the ones made by the legacy CM.
static void addExplicitVectorLength(VPlan &Plan, const std::optional< unsigned > &MaxEVLSafeElements)
Add a VPCurrentIterationPHIRecipe and related recipes to Plan and replaces all uses of the canonical ...
static void simplifyReverses(VPlan &Plan)
Cancel out redundant reverses in Plan, e.g. reverse(reverse(x)) -> x.
static void makeCallWideningDecisions(VPlan &Plan, VFRange &Range, VPRecipeBuilder &RecipeBuilder, VPCostContext &CostCtx)
Convert call VPInstructions in Plan into widened call, vector intrinsic or replicate recipes based on...
static void adjustFirstOrderRecurrenceMiddleUsers(VPlan &Plan, VFRange &Range)
Adjust first-order recurrence users in the middle block: create penultimate element extracts for LCSS...
static void optimizeEVLMasks(VPlan &Plan)
Optimize recipes which use an EVL-based header mask to VP intrinsics, for example:
static void removeDeadRecipes(VPlan &Plan)
Remove dead recipes from Plan.
static void simplifyRecipes(VPlan &Plan)
Perform instcombine-like simplifications on recipes in Plan.
static void sinkPredicatedStores(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L)
Sink predicated stores to the same address with complementary predicates (P and NOT P) to an uncondit...
static void replaceSymbolicStrides(VPlan &Plan, PredicatedScalarEvolution &PSE, const DenseMap< Value *, const SCEV * > &StridesMap, const VPDominatorTree &VPDT)
Replace symbolic strides from StridesMap in Plan with constants when possible.
static bool removeBranchOnConst(VPlan &Plan, bool OnlyLatches=false)
Remove BranchOnCond recipes with true or false conditions together with removing dead edges to their ...
static void convertToStridedAccesses(VPlan &Plan, PredicatedScalarEvolution &PSE, Loop &L, VPCostContext &Ctx, VFRange &Range)
Transform widen memory recipes into strided access recipes when legal and profitable.
static bool handleUncountableEarlyExits(VPlan &Plan, VPBasicBlock *HeaderVPBB, VPBasicBlock *LatchVPBB, VPBasicBlock *MiddleVPBB, Loop *TheLoop, PredicatedScalarEvolution &PSE, DominatorTree &DT, AssumptionCache *AC, UncountableExitStyle Style)
Update Plan to account for uncountable early exits by introducing appropriate branching logic in the ...
static void clearReductionWrapFlags(VPlan &Plan)
Clear NSW/NUW flags from reduction instructions if necessary.
static void optimizeInductionLiveOutUsers(VPlan &Plan, PredicatedScalarEvolution &PSE, bool FoldTail)
If there's a single exit block, optimize its phi recipes that use exiting IV values by feeding them p...
static void createPartialReductions(VPlan &Plan, VPCostContext &CostCtx, VFRange &Range)
Detect and create partial reduction recipes for scaled reductions in Plan.
static void cse(VPlan &Plan)
Perform common-subexpression-elimination on Plan.
static void attachVPCheckBlock(VPlan &Plan, VPValue *Cond, VPBasicBlock *CheckBlock, bool AddBranchWeights)
Wrap runtime check block CheckBlock in a VPIRBB and Cond in a VPValue and connect the block to Plan,...
static LLVM_ABI_FOR_TEST void optimize(VPlan &Plan)
Apply VPlan-to-VPlan optimizations to Plan, including induction recipe optimizations,...
static void dissolveLoopRegions(VPlan &Plan)
Replace loop regions with explicit CFG.
static void truncateToMinimalBitwidths(VPlan &Plan, const MapVector< Instruction *, uint64_t > &MinBWs)
Insert truncates and extends for any truncated recipe.
static void dropPoisonGeneratingRecipes(VPlan &Plan)
Drop poison flags from recipes that may generate a poison value that is used after vectorization,...
static void optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Optimize Plan based on BestVF and BestUF.
static void convertEVLExitCond(VPlan &Plan)
Replaces the exit condition from (branch-on-cond eq CanonicalIVInc, VectorTripCount) to (branch-on-co...