LLVM 23.0.0git
VPlanTransforms.cpp
Go to the documentation of this file.
1//===-- VPlanTransforms.cpp - Utility VPlan to VPlan transforms -----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file implements a set of utility VPlan to VPlan transformations.
11///
12//===----------------------------------------------------------------------===//
13
14#include "VPlanTransforms.h"
15#include "VPRecipeBuilder.h"
16#include "VPlan.h"
17#include "VPlanAnalysis.h"
18#include "VPlanCFG.h"
19#include "VPlanDominatorTree.h"
20#include "VPlanHelpers.h"
21#include "VPlanPatternMatch.h"
22#include "VPlanUtils.h"
23#include "VPlanVerifier.h"
24#include "llvm/ADT/APInt.h"
26#include "llvm/ADT/STLExtras.h"
28#include "llvm/ADT/SetVector.h"
30#include "llvm/ADT/TypeSwitch.h"
38#include "llvm/IR/Intrinsics.h"
39#include "llvm/IR/MDBuilder.h"
40#include "llvm/IR/Metadata.h"
45
46using namespace llvm;
47using namespace VPlanPatternMatch;
48using namespace SCEVPatternMatch;
49
51 VPlan &Plan, const TargetLibraryInfo &TLI) {
52
54 Plan.getVectorLoopRegion());
56 // Skip blocks outside region
57 if (!VPBB->getParent())
58 break;
59 VPRecipeBase *Term = VPBB->getTerminator();
60 auto EndIter = Term ? Term->getIterator() : VPBB->end();
61 // Introduce each ingredient into VPlan.
62 for (VPRecipeBase &Ingredient :
63 make_early_inc_range(make_range(VPBB->begin(), EndIter))) {
64
65 VPValue *VPV = Ingredient.getVPSingleValue();
66 if (!VPV->getUnderlyingValue())
67 continue;
68
70
71 VPRecipeBase *NewRecipe = nullptr;
72 if (auto *PhiR = dyn_cast<VPPhi>(&Ingredient)) {
73 auto *Phi = cast<PHINode>(PhiR->getUnderlyingValue());
74 NewRecipe = new VPWidenPHIRecipe(PhiR->operands(), PhiR->getDebugLoc(),
75 Phi->getName());
76 } else if (auto *VPI = dyn_cast<VPInstruction>(&Ingredient)) {
77 assert(!isa<PHINode>(Inst) && "phis should be handled above");
78 // Create VPWidenMemoryRecipe for loads and stores.
79 if (LoadInst *Load = dyn_cast<LoadInst>(Inst)) {
80 NewRecipe = new VPWidenLoadRecipe(
81 *Load, Ingredient.getOperand(0), nullptr /*Mask*/,
82 false /*Consecutive*/, *VPI, Ingredient.getDebugLoc());
83 } else if (StoreInst *Store = dyn_cast<StoreInst>(Inst)) {
84 NewRecipe = new VPWidenStoreRecipe(
85 *Store, Ingredient.getOperand(1), Ingredient.getOperand(0),
86 nullptr /*Mask*/, false /*Consecutive*/, *VPI,
87 Ingredient.getDebugLoc());
89 NewRecipe = new VPWidenGEPRecipe(GEP, Ingredient.operands(), *VPI,
90 Ingredient.getDebugLoc());
91 } else if (CallInst *CI = dyn_cast<CallInst>(Inst)) {
92 Intrinsic::ID VectorID = getVectorIntrinsicIDForCall(CI, &TLI);
93 if (VectorID == Intrinsic::not_intrinsic)
94 return false;
95
96 // The noalias.scope.decl intrinsic declares a noalias scope that
97 // is valid for a single iteration. Emitting it as a single-scalar
98 // replicate would incorrectly extend the scope across multiple
99 // original iterations packed into one vector iteration.
100 // FIXME: If we want to vectorize this loop, then we have to drop
101 // all the associated !alias.scope and !noalias.
102 if (VectorID == Intrinsic::experimental_noalias_scope_decl)
103 return false;
104
105 // These intrinsics are recognized by getVectorIntrinsicIDForCall
106 // but are not widenable. Emit them as replicate instead of widening.
107 if (VectorID == Intrinsic::assume ||
108 VectorID == Intrinsic::lifetime_end ||
109 VectorID == Intrinsic::lifetime_start ||
110 VectorID == Intrinsic::sideeffect ||
111 VectorID == Intrinsic::pseudoprobe) {
112 // If the operand of llvm.assume holds before vectorization, it will
113 // also hold per lane.
114 // llvm.pseudoprobe requires to be duplicated per lane for accurate
115 // sample count.
116 const bool IsSingleScalar = VectorID != Intrinsic::assume &&
117 VectorID != Intrinsic::pseudoprobe;
118 NewRecipe = new VPReplicateRecipe(CI, Ingredient.operands(),
119 /*IsSingleScalar=*/IsSingleScalar,
120 /*Mask=*/nullptr, *VPI, *VPI,
121 Ingredient.getDebugLoc());
122 } else {
123 NewRecipe = new VPWidenIntrinsicRecipe(
124 *CI, VectorID, drop_end(Ingredient.operands()), CI->getType(),
125 VPIRFlags(*CI), *VPI, CI->getDebugLoc());
126 }
127 } else if (auto *CI = dyn_cast<CastInst>(Inst)) {
128 NewRecipe = new VPWidenCastRecipe(
129 CI->getOpcode(), Ingredient.getOperand(0), CI->getType(), CI,
130 VPIRFlags(*CI), VPIRMetadata(*CI));
131 } else {
132 NewRecipe = new VPWidenRecipe(*Inst, Ingredient.operands(), *VPI,
133 *VPI, Ingredient.getDebugLoc());
134 }
135 } else {
137 "inductions must be created earlier");
138 continue;
139 }
140
141 NewRecipe->insertBefore(&Ingredient);
142 if (NewRecipe->getNumDefinedValues() == 1)
143 VPV->replaceAllUsesWith(NewRecipe->getVPSingleValue());
144 else
145 assert(NewRecipe->getNumDefinedValues() == 0 &&
146 "Only recpies with zero or one defined values expected");
147 Ingredient.eraseFromParent();
148 }
149 }
150 return true;
151}
152
153/// Helper for extra no-alias checks via known-safe recipe and SCEV.
155 const SmallPtrSetImpl<VPRecipeBase *> &ExcludeRecipes;
156 VPReplicateRecipe &GroupLeader;
158 const Loop &L;
159 VPTypeAnalysis &TypeInfo;
160
161 // Return true if \p A and \p B are known to not alias for all VFs in the
162 // plan, checked via the distance between the accesses
163 bool isNoAliasViaDistance(VPReplicateRecipe *A, VPReplicateRecipe *B) const {
164 if (A->getOpcode() != Instruction::Store ||
165 B->getOpcode() != Instruction::Store)
166 return false;
167
168 VPValue *AddrA = A->getOperand(1);
169 const SCEV *SCEVA = vputils::getSCEVExprForVPValue(AddrA, PSE, &L);
170 VPValue *AddrB = B->getOperand(1);
171 const SCEV *SCEVB = vputils::getSCEVExprForVPValue(AddrB, PSE, &L);
173 return false;
174
175 const APInt *Distance;
176 ScalarEvolution &SE = *PSE.getSE();
177 if (!match(SE.getMinusSCEV(SCEVA, SCEVB), m_scev_APInt(Distance)))
178 return false;
179
180 const DataLayout &DL = SE.getDataLayout();
181 Type *TyA = TypeInfo.inferScalarType(A->getOperand(0));
182 uint64_t SizeA = DL.getTypeStoreSize(TyA);
183 Type *TyB = TypeInfo.inferScalarType(B->getOperand(0));
184 uint64_t SizeB = DL.getTypeStoreSize(TyB);
185
186 // Use the maximum store size to ensure no overlap from either direction.
187 // Currently only handles fixed sizes, as it is only used for
188 // replicating VPReplicateRecipes.
189 uint64_t MaxStoreSize = std::max(SizeA, SizeB);
190
191 auto VFs = B->getParent()->getPlan()->vectorFactors();
193 if (MaxVF.isScalable())
194 return false;
195 return Distance->abs().uge(
196 MaxVF.multiplyCoefficientBy(MaxStoreSize).getFixedValue());
197 }
198
199public:
202 const Loop &L, VPTypeAnalysis &TypeInfo)
203 : ExcludeRecipes(ExcludeRecipes), GroupLeader(GroupLeader), PSE(PSE),
204 L(L), TypeInfo(TypeInfo) {}
205
206 /// Return true if \p R should be skipped during alias checking, either
207 /// because it's in the exclude set or because no-alias can be proven via
208 /// SCEV.
209 bool shouldSkip(VPRecipeBase &R) const {
210 auto *Store = dyn_cast<VPReplicateRecipe>(&R);
211 return ExcludeRecipes.contains(&R) ||
212 (Store && isNoAliasViaDistance(Store, &GroupLeader));
213 }
214};
215
216/// Check if a memory operation doesn't alias with memory operations using
217/// scoped noalias metadata, in blocks in the single-successor chain between \p
218/// FirstBB and \p LastBB. If \p SinkInfo is std::nullopt, only recipes that may
219/// write to memory are checked (for load hoisting). Otherwise recipes that both
220/// read and write memory are checked, and SCEV is used to prove no-alias
221/// between the group leader and other replicate recipes (for store sinking).
222static bool
224 VPBasicBlock *FirstBB, VPBasicBlock *LastBB,
225 std::optional<SinkStoreInfo> SinkInfo = {}) {
226 bool CheckReads = SinkInfo.has_value();
227 if (!MemLoc.AATags.Scope)
228 return false;
229
230 for (VPBasicBlock *VPBB :
232 for (VPRecipeBase &R : *VPBB) {
233 if (SinkInfo && SinkInfo->shouldSkip(R))
234 continue;
235
236 // Skip recipes that don't need checking.
237 if (!R.mayWriteToMemory() && !(CheckReads && R.mayReadFromMemory()))
238 continue;
239
241 if (!Loc)
242 // Conservatively assume aliasing for memory operations without
243 // location.
244 return false;
245
247 return false;
248 }
249 }
250 return true;
251}
252
253/// Collect either replicated Loads or Stores grouped by their address SCEV, in
254/// a deep-traversal of the vector loop region in \p Plan.
255template <unsigned Opcode>
258 VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L,
259 function_ref<bool(VPReplicateRecipe *)> FilterFn) {
260 static_assert(Opcode == Instruction::Load || Opcode == Instruction::Store,
261 "Only Load and Store opcodes supported");
262 constexpr bool IsLoad = (Opcode == Instruction::Load);
264 RecipesByAddress;
267 for (VPRecipeBase &R : *VPBB) {
268 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
269 if (!RepR || RepR->getOpcode() != Opcode || !FilterFn(RepR))
270 continue;
271
272 // For loads, operand 0 is address; for stores, operand 1 is address.
273 VPValue *Addr = RepR->getOperand(IsLoad ? 0 : 1);
274 const SCEV *AddrSCEV = vputils::getSCEVExprForVPValue(Addr, PSE, L);
275 if (!isa<SCEVCouldNotCompute>(AddrSCEV))
276 RecipesByAddress[AddrSCEV].push_back(RepR);
277 }
278 }
279 auto Groups = to_vector(RecipesByAddress.values());
280 VPDominatorTree VPDT(Plan);
281 for (auto &Group : Groups) {
282 // Sort mem ops by dominance order, with earliest (most dominating) first.
284 return VPDT.properlyDominates(A, B);
285 });
286 }
287 return Groups;
288}
289
290static bool sinkScalarOperands(VPlan &Plan) {
291 auto Iter = vp_depth_first_deep(Plan.getEntry());
292 bool ScalarVFOnly = Plan.hasScalarVFOnly();
293 bool Changed = false;
294
296 auto InsertIfValidSinkCandidate = [ScalarVFOnly, &WorkList](
297 VPBasicBlock *SinkTo, VPValue *Op) {
298 auto *Candidate =
299 dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe());
300 if (!Candidate)
301 return;
302
303 // We only know how to sink VPReplicateRecipes and VPScalarIVStepsRecipes
304 // for now.
306 return;
307
308 if (Candidate->getParent() == SinkTo ||
309 vputils::cannotHoistOrSinkRecipe(*Candidate, /*Sinking=*/true))
310 return;
311
312 if (auto *RepR = dyn_cast<VPReplicateRecipe>(Candidate))
313 if (!ScalarVFOnly && RepR->isSingleScalar())
314 return;
315
316 WorkList.insert({SinkTo, Candidate});
317 };
318
319 // First, collect the operands of all recipes in replicate blocks as seeds for
320 // sinking.
322 VPBasicBlock *EntryVPBB = VPR->getEntryBasicBlock();
323 if (!VPR->isReplicator() || EntryVPBB->getSuccessors().size() != 2)
324 continue;
325 VPBasicBlock *VPBB = cast<VPBasicBlock>(EntryVPBB->getSuccessors().front());
326 if (VPBB->getSingleSuccessor() != VPR->getExitingBasicBlock())
327 continue;
328 for (auto &Recipe : *VPBB)
329 for (VPValue *Op : Recipe.operands())
330 InsertIfValidSinkCandidate(VPBB, Op);
331 }
332
333 // Try to sink each replicate or scalar IV steps recipe in the worklist.
334 for (unsigned I = 0; I != WorkList.size(); ++I) {
335 VPBasicBlock *SinkTo;
336 VPSingleDefRecipe *SinkCandidate;
337 std::tie(SinkTo, SinkCandidate) = WorkList[I];
338
339 // All recipe users of SinkCandidate must be in the same block SinkTo or all
340 // users outside of SinkTo must only use the first lane of SinkCandidate. In
341 // the latter case, we need to duplicate SinkCandidate.
342 auto UsersOutsideSinkTo =
343 make_filter_range(SinkCandidate->users(), [SinkTo](VPUser *U) {
344 return cast<VPRecipeBase>(U)->getParent() != SinkTo;
345 });
346 if (any_of(UsersOutsideSinkTo, [SinkCandidate](VPUser *U) {
347 return !U->usesFirstLaneOnly(SinkCandidate);
348 }))
349 continue;
350 bool NeedsDuplicating = !UsersOutsideSinkTo.empty();
351
352 if (NeedsDuplicating) {
353 if (ScalarVFOnly)
354 continue;
355 VPSingleDefRecipe *Clone;
356 if (auto *SinkCandidateRepR =
357 dyn_cast<VPReplicateRecipe>(SinkCandidate)) {
358 // TODO: Handle converting to uniform recipes as separate transform,
359 // then cloning should be sufficient here.
360 Instruction *I = SinkCandidate->getUnderlyingInstr();
361 Clone = new VPReplicateRecipe(I, SinkCandidate->operands(), true,
362 nullptr /*Mask*/, *SinkCandidateRepR,
363 *SinkCandidateRepR);
364 // TODO: add ".cloned" suffix to name of Clone's VPValue.
365 } else {
366 Clone = SinkCandidate->clone();
367 }
368
369 Clone->insertBefore(SinkCandidate);
370 SinkCandidate->replaceUsesWithIf(Clone, [SinkTo](VPUser &U, unsigned) {
371 return cast<VPRecipeBase>(&U)->getParent() != SinkTo;
372 });
373 }
374 SinkCandidate->moveBefore(*SinkTo, SinkTo->getFirstNonPhi());
375 for (VPValue *Op : SinkCandidate->operands())
376 InsertIfValidSinkCandidate(SinkTo, Op);
377 Changed = true;
378 }
379 return Changed;
380}
381
382/// If \p R is a region with a VPBranchOnMaskRecipe in the entry block, return
383/// the mask.
385 auto *EntryBB = dyn_cast<VPBasicBlock>(R->getEntry());
386 if (!EntryBB || EntryBB->size() != 1 ||
387 !isa<VPBranchOnMaskRecipe>(EntryBB->begin()))
388 return nullptr;
389
390 return cast<VPBranchOnMaskRecipe>(&*EntryBB->begin())->getOperand(0);
391}
392
393/// If \p R is a triangle region, return the 'then' block of the triangle.
395 auto *EntryBB = cast<VPBasicBlock>(R->getEntry());
396 if (EntryBB->getNumSuccessors() != 2)
397 return nullptr;
398
399 auto *Succ0 = dyn_cast<VPBasicBlock>(EntryBB->getSuccessors()[0]);
400 auto *Succ1 = dyn_cast<VPBasicBlock>(EntryBB->getSuccessors()[1]);
401 if (!Succ0 || !Succ1)
402 return nullptr;
403
404 if (Succ0->getNumSuccessors() + Succ1->getNumSuccessors() != 1)
405 return nullptr;
406 if (Succ0->getSingleSuccessor() == Succ1)
407 return Succ0;
408 if (Succ1->getSingleSuccessor() == Succ0)
409 return Succ1;
410 return nullptr;
411}
412
413// Merge replicate regions in their successor region, if a replicate region
414// is connected to a successor replicate region with the same predicate by a
415// single, empty VPBasicBlock.
417 SmallPtrSet<VPRegionBlock *, 4> TransformedRegions;
418
419 // Collect replicate regions followed by an empty block, followed by another
420 // replicate region with matching masks to process front. This is to avoid
421 // iterator invalidation issues while merging regions.
424 vp_depth_first_deep(Plan.getEntry()))) {
425 if (!Region1->isReplicator())
426 continue;
427 auto *MiddleBasicBlock =
428 dyn_cast_or_null<VPBasicBlock>(Region1->getSingleSuccessor());
429 if (!MiddleBasicBlock || !MiddleBasicBlock->empty())
430 continue;
431
432 auto *Region2 =
433 dyn_cast_or_null<VPRegionBlock>(MiddleBasicBlock->getSingleSuccessor());
434 if (!Region2 || !Region2->isReplicator())
435 continue;
436
437 VPValue *Mask1 = getPredicatedMask(Region1);
438 VPValue *Mask2 = getPredicatedMask(Region2);
439 if (!Mask1 || Mask1 != Mask2)
440 continue;
441
442 assert(Mask1 && Mask2 && "both region must have conditions");
443 WorkList.push_back(Region1);
444 }
445
446 // Move recipes from Region1 to its successor region, if both are triangles.
447 for (VPRegionBlock *Region1 : WorkList) {
448 if (TransformedRegions.contains(Region1))
449 continue;
450 auto *MiddleBasicBlock = cast<VPBasicBlock>(Region1->getSingleSuccessor());
451 auto *Region2 = cast<VPRegionBlock>(MiddleBasicBlock->getSingleSuccessor());
452
453 VPBasicBlock *Then1 = getPredicatedThenBlock(Region1);
454 VPBasicBlock *Then2 = getPredicatedThenBlock(Region2);
455 if (!Then1 || !Then2)
456 continue;
457
458 // Note: No fusion-preventing memory dependencies are expected in either
459 // region. Such dependencies should be rejected during earlier dependence
460 // checks, which guarantee accesses can be re-ordered for vectorization.
461 //
462 // Move recipes to the successor region.
463 for (VPRecipeBase &ToMove : make_early_inc_range(reverse(*Then1)))
464 ToMove.moveBefore(*Then2, Then2->getFirstNonPhi());
465
466 auto *Merge1 = cast<VPBasicBlock>(Then1->getSingleSuccessor());
467 auto *Merge2 = cast<VPBasicBlock>(Then2->getSingleSuccessor());
468
469 // Move VPPredInstPHIRecipes from the merge block to the successor region's
470 // merge block. Update all users inside the successor region to use the
471 // original values.
472 for (VPRecipeBase &Phi1ToMove : make_early_inc_range(reverse(*Merge1))) {
473 VPValue *PredInst1 =
474 cast<VPPredInstPHIRecipe>(&Phi1ToMove)->getOperand(0);
475 VPValue *Phi1ToMoveV = Phi1ToMove.getVPSingleValue();
476 Phi1ToMoveV->replaceUsesWithIf(PredInst1, [Then2](VPUser &U, unsigned) {
477 return cast<VPRecipeBase>(&U)->getParent() == Then2;
478 });
479
480 // Remove phi recipes that are unused after merging the regions.
481 if (Phi1ToMove.getVPSingleValue()->getNumUsers() == 0) {
482 Phi1ToMove.eraseFromParent();
483 continue;
484 }
485 Phi1ToMove.moveBefore(*Merge2, Merge2->begin());
486 }
487
488 // Remove the dead recipes in Region1's entry block.
489 for (VPRecipeBase &R :
490 make_early_inc_range(reverse(*Region1->getEntryBasicBlock())))
491 R.eraseFromParent();
492
493 // Finally, remove the first region.
494 for (VPBlockBase *Pred : make_early_inc_range(Region1->getPredecessors())) {
495 VPBlockUtils::disconnectBlocks(Pred, Region1);
496 VPBlockUtils::connectBlocks(Pred, MiddleBasicBlock);
497 }
498 VPBlockUtils::disconnectBlocks(Region1, MiddleBasicBlock);
499 TransformedRegions.insert(Region1);
500 }
501
502 return !TransformedRegions.empty();
503}
504
506 VPRegionBlock *ParentRegion,
507 VPlan &Plan) {
508 Instruction *Instr = PredRecipe->getUnderlyingInstr();
509 // Build the triangular if-then region.
510 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
511 assert(Instr->getParent() && "Predicated instruction not in any basic block");
512 auto *BlockInMask = PredRecipe->getMask();
513 auto *MaskDef = BlockInMask->getDefiningRecipe();
514 auto *BOMRecipe = new VPBranchOnMaskRecipe(
515 BlockInMask, MaskDef ? MaskDef->getDebugLoc() : DebugLoc::getUnknown());
516 auto *Entry =
517 Plan.createVPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
518
519 // Replace predicated replicate recipe with a replicate recipe without a
520 // mask but in the replicate region.
521 auto *RecipeWithoutMask = new VPReplicateRecipe(
522 PredRecipe->getUnderlyingInstr(), drop_end(PredRecipe->operands()),
523 PredRecipe->isSingleScalar(), nullptr /*Mask*/, *PredRecipe, *PredRecipe,
524 PredRecipe->getDebugLoc());
525 auto *Pred =
526 Plan.createVPBasicBlock(Twine(RegionName) + ".if", RecipeWithoutMask);
527 auto *Exiting = Plan.createVPBasicBlock(Twine(RegionName) + ".continue");
529 Plan.createReplicateRegion(Entry, Exiting, RegionName);
530
531 // Note: first set Entry as region entry and then connect successors starting
532 // from it in order, to propagate the "parent" of each VPBasicBlock.
533 Region->setParent(ParentRegion);
534 VPBlockUtils::insertTwoBlocksAfter(Pred, Exiting, Entry);
535 VPBlockUtils::connectBlocks(Pred, Exiting);
536
537 if (PredRecipe->getNumUsers() != 0) {
538 auto *PHIRecipe = new VPPredInstPHIRecipe(RecipeWithoutMask,
539 RecipeWithoutMask->getDebugLoc());
540 Exiting->appendRecipe(PHIRecipe);
541 PredRecipe->replaceAllUsesWith(PHIRecipe);
542 }
543 PredRecipe->eraseFromParent();
544 return Region;
545}
546
547static void addReplicateRegions(VPlan &Plan) {
550 vp_depth_first_deep(Plan.getEntry()))) {
551 for (VPRecipeBase &R : *VPBB)
552 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
553 if (RepR->isPredicated())
554 WorkList.push_back(RepR);
555 }
556 }
557
558 unsigned BBNum = 0;
559 for (VPReplicateRecipe *RepR : WorkList) {
560 VPBasicBlock *CurrentBlock = RepR->getParent();
561 VPBasicBlock *SplitBlock = CurrentBlock->splitAt(RepR->getIterator());
562
563 BasicBlock *OrigBB = RepR->getUnderlyingInstr()->getParent();
564 SplitBlock->setName(
565 OrigBB->hasName() ? OrigBB->getName() + "." + Twine(BBNum++) : "");
566 // Record predicated instructions for above packing optimizations.
568 createReplicateRegion(RepR, CurrentBlock->getParent(), Plan);
570
571 VPRegionBlock *ParentRegion = Region->getParent();
572 if (ParentRegion && ParentRegion->getExiting() == CurrentBlock)
573 ParentRegion->setExiting(SplitBlock);
574 }
575}
576
580 vp_depth_first_deep(Plan.getEntry()))) {
581 // Don't fold the blocks in the skeleton of the Plan into their single
582 // predecessors for now.
583 // TODO: Remove restriction once more of the skeleton is modeled in VPlan.
584 if (!VPBB->getParent())
585 continue;
586 auto *PredVPBB =
587 dyn_cast_or_null<VPBasicBlock>(VPBB->getSinglePredecessor());
588 if (!PredVPBB || PredVPBB->getNumSuccessors() != 1 ||
589 isa<VPIRBasicBlock>(PredVPBB))
590 continue;
591 WorkList.push_back(VPBB);
592 }
593
594 for (VPBasicBlock *VPBB : WorkList) {
595 VPBasicBlock *PredVPBB = cast<VPBasicBlock>(VPBB->getSinglePredecessor());
596 for (VPRecipeBase &R : make_early_inc_range(*VPBB))
597 R.moveBefore(*PredVPBB, PredVPBB->end());
598 VPBlockUtils::disconnectBlocks(PredVPBB, VPBB);
599 auto *ParentRegion = VPBB->getParent();
600 if (ParentRegion && ParentRegion->getExiting() == VPBB)
601 ParentRegion->setExiting(PredVPBB);
602 VPBlockUtils::transferSuccessors(VPBB, PredVPBB);
603 // VPBB is now dead and will be cleaned up when the plan gets destroyed.
604 }
605 return !WorkList.empty();
606}
607
609 // Convert masked VPReplicateRecipes to if-then region blocks.
611
612 bool ShouldSimplify = true;
613 while (ShouldSimplify) {
614 ShouldSimplify = sinkScalarOperands(Plan);
615 ShouldSimplify |= mergeReplicateRegionsIntoSuccessors(Plan);
616 ShouldSimplify |= mergeBlocksIntoPredecessors(Plan);
617 }
618}
619
620/// Remove redundant casts of inductions.
621///
622/// Such redundant casts are casts of induction variables that can be ignored,
623/// because we already proved that the casted phi is equal to the uncasted phi
624/// in the vectorized loop. There is no need to vectorize the cast - the same
625/// value can be used for both the phi and casts in the vector loop.
627 for (auto &Phi : Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
629 if (!IV || IV->getTruncInst())
630 continue;
631
632 // A sequence of IR Casts has potentially been recorded for IV, which
633 // *must be bypassed* when the IV is vectorized, because the vectorized IV
634 // will produce the desired casted value. This sequence forms a def-use
635 // chain and is provided in reverse order, ending with the cast that uses
636 // the IV phi. Search for the recipe of the last cast in the chain and
637 // replace it with the original IV. Note that only the final cast is
638 // expected to have users outside the cast-chain and the dead casts left
639 // over will be cleaned up later.
640 ArrayRef<Instruction *> Casts = IV->getInductionDescriptor().getCastInsts();
641 VPValue *FindMyCast = IV;
642 for (Instruction *IRCast : reverse(Casts)) {
643 VPSingleDefRecipe *FoundUserCast = nullptr;
644 for (auto *U : FindMyCast->users()) {
645 auto *UserCast = dyn_cast<VPSingleDefRecipe>(U);
646 if (UserCast && UserCast->getUnderlyingValue() == IRCast) {
647 FoundUserCast = UserCast;
648 break;
649 }
650 }
651 // A cast recipe in the chain may have been removed by earlier DCE.
652 if (!FoundUserCast)
653 break;
654 FindMyCast = FoundUserCast;
655 }
656 if (FindMyCast != IV)
657 FindMyCast->replaceAllUsesWith(IV);
658 }
659}
660
663 Instruction::BinaryOps InductionOpcode,
664 FPMathOperator *FPBinOp, Instruction *TruncI,
665 VPIRValue *StartV, VPValue *Step, DebugLoc DL,
666 VPBuilder &Builder) {
667 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
668 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
669 VPValue *CanonicalIV = LoopRegion->getCanonicalIV();
670 VPSingleDefRecipe *BaseIV =
671 Builder.createDerivedIV(Kind, FPBinOp, StartV, CanonicalIV, Step);
672
673 // Truncate base induction if needed.
674 VPTypeAnalysis TypeInfo(Plan);
675 Type *ResultTy = TypeInfo.inferScalarType(BaseIV);
676 if (TruncI) {
677 Type *TruncTy = TruncI->getType();
678 assert(ResultTy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits() &&
679 "Not truncating.");
680 assert(ResultTy->isIntegerTy() && "Truncation requires an integer type");
681 BaseIV = Builder.createScalarCast(Instruction::Trunc, BaseIV, TruncTy, DL);
682 ResultTy = TruncTy;
683 }
684
685 // Truncate step if needed.
686 Type *StepTy = TypeInfo.inferScalarType(Step);
687 if (ResultTy != StepTy) {
688 assert(StepTy->getScalarSizeInBits() > ResultTy->getScalarSizeInBits() &&
689 "Not truncating.");
690 assert(StepTy->isIntegerTy() && "Truncation requires an integer type");
691 auto *VecPreheader =
693 VPBuilder::InsertPointGuard Guard(Builder);
694 Builder.setInsertPoint(VecPreheader);
695 Step = Builder.createScalarCast(Instruction::Trunc, Step, ResultTy, DL);
696 }
697 return Builder.createScalarIVSteps(InductionOpcode, FPBinOp, BaseIV, Step,
698 &Plan.getVF(), DL);
699}
700
702 VPlan &Plan, ScalarEvolution &SE, const TargetTransformInfo &TTI,
704 const SmallPtrSetImpl<const Value *> &ValuesToIgnore) {
705 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
706 if (!LoopRegion)
707 return;
708
710 LoopRegion->getCanonicalIV());
711 if (!WideCanIV)
712 return;
713
714 Type *CanIVTy = LoopRegion->getCanonicalIVType();
715
716 // Replace the wide canonical IV with a scalar-iv-steps over the canonical
717 // IV.
718 if (Plan.hasScalarVFOnly() || vputils::onlyFirstLaneUsed(WideCanIV)) {
719 VPBuilder Builder(WideCanIV);
720 WideCanIV->replaceAllUsesWith(createScalarIVSteps(
721 Plan, InductionDescriptor::IK_IntInduction, Instruction::Add, nullptr,
722 nullptr, Plan.getZero(CanIVTy), Plan.getConstantInt(CanIVTy, 1),
723 WideCanIV->getDebugLoc(), Builder));
724 WideCanIV->eraseFromParent();
725 return;
726 }
727
728 if (vputils::onlyScalarValuesUsed(WideCanIV))
729 return;
730
731 // If a canonical VPWidenIntOrFpInductionRecipe already produces vector lanes
732 // in the header, reuse it instead of introducing another wide induction phi.
733 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
734 for (VPRecipeBase &Phi : Header->phis()) {
735 auto *WidenIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
736 if (!WidenIV || !WidenIV->isCanonical())
737 continue;
738 // The reused wide IV feeds the header mask, whose lanes may extend past
739 // the trip count; drop flags that only hold inside the scalar loop.
740 WidenIV->dropPoisonGeneratingFlags();
741 WideCanIV->replaceAllUsesWith(WidenIV);
742 WideCanIV->eraseFromParent();
743 return;
744 }
745
746 // Introduce a new VPWidenIntOrFpInductionRecipe if profitable.
747 auto *VecTy = VectorType::get(CanIVTy, VF);
748 InstructionCost BroadcastCost = TTI.getShuffleCost(
750 InstructionCost PHICost = TTI.getCFInstrCost(Instruction::PHI, CostKind);
751 if (PHICost > BroadcastCost)
752 return;
753
754 // Bail out if the additional wide induction phi increase the expected spill
755 // cost.
756 VPRegisterUsage UnrolledBase =
757 calculateRegisterUsageForPlan(Plan, VF, TTI, ValuesToIgnore)[0];
758 for (unsigned &NumUsers : make_second_range(UnrolledBase.MaxLocalUsers))
759 NumUsers *= UF;
760 unsigned RegClass = TTI.getRegisterClassForType(/*Vector=*/true, VecTy);
761 VPRegisterUsage Projected = UnrolledBase;
762 Projected.MaxLocalUsers[RegClass] += TTI.getRegUsageForType(VecTy);
763 if (Projected.spillCost(TTI, CostKind) >
764 UnrolledBase.spillCost(TTI, CostKind))
765 return;
766
769 VPValue *StepV = Plan.getConstantInt(CanIVTy, 1);
770 auto *NewWideIV = new VPWidenIntOrFpInductionRecipe(
771 /*IV=*/nullptr, Plan.getZero(CanIVTy), StepV, &Plan.getVF(), ID,
772 VPIRFlags::WrapFlagsTy(/*HasNUW=*/LoopRegion->hasCanonicalIVNUW(),
773 /*HasNSW=*/false),
774 WideCanIV->getDebugLoc());
775 NewWideIV->insertBefore(&*Header->getFirstNonPhi());
776 WideCanIV->replaceAllUsesWith(NewWideIV);
777 WideCanIV->eraseFromParent();
778}
779
780/// Returns true if \p R is dead and can be removed.
781static bool isDeadRecipe(VPRecipeBase &R) {
782 // Do remove conditional assume instructions as their conditions may be
783 // flattened.
784 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
785 bool IsConditionalAssume = RepR && RepR->isPredicated() &&
787 if (IsConditionalAssume)
788 return true;
789
790 if (R.mayHaveSideEffects())
791 return false;
792
793 // Recipe is dead if no user keeps the recipe alive.
794 return all_of(R.definedValues(),
795 [](VPValue *V) { return V->getNumUsers() == 0; });
796}
797
800 Plan.getEntry());
802 // The recipes in the block are processed in reverse order, to catch chains
803 // of dead recipes.
804 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
805 if (isDeadRecipe(R)) {
806 R.eraseFromParent();
807 continue;
808 }
809
810 // Check if R is a dead VPPhi <-> update cycle and remove it.
811 VPValue *Start, *Incoming;
812 if (!match(&R, m_VPPhi(m_VPValue(Start), m_VPValue(Incoming))))
813 continue;
814 auto *PhiR = cast<VPPhi>(&R);
815 VPUser *PhiUser = PhiR->getSingleUser();
816 if (!PhiUser)
817 continue;
818 if (PhiUser != Incoming->getDefiningRecipe() ||
819 Incoming->getNumUsers() != 1)
820 continue;
821 PhiR->replaceAllUsesWith(Start);
822 PhiR->eraseFromParent();
823 Incoming->getDefiningRecipe()->eraseFromParent();
824 }
825 }
826}
827
830 for (unsigned I = 0; I != Users.size(); ++I) {
832 for (VPValue *V : Cur->definedValues())
833 Users.insert_range(V->users());
834 }
835 return Users.takeVector();
836}
837
838/// Scalarize a VPWidenPointerInductionRecipe by replacing it with a PtrAdd
839/// (IndStart, ScalarIVSteps (0, Step)). This is used when the recipe only
840/// generates scalar values.
841static VPValue *
843 VPlan &Plan, VPBuilder &Builder) {
845 VPIRValue *StartV = Plan.getZero(ID.getStep()->getType());
846 VPValue *StepV = PtrIV->getOperand(1);
848 Plan, InductionDescriptor::IK_IntInduction, Instruction::Add, nullptr,
849 nullptr, StartV, StepV, PtrIV->getDebugLoc(), Builder);
850
851 return Builder.createPtrAdd(PtrIV->getStartValue(), Steps,
852 PtrIV->getDebugLoc(), "next.gep");
853}
854
855/// Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd
856/// (IndStart, ScalarIVSteps (0, Step)) if only its scalar values are used, as
857/// VPWidenPointerInductionRecipe will generate vectors only. If some users
858/// require vectors while other require scalars, the scalar uses need to extract
859/// the scalars from the generated vectors (Note that this is different to how
860/// int/fp inductions are handled). Legalize extract-from-ends using uniform
861/// VPReplicateRecipe of wide inductions to use regular VPReplicateRecipe, so
862/// the correct end value is available. Also optimize
863/// VPWidenIntOrFpInductionRecipe, if any of its users needs scalar values, by
864/// providing them scalar steps built on the canonical scalar IV and update the
865/// original IV's users. This is an optional optimization to reduce the needs of
866/// vector extracts.
869 bool HasOnlyVectorVFs = !Plan.hasScalarVFOnly();
870 VPBuilder Builder(HeaderVPBB, HeaderVPBB->getFirstNonPhi());
871 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
872 auto *PhiR = dyn_cast<VPWidenInductionRecipe>(&Phi);
873 if (!PhiR)
874 continue;
875
876 // Try to narrow wide and replicating recipes to uniform recipes, based on
877 // VPlan analysis.
878 // TODO: Apply to all recipes in the future, to replace legacy uniformity
879 // analysis.
880 auto Users = collectUsersRecursively(PhiR);
881 for (VPUser *U : reverse(Users)) {
882 auto *Def = dyn_cast<VPRecipeWithIRFlags>(U);
883 auto *RepR = dyn_cast<VPReplicateRecipe>(U);
884 // Skip recipes that shouldn't be narrowed.
885 if (!Def || !isa<VPReplicateRecipe, VPWidenRecipe>(Def) ||
886 Def->getNumUsers() == 0 || !Def->getUnderlyingValue() ||
887 (RepR && (RepR->isSingleScalar() || RepR->isPredicated())))
888 continue;
889
890 // Skip recipes that may have other lanes than their first used.
892 continue;
893
894 // TODO: Support scalarizing ExtractValue.
895 if (match(Def,
897 continue;
898
899 auto *Clone = new VPReplicateRecipe(Def->getUnderlyingInstr(),
900 Def->operands(), /*IsUniform*/ true,
901 /*Mask*/ nullptr, /*Flags*/ *Def);
902 Clone->insertAfter(Def);
903 Def->replaceAllUsesWith(Clone);
904 }
905
906 // Replace wide pointer inductions which have only their scalars used by
907 // PtrAdd(IndStart, ScalarIVSteps (0, Step)).
908 if (auto *PtrIV = dyn_cast<VPWidenPointerInductionRecipe>(&Phi)) {
909 if (!Plan.hasScalarVFOnly() &&
910 !PtrIV->onlyScalarsGenerated(Plan.hasScalableVF()))
911 continue;
912
913 VPValue *PtrAdd = scalarizeVPWidenPointerInduction(PtrIV, Plan, Builder);
914 PtrIV->replaceAllUsesWith(PtrAdd);
915 continue;
916 }
917
918 // Replace widened induction with scalar steps for users that only use
919 // scalars.
920 auto *WideIV = cast<VPWidenIntOrFpInductionRecipe>(&Phi);
921 if (HasOnlyVectorVFs && none_of(WideIV->users(), [WideIV](VPUser *U) {
922 return U->usesScalars(WideIV);
923 }))
924 continue;
925
926 const InductionDescriptor &ID = WideIV->getInductionDescriptor();
928 Plan, ID.getKind(), ID.getInductionOpcode(),
929 dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
930 WideIV->getTruncInst(), WideIV->getStartValue(), WideIV->getStepValue(),
931 WideIV->getDebugLoc(), Builder);
932
933 // Update scalar users of IV to use Step instead.
934 if (!HasOnlyVectorVFs) {
935 assert(!Plan.hasScalableVF() &&
936 "plans containing a scalar VF cannot also include scalable VFs");
937 WideIV->replaceAllUsesWith(Steps);
938 } else {
939 bool HasScalableVF = Plan.hasScalableVF();
940 WideIV->replaceUsesWithIf(Steps,
941 [WideIV, HasScalableVF](VPUser &U, unsigned) {
942 if (HasScalableVF)
943 return U.usesFirstLaneOnly(WideIV);
944 return U.usesScalars(WideIV);
945 });
946 }
947 }
948}
949
950/// Check if \p VPV is an untruncated wide induction, either before or after the
951/// increment. If so return the header IV (before the increment), otherwise
952/// return null.
955 auto *WideIV = dyn_cast<VPWidenInductionRecipe>(VPV);
956 if (WideIV) {
957 // VPV itself is a wide induction, separately compute the end value for exit
958 // users if it is not a truncated IV.
959 auto *IntOrFpIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
960 return (IntOrFpIV && IntOrFpIV->getTruncInst()) ? nullptr : WideIV;
961 }
962
963 // Check if VPV is an optimizable induction increment.
964 VPRecipeBase *Def = VPV->getDefiningRecipe();
965 if (!Def || Def->getNumOperands() != 2)
966 return nullptr;
967 WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(0));
968 if (!WideIV)
969 WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(1));
970 if (!WideIV)
971 return nullptr;
972
973 auto IsWideIVInc = [&]() {
974 auto &ID = WideIV->getInductionDescriptor();
975
976 // Check if VPV increments the induction by the induction step.
977 VPValue *IVStep = WideIV->getStepValue();
978 switch (ID.getInductionOpcode()) {
979 case Instruction::Add:
980 return match(VPV, m_c_Add(m_Specific(WideIV), m_Specific(IVStep)));
981 case Instruction::FAdd:
982 return match(VPV, m_c_FAdd(m_Specific(WideIV), m_Specific(IVStep)));
983 case Instruction::FSub:
984 return match(VPV, m_Binary<Instruction::FSub>(m_Specific(WideIV),
985 m_Specific(IVStep)));
986 case Instruction::Sub: {
987 // IVStep will be the negated step of the subtraction. Check if Step == -1
988 // * IVStep.
989 VPValue *Step;
990 if (!match(VPV, m_Sub(m_VPValue(), m_VPValue(Step))))
991 return false;
992 const SCEV *IVStepSCEV = vputils::getSCEVExprForVPValue(IVStep, PSE);
993 const SCEV *StepSCEV = vputils::getSCEVExprForVPValue(Step, PSE);
994 ScalarEvolution &SE = *PSE.getSE();
995 return !isa<SCEVCouldNotCompute>(IVStepSCEV) &&
996 !isa<SCEVCouldNotCompute>(StepSCEV) &&
997 IVStepSCEV == SE.getNegativeSCEV(StepSCEV);
998 }
999 default:
1000 return ID.getKind() == InductionDescriptor::IK_PtrInduction &&
1001 match(VPV, m_GetElementPtr(m_Specific(WideIV),
1002 m_Specific(WideIV->getStepValue())));
1003 }
1004 llvm_unreachable("should have been covered by switch above");
1005 };
1006 return IsWideIVInc() ? WideIV : nullptr;
1007}
1008
1009/// Attempts to optimize the induction variable exit values for users in the
1010/// early exit block.
1012 VPTypeAnalysis &TypeInfo,
1013 VPValue *Op,
1015 VPValue *Incoming, *Mask;
1017 m_VPValue(Incoming))))
1018 return nullptr;
1019
1020 auto *WideIV = getOptimizableIVOf(Incoming, PSE);
1021 if (!WideIV)
1022 return nullptr;
1023
1024 auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
1025 if (WideIntOrFp && WideIntOrFp->getTruncInst())
1026 return nullptr;
1027
1028 // Calculate the final index.
1029 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
1030 auto *CanonicalIV = LoopRegion->getCanonicalIV();
1031 Type *CanonicalIVType = LoopRegion->getCanonicalIVType();
1032 auto *ExtractR = cast<VPInstruction>(Op);
1033 VPBuilder B(ExtractR);
1034
1035 DebugLoc DL = ExtractR->getDebugLoc();
1036 VPValue *FirstActiveLane = B.createFirstActiveLane(Mask, DL);
1037 Type *FirstActiveLaneType = TypeInfo.inferScalarType(FirstActiveLane);
1038 FirstActiveLane = B.createScalarZExtOrTrunc(FirstActiveLane, CanonicalIVType,
1039 FirstActiveLaneType, DL);
1040 VPValue *EndValue = B.createAdd(CanonicalIV, FirstActiveLane, DL);
1041
1042 // `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
1043 // changed it means the exit is using the incremented value, so we need to
1044 // add the step.
1045 if (Incoming != WideIV) {
1046 VPValue *One = Plan.getConstantInt(CanonicalIVType, 1);
1047 EndValue = B.createAdd(EndValue, One, DL);
1048 }
1049
1050 if (!WideIntOrFp || !WideIntOrFp->isCanonical()) {
1051 const InductionDescriptor &ID = WideIV->getInductionDescriptor();
1052 VPIRValue *Start = WideIV->getStartValue();
1053 VPValue *Step = WideIV->getStepValue();
1054 EndValue = B.createDerivedIV(
1055 ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
1056 Start, EndValue, Step);
1057 }
1058
1059 return EndValue;
1060}
1061
1062/// Compute the end value for \p WideIV, unless it is truncated. Creates a
1063/// VPDerivedIVRecipe for non-canonical inductions.
1065 VPBuilder &VectorPHBuilder,
1066 VPTypeAnalysis &TypeInfo,
1067 VPValue *VectorTC) {
1068 auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
1069 // Truncated wide inductions resume from the last lane of their vector value
1070 // in the last vector iteration which is handled elsewhere.
1071 if (WideIntOrFp && WideIntOrFp->getTruncInst())
1072 return nullptr;
1073
1074 VPIRValue *Start = WideIV->getStartValue();
1075 VPValue *Step = WideIV->getStepValue();
1077 VPValue *EndValue = VectorTC;
1078 if (!WideIntOrFp || !WideIntOrFp->isCanonical()) {
1079 EndValue = VectorPHBuilder.createDerivedIV(
1080 ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
1081 Start, VectorTC, Step);
1082 }
1083
1084 // EndValue is derived from the vector trip count (which has the same type as
1085 // the widest induction) and thus may be wider than the induction here.
1086 Type *ScalarTypeOfWideIV = TypeInfo.inferScalarType(WideIV);
1087 if (ScalarTypeOfWideIV != TypeInfo.inferScalarType(EndValue)) {
1088 EndValue = VectorPHBuilder.createScalarCast(Instruction::Trunc, EndValue,
1089 ScalarTypeOfWideIV,
1090 WideIV->getDebugLoc());
1091 }
1092
1093 return EndValue;
1094}
1095
1096/// Attempts to optimize the induction variable exit values for users in the
1097/// exit block coming from the latch in the original scalar loop.
1099 VPlan &Plan, VPTypeAnalysis &TypeInfo, VPValue *Op,
1101 VPValue *Incoming;
1103 return nullptr;
1104
1105 VPWidenInductionRecipe *WideIV = getOptimizableIVOf(Incoming, PSE);
1106 if (!WideIV)
1107 return nullptr;
1108
1109 VPValue *EndValue = EndValues.lookup(WideIV);
1110 assert(EndValue && "Must have computed the end value up front");
1111
1112 // `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
1113 // changed it means the exit is using the incremented value, so we don't
1114 // need to subtract the step.
1115 if (Incoming != WideIV)
1116 return EndValue;
1117
1118 // Otherwise, subtract the step from the EndValue.
1119 auto *ExtractR = cast<VPInstruction>(Op);
1120 VPBuilder B(ExtractR);
1121 VPValue *Step = WideIV->getStepValue();
1122 Type *ScalarTy = TypeInfo.inferScalarType(WideIV);
1123 if (ScalarTy->isIntegerTy())
1124 return B.createSub(EndValue, Step, DebugLoc::getUnknown(), "ind.escape");
1125 if (ScalarTy->isPointerTy()) {
1126 Type *StepTy = TypeInfo.inferScalarType(Step);
1127 auto *Zero = Plan.getZero(StepTy);
1128 return B.createPtrAdd(EndValue, B.createSub(Zero, Step),
1129 DebugLoc::getUnknown(), "ind.escape");
1130 }
1131 if (ScalarTy->isFloatingPointTy()) {
1132 const auto &ID = WideIV->getInductionDescriptor();
1133 return B.createNaryOp(
1134 ID.getInductionBinOp()->getOpcode() == Instruction::FAdd
1135 ? Instruction::FSub
1136 : Instruction::FAdd,
1137 {EndValue, Step}, {ID.getInductionBinOp()->getFastMathFlags()});
1138 }
1139 llvm_unreachable("all possible induction types must be handled");
1140 return nullptr;
1141}
1142
1144 VPlan &Plan, PredicatedScalarEvolution &PSE, bool FoldTail) {
1145 // Compute end values for all inductions.
1146 VPTypeAnalysis TypeInfo(Plan);
1147 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
1148 auto *VectorPH = cast<VPBasicBlock>(VectorRegion->getSinglePredecessor());
1149 VPBuilder VectorPHBuilder(VectorPH, VectorPH->begin());
1151 VPValue *ResumeTC =
1152 FoldTail ? Plan.getTripCount() : &Plan.getVectorTripCount();
1153 for (auto &Phi : VectorRegion->getEntryBasicBlock()->phis()) {
1154 auto *WideIV = dyn_cast<VPWidenInductionRecipe>(&Phi);
1155 if (!WideIV)
1156 continue;
1158 WideIV, VectorPHBuilder, TypeInfo, ResumeTC))
1159 EndValues[WideIV] = EndValue;
1160 }
1161
1162 VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
1163 for (VPRecipeBase &R : make_early_inc_range(*MiddleVPBB)) {
1164 VPValue *Op;
1165 if (!match(&R, m_ExitingIVValue(m_VPValue(Op))))
1166 continue;
1167 auto *WideIV = cast<VPWidenInductionRecipe>(Op);
1168 if (VPValue *EndValue = EndValues.lookup(WideIV)) {
1169 R.getVPSingleValue()->replaceAllUsesWith(EndValue);
1170 R.eraseFromParent();
1171 }
1172 }
1173
1174 // Then, optimize exit block users.
1175 for (VPIRBasicBlock *ExitVPBB : Plan.getExitBlocks()) {
1176 for (VPRecipeBase &R : ExitVPBB->phis()) {
1177 auto *ExitIRI = cast<VPIRPhi>(&R);
1178
1179 for (auto [Idx, PredVPBB] : enumerate(ExitVPBB->getPredecessors())) {
1180 VPValue *Escape = nullptr;
1181 if (PredVPBB == MiddleVPBB)
1183 Plan, TypeInfo, ExitIRI->getOperand(Idx), EndValues, PSE);
1184 else
1186 Plan, TypeInfo, ExitIRI->getOperand(Idx), PSE);
1187 if (Escape)
1188 ExitIRI->setOperand(Idx, Escape);
1189 }
1190 }
1191 }
1192}
1193
1194/// Remove redundant EpxandSCEVRecipes in \p Plan's entry block by replacing
1195/// them with already existing recipes expanding the same SCEV expression.
1198
1199 for (VPRecipeBase &R :
1201 auto *ExpR = dyn_cast<VPExpandSCEVRecipe>(&R);
1202 if (!ExpR)
1203 continue;
1204
1205 const auto &[V, Inserted] = SCEV2VPV.try_emplace(ExpR->getSCEV(), ExpR);
1206 if (Inserted)
1207 continue;
1208 ExpR->replaceAllUsesWith(V->second);
1209 ExpR->eraseFromParent();
1210 }
1211}
1212
1214 SmallVector<VPValue *> WorkList;
1216 WorkList.push_back(V);
1217
1218 while (!WorkList.empty()) {
1219 VPValue *Cur = WorkList.pop_back_val();
1220 if (!Seen.insert(Cur).second)
1221 continue;
1222 VPRecipeBase *R = Cur->getDefiningRecipe();
1223 if (!R)
1224 continue;
1225 if (!isDeadRecipe(*R))
1226 continue;
1227 append_range(WorkList, R->operands());
1228 R->eraseFromParent();
1229 }
1230}
1231
1232/// Get any instruction opcode or intrinsic ID data embedded in recipe \p R.
1233/// Returns an optional pair, where the first element indicates whether it is
1234/// an intrinsic ID.
1235static std::optional<std::pair<bool, unsigned>>
1237 return TypeSwitch<const VPSingleDefRecipe *,
1238 std::optional<std::pair<bool, unsigned>>>(R)
1241 [](auto *I) { return std::make_pair(false, I->getOpcode()); })
1242 .Case([](const VPWidenIntrinsicRecipe *I) {
1243 return std::make_pair(true, I->getVectorIntrinsicID());
1244 })
1245 .Case<VPVectorPointerRecipe, VPPredInstPHIRecipe, VPScalarIVStepsRecipe>(
1246 [](auto *I) {
1247 // For recipes that do not directly map to LLVM IR instructions,
1248 // assign opcodes after the last VPInstruction opcode (which is also
1249 // after the last IR Instruction opcode), based on the VPRecipeID.
1250 return std::make_pair(false, VPInstruction::OpsEnd + 1 +
1251 I->getVPRecipeID());
1252 })
1253 .Default([](auto *) { return std::nullopt; });
1254}
1255
1256/// Try to fold \p R using InstSimplifyFolder. Will succeed and return a
1257/// non-nullptr VPValue for a handled opcode or intrinsic ID if corresponding \p
1258/// Operands are foldable live-ins.
1260 ArrayRef<VPValue *> Operands,
1261 const DataLayout &DL,
1262 VPTypeAnalysis &TypeInfo) {
1263 auto OpcodeOrIID = getOpcodeOrIntrinsicID(&R);
1264 if (!OpcodeOrIID)
1265 return nullptr;
1266
1268 for (VPValue *Op : Operands) {
1269 if (!match(Op, m_LiveIn()))
1270 return nullptr;
1271 Value *V = Op->getUnderlyingValue();
1272 if (!V)
1273 return nullptr;
1274 Ops.push_back(V);
1275 }
1276
1277 auto FoldToIRValue = [&]() -> Value * {
1278 InstSimplifyFolder Folder(DL);
1279 if (OpcodeOrIID->first) {
1280 if (R.getNumOperands() != 2)
1281 return nullptr;
1282 unsigned ID = OpcodeOrIID->second;
1283 return Folder.FoldBinaryIntrinsic(ID, Ops[0], Ops[1],
1284 TypeInfo.inferScalarType(&R));
1285 }
1286 unsigned Opcode = OpcodeOrIID->second;
1287 if (Instruction::isBinaryOp(Opcode))
1288 return Folder.FoldBinOp(static_cast<Instruction::BinaryOps>(Opcode),
1289 Ops[0], Ops[1]);
1290 if (Instruction::isCast(Opcode))
1291 return Folder.FoldCast(static_cast<Instruction::CastOps>(Opcode), Ops[0],
1292 TypeInfo.inferScalarType(R.getVPSingleValue()));
1293 switch (Opcode) {
1295 return Folder.FoldSelect(Ops[0], Ops[1],
1297 case VPInstruction::Not:
1298 return Folder.FoldBinOp(Instruction::BinaryOps::Xor, Ops[0],
1300 case Instruction::Select:
1301 return Folder.FoldSelect(Ops[0], Ops[1], Ops[2]);
1302 case Instruction::ICmp:
1303 case Instruction::FCmp:
1304 return Folder.FoldCmp(cast<VPRecipeWithIRFlags>(R).getPredicate(), Ops[0],
1305 Ops[1]);
1306 case Instruction::GetElementPtr: {
1307 auto &RFlags = cast<VPRecipeWithIRFlags>(R);
1308 auto *GEP = cast<GetElementPtrInst>(RFlags.getUnderlyingInstr());
1309 return Folder.FoldGEP(GEP->getSourceElementType(), Ops[0],
1310 drop_begin(Ops), RFlags.getGEPNoWrapFlags());
1311 }
1314 return Folder.FoldGEP(IntegerType::getInt8Ty(TypeInfo.getContext()),
1315 Ops[0], Ops[1],
1316 cast<VPRecipeWithIRFlags>(R).getGEPNoWrapFlags());
1317 // An extract of a live-in is an extract of a broadcast, so return the
1318 // broadcasted element.
1319 case Instruction::ExtractElement:
1320 assert(!Ops[0]->getType()->isVectorTy() && "Live-ins should be scalar");
1321 return Ops[0];
1322 }
1323 return nullptr;
1324 };
1325
1326 if (Value *V = FoldToIRValue())
1327 return R.getParent()->getPlan()->getOrAddLiveIn(V);
1328 return nullptr;
1329}
1330
1331/// Try to simplify VPSingleDefRecipe \p Def.
1333 VPlan *Plan = Def->getParent()->getPlan();
1334
1335 // Simplification of live-in IR values for SingleDef recipes using
1336 // InstSimplifyFolder.
1337 const DataLayout &DL = Plan->getDataLayout();
1338 if (VPValue *V = tryToFoldLiveIns(*Def, Def->operands(), DL, TypeInfo))
1339 return Def->replaceAllUsesWith(V);
1340
1341 // Fold PredPHI LiveIn -> LiveIn.
1342 if (auto *PredPHI = dyn_cast<VPPredInstPHIRecipe>(Def)) {
1343 VPValue *Op = PredPHI->getOperand(0);
1344 if (isa<VPIRValue>(Op))
1345 PredPHI->replaceAllUsesWith(Op);
1346 }
1347
1348 VPBuilder Builder(Def);
1349
1350 // Avoid replacing VPInstructions with underlying values with new
1351 // VPInstructions, as we would fail to create widen/replicate recpes from the
1352 // new VPInstructions without an underlying value, and miss out on some
1353 // transformations that only apply to widened/replicated recipes later, by
1354 // doing so.
1355 // TODO: We should also not replace non-VPInstructions like VPWidenRecipe with
1356 // VPInstructions without underlying values, as those will get skipped during
1357 // cost computation.
1358 bool CanCreateNewRecipe =
1359 !isa<VPInstruction>(Def) || !Def->getUnderlyingValue();
1360
1361 VPValue *A;
1362 if (match(Def, m_Trunc(m_ZExtOrSExt(m_VPValue(A))))) {
1363 Type *TruncTy = TypeInfo.inferScalarType(Def);
1364 Type *ATy = TypeInfo.inferScalarType(A);
1365 if (TruncTy == ATy) {
1366 Def->replaceAllUsesWith(A);
1367 } else {
1368 // Don't replace a non-widened cast recipe with a widened cast.
1369 if (!isa<VPWidenCastRecipe>(Def))
1370 return;
1371 if (ATy->getScalarSizeInBits() < TruncTy->getScalarSizeInBits()) {
1372
1373 unsigned ExtOpcode = match(Def->getOperand(0), m_SExt(m_VPValue()))
1374 ? Instruction::SExt
1375 : Instruction::ZExt;
1376 auto *Ext = Builder.createWidenCast(Instruction::CastOps(ExtOpcode), A,
1377 TruncTy);
1378 if (auto *UnderlyingExt = Def->getOperand(0)->getUnderlyingValue()) {
1379 // UnderlyingExt has distinct return type, used to retain legacy cost.
1380 Ext->setUnderlyingValue(UnderlyingExt);
1381 }
1382 Def->replaceAllUsesWith(Ext);
1383 } else if (ATy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits()) {
1384 auto *Trunc = Builder.createWidenCast(Instruction::Trunc, A, TruncTy);
1385 Def->replaceAllUsesWith(Trunc);
1386 }
1387 }
1388#ifndef NDEBUG
1389 // Verify that the cached type info is for both A and its users is still
1390 // accurate by comparing it to freshly computed types.
1391 VPTypeAnalysis TypeInfo2(*Plan);
1392 assert(TypeInfo.inferScalarType(A) == TypeInfo2.inferScalarType(A));
1393 for (VPUser *U : A->users()) {
1394 auto *R = cast<VPRecipeBase>(U);
1395 for (VPValue *VPV : R->definedValues())
1396 assert(TypeInfo.inferScalarType(VPV) == TypeInfo2.inferScalarType(VPV));
1397 }
1398#endif
1399 }
1400
1401 // Simplify (X && Y) | (X && !Y) -> X.
1402 // TODO: Split up into simpler, modular combines: (X && Y) | (X && Z) into X
1403 // && (Y | Z) and (X | !X) into true. This requires queuing newly created
1404 // recipes to be visited during simplification.
1405 VPValue *X, *Y, *Z;
1406 if (match(Def,
1409 Def->replaceAllUsesWith(X);
1410 Def->eraseFromParent();
1411 return;
1412 }
1413
1414 // x | AllOnes -> AllOnes
1415 if (match(Def, m_c_BinaryOr(m_VPValue(X), m_AllOnes())))
1416 return Def->replaceAllUsesWith(
1417 Plan->getAllOnesValue(TypeInfo.inferScalarType(Def)));
1418
1419 // x | 0 -> x
1420 if (match(Def, m_c_BinaryOr(m_VPValue(X), m_ZeroInt())))
1421 return Def->replaceAllUsesWith(X);
1422
1423 // x | !x -> AllOnes
1425 return Def->replaceAllUsesWith(
1426 Plan->getAllOnesValue(TypeInfo.inferScalarType(Def)));
1427
1428 // x & 0 -> 0
1429 if (match(Def, m_c_BinaryAnd(m_VPValue(X), m_ZeroInt())))
1430 return Def->replaceAllUsesWith(
1431 Plan->getZero(TypeInfo.inferScalarType(Def)));
1432
1433 // x & AllOnes -> x
1434 if (match(Def, m_c_BinaryAnd(m_VPValue(X), m_AllOnes())))
1435 return Def->replaceAllUsesWith(X);
1436
1437 // x && false -> false
1438 if (match(Def, m_c_LogicalAnd(m_VPValue(X), m_False())))
1439 return Def->replaceAllUsesWith(Plan->getFalse());
1440
1441 // x && true -> x
1442 if (match(Def, m_c_LogicalAnd(m_VPValue(X), m_True())))
1443 return Def->replaceAllUsesWith(X);
1444
1445 // (x && y) | (x && z) -> x && (y | z)
1446 if (CanCreateNewRecipe &&
1449 // Simplify only if one of the operands has one use to avoid creating an
1450 // extra recipe.
1451 (!Def->getOperand(0)->hasMoreThanOneUniqueUser() ||
1452 !Def->getOperand(1)->hasMoreThanOneUniqueUser()))
1453 return Def->replaceAllUsesWith(
1454 Builder.createLogicalAnd(X, Builder.createOr(Y, Z)));
1455
1456 // x && (x && y) -> x && y
1457 if (match(Def, m_LogicalAnd(m_VPValue(X),
1459 return Def->replaceAllUsesWith(Def->getOperand(1));
1460
1461 // x && (y && x) -> x && y
1462 if (match(Def, m_LogicalAnd(m_VPValue(X),
1464 return Def->replaceAllUsesWith(Builder.createLogicalAnd(X, Y));
1465
1466 // x && !x -> 0
1468 return Def->replaceAllUsesWith(Plan->getFalse());
1469
1470 if (match(Def, m_Select(m_VPValue(), m_VPValue(X), m_Deferred(X))))
1471 return Def->replaceAllUsesWith(X);
1472
1473 // select c, false, true -> not c
1474 VPValue *C;
1475 if (CanCreateNewRecipe &&
1476 match(Def, m_Select(m_VPValue(C), m_False(), m_True())))
1477 return Def->replaceAllUsesWith(Builder.createNot(C));
1478
1479 // select !c, x, y -> select c, y, x
1480 if (match(Def, m_Select(m_Not(m_VPValue(C)), m_VPValue(X), m_VPValue(Y)))) {
1481 Def->setOperand(0, C);
1482 Def->setOperand(1, Y);
1483 Def->setOperand(2, X);
1484 return;
1485 }
1486
1487 if (match(Def, m_c_Add(m_VPValue(A), m_ZeroInt())))
1488 return Def->replaceAllUsesWith(A);
1489
1490 if (match(Def, m_c_Mul(m_VPValue(A), m_One())))
1491 return Def->replaceAllUsesWith(A);
1492
1493 if (match(Def, m_c_Mul(m_VPValue(A), m_ZeroInt())))
1494 return Def->replaceAllUsesWith(
1495 Plan->getZero(TypeInfo.inferScalarType(Def)));
1496
1497 if (CanCreateNewRecipe && match(Def, m_c_Mul(m_VPValue(A), m_AllOnes()))) {
1498 // Preserve nsw from the Mul on the new Sub.
1500 false, cast<VPRecipeWithIRFlags>(Def)->hasNoSignedWrap()};
1501 return Def->replaceAllUsesWith(
1502 Builder.createSub(Plan->getZero(TypeInfo.inferScalarType(A)), A,
1503 Def->getDebugLoc(), "", NW));
1504 }
1505
1506 if (CanCreateNewRecipe &&
1508 // Preserve nsw from the Add and the Sub, if it's present on both, on the
1509 // new Sub.
1511 false,
1512 cast<VPRecipeWithIRFlags>(Def)->hasNoSignedWrap() &&
1513 cast<VPRecipeWithIRFlags>(Def->getOperand(Def->getOperand(0) == X))
1514 ->hasNoSignedWrap()};
1515 return Def->replaceAllUsesWith(
1516 Builder.createSub(X, Y, Def->getDebugLoc(), "", NW));
1517 }
1518
1519 const APInt *APC;
1520 if (CanCreateNewRecipe && match(Def, m_c_Mul(m_VPValue(A), m_APInt(APC))) &&
1521 APC->isPowerOf2())
1522 return Def->replaceAllUsesWith(Builder.createNaryOp(
1523 Instruction::Shl,
1524 {A, Plan->getConstantInt(APC->getBitWidth(), APC->exactLogBase2())},
1525 *cast<VPRecipeWithIRFlags>(Def), Def->getDebugLoc()));
1526
1527 if (CanCreateNewRecipe && match(Def, m_UDiv(m_VPValue(A), m_APInt(APC))) &&
1528 APC->isPowerOf2())
1529 return Def->replaceAllUsesWith(Builder.createNaryOp(
1530 Instruction::LShr,
1531 {A, Plan->getConstantInt(APC->getBitWidth(), APC->exactLogBase2())},
1532 *cast<VPRecipeWithIRFlags>(Def), Def->getDebugLoc()));
1533
1534 if (match(Def, m_Not(m_VPValue(A)))) {
1535 if (match(A, m_Not(m_VPValue(A))))
1536 return Def->replaceAllUsesWith(A);
1537
1538 // Try to fold Not into compares by adjusting the predicate in-place.
1539 CmpPredicate Pred;
1540 if (match(A, m_Cmp(Pred, m_VPValue(), m_VPValue()))) {
1541 auto *Cmp = cast<VPRecipeWithIRFlags>(A);
1542 if (all_of(Cmp->users(),
1544 m_Not(m_Specific(Cmp)),
1545 m_Select(m_Specific(Cmp), m_VPValue(), m_VPValue()))))) {
1546 Cmp->setPredicate(CmpInst::getInversePredicate(Pred));
1547 for (VPUser *U : to_vector(Cmp->users())) {
1548 auto *R = cast<VPSingleDefRecipe>(U);
1549 if (match(R, m_Select(m_Specific(Cmp), m_VPValue(X), m_VPValue(Y)))) {
1550 // select (cmp pred), x, y -> select (cmp inv_pred), y, x
1551 R->setOperand(1, Y);
1552 R->setOperand(2, X);
1553 } else {
1554 // not (cmp pred) -> cmp inv_pred
1555 assert(match(R, m_Not(m_Specific(Cmp))) && "Unexpected user");
1556 R->replaceAllUsesWith(Cmp);
1557 }
1558 }
1559 // If Cmp doesn't have a debug location, use the one from the negation,
1560 // to preserve the location.
1561 if (!Cmp->getDebugLoc() && Def->getDebugLoc())
1562 Cmp->setDebugLoc(Def->getDebugLoc());
1563 }
1564 }
1565 }
1566
1567 // Fold any-of (fcmp uno %A, %A), (fcmp uno %B, %B), ... ->
1568 // any-of (fcmp uno %A, %B), ...
1569 if (match(Def, m_AnyOf())) {
1571 VPRecipeBase *UnpairedCmp = nullptr;
1572 for (VPValue *Op : Def->operands()) {
1573 VPValue *X;
1574 if (Op->getNumUsers() > 1 ||
1576 m_Deferred(X)))) {
1577 NewOps.push_back(Op);
1578 } else if (!UnpairedCmp) {
1579 UnpairedCmp = Op->getDefiningRecipe();
1580 } else {
1581 NewOps.push_back(Builder.createFCmp(CmpInst::FCMP_UNO,
1582 UnpairedCmp->getOperand(0), X));
1583 UnpairedCmp = nullptr;
1584 }
1585 }
1586
1587 if (UnpairedCmp)
1588 NewOps.push_back(UnpairedCmp->getVPSingleValue());
1589
1590 if (NewOps.size() < Def->getNumOperands()) {
1591 VPValue *NewAnyOf = Builder.createNaryOp(VPInstruction::AnyOf, NewOps);
1592 return Def->replaceAllUsesWith(NewAnyOf);
1593 }
1594 }
1595
1596 // Fold (fcmp uno %X, %X) or (fcmp uno %Y, %Y) -> fcmp uno %X, %Y
1597 // This is useful for fmax/fmin without fast-math flags, where we need to
1598 // check if any operand is NaN.
1599 if (CanCreateNewRecipe &&
1601 m_Deferred(X)),
1603 m_Deferred(Y))))) {
1604 VPValue *NewCmp = Builder.createFCmp(CmpInst::FCMP_UNO, X, Y);
1605 return Def->replaceAllUsesWith(NewCmp);
1606 }
1607
1608 // Remove redundant DerviedIVs, that is 0 + A * 1 -> A and 0 + 0 * x -> 0.
1609 if ((match(Def, m_DerivedIV(m_ZeroInt(), m_VPValue(A), m_One())) ||
1610 match(Def, m_DerivedIV(m_ZeroInt(), m_ZeroInt(), m_VPValue()))) &&
1611 TypeInfo.inferScalarType(Def->getOperand(1)) ==
1612 TypeInfo.inferScalarType(Def))
1613 return Def->replaceAllUsesWith(Def->getOperand(1));
1614
1616 m_One()))) {
1617 Type *WideStepTy = TypeInfo.inferScalarType(Def);
1618 if (TypeInfo.inferScalarType(X) != WideStepTy)
1619 X = Builder.createWidenCast(Instruction::Trunc, X, WideStepTy);
1620 Def->replaceAllUsesWith(X);
1621 return;
1622 }
1623
1624 // For i1 vp.merges produced by AnyOf reductions:
1625 // vp.merge true, (or x, y), x, evl -> vp.merge y, true, x, evl
1627 m_VPValue(X), m_VPValue())) &&
1629 TypeInfo.inferScalarType(Def)->isIntegerTy(1)) {
1630 Def->setOperand(1, Def->getOperand(0));
1631 Def->setOperand(0, Y);
1632 return;
1633 }
1634
1635 // Simplify MaskedCond with no block mask to its single operand.
1637 !cast<VPInstruction>(Def)->isMasked())
1638 return Def->replaceAllUsesWith(Def->getOperand(0));
1639
1640 // Look through ExtractLastLane.
1641 if (match(Def, m_ExtractLastLane(m_VPValue(A)))) {
1642 if (match(A, m_BuildVector())) {
1643 auto *BuildVector = cast<VPInstruction>(A);
1644 Def->replaceAllUsesWith(
1645 BuildVector->getOperand(BuildVector->getNumOperands() - 1));
1646 return;
1647 }
1648 if (Plan->hasScalarVFOnly())
1649 return Def->replaceAllUsesWith(A);
1650 }
1651
1652 // Look through ExtractPenultimateElement (BuildVector ....).
1654 auto *BuildVector = cast<VPInstruction>(Def->getOperand(0));
1655 Def->replaceAllUsesWith(
1656 BuildVector->getOperand(BuildVector->getNumOperands() - 2));
1657 return;
1658 }
1659
1660 uint64_t Idx;
1662 auto *BuildVector = cast<VPInstruction>(Def->getOperand(0));
1663 Def->replaceAllUsesWith(BuildVector->getOperand(Idx));
1664 return;
1665 }
1666
1667 if (match(Def, m_BuildVector()) && all_equal(Def->operands())) {
1668 Def->replaceAllUsesWith(
1669 Builder.createNaryOp(VPInstruction::Broadcast, Def->getOperand(0)));
1670 return;
1671 }
1672
1673 // Look through broadcast of single-scalar when used as select conditions; in
1674 // that case the scalar condition can be used directly.
1675 if (match(Def,
1678 "broadcast operand must be single-scalar");
1679 Def->setOperand(0, C);
1680 return;
1681 }
1682
1684 if (Def->getNumOperands() == 1) {
1685 Def->replaceAllUsesWith(Def->getOperand(0));
1686 return;
1687 }
1688 if (auto *Phi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(Def)) {
1689 if (all_equal(Phi->incoming_values()))
1690 Phi->replaceAllUsesWith(Phi->getOperand(0));
1691 }
1692 return;
1693 }
1694
1695 VPIRValue *IRV;
1696 if (Def->getNumOperands() == 1 &&
1698 return Def->replaceAllUsesWith(IRV);
1699
1700 // Some simplifications can only be applied after unrolling. Perform them
1701 // below.
1702 if (!Plan->isUnrolled())
1703 return;
1704
1705 // After unrolling, extract-lane may be used to extract values from multiple
1706 // scalar sources. Only simplify when extracting from a single scalar source.
1707 VPValue *LaneToExtract;
1708 if (match(Def, m_ExtractLane(m_VPValue(LaneToExtract), m_VPValue(A)))) {
1709 // Simplify extract-lane(%lane_num, %scalar_val) -> %scalar_val.
1711 return Def->replaceAllUsesWith(A);
1712
1713 // Simplify extract-lane with single source to extract-element.
1714 Def->replaceAllUsesWith(Builder.createNaryOp(
1715 Instruction::ExtractElement, {A, LaneToExtract}, Def->getDebugLoc()));
1716 return;
1717 }
1718
1719 // Look for cycles where Def is of the form:
1720 // X = phi(0, IVInc) ; used only by IVInc, or by IVInc and Inc = X + Y
1721 // IVInc = X + Step ; used by X and Def
1722 // Def = IVInc + Y
1723 // Fold the increment Y into the phi's start value, replace Def with IVInc,
1724 // and if Inc exists, replace it with X.
1725 if (match(Def, m_Add(m_Add(m_VPValue(X), m_VPValue()), m_VPValue(Y))) &&
1726 isa<VPIRValue>(Y) &&
1727 match(X, m_VPPhi(m_ZeroInt(), m_Specific(Def->getOperand(0))))) {
1728 auto *Phi = cast<VPPhi>(X);
1729 auto *IVInc = Def->getOperand(0);
1730 if (IVInc->getNumUsers() == 2) {
1731 // If Phi has a second user (besides IVInc's defining recipe), it must
1732 // be Inc = Phi + Y for the fold to apply.
1735 if (Phi->getNumUsers() == 1 || (Phi->getNumUsers() == 2 && Inc)) {
1736 Def->replaceAllUsesWith(IVInc);
1737 if (Inc)
1738 Inc->replaceAllUsesWith(Phi);
1739 Phi->setOperand(0, Y);
1740 return;
1741 }
1742 }
1743 }
1744
1745 // Simplify unrolled VectorPointer without offset, or with zero offset, to
1746 // just the pointer operand.
1747 if (auto *VPR = dyn_cast<VPVectorPointerRecipe>(Def))
1748 if (!VPR->getOffset() || match(VPR->getOffset(), m_ZeroInt()))
1749 return VPR->replaceAllUsesWith(VPR->getOperand(0));
1750
1751 // VPScalarIVSteps after unrolling can be replaced by their start value, if
1752 // the start index is zero and only the first lane 0 is demanded.
1753 if (auto *Steps = dyn_cast<VPScalarIVStepsRecipe>(Def)) {
1754 if (!Steps->getStartIndex() && vputils::onlyFirstLaneUsed(Steps)) {
1755 Steps->replaceAllUsesWith(Steps->getOperand(0));
1756 return;
1757 }
1758 }
1759 // Simplify redundant ReductionStartVector recipes after unrolling.
1760 VPValue *StartV;
1762 m_VPValue(StartV), m_VPValue(), m_VPValue()))) {
1763 Def->replaceUsesWithIf(StartV, [](const VPUser &U, unsigned Idx) {
1764 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&U);
1765 return PhiR && PhiR->isInLoop();
1766 });
1767 return;
1768 }
1769
1771 Def->replaceAllUsesWith(A);
1772 return;
1773 }
1774
1775 if (match(Def, m_ExtractLastLane(m_VPValue(A))) &&
1776 vputils::isSingleScalar(A) && all_of(A->users(), [Def, A](VPUser *U) {
1777 return U->usesScalars(A) || Def == U;
1778 })) {
1779 return Def->replaceAllUsesWith(A);
1780 }
1781
1782 if (Plan->getConcreteUF() == 1 && match(Def, m_ExtractLastPart(m_VPValue(A))))
1783 return Def->replaceAllUsesWith(A);
1784}
1785
1788 Plan.getEntry());
1789 VPTypeAnalysis TypeInfo(Plan);
1791 for (VPRecipeBase &R : make_early_inc_range(*VPBB))
1792 if (auto *Def = dyn_cast<VPSingleDefRecipe>(&R))
1793 simplifyRecipe(Def, TypeInfo);
1794 }
1795}
1796
1797/// Reassociate (headermask && x) && y -> headermask && (x && y) to allow the
1798/// header mask to be simplified further when tail folding, e.g. in
1799/// optimizeEVLMasks.
1800static void reassociateHeaderMask(VPlan &Plan) {
1801 VPValue *HeaderMask = vputils::findHeaderMask(Plan);
1802 if (!HeaderMask)
1803 return;
1804
1805 SmallVector<VPUser *> Worklist;
1806 for (VPUser *U : HeaderMask->users())
1807 if (match(U, m_LogicalAnd(m_Specific(HeaderMask), m_VPValue())))
1809
1810 while (!Worklist.empty()) {
1811 auto *R = dyn_cast<VPSingleDefRecipe>(Worklist.pop_back_val());
1812 VPValue *X, *Y;
1813 if (!R || !match(R, m_LogicalAnd(
1814 m_LogicalAnd(m_Specific(HeaderMask), m_VPValue(X)),
1815 m_VPValue(Y))))
1816 continue;
1817 append_range(Worklist, R->users());
1818 VPBuilder Builder(R);
1819 R->replaceAllUsesWith(
1820 Builder.createLogicalAnd(HeaderMask, Builder.createLogicalAnd(X, Y)));
1821 }
1822}
1823
1825 if (Plan.hasScalarVFOnly())
1826 return;
1827
1829 vp_depth_first_deep(Plan.getEntry()))) {
1830 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
1832 continue;
1833 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
1834 if (RepR && (RepR->isSingleScalar() || RepR->isPredicated()))
1835 continue;
1836
1837 auto *RepOrWidenR = cast<VPRecipeWithIRFlags>(&R);
1838 if (RepR && RepR->getOpcode() == Instruction::Store &&
1839 vputils::isSingleScalar(RepR->getOperand(1))) {
1840 auto *Clone = new VPReplicateRecipe(
1841 RepOrWidenR->getUnderlyingInstr(), RepOrWidenR->operands(),
1842 true /*IsSingleScalar*/, nullptr /*Mask*/, *RepR /*Flags*/,
1843 *RepR /*Metadata*/, RepR->getDebugLoc());
1844 Clone->insertBefore(RepOrWidenR);
1845 VPBuilder Builder(Clone);
1846 VPValue *ExtractOp = Clone->getOperand(0);
1847 if (vputils::isUniformAcrossVFsAndUFs(RepR->getOperand(1)))
1848 ExtractOp =
1849 Builder.createNaryOp(VPInstruction::ExtractLastPart, ExtractOp);
1850 ExtractOp =
1851 Builder.createNaryOp(VPInstruction::ExtractLastLane, ExtractOp);
1852 Clone->setOperand(0, ExtractOp);
1853 RepR->eraseFromParent();
1854 continue;
1855 }
1856
1857 // Skip recipes that aren't single scalars.
1858 if (!vputils::isSingleScalar(RepOrWidenR))
1859 continue;
1860
1861 // Predicate to check if a user of Op introduces extra broadcasts.
1862 auto IntroducesBCastOf = [](const VPValue *Op) {
1863 return [Op](const VPUser *U) {
1864 if (auto *VPI = dyn_cast<VPInstruction>(U)) {
1868 VPI->getOpcode()))
1869 return false;
1870 }
1871 return !U->usesScalars(Op);
1872 };
1873 };
1874
1875 if (any_of(RepOrWidenR->users(), IntroducesBCastOf(RepOrWidenR)) &&
1876 none_of(RepOrWidenR->operands(), [&](VPValue *Op) {
1877 if (any_of(
1878 make_filter_range(Op->users(), not_equal_to(RepOrWidenR)),
1879 IntroducesBCastOf(Op)))
1880 return false;
1881 // Non-constant live-ins require broadcasts, while constants do not
1882 // need explicit broadcasts.
1883 auto *IRV = dyn_cast<VPIRValue>(Op);
1884 bool LiveInNeedsBroadcast = IRV && !isa<Constant>(IRV->getValue());
1885 auto *OpR = dyn_cast<VPReplicateRecipe>(Op);
1886 return LiveInNeedsBroadcast || (OpR && OpR->isSingleScalar());
1887 }))
1888 continue;
1889
1890 auto *Clone = new VPReplicateRecipe(
1891 RepOrWidenR->getUnderlyingInstr(), RepOrWidenR->operands(),
1892 true /*IsSingleScalar*/, nullptr, *RepOrWidenR);
1893 Clone->insertBefore(RepOrWidenR);
1894 RepOrWidenR->replaceAllUsesWith(Clone);
1895 if (isDeadRecipe(*RepOrWidenR))
1896 RepOrWidenR->eraseFromParent();
1897 }
1898 }
1899}
1900
1901/// Try to see if all of \p Blend's masks share a common value logically and'ed
1902/// and remove it from the masks.
1904 if (Blend->isNormalized())
1905 return;
1906 VPValue *CommonEdgeMask;
1907 if (!match(Blend->getMask(0),
1908 m_LogicalAnd(m_VPValue(CommonEdgeMask), m_VPValue())))
1909 return;
1910 for (unsigned I = 0; I < Blend->getNumIncomingValues(); I++)
1911 if (!match(Blend->getMask(I),
1912 m_LogicalAnd(m_Specific(CommonEdgeMask), m_VPValue())))
1913 return;
1914 for (unsigned I = 0; I < Blend->getNumIncomingValues(); I++)
1915 Blend->setMask(I, Blend->getMask(I)->getDefiningRecipe()->getOperand(1));
1916}
1917
1918/// Normalize and simplify VPBlendRecipes. Should be run after simplifyRecipes
1919/// to make sure the masks are simplified.
1920static void simplifyBlends(VPlan &Plan) {
1923 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
1924 auto *Blend = dyn_cast<VPBlendRecipe>(&R);
1925 if (!Blend)
1926 continue;
1927
1928 removeCommonBlendMask(Blend);
1929
1930 // Try to remove redundant blend recipes.
1931 SmallPtrSet<VPValue *, 4> UniqueValues;
1932 if (Blend->isNormalized() || !match(Blend->getMask(0), m_False()))
1933 UniqueValues.insert(Blend->getIncomingValue(0));
1934 for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
1935 if (!match(Blend->getMask(I), m_False()))
1936 UniqueValues.insert(Blend->getIncomingValue(I));
1937
1938 if (UniqueValues.size() == 1) {
1939 Blend->replaceAllUsesWith(*UniqueValues.begin());
1940 Blend->eraseFromParent();
1941 continue;
1942 }
1943
1944 if (Blend->isNormalized())
1945 continue;
1946
1947 // Normalize the blend so its first incoming value is used as the initial
1948 // value with the others blended into it.
1949
1950 unsigned StartIndex = 0;
1951 for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
1952 // If a value's mask is used only by the blend then is can be deadcoded.
1953 // TODO: Find the most expensive mask that can be deadcoded, or a mask
1954 // that's used by multiple blends where it can be removed from them all.
1955 VPValue *Mask = Blend->getMask(I);
1956 if (Mask->getNumUsers() == 1 && !match(Mask, m_False())) {
1957 StartIndex = I;
1958 break;
1959 }
1960 }
1961
1962 SmallVector<VPValue *, 4> OperandsWithMask;
1963 OperandsWithMask.push_back(Blend->getIncomingValue(StartIndex));
1964
1965 for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
1966 if (I == StartIndex)
1967 continue;
1968 OperandsWithMask.push_back(Blend->getIncomingValue(I));
1969 OperandsWithMask.push_back(Blend->getMask(I));
1970 }
1971
1972 auto *NewBlend =
1973 new VPBlendRecipe(cast_or_null<PHINode>(Blend->getUnderlyingValue()),
1974 OperandsWithMask, *Blend, Blend->getDebugLoc());
1975 NewBlend->insertBefore(&R);
1976
1977 VPValue *DeadMask = Blend->getMask(StartIndex);
1978 Blend->replaceAllUsesWith(NewBlend);
1979 Blend->eraseFromParent();
1981
1982 /// Simplify BLEND %a, %b, Not(%mask) -> BLEND %b, %a, %mask.
1983 VPValue *NewMask;
1984 if (NewBlend->getNumOperands() == 3 &&
1985 match(NewBlend->getMask(1), m_Not(m_VPValue(NewMask)))) {
1986 VPValue *Inc0 = NewBlend->getOperand(0);
1987 VPValue *Inc1 = NewBlend->getOperand(1);
1988 VPValue *OldMask = NewBlend->getOperand(2);
1989 NewBlend->setOperand(0, Inc1);
1990 NewBlend->setOperand(1, Inc0);
1991 NewBlend->setOperand(2, NewMask);
1992 if (OldMask->getNumUsers() == 0)
1993 cast<VPInstruction>(OldMask)->eraseFromParent();
1994 }
1995 }
1996 }
1997}
1998
1999/// Optimize the width of vector induction variables in \p Plan based on a known
2000/// constant Trip Count, \p BestVF and \p BestUF.
2002 ElementCount BestVF,
2003 unsigned BestUF) {
2004 // Only proceed if we have not completely removed the vector region.
2005 if (!Plan.getVectorLoopRegion())
2006 return false;
2007
2008 const APInt *TC;
2009 if (!BestVF.isFixed() || !match(Plan.getTripCount(), m_APInt(TC)))
2010 return false;
2011
2012 // Calculate the minimum power-of-2 bit width that can fit the known TC, VF
2013 // and UF. Returns at least 8.
2014 auto ComputeBitWidth = [](APInt TC, uint64_t Align) {
2015 APInt AlignedTC =
2018 APInt MaxVal = AlignedTC - 1;
2019 return std::max<unsigned>(PowerOf2Ceil(MaxVal.getActiveBits()), 8);
2020 };
2021 unsigned NewBitWidth =
2022 ComputeBitWidth(*TC, BestVF.getKnownMinValue() * BestUF);
2023
2024 LLVMContext &Ctx = Plan.getContext();
2025 auto *NewIVTy = IntegerType::get(Ctx, NewBitWidth);
2026
2027 bool MadeChange = false;
2028
2029 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
2030 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
2031 auto *WideIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
2032
2033 // Currently only handle canonical IVs as it is trivial to replace the start
2034 // and stop values, and we currently only perform the optimization when the
2035 // IV has a single use.
2036 if (!WideIV || !WideIV->isCanonical() ||
2037 WideIV->hasMoreThanOneUniqueUser() ||
2038 NewIVTy == WideIV->getScalarType())
2039 continue;
2040
2041 // Currently only handle cases where the single user is a header-mask
2042 // comparison with the backedge-taken-count.
2043 VPUser *SingleUser = WideIV->getSingleUser();
2044 if (!SingleUser ||
2045 !match(SingleUser,
2046 m_ICmp(m_Specific(WideIV),
2048 continue;
2049
2050 // Update IV operands and comparison bound to use new narrower type.
2051 auto *NewStart = Plan.getZero(NewIVTy);
2052 WideIV->setStartValue(NewStart);
2053 auto *NewStep = Plan.getConstantInt(NewIVTy, 1);
2054 WideIV->setStepValue(NewStep);
2055
2056 auto *NewBTC = new VPWidenCastRecipe(
2057 Instruction::Trunc, Plan.getOrCreateBackedgeTakenCount(), NewIVTy,
2058 nullptr, VPIRFlags::getDefaultFlags(Instruction::Trunc));
2059 Plan.getVectorPreheader()->appendRecipe(NewBTC);
2060 auto *Cmp = cast<VPInstruction>(WideIV->getSingleUser());
2061 Cmp->setOperand(1, NewBTC);
2062
2063 MadeChange = true;
2064 }
2065
2066 return MadeChange;
2067}
2068
2069/// Return true if \p Cond is known to be true for given \p BestVF and \p
2070/// BestUF.
2072 ElementCount BestVF, unsigned BestUF,
2075 return any_of(Cond->getDefiningRecipe()->operands(), [&Plan, BestVF, BestUF,
2076 &PSE](VPValue *C) {
2077 return isConditionTrueViaVFAndUF(C, Plan, BestVF, BestUF, PSE);
2078 });
2079
2080 auto *CanIV = Plan.getVectorLoopRegion()->getCanonicalIV();
2083 m_c_Add(m_Specific(CanIV), m_Specific(&Plan.getVFxUF())),
2084 m_Specific(&Plan.getVectorTripCount()))))
2085 return false;
2086
2087 // The compare checks CanIV + VFxUF == vector trip count. The vector trip
2088 // count is not conveniently available as SCEV so far, so we compare directly
2089 // against the original trip count. This is stricter than necessary, as we
2090 // will only return true if the trip count == vector trip count.
2091 const SCEV *VectorTripCount =
2093 if (isa<SCEVCouldNotCompute>(VectorTripCount))
2094 VectorTripCount = vputils::getSCEVExprForVPValue(Plan.getTripCount(), PSE);
2095 assert(!isa<SCEVCouldNotCompute>(VectorTripCount) &&
2096 "Trip count SCEV must be computable");
2097 ScalarEvolution &SE = *PSE.getSE();
2098 ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF);
2099 const SCEV *C = SE.getElementCount(VectorTripCount->getType(), NumElements);
2100 return SE.isKnownPredicate(CmpInst::ICMP_EQ, VectorTripCount, C);
2101}
2102
2103/// Try to replace multiple active lane masks used for control flow with
2104/// a single, wide active lane mask instruction followed by multiple
2105/// extract subvector intrinsics. This applies to the active lane mask
2106/// instructions both in the loop and in the preheader.
2107/// Incoming values of all ActiveLaneMaskPHIs are updated to use the
2108/// new extracts from the first active lane mask, which has it's last
2109/// operand (multiplier) set to UF.
2111 unsigned UF) {
2112 if (!EnableWideActiveLaneMask || !VF.isVector() || UF == 1)
2113 return false;
2114
2115 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
2116 VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
2117 auto *Term = &ExitingVPBB->back();
2118
2119 using namespace llvm::VPlanPatternMatch;
2121 m_VPValue(), m_VPValue(), m_VPValue())))))
2122 return false;
2123
2124 auto *Header = cast<VPBasicBlock>(VectorRegion->getEntry());
2125 LLVMContext &Ctx = Plan.getContext();
2126
2127 auto ExtractFromALM = [&](VPInstruction *ALM,
2128 SmallVectorImpl<VPValue *> &Extracts) {
2129 DebugLoc DL = ALM->getDebugLoc();
2130 for (unsigned Part = 0; Part < UF; ++Part) {
2132 Ops.append({ALM, Plan.getConstantInt(64, VF.getKnownMinValue() * Part)});
2133 auto *Ext =
2134 new VPWidenIntrinsicRecipe(Intrinsic::vector_extract, Ops,
2135 IntegerType::getInt1Ty(Ctx), {}, {}, DL);
2136 Extracts[Part] = Ext;
2137 Ext->insertAfter(ALM);
2138 }
2139 };
2140
2141 // Create a list of each active lane mask phi, ordered by unroll part.
2143 for (VPRecipeBase &R : Header->phis()) {
2145 if (!Phi)
2146 continue;
2147 VPValue *Index = nullptr;
2148 match(Phi->getBackedgeValue(),
2150 assert(Index && "Expected index from ActiveLaneMask instruction");
2151
2152 uint64_t Part;
2153 if (match(Index,
2155 m_VPValue(), m_Mul(m_VPValue(), m_ConstantInt(Part)))))
2156 Phis[Part] = Phi;
2157 else {
2158 // Anything other than a CanonicalIVIncrementForPart is part 0
2159 assert(!match(
2160 Index,
2162 Phis[0] = Phi;
2163 }
2164 }
2165
2166 assert(all_of(Phis, not_equal_to(nullptr)) &&
2167 "Expected one VPActiveLaneMaskPHIRecipe for each unroll part");
2168
2169 auto *EntryALM = cast<VPInstruction>(Phis[0]->getStartValue());
2170 auto *LoopALM = cast<VPInstruction>(Phis[0]->getBackedgeValue());
2171
2172 assert((EntryALM->getOpcode() == VPInstruction::ActiveLaneMask &&
2173 LoopALM->getOpcode() == VPInstruction::ActiveLaneMask) &&
2174 "Expected incoming values of Phi to be ActiveLaneMasks");
2175
2176 // When using wide lane masks, the return type of the get.active.lane.mask
2177 // intrinsic is VF x UF (last operand).
2178 VPValue *ALMMultiplier = Plan.getConstantInt(64, UF);
2179 EntryALM->setOperand(2, ALMMultiplier);
2180 LoopALM->setOperand(2, ALMMultiplier);
2181
2182 // Create UF x extract vectors and insert into preheader.
2183 SmallVector<VPValue *> EntryExtracts(UF);
2184 ExtractFromALM(EntryALM, EntryExtracts);
2185
2186 // Create UF x extract vectors and insert before the loop compare & branch,
2187 // updating the compare to use the first extract.
2188 SmallVector<VPValue *> LoopExtracts(UF);
2189 ExtractFromALM(LoopALM, LoopExtracts);
2190 VPInstruction *Not = cast<VPInstruction>(Term->getOperand(0));
2191 Not->setOperand(0, LoopExtracts[0]);
2192
2193 // Update the incoming values of active lane mask phis.
2194 for (unsigned Part = 0; Part < UF; ++Part) {
2195 Phis[Part]->setStartValue(EntryExtracts[Part]);
2196 Phis[Part]->setBackedgeValue(LoopExtracts[Part]);
2197 }
2198
2199 return true;
2200}
2201
2202/// Try to simplify the branch condition of \p Plan. This may restrict the
2203/// resulting plan to \p BestVF and \p BestUF.
2205 unsigned BestUF,
2207 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
2208 VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
2209 auto *Term = &ExitingVPBB->back();
2210 VPValue *Cond;
2211 auto m_CanIVInc = m_Add(m_VPValue(), m_Specific(&Plan.getVFxUF()));
2212 // Check if the branch condition compares the canonical IV increment (for main
2213 // loop), or the canonical IV increment plus an offset (for epilog loop).
2214 if (match(Term, m_BranchOnCount(
2215 m_CombineOr(m_CanIVInc, m_c_Add(m_CanIVInc, m_LiveIn())),
2216 m_VPValue())) ||
2218 m_VPValue(), m_VPValue(), m_VPValue()))))) {
2219 // Try to simplify the branch condition if VectorTC <= VF * UF when the
2220 // latch terminator is BranchOnCount or BranchOnCond(Not(ActiveLaneMask)).
2221 const SCEV *VectorTripCount =
2223 if (isa<SCEVCouldNotCompute>(VectorTripCount))
2224 VectorTripCount =
2226 assert(!isa<SCEVCouldNotCompute>(VectorTripCount) &&
2227 "Trip count SCEV must be computable");
2228 ScalarEvolution &SE = *PSE.getSE();
2229 ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF);
2230 const SCEV *C = SE.getElementCount(VectorTripCount->getType(), NumElements);
2231 if (!SE.isKnownPredicate(CmpInst::ICMP_ULE, VectorTripCount, C))
2232 return false;
2233 } else if (match(Term, m_BranchOnCond(m_VPValue(Cond))) ||
2235 // For BranchOnCond, check if we can prove the condition to be true using VF
2236 // and UF.
2237 if (!isConditionTrueViaVFAndUF(Cond, Plan, BestVF, BestUF, PSE))
2238 return false;
2239 } else {
2240 return false;
2241 }
2242
2243 // The vector loop region only executes once. Convert terminator of the
2244 // exiting block to exit in the first iteration.
2245 if (match(Term, m_BranchOnTwoConds())) {
2246 Term->setOperand(1, Plan.getTrue());
2247 return true;
2248 }
2249
2250 auto *BOC = new VPInstruction(VPInstruction::BranchOnCond, Plan.getTrue(), {},
2251 {}, Term->getDebugLoc());
2252 ExitingVPBB->appendRecipe(BOC);
2253 Term->eraseFromParent();
2254
2255 return true;
2256}
2257
2258/// From the definition of llvm.experimental.get.vector.length,
2259/// VPInstruction::ExplicitVectorLength(%AVL) = %AVL when %AVL <= VF.
2263 vp_depth_first_deep(Plan.getEntry()))) {
2264 for (VPRecipeBase &R : *VPBB) {
2265 VPValue *AVL;
2266 if (!match(&R, m_EVL(m_VPValue(AVL))))
2267 continue;
2268
2269 const SCEV *AVLSCEV = vputils::getSCEVExprForVPValue(AVL, PSE);
2270 if (isa<SCEVCouldNotCompute>(AVLSCEV))
2271 continue;
2272 ScalarEvolution &SE = *PSE.getSE();
2273 const SCEV *VFSCEV = SE.getElementCount(AVLSCEV->getType(), VF);
2274 if (!SE.isKnownPredicate(CmpInst::ICMP_ULE, AVLSCEV, VFSCEV))
2275 continue;
2276
2278 AVL, Type::getInt32Ty(Plan.getContext()), AVLSCEV->getType(),
2279 R.getDebugLoc());
2280 if (Trunc != AVL) {
2281 auto *TruncR = cast<VPSingleDefRecipe>(Trunc);
2282 const DataLayout &DL = Plan.getDataLayout();
2283 VPTypeAnalysis TypeInfo(Plan);
2284 if (VPValue *Folded =
2285 tryToFoldLiveIns(*TruncR, TruncR->operands(), DL, TypeInfo))
2286 Trunc = Folded;
2287 }
2288 R.getVPSingleValue()->replaceAllUsesWith(Trunc);
2289 return true;
2290 }
2291 }
2292 return false;
2293}
2294
2296 unsigned BestUF,
2298 assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
2299 assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
2300
2301 bool MadeChange = tryToReplaceALMWithWideALM(Plan, BestVF, BestUF);
2302 MadeChange |= simplifyBranchConditionForVFAndUF(Plan, BestVF, BestUF, PSE);
2303 MadeChange |= optimizeVectorInductionWidthForTCAndVFUF(Plan, BestVF, BestUF);
2304
2305 if (MadeChange) {
2306 Plan.setVF(BestVF);
2307 assert(Plan.getConcreteUF() == BestUF && "BestUF must match the Plan's UF");
2308 }
2309}
2310
2312 for (VPRecipeBase &R :
2314 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
2315 if (!PhiR)
2316 continue;
2317 RecurKind RK = PhiR->getRecurrenceKind();
2318 if (RK != RecurKind::Add && RK != RecurKind::Mul && RK != RecurKind::Sub &&
2320 continue;
2321
2322 for (VPUser *U : collectUsersRecursively(PhiR))
2323 if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(U)) {
2324 RecWithFlags->dropPoisonGeneratingFlags();
2325 }
2326 }
2327}
2328
2329namespace {
2330struct VPCSEDenseMapInfo : public DenseMapInfo<VPSingleDefRecipe *> {
2331 static bool isSentinel(const VPSingleDefRecipe *Def) {
2332 return Def == getEmptyKey() || Def == getTombstoneKey();
2333 }
2334
2335 /// If recipe \p R will lower to a GEP with a non-i8 source element type,
2336 /// return that source element type.
2337 static Type *getGEPSourceElementType(const VPSingleDefRecipe *R) {
2338 // All VPInstructions that lower to GEPs must have the i8 source element
2339 // type (as they are PtrAdds), so we omit it.
2341 .Case([](const VPReplicateRecipe *I) -> Type * {
2342 if (auto *GEP = dyn_cast<GetElementPtrInst>(I->getUnderlyingValue()))
2343 return GEP->getSourceElementType();
2344 return nullptr;
2345 })
2346 .Case<VPVectorPointerRecipe, VPWidenGEPRecipe>(
2347 [](auto *I) { return I->getSourceElementType(); })
2348 .Default([](auto *) { return nullptr; });
2349 }
2350
2351 /// Returns true if recipe \p Def can be safely handed for CSE.
2352 static bool canHandle(const VPSingleDefRecipe *Def) {
2353 // We can extend the list of handled recipes in the future,
2354 // provided we account for the data embedded in them while checking for
2355 // equality or hashing.
2356 auto C = getOpcodeOrIntrinsicID(Def);
2357
2358 // The issue with (Insert|Extract)Value is that the index of the
2359 // insert/extract is not a proper operand in LLVM IR, and hence also not in
2360 // VPlan.
2361 if (!C || (!C->first && (C->second == Instruction::InsertValue ||
2362 C->second == Instruction::ExtractValue)))
2363 return false;
2364
2365 // During CSE, we can only handle recipes that don't read from memory: if
2366 // they read from memory, there could be an intervening write to memory
2367 // before the next instance is CSE'd, leading to an incorrect result.
2368 return !Def->mayReadFromMemory();
2369 }
2370
2371 /// Hash the underlying data of \p Def.
2372 static unsigned getHashValue(const VPSingleDefRecipe *Def) {
2373 const VPlan *Plan = Def->getParent()->getPlan();
2374 VPTypeAnalysis TypeInfo(*Plan);
2375 hash_code Result = hash_combine(
2376 Def->getVPRecipeID(), getOpcodeOrIntrinsicID(Def),
2377 getGEPSourceElementType(Def), TypeInfo.inferScalarType(Def),
2379 if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(Def))
2380 if (RFlags->hasPredicate())
2381 return hash_combine(Result, RFlags->getPredicate());
2382 if (auto *SIVSteps = dyn_cast<VPScalarIVStepsRecipe>(Def))
2383 return hash_combine(Result, SIVSteps->getInductionOpcode());
2384 return Result;
2385 }
2386
2387 /// Check equality of underlying data of \p L and \p R.
2388 static bool isEqual(const VPSingleDefRecipe *L, const VPSingleDefRecipe *R) {
2389 if (isSentinel(L) || isSentinel(R))
2390 return L == R;
2391 if (L->getVPRecipeID() != R->getVPRecipeID() ||
2393 getGEPSourceElementType(L) != getGEPSourceElementType(R) ||
2395 !equal(L->operands(), R->operands()))
2396 return false;
2398 "must have valid opcode info for both recipes");
2399 if (auto *LFlags = dyn_cast<VPRecipeWithIRFlags>(L))
2400 if (LFlags->hasPredicate() &&
2401 LFlags->getPredicate() !=
2402 cast<VPRecipeWithIRFlags>(R)->getPredicate())
2403 return false;
2404 if (auto *LSIV = dyn_cast<VPScalarIVStepsRecipe>(L))
2405 if (LSIV->getInductionOpcode() !=
2406 cast<VPScalarIVStepsRecipe>(R)->getInductionOpcode())
2407 return false;
2408 // Recipes in replicate regions implicitly depend on predicate. If either
2409 // recipe is in a replicate region, only consider them equal if both have
2410 // the same parent.
2411 const VPRegionBlock *RegionL = L->getRegion();
2412 const VPRegionBlock *RegionR = R->getRegion();
2413 if (((RegionL && RegionL->isReplicator()) ||
2414 (RegionR && RegionR->isReplicator())) &&
2415 L->getParent() != R->getParent())
2416 return false;
2417 const VPlan *Plan = L->getParent()->getPlan();
2418 VPTypeAnalysis TypeInfo(*Plan);
2419 return TypeInfo.inferScalarType(L) == TypeInfo.inferScalarType(R);
2420 }
2421};
2422} // end anonymous namespace
2423
2424/// Perform a common-subexpression-elimination of VPSingleDefRecipes on the \p
2425/// Plan.
2427 VPDominatorTree VPDT(Plan);
2429
2431 Plan.getEntry());
2433 for (VPRecipeBase &R : *VPBB) {
2434 auto *Def = dyn_cast<VPSingleDefRecipe>(&R);
2435 if (!Def || !VPCSEDenseMapInfo::canHandle(Def))
2436 continue;
2437 if (VPSingleDefRecipe *V = CSEMap.lookup(Def)) {
2438 // V must dominate Def for a valid replacement.
2439 if (!VPDT.dominates(V->getParent(), VPBB))
2440 continue;
2441 // Only keep flags present on both V and Def.
2442 if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(V))
2443 RFlags->intersectFlags(*cast<VPRecipeWithIRFlags>(Def));
2444 Def->replaceAllUsesWith(V);
2445 continue;
2446 }
2447 CSEMap[Def] = Def;
2448 }
2449 }
2450}
2451
2452/// Return true if we do not know how to (mechanically) hoist or sink a
2453/// non-memory or memory recipe \p R out of a loop region.
2455 VPBasicBlock *LastBB) {
2456 if (!isa<VPReplicateRecipe>(R) || !R.mayReadFromMemory())
2458
2459 // Check that the load doesn't alias with stores between FirstBB and LastBB.
2460 auto MemLoc = vputils::getMemoryLocation(R);
2461 return !MemLoc || !canHoistOrSinkWithNoAliasCheck(*MemLoc, FirstBB, LastBB);
2462}
2463
2464/// Move loop-invariant recipes out of the vector loop region in \p Plan.
2465static void licm(VPlan &Plan) {
2466 VPBasicBlock *Preheader = Plan.getVectorPreheader();
2467
2468 // Hoist any loop invariant recipes from the vector loop region to the
2469 // preheader. Preform a shallow traversal of the vector loop region, to
2470 // exclude recipes in replicate regions. Since the top-level blocks in the
2471 // vector loop region are guaranteed to execute if the vector pre-header is,
2472 // we don't need to check speculation safety.
2473 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2474 assert(Preheader->getSingleSuccessor() == LoopRegion &&
2475 "Expected vector prehader's successor to be the vector loop region");
2477 vp_depth_first_shallow(LoopRegion->getEntry()))) {
2478 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
2479 if (cannotHoistOrSinkRecipe(R, LoopRegion->getEntryBasicBlock(),
2480 LoopRegion->getExitingBasicBlock()))
2481 continue;
2482 if (any_of(R.operands(), [](VPValue *Op) {
2483 return !Op->isDefinedOutsideLoopRegions();
2484 }))
2485 continue;
2486 R.moveBefore(*Preheader, Preheader->end());
2487 }
2488 }
2489
2490#ifndef NDEBUG
2491 VPDominatorTree VPDT(Plan);
2492#endif
2493 // Sink recipes with no users inside the vector loop region if all users are
2494 // in the same exit block of the region.
2495 // TODO: Extend to sink recipes from inner loops.
2497 LoopRegion->getEntry());
2499 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
2500 if (vputils::cannotHoistOrSinkRecipe(R, /*Sinking=*/true))
2501 continue;
2502
2503 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
2504 assert(!RepR->isPredicated() &&
2505 "Expected prior transformation of predicated replicates to "
2506 "replicate regions");
2507 // narrowToSingleScalarRecipes should have already maximally narrowed
2508 // replicates to single-scalar replicates.
2509 // TODO: When unrolling, replicateByVF doesn't handle sunk
2510 // non-single-scalar replicates correctly.
2511 if (!RepR->isSingleScalar())
2512 continue;
2513 }
2514
2515 // TODO: Use R.definedValues() instead of casting to VPSingleDefRecipe to
2516 // support recipes with multiple defined values (e.g., interleaved loads).
2517 auto *Def = cast<VPSingleDefRecipe>(&R);
2518
2519 // Cannot sink the recipe if the user is defined in a loop region or a
2520 // non-successor of the vector loop region. Cannot sink if user is a phi
2521 // either.
2522 VPBasicBlock *SinkBB = nullptr;
2523 if (any_of(Def->users(), [&SinkBB, &LoopRegion](VPUser *U) {
2524 auto *UserR = cast<VPRecipeBase>(U);
2525 VPBasicBlock *Parent = UserR->getParent();
2526 // TODO: Support sinking when users are in multiple blocks.
2527 if (SinkBB && SinkBB != Parent)
2528 return true;
2529 SinkBB = Parent;
2530 // TODO: If the user is a PHI node, we should check the block of
2531 // incoming value. Support PHI node users if needed.
2532 return UserR->isPhi() || Parent->getEnclosingLoopRegion() ||
2533 Parent->getSinglePredecessor() != LoopRegion;
2534 }))
2535 continue;
2536
2537 if (!SinkBB)
2538 SinkBB = cast<VPBasicBlock>(LoopRegion->getSingleSuccessor());
2539
2540 // TODO: This will need to be a check instead of a assert after
2541 // conditional branches in vectorized loops are supported.
2542 assert(VPDT.properlyDominates(VPBB, SinkBB) &&
2543 "Defining block must dominate sink block");
2544 // TODO: Clone the recipe if users are on multiple exit paths, instead of
2545 // just moving.
2546 Def->moveBefore(*SinkBB, SinkBB->getFirstNonPhi());
2547 }
2548 }
2549}
2550
2552 VPlan &Plan, const MapVector<Instruction *, uint64_t> &MinBWs) {
2553 if (Plan.hasScalarVFOnly())
2554 return;
2555 // Keep track of created truncates, so they can be re-used. Note that we
2556 // cannot use RAUW after creating a new truncate, as this would could make
2557 // other uses have different types for their operands, making them invalidly
2558 // typed.
2560 VPTypeAnalysis TypeInfo(Plan);
2561 VPBasicBlock *PH = Plan.getVectorPreheader();
2564 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
2567 continue;
2568
2569 VPValue *ResultVPV = R.getVPSingleValue();
2570 auto *UI = cast_or_null<Instruction>(ResultVPV->getUnderlyingValue());
2571 unsigned NewResSizeInBits = MinBWs.lookup(UI);
2572 if (!NewResSizeInBits)
2573 continue;
2574
2575 // If the value wasn't vectorized, we must maintain the original scalar
2576 // type. Skip those here, after incrementing NumProcessedRecipes. Also
2577 // skip casts which do not need to be handled explicitly here, as
2578 // redundant casts will be removed during recipe simplification.
2580 continue;
2581
2582 Type *OldResTy = TypeInfo.inferScalarType(ResultVPV);
2583 unsigned OldResSizeInBits = OldResTy->getScalarSizeInBits();
2584 assert(OldResTy->isIntegerTy() && "only integer types supported");
2585 (void)OldResSizeInBits;
2586
2587 auto *NewResTy = IntegerType::get(Plan.getContext(), NewResSizeInBits);
2588
2589 // Any wrapping introduced by shrinking this operation shouldn't be
2590 // considered undefined behavior. So, we can't unconditionally copy
2591 // arithmetic wrapping flags to VPW.
2592 if (auto *VPW = dyn_cast<VPRecipeWithIRFlags>(&R))
2593 VPW->dropPoisonGeneratingFlags();
2594
2595 if (OldResSizeInBits != NewResSizeInBits &&
2596 !match(&R, m_ICmp(m_VPValue(), m_VPValue()))) {
2597 // Extend result to original width.
2598 auto *Ext = new VPWidenCastRecipe(
2599 Instruction::ZExt, ResultVPV, OldResTy, nullptr,
2600 VPIRFlags::getDefaultFlags(Instruction::ZExt));
2601 Ext->insertAfter(&R);
2602 ResultVPV->replaceAllUsesWith(Ext);
2603 Ext->setOperand(0, ResultVPV);
2604 assert(OldResSizeInBits > NewResSizeInBits && "Nothing to shrink?");
2605 } else {
2606 assert(match(&R, m_ICmp(m_VPValue(), m_VPValue())) &&
2607 "Only ICmps should not need extending the result.");
2608 }
2609
2610 assert(!isa<VPWidenStoreRecipe>(&R) && "stores cannot be narrowed");
2612 continue;
2613
2614 // Shrink operands by introducing truncates as needed.
2615 unsigned StartIdx =
2616 match(&R, m_Select(m_VPValue(), m_VPValue(), m_VPValue())) ? 1 : 0;
2617 for (unsigned Idx = StartIdx; Idx != R.getNumOperands(); ++Idx) {
2618 auto *Op = R.getOperand(Idx);
2619 unsigned OpSizeInBits =
2621 if (OpSizeInBits == NewResSizeInBits)
2622 continue;
2623 assert(OpSizeInBits > NewResSizeInBits && "nothing to truncate");
2624 auto [ProcessedIter, IterIsEmpty] = ProcessedTruncs.try_emplace(Op);
2625 if (!IterIsEmpty) {
2626 R.setOperand(Idx, ProcessedIter->second);
2627 continue;
2628 }
2629
2630 VPBuilder Builder;
2631 if (isa<VPIRValue>(Op))
2632 Builder.setInsertPoint(PH);
2633 else
2634 Builder.setInsertPoint(&R);
2635 VPWidenCastRecipe *NewOp =
2636 Builder.createWidenCast(Instruction::Trunc, Op, NewResTy);
2637 ProcessedIter->second = NewOp;
2638 R.setOperand(Idx, NewOp);
2639 }
2640
2641 }
2642 }
2643}
2644
2645void VPlanTransforms::removeBranchOnConst(VPlan &Plan, bool OnlyLatches) {
2646 std::optional<VPDominatorTree> VPDT;
2647 if (OnlyLatches)
2648 VPDT.emplace(Plan);
2649
2650 // Collect all blocks before modifying the CFG so we can identify unreachable
2651 // ones after constant branch removal.
2653
2654 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(AllBlocks)) {
2655 VPValue *Cond;
2656 // Skip blocks that are not terminated by BranchOnCond.
2657 if (VPBB->empty() || !match(&VPBB->back(), m_BranchOnCond(m_VPValue(Cond))))
2658 continue;
2659
2660 if (OnlyLatches && !VPBlockUtils::isLatch(VPBB, *VPDT))
2661 continue;
2662
2663 assert(VPBB->getNumSuccessors() == 2 &&
2664 "Two successors expected for BranchOnCond");
2665 unsigned RemovedIdx;
2666 if (match(Cond, m_True()))
2667 RemovedIdx = 1;
2668 else if (match(Cond, m_False()))
2669 RemovedIdx = 0;
2670 else
2671 continue;
2672
2673 VPBasicBlock *RemovedSucc =
2674 cast<VPBasicBlock>(VPBB->getSuccessors()[RemovedIdx]);
2675 assert(count(RemovedSucc->getPredecessors(), VPBB) == 1 &&
2676 "There must be a single edge between VPBB and its successor");
2677 // Values coming from VPBB into phi recipes of RemovedSucc are removed from
2678 // these recipes.
2679 for (VPRecipeBase &R : RemovedSucc->phis())
2680 cast<VPPhiAccessors>(&R)->removeIncomingValueFor(VPBB);
2681
2682 // Disconnect blocks and remove the terminator.
2683 VPBlockUtils::disconnectBlocks(VPBB, RemovedSucc);
2684 VPBB->back().eraseFromParent();
2685 }
2686
2687 // Compute which blocks are still reachable from the entry after constant
2688 // branch removal.
2691
2692 // Detach all unreachable blocks from their successors, removing their recipes
2693 // and incoming values from phi recipes.
2694 VPSymbolicValue Tmp(nullptr);
2695 for (VPBlockBase *B : AllBlocks) {
2696 if (Reachable.contains(B))
2697 continue;
2698 for (VPBlockBase *Succ : to_vector(B->successors())) {
2699 if (auto *SuccBB = dyn_cast<VPBasicBlock>(Succ))
2700 for (VPRecipeBase &R : SuccBB->phis())
2701 cast<VPPhiAccessors>(&R)->removeIncomingValueFor(B);
2703 }
2704 for (VPBasicBlock *DeadBB :
2706 for (VPRecipeBase &R : make_early_inc_range(*DeadBB)) {
2707 for (VPValue *Def : R.definedValues())
2708 Def->replaceAllUsesWith(&Tmp);
2709 R.eraseFromParent();
2710 }
2711 }
2712 }
2713}
2714
2734
2735// Add a VPActiveLaneMaskPHIRecipe and related recipes to \p Plan and replace
2736// the loop terminator with a branch-on-cond recipe with the negated
2737// active-lane-mask as operand. Note that this turns the loop into an
2738// uncountable one. Only the existing terminator is replaced, all other existing
2739// recipes/users remain unchanged, except for poison-generating flags being
2740// dropped from the canonical IV increment. Return the created
2741// VPActiveLaneMaskPHIRecipe.
2742//
2743// The function adds the following recipes:
2744//
2745// vector.ph:
2746// %EntryInc = canonical-iv-increment-for-part CanonicalIVStart
2747// %EntryALM = active-lane-mask %EntryInc, TC
2748//
2749// vector.body:
2750// ...
2751// %P = active-lane-mask-phi [ %EntryALM, %vector.ph ], [ %ALM, %vector.body ]
2752// ...
2753// %InLoopInc = canonical-iv-increment-for-part CanonicalIVIncrement
2754// %ALM = active-lane-mask %InLoopInc, TC
2755// %Negated = Not %ALM
2756// branch-on-cond %Negated
2757//
2760 VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
2761 VPBasicBlock *EB = TopRegion->getExitingBasicBlock();
2762 VPValue *StartV = Plan.getZero(TopRegion->getCanonicalIVType());
2763 auto *CanonicalIVIncrement = TopRegion->getOrCreateCanonicalIVIncrement();
2764 // TODO: Check if dropping the flags is needed.
2765 TopRegion->clearCanonicalIVNUW(CanonicalIVIncrement);
2766 DebugLoc DL = CanonicalIVIncrement->getDebugLoc();
2767 // We can't use StartV directly in the ActiveLaneMask VPInstruction, since
2768 // we have to take unrolling into account. Each part needs to start at
2769 // Part * VF
2770 auto *VecPreheader = Plan.getVectorPreheader();
2771 VPBuilder Builder(VecPreheader);
2772
2773 // Create the ActiveLaneMask instruction using the correct start values.
2774 VPValue *TC = Plan.getTripCount();
2775 VPValue *VF = &Plan.getVF();
2776
2777 auto *EntryIncrement = Builder.createOverflowingOp(
2778 VPInstruction::CanonicalIVIncrementForPart, {StartV, VF}, {false, false},
2779 DL, "index.part.next");
2780
2781 // Create the active lane mask instruction in the VPlan preheader.
2782 VPValue *ALMMultiplier =
2783 Plan.getConstantInt(TopRegion->getCanonicalIVType(), 1);
2784 auto *EntryALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
2785 {EntryIncrement, TC, ALMMultiplier}, DL,
2786 "active.lane.mask.entry");
2787
2788 // Now create the ActiveLaneMaskPhi recipe in the main loop using the
2789 // preheader ActiveLaneMask instruction.
2790 auto *LaneMaskPhi =
2792 auto *HeaderVPBB = TopRegion->getEntryBasicBlock();
2793 LaneMaskPhi->insertBefore(*HeaderVPBB, HeaderVPBB->begin());
2794
2795 // Create the active lane mask for the next iteration of the loop before the
2796 // original terminator.
2797 VPRecipeBase *OriginalTerminator = EB->getTerminator();
2798 Builder.setInsertPoint(OriginalTerminator);
2799 auto *InLoopIncrement = Builder.createOverflowingOp(
2801 {CanonicalIVIncrement, &Plan.getVF()}, {false, false}, DL);
2802 auto *ALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
2803 {InLoopIncrement, TC, ALMMultiplier}, DL,
2804 "active.lane.mask.next");
2805 LaneMaskPhi->addOperand(ALM);
2806
2807 // Replace the original terminator with BranchOnCond. We have to invert the
2808 // mask here because a true condition means jumping to the exit block.
2809 auto *NotMask = Builder.createNot(ALM, DL);
2810 Builder.createNaryOp(VPInstruction::BranchOnCond, {NotMask}, DL);
2811 OriginalTerminator->eraseFromParent();
2812 return LaneMaskPhi;
2813}
2814
2816 bool UseActiveLaneMaskForControlFlow) {
2817 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2818 auto *WideCanonicalIV = vputils::findUserOf<VPWidenCanonicalIVRecipe>(
2819 LoopRegion->getCanonicalIV());
2820 assert(WideCanonicalIV &&
2821 "Must have widened canonical IV when tail folding!");
2822 VPSingleDefRecipe *HeaderMask = vputils::findHeaderMask(Plan);
2823 VPSingleDefRecipe *LaneMask;
2824 if (UseActiveLaneMaskForControlFlow) {
2825 LaneMask = addVPLaneMaskPhiAndUpdateExitBranch(Plan);
2826 } else {
2827 VPBuilder B = VPBuilder::getToInsertAfter(WideCanonicalIV);
2828 VPValue *ALMMultiplier =
2829 Plan.getConstantInt(LoopRegion->getCanonicalIVType(), 1);
2830 LaneMask =
2831 B.createNaryOp(VPInstruction::ActiveLaneMask,
2832 {WideCanonicalIV, Plan.getTripCount(), ALMMultiplier},
2833 nullptr, "active.lane.mask");
2834 }
2835
2836 // Walk users of WideCanonicalIV and replace the header mask of the form
2837 // (ICMP_ULE, WideCanonicalIV, backedge-taken-count) with an active-lane-mask,
2838 // removing the old one to ensure there is always only a single header mask.
2839 HeaderMask->replaceAllUsesWith(LaneMask);
2840 HeaderMask->eraseFromParent();
2841}
2842
2843template <typename Op0_t, typename Op1_t> struct RemoveMask_match {
2844 Op0_t In;
2846
2847 RemoveMask_match(const Op0_t &In, Op1_t &Out) : In(In), Out(Out) {}
2848
2849 template <typename OpTy> bool match(OpTy *V) const {
2850 if (m_Specific(In).match(V)) {
2851 Out = nullptr;
2852 return true;
2853 }
2854 return m_LogicalAnd(m_Specific(In), m_VPValue(Out)).match(V);
2855 }
2856};
2857
2858/// Match a specific mask \p In, or a combination of it (logical-and In, Out).
2859/// Returns the remaining part \p Out if so, or nullptr otherwise.
2860template <typename Op0_t, typename Op1_t>
2861static inline RemoveMask_match<Op0_t, Op1_t> m_RemoveMask(const Op0_t &In,
2862 Op1_t &Out) {
2863 return RemoveMask_match<Op0_t, Op1_t>(In, Out);
2864}
2865
2866/// Try to optimize a \p CurRecipe masked by \p HeaderMask to a corresponding
2867/// EVL-based recipe without the header mask. Returns nullptr if no EVL-based
2868/// recipe could be created.
2869/// \p HeaderMask Header Mask.
2870/// \p CurRecipe Recipe to be transform.
2871/// \p TypeInfo VPlan-based type analysis.
2872/// \p EVL The explicit vector length parameter of vector-predication
2873/// intrinsics.
2875 VPRecipeBase &CurRecipe,
2876 VPTypeAnalysis &TypeInfo, VPValue &EVL) {
2877 VPlan *Plan = CurRecipe.getParent()->getPlan();
2878 DebugLoc DL = CurRecipe.getDebugLoc();
2879 VPValue *Addr, *Mask, *EndPtr;
2880
2881 /// Adjust any end pointers so that they point to the end of EVL lanes not VF.
2882 auto AdjustEndPtr = [&CurRecipe, &EVL](VPValue *EndPtr) {
2883 auto *EVLEndPtr = cast<VPVectorEndPointerRecipe>(EndPtr)->clone();
2884 EVLEndPtr->insertBefore(&CurRecipe);
2885 EVLEndPtr->setOperand(1, &EVL);
2886 return EVLEndPtr;
2887 };
2888
2889 auto GetVPReverse = [&CurRecipe, &EVL, &TypeInfo, Plan,
2891 if (!V)
2892 return nullptr;
2893 auto *Reverse = new VPWidenIntrinsicRecipe(
2894 Intrinsic::experimental_vp_reverse, {V, Plan->getTrue(), &EVL},
2895 TypeInfo.inferScalarType(V), {}, {}, DL);
2896 Reverse->insertBefore(&CurRecipe);
2897 return Reverse;
2898 };
2899
2900 if (match(&CurRecipe,
2901 m_MaskedLoad(m_VPValue(Addr), m_RemoveMask(HeaderMask, Mask))))
2902 return new VPWidenLoadEVLRecipe(cast<VPWidenLoadRecipe>(CurRecipe), Addr,
2903 EVL, Mask);
2904
2905 VPValue *ReversedVal;
2906 if (match(&CurRecipe, m_Reverse(m_VPValue(ReversedVal))) &&
2907 match(ReversedVal,
2908 m_MaskedLoad(m_VPValue(EndPtr),
2909 m_Reverse(m_RemoveMask(HeaderMask, Mask)))) &&
2910 match(EndPtr, m_VecEndPtr(m_VPValue(), m_Specific(&Plan->getVF())))) {
2911 Mask = GetVPReverse(Mask);
2912 Addr = AdjustEndPtr(EndPtr);
2913 auto *LoadR = new VPWidenLoadEVLRecipe(
2914 *cast<VPWidenLoadRecipe>(ReversedVal), Addr, EVL, Mask);
2915 LoadR->insertBefore(&CurRecipe);
2916 return new VPWidenIntrinsicRecipe(
2917 Intrinsic::experimental_vp_reverse, {LoadR, Plan->getTrue(), &EVL},
2918 TypeInfo.inferScalarType(LoadR), {}, {}, DL);
2919 }
2920
2921 VPValue *StoredVal;
2922 if (match(&CurRecipe, m_MaskedStore(m_VPValue(Addr), m_VPValue(StoredVal),
2923 m_RemoveMask(HeaderMask, Mask))))
2924 return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(CurRecipe), Addr,
2925 StoredVal, EVL, Mask);
2926
2927 if (match(&CurRecipe,
2928 m_MaskedStore(m_VPValue(EndPtr), m_Reverse(m_VPValue(ReversedVal)),
2929 m_Reverse(m_RemoveMask(HeaderMask, Mask)))) &&
2930 match(EndPtr, m_VecEndPtr(m_VPValue(), m_Specific(&Plan->getVF())))) {
2931 Mask = GetVPReverse(Mask);
2932 Addr = AdjustEndPtr(EndPtr);
2933 StoredVal = GetVPReverse(ReversedVal);
2934 return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(CurRecipe), Addr,
2935 StoredVal, EVL, Mask);
2936 }
2937
2938 if (auto *Rdx = dyn_cast<VPReductionRecipe>(&CurRecipe))
2939 if (Rdx->isConditional() &&
2940 match(Rdx->getCondOp(), m_RemoveMask(HeaderMask, Mask)))
2941 return new VPReductionEVLRecipe(*Rdx, EVL, Mask);
2942
2943 if (auto *Interleave = dyn_cast<VPInterleaveRecipe>(&CurRecipe))
2944 if (Interleave->getMask() &&
2945 match(Interleave->getMask(), m_RemoveMask(HeaderMask, Mask)))
2946 return new VPInterleaveEVLRecipe(*Interleave, EVL, Mask);
2947
2948 VPValue *LHS, *RHS;
2949 if (match(&CurRecipe,
2950 m_Select(m_Specific(HeaderMask), m_VPValue(LHS), m_VPValue(RHS))))
2951 return new VPWidenIntrinsicRecipe(
2952 Intrinsic::vp_merge, {Plan->getTrue(), LHS, RHS, &EVL},
2953 TypeInfo.inferScalarType(LHS), {}, {}, DL);
2954
2955 if (match(&CurRecipe, m_Select(m_RemoveMask(HeaderMask, Mask), m_VPValue(LHS),
2956 m_VPValue(RHS))))
2957 return new VPWidenIntrinsicRecipe(
2958 Intrinsic::vp_merge, {Mask, LHS, RHS, &EVL},
2959 TypeInfo.inferScalarType(LHS), {}, {}, DL);
2960
2961 if (match(&CurRecipe, m_LastActiveLane(m_Specific(HeaderMask)))) {
2962 Type *Ty = TypeInfo.inferScalarType(CurRecipe.getVPSingleValue());
2963 VPValue *ZExt = VPBuilder(&CurRecipe)
2965 &EVL, Ty, TypeInfo.inferScalarType(&EVL), DL);
2966 return new VPInstruction(
2967 Instruction::Sub, {ZExt, Plan->getConstantInt(Ty, 1)},
2968 VPIRFlags::getDefaultFlags(Instruction::Sub), {}, DL);
2969 }
2970
2971 // lhs | (headermask && rhs) -> vp.merge rhs, true, lhs, evl
2972 if (match(&CurRecipe,
2974 m_LogicalAnd(m_Specific(HeaderMask), m_VPValue(RHS)))))
2975 return new VPWidenIntrinsicRecipe(
2976 Intrinsic::vp_merge, {RHS, Plan->getTrue(), LHS, &EVL},
2977 TypeInfo.inferScalarType(LHS), {}, {}, DL);
2978
2979 return nullptr;
2980}
2981
2982/// Optimize away any EVL-based header masks to VP intrinsic based recipes.
2983/// The transforms here need to preserve the original semantics.
2985 // Find the EVL-based header mask if it exists: icmp ult step-vector, EVL
2986 VPValue *HeaderMask = nullptr, *EVL = nullptr;
2989 m_VPValue(EVL))) &&
2990 match(EVL, m_EVL(m_VPValue()))) {
2991 HeaderMask = R.getVPSingleValue();
2992 break;
2993 }
2994 }
2995 if (!HeaderMask)
2996 return;
2997
2998 VPTypeAnalysis TypeInfo(Plan);
2999 SmallVector<VPRecipeBase *> OldRecipes;
3000 for (VPUser *U : collectUsersRecursively(HeaderMask)) {
3002 if (auto *NewR = optimizeMaskToEVL(HeaderMask, *R, TypeInfo, *EVL)) {
3003 NewR->insertBefore(R);
3004 for (auto [Old, New] :
3005 zip_equal(R->definedValues(), NewR->definedValues()))
3006 Old->replaceAllUsesWith(New);
3007 OldRecipes.push_back(R);
3008 }
3009 }
3010
3011 // Replace remaining (HeaderMask && Mask) with vp.merge (True, Mask,
3012 // False, EVL)
3013 for (VPUser *U : collectUsersRecursively(HeaderMask)) {
3014 VPValue *Mask;
3015 if (match(U, m_LogicalAnd(m_Specific(HeaderMask), m_VPValue(Mask)))) {
3016 auto *LogicalAnd = cast<VPInstruction>(U);
3017 auto *Merge = new VPWidenIntrinsicRecipe(
3018 Intrinsic::vp_merge, {Plan.getTrue(), Mask, Plan.getFalse(), EVL},
3019 TypeInfo.inferScalarType(Mask), {}, {}, LogicalAnd->getDebugLoc());
3020 Merge->insertBefore(LogicalAnd);
3021 LogicalAnd->replaceAllUsesWith(Merge);
3022 OldRecipes.push_back(LogicalAnd);
3023 }
3024 }
3025
3026 // Erase old recipes at the end so we don't invalidate TypeInfo.
3027 for (VPRecipeBase *R : reverse(OldRecipes)) {
3028 SmallVector<VPValue *> PossiblyDead(R->operands());
3029 R->eraseFromParent();
3030 for (VPValue *Op : PossiblyDead)
3032 }
3033}
3034
3035/// After replacing the canonical IV with a EVL-based IV, fixup recipes that use
3036/// VF to use the EVL instead to avoid incorrect updates on the penultimate
3037/// iteration.
3038static void fixupVFUsersForEVL(VPlan &Plan, VPValue &EVL) {
3039 VPTypeAnalysis TypeInfo(Plan);
3040 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3041 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
3042
3043 assert(all_of(Plan.getVF().users(),
3046 "User of VF that we can't transform to EVL.");
3047 Plan.getVF().replaceUsesWithIf(&EVL, [](VPUser &U, unsigned Idx) {
3049 });
3050
3051 assert(all_of(Plan.getVFxUF().users(),
3053 m_c_Add(m_Specific(LoopRegion->getCanonicalIV()),
3054 m_Specific(&Plan.getVFxUF())),
3056 "Only users of VFxUF should be VPWidenPointerInductionRecipe and the "
3057 "increment of the canonical induction.");
3058 Plan.getVFxUF().replaceUsesWithIf(&EVL, [](VPUser &U, unsigned Idx) {
3059 // Only replace uses in VPWidenPointerInductionRecipe; The increment of the
3060 // canonical induction must not be updated.
3062 });
3063
3064 // Create a scalar phi to track the previous EVL if fixed-order recurrence is
3065 // contained.
3066 bool ContainsFORs =
3068 if (ContainsFORs) {
3069 // TODO: Use VPInstruction::ExplicitVectorLength to get maximum EVL.
3070 VPValue *MaxEVL = &Plan.getVF();
3071 // Emit VPScalarCastRecipe in preheader if VF is not a 32 bits integer.
3072 VPBuilder Builder(LoopRegion->getPreheaderVPBB());
3073 MaxEVL = Builder.createScalarZExtOrTrunc(
3074 MaxEVL, Type::getInt32Ty(Plan.getContext()),
3075 TypeInfo.inferScalarType(MaxEVL), DebugLoc::getUnknown());
3076
3077 Builder.setInsertPoint(Header, Header->getFirstNonPhi());
3078 VPValue *PrevEVL = Builder.createScalarPhi(
3079 {MaxEVL, &EVL}, DebugLoc::getUnknown(), "prev.evl");
3080
3083 for (VPRecipeBase &R : *VPBB) {
3084 VPValue *V1, *V2;
3085 if (!match(&R,
3087 m_VPValue(V1), m_VPValue(V2))))
3088 continue;
3089 VPValue *Imm = Plan.getOrAddLiveIn(
3092 Intrinsic::experimental_vp_splice,
3093 {V1, V2, Imm, Plan.getTrue(), PrevEVL, &EVL},
3094 TypeInfo.inferScalarType(R.getVPSingleValue()), {}, {},
3095 R.getDebugLoc());
3096 VPSplice->insertBefore(&R);
3097 R.getVPSingleValue()->replaceAllUsesWith(VPSplice);
3098 }
3099 }
3100 }
3101
3102 VPValue *HeaderMask = vputils::findHeaderMask(Plan);
3103 if (!HeaderMask)
3104 return;
3105
3106 // Ensure that any reduction that uses a select to mask off tail lanes does so
3107 // in the vector loop, not the middle block, since EVL tail folding can have
3108 // tail elements in the penultimate iteration.
3109 assert(all_of(*Plan.getMiddleBlock(), [&Plan, HeaderMask](VPRecipeBase &R) {
3110 if (match(&R, m_ComputeReductionResult(m_Select(m_Specific(HeaderMask),
3111 m_VPValue(), m_VPValue()))))
3112 return R.getOperand(0)->getDefiningRecipe()->getRegion() ==
3113 Plan.getVectorLoopRegion();
3114 return true;
3115 }));
3116
3117 // Replace header masks with a mask equivalent to predicating by EVL:
3118 //
3119 // icmp ule widen-canonical-iv backedge-taken-count
3120 // ->
3121 // icmp ult step-vector, EVL
3122 VPRecipeBase *EVLR = EVL.getDefiningRecipe();
3123 VPBuilder Builder(EVLR->getParent(), std::next(EVLR->getIterator()));
3124 Type *EVLType = TypeInfo.inferScalarType(&EVL);
3125 VPValue *EVLMask = Builder.createICmp(
3127 Builder.createNaryOp(VPInstruction::StepVector, {}, EVLType), &EVL);
3128 HeaderMask->replaceAllUsesWith(EVLMask);
3129}
3130
3131/// Converts a tail folded vector loop region to step by
3132/// VPInstruction::ExplicitVectorLength elements instead of VF elements each
3133/// iteration.
3134///
3135/// - Add a VPCurrentIterationPHIRecipe and related recipes to \p Plan and
3136/// replaces all uses of the canonical IV except for the canonical IV
3137/// increment with a VPCurrentIterationPHIRecipe. The canonical IV is used
3138/// only for loop iterations counting after this transformation.
3139///
3140/// - The header mask is replaced with a header mask based on the EVL.
3141///
3142/// - Plans with FORs have a new phi added to keep track of the EVL of the
3143/// previous iteration, and VPFirstOrderRecurrencePHIRecipes are replaced with
3144/// @llvm.vp.splice.
3145///
3146/// The function uses the following definitions:
3147/// %StartV is the canonical induction start value.
3148///
3149/// The function adds the following recipes:
3150///
3151/// vector.ph:
3152/// ...
3153///
3154/// vector.body:
3155/// ...
3156/// %CurrentIter = CURRENT-ITERATION-PHI [ %StartV, %vector.ph ],
3157/// [ %NextIter, %vector.body ]
3158/// %AVL = phi [ trip-count, %vector.ph ], [ %NextAVL, %vector.body ]
3159/// %VPEVL = EXPLICIT-VECTOR-LENGTH %AVL
3160/// ...
3161/// %OpEVL = cast i32 %VPEVL to IVSize
3162/// %NextIter = add IVSize %OpEVL, %CurrentIter
3163/// %NextAVL = sub IVSize nuw %AVL, %OpEVL
3164/// ...
3165///
3166/// If MaxSafeElements is provided, the function adds the following recipes:
3167/// vector.ph:
3168/// ...
3169///
3170/// vector.body:
3171/// ...
3172/// %CurrentIter = CURRENT-ITERATION-PHI [ %StartV, %vector.ph ],
3173/// [ %NextIter, %vector.body ]
3174/// %AVL = phi [ trip-count, %vector.ph ], [ %NextAVL, %vector.body ]
3175/// %cmp = cmp ult %AVL, MaxSafeElements
3176/// %SAFE_AVL = select %cmp, %AVL, MaxSafeElements
3177/// %VPEVL = EXPLICIT-VECTOR-LENGTH %SAFE_AVL
3178/// ...
3179/// %OpEVL = cast i32 %VPEVL to IVSize
3180/// %NextIter = add IVSize %OpEVL, %CurrentIter
3181/// %NextAVL = sub IVSize nuw %AVL, %OpEVL
3182/// ...
3183///
3185 VPlan &Plan, const std::optional<unsigned> &MaxSafeElements) {
3186 if (Plan.hasScalarVFOnly())
3187 return;
3188 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3189 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
3190
3191 auto *CanonicalIV = LoopRegion->getCanonicalIV();
3192 auto *CanIVTy = LoopRegion->getCanonicalIVType();
3193 VPValue *StartV = Plan.getZero(CanIVTy);
3194 auto *CanonicalIVIncrement = LoopRegion->getOrCreateCanonicalIVIncrement();
3195
3196 // Create the CurrentIteration recipe in the vector loop.
3197 auto *CurrentIteration =
3199 CurrentIteration->insertBefore(*Header, Header->begin());
3200 VPBuilder Builder(Header, Header->getFirstNonPhi());
3201 // Create the AVL (application vector length), starting from TC -> 0 in steps
3202 // of EVL.
3203 VPPhi *AVLPhi = Builder.createScalarPhi(
3204 {Plan.getTripCount()}, DebugLoc::getCompilerGenerated(), "avl");
3205 VPValue *AVL = AVLPhi;
3206
3207 if (MaxSafeElements) {
3208 // Support for MaxSafeDist for correct loop emission.
3209 VPValue *AVLSafe = Plan.getConstantInt(CanIVTy, *MaxSafeElements);
3210 VPValue *Cmp = Builder.createICmp(ICmpInst::ICMP_ULT, AVL, AVLSafe);
3211 AVL = Builder.createSelect(Cmp, AVL, AVLSafe, DebugLoc::getUnknown(),
3212 "safe_avl");
3213 }
3214 auto *VPEVL = Builder.createNaryOp(VPInstruction::ExplicitVectorLength, AVL,
3215 DebugLoc::getUnknown(), "evl");
3216
3217 Builder.setInsertPoint(CanonicalIVIncrement);
3218 VPValue *OpVPEVL = VPEVL;
3219
3220 auto *I32Ty = Type::getInt32Ty(Plan.getContext());
3221 OpVPEVL = Builder.createScalarZExtOrTrunc(
3222 OpVPEVL, CanIVTy, I32Ty, CanonicalIVIncrement->getDebugLoc());
3223
3224 auto *NextIter = Builder.createAdd(
3225 OpVPEVL, CurrentIteration, CanonicalIVIncrement->getDebugLoc(),
3226 "current.iteration.next", CanonicalIVIncrement->getNoWrapFlags());
3227 CurrentIteration->addOperand(NextIter);
3228
3229 VPValue *NextAVL =
3230 Builder.createSub(AVLPhi, OpVPEVL, DebugLoc::getCompilerGenerated(),
3231 "avl.next", {/*NUW=*/true, /*NSW=*/false});
3232 AVLPhi->addOperand(NextAVL);
3233
3234 fixupVFUsersForEVL(Plan, *VPEVL);
3235 removeDeadRecipes(Plan);
3236
3237 // Replace all uses of the canonical IV with VPCurrentIterationPHIRecipe
3238 // except for the canonical IV increment.
3239 CanonicalIV->replaceAllUsesWith(CurrentIteration);
3240 CanonicalIVIncrement->setOperand(0, CanonicalIV);
3241 // TODO: support unroll factor > 1.
3242 Plan.setUF(1);
3243}
3244
3246 // Find the vector loop entry by locating VPCurrentIterationPHIRecipe.
3247 // There should be only one VPCurrentIteration in the entire plan.
3248 VPCurrentIterationPHIRecipe *CurrentIteration = nullptr;
3249
3252 for (VPRecipeBase &R : VPBB->phis())
3253 if (auto *PhiR = dyn_cast<VPCurrentIterationPHIRecipe>(&R)) {
3254 assert(!CurrentIteration &&
3255 "Found multiple CurrentIteration. Only one expected");
3256 CurrentIteration = PhiR;
3257 }
3258
3259 // Early return if it is not variable-length stepping.
3260 if (!CurrentIteration)
3261 return;
3262
3263 VPBasicBlock *HeaderVPBB = CurrentIteration->getParent();
3264 VPValue *CurrentIterationIncr = CurrentIteration->getBackedgeValue();
3265
3266 // Convert CurrentIteration to concrete recipe.
3267 auto *ScalarR =
3268 VPBuilder(CurrentIteration)
3270 {CurrentIteration->getStartValue(), CurrentIterationIncr},
3271 CurrentIteration->getDebugLoc(), "current.iteration.iv");
3272 CurrentIteration->replaceAllUsesWith(ScalarR);
3273 CurrentIteration->eraseFromParent();
3274
3275 // Replace CanonicalIVInc with CurrentIteration increment if it exists.
3276 auto *CanonicalIV = cast<VPPhi>(&*HeaderVPBB->begin());
3277 if (auto *CanIVInc = vputils::findUserOf(
3278 CanonicalIV, m_c_Add(m_VPValue(), m_Specific(&Plan.getVFxUF())))) {
3279 cast<VPInstruction>(CanIVInc)->replaceAllUsesWith(CurrentIterationIncr);
3280 CanIVInc->eraseFromParent();
3281 }
3282}
3283
3285 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3286 if (!LoopRegion)
3287 return;
3288 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
3289 if (Header->empty())
3290 return;
3291 // The EVL IV is always at the beginning.
3292 auto *EVLPhi = dyn_cast<VPCurrentIterationPHIRecipe>(&Header->front());
3293 if (!EVLPhi)
3294 return;
3295
3296 // Bail if not an EVL tail folded loop.
3297 VPValue *AVL;
3298 if (!match(EVLPhi->getBackedgeValue(),
3299 m_c_Add(m_ZExtOrSelf(m_EVL(m_VPValue(AVL))), m_Specific(EVLPhi))))
3300 return;
3301
3302 // The AVL may be capped to a safe distance.
3303 VPValue *SafeAVL, *UnsafeAVL;
3304 if (match(AVL,
3306 m_VPValue(SafeAVL)),
3307 m_Deferred(UnsafeAVL), m_Deferred(SafeAVL))))
3308 AVL = UnsafeAVL;
3309
3310 VPValue *AVLNext;
3311 [[maybe_unused]] bool FoundAVLNext =
3313 m_Specific(Plan.getTripCount()), m_VPValue(AVLNext)));
3314 assert(FoundAVLNext && "Didn't find AVL backedge?");
3315
3316 VPBasicBlock *Latch = LoopRegion->getExitingBasicBlock();
3317 auto *LatchBr = cast<VPInstruction>(Latch->getTerminator());
3318 if (match(LatchBr, m_BranchOnCond(m_True())))
3319 return;
3320
3321 VPValue *CanIVInc;
3322 [[maybe_unused]] bool FoundIncrement = match(
3323 LatchBr,
3325 m_Specific(&Plan.getVectorTripCount()))));
3326 assert(FoundIncrement &&
3327 match(CanIVInc, m_Add(m_Specific(LoopRegion->getCanonicalIV()),
3328 m_Specific(&Plan.getVFxUF()))) &&
3329 "Expected BranchOnCond with ICmp comparing CanIV + VFxUF with vector "
3330 "trip count");
3331
3332 Type *AVLTy = VPTypeAnalysis(Plan).inferScalarType(AVLNext);
3333 VPBuilder Builder(LatchBr);
3334 LatchBr->setOperand(
3335 0, Builder.createICmp(CmpInst::ICMP_EQ, AVLNext, Plan.getZero(AVLTy)));
3336}
3337
3339 VPlan &Plan, PredicatedScalarEvolution &PSE,
3340 const DenseMap<Value *, const SCEV *> &StridesMap) {
3341 // Replace VPValues for known constant strides guaranteed by predicate scalar
3342 // evolution.
3343 auto CanUseVersionedStride = [&Plan](VPUser &U, unsigned) {
3344 auto *R = cast<VPRecipeBase>(&U);
3345 return R->getRegion() ||
3346 R->getParent() == Plan.getVectorLoopRegion()->getSinglePredecessor();
3347 };
3348 ValueToSCEVMapTy RewriteMap;
3349 for (const SCEV *Stride : StridesMap.values()) {
3350 using namespace SCEVPatternMatch;
3351 auto *StrideV = cast<SCEVUnknown>(Stride)->getValue();
3352 const APInt *StrideConst;
3353 if (!match(PSE.getSCEV(StrideV), m_scev_APInt(StrideConst)))
3354 // Only handle constant strides for now.
3355 continue;
3356
3357 auto *CI = Plan.getConstantInt(*StrideConst);
3358 if (VPValue *StrideVPV = Plan.getLiveIn(StrideV))
3359 StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
3360
3361 // The versioned value may not be used in the loop directly but through a
3362 // sext/zext. Add new live-ins in those cases.
3363 for (Value *U : StrideV->users()) {
3365 continue;
3366 VPValue *StrideVPV = Plan.getLiveIn(U);
3367 if (!StrideVPV)
3368 continue;
3369 unsigned BW = U->getType()->getScalarSizeInBits();
3370 APInt C =
3371 isa<SExtInst>(U) ? StrideConst->sext(BW) : StrideConst->zext(BW);
3372 VPValue *CI = Plan.getConstantInt(C);
3373 StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
3374 }
3375 RewriteMap[StrideV] = PSE.getSCEV(StrideV);
3376 }
3377
3378 for (VPRecipeBase &R : *Plan.getEntry()) {
3379 auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(&R);
3380 if (!ExpSCEV)
3381 continue;
3382 const SCEV *ScevExpr = ExpSCEV->getSCEV();
3383 auto *NewSCEV =
3384 SCEVParameterRewriter::rewrite(ScevExpr, *PSE.getSE(), RewriteMap);
3385 if (NewSCEV != ScevExpr) {
3386 VPValue *NewExp = vputils::getOrCreateVPValueForSCEVExpr(Plan, NewSCEV);
3387 ExpSCEV->replaceAllUsesWith(NewExp);
3388 if (Plan.getTripCount() == ExpSCEV)
3389 Plan.resetTripCount(NewExp);
3390 }
3391 }
3392}
3393
3395 // Collect recipes in the backward slice of `Root` that may generate a poison
3396 // value that is used after vectorization.
3398 auto CollectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) {
3400 Worklist.push_back(Root);
3401
3402 // Traverse the backward slice of Root through its use-def chain.
3403 while (!Worklist.empty()) {
3404 VPRecipeBase *CurRec = Worklist.pop_back_val();
3405
3406 if (!Visited.insert(CurRec).second)
3407 continue;
3408
3409 // Prune search if we find another recipe generating a widen memory
3410 // instruction. Widen memory instructions involved in address computation
3411 // will lead to gather/scatter instructions, which don't need to be
3412 // handled.
3414 VPHeaderPHIRecipe>(CurRec))
3415 continue;
3416
3417 // This recipe contributes to the address computation of a widen
3418 // load/store. If the underlying instruction has poison-generating flags,
3419 // drop them directly.
3420 if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(CurRec)) {
3421 VPValue *A, *B;
3422 // Dropping disjoint from an OR may yield incorrect results, as some
3423 // analysis may have converted it to an Add implicitly (e.g. SCEV used
3424 // for dependence analysis). Instead, replace it with an equivalent Add.
3425 // This is possible as all users of the disjoint OR only access lanes
3426 // where the operands are disjoint or poison otherwise.
3427 if (match(RecWithFlags, m_BinaryOr(m_VPValue(A), m_VPValue(B))) &&
3428 RecWithFlags->isDisjoint()) {
3429 VPBuilder Builder(RecWithFlags);
3430 VPInstruction *New =
3431 Builder.createAdd(A, B, RecWithFlags->getDebugLoc());
3432 New->setUnderlyingValue(RecWithFlags->getUnderlyingValue());
3433 RecWithFlags->replaceAllUsesWith(New);
3434 RecWithFlags->eraseFromParent();
3435 CurRec = New;
3436 } else
3437 RecWithFlags->dropPoisonGeneratingFlags();
3438 } else {
3441 (void)Instr;
3442 assert((!Instr || !Instr->hasPoisonGeneratingFlags()) &&
3443 "found instruction with poison generating flags not covered by "
3444 "VPRecipeWithIRFlags");
3445 }
3446
3447 // Add new definitions to the worklist.
3448 for (VPValue *Operand : CurRec->operands())
3449 if (VPRecipeBase *OpDef = Operand->getDefiningRecipe())
3450 Worklist.push_back(OpDef);
3451 }
3452 });
3453
3454 // We want to exclude the tail folding case, as we don't need to drop flags
3455 // for operations computing the first lane in this case: the first lane of the
3456 // header mask must always be true.
3457 auto IsNotHeaderMask = [&Plan](VPValue *Mask) {
3458 return Mask && !vputils::isHeaderMask(Mask, Plan);
3459 };
3460
3461 // Traverse all the recipes in the VPlan and collect the poison-generating
3462 // recipes in the backward slice starting at the address of a VPWidenRecipe or
3463 // VPInterleaveRecipe.
3464 auto Iter =
3467 for (VPRecipeBase &Recipe : *VPBB) {
3468 if (auto *WidenRec = dyn_cast<VPWidenMemoryRecipe>(&Recipe)) {
3469 VPRecipeBase *AddrDef = WidenRec->getAddr()->getDefiningRecipe();
3470 if (AddrDef && WidenRec->isConsecutive() &&
3471 IsNotHeaderMask(WidenRec->getMask()))
3472 CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
3473 } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) {
3474 VPRecipeBase *AddrDef = InterleaveRec->getAddr()->getDefiningRecipe();
3475 if (AddrDef && IsNotHeaderMask(InterleaveRec->getMask()))
3476 CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
3477 }
3478 }
3479 }
3480}
3481
3483 VPlan &Plan,
3485 &InterleaveGroups,
3486 const bool &EpilogueAllowed) {
3487 if (InterleaveGroups.empty())
3488 return;
3489
3491 for (VPBasicBlock *VPBB :
3494 for (VPRecipeBase &R :
3496 auto &MemR = cast<VPWidenMemoryRecipe>(R);
3497 IRMemberToRecipe[&MemR.getIngredient()] = &MemR;
3498 }
3499
3500 // Interleave memory: for each Interleave Group we marked earlier as relevant
3501 // for this VPlan, replace the Recipes widening its memory instructions with a
3502 // single VPInterleaveRecipe at its insertion point.
3503 VPDominatorTree VPDT(Plan);
3504 for (const auto *IG : InterleaveGroups) {
3505 // Skip interleave groups where members don't have recipes. This can happen
3506 // when removeDeadRecipes removes recipes that are part of interleave groups
3507 // but have no users.
3508 if (llvm::any_of(IG->members(), [&IRMemberToRecipe](Instruction *Member) {
3509 return !IRMemberToRecipe.contains(Member);
3510 }))
3511 continue;
3512
3513 auto *Start = IRMemberToRecipe.lookup(IG->getMember(0));
3514 VPIRMetadata InterleaveMD(*Start);
3515 SmallVector<VPValue *, 4> StoredValues;
3516 if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(Start))
3517 StoredValues.push_back(StoreR->getStoredValue());
3518 for (unsigned I = 1; I < IG->getFactor(); ++I) {
3519 Instruction *MemberI = IG->getMember(I);
3520 if (!MemberI)
3521 continue;
3522 VPWidenMemoryRecipe *MemoryR = IRMemberToRecipe.lookup(MemberI);
3523 if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(MemoryR))
3524 StoredValues.push_back(StoreR->getStoredValue());
3525 InterleaveMD.intersect(*MemoryR);
3526 }
3527
3528 bool NeedsMaskForGaps =
3529 (IG->requiresScalarEpilogue() && !EpilogueAllowed) ||
3530 (!StoredValues.empty() && !IG->isFull());
3531
3532 Instruction *IRInsertPos = IG->getInsertPos();
3533 auto *InsertPos = IRMemberToRecipe.lookup(IRInsertPos);
3534
3536 if (auto *Gep = dyn_cast<GetElementPtrInst>(
3537 getLoadStorePointerOperand(IRInsertPos)->stripPointerCasts()))
3538 NW = Gep->getNoWrapFlags().withoutNoUnsignedWrap();
3539
3540 // Get or create the start address for the interleave group.
3541 VPValue *Addr = Start->getAddr();
3542 VPRecipeBase *AddrDef = Addr->getDefiningRecipe();
3543 if (AddrDef && !VPDT.properlyDominates(AddrDef, InsertPos)) {
3544 // We cannot re-use the address of member zero because it does not
3545 // dominate the insert position. Instead, use the address of the insert
3546 // position and create a PtrAdd adjusting it to the address of member
3547 // zero.
3548 // TODO: Hoist Addr's defining recipe (and any operands as needed) to
3549 // InsertPos or sink loads above zero members to join it.
3550 assert(IG->getIndex(IRInsertPos) != 0 &&
3551 "index of insert position shouldn't be zero");
3552 auto &DL = IRInsertPos->getDataLayout();
3553 APInt Offset(32,
3554 DL.getTypeAllocSize(getLoadStoreType(IRInsertPos)) *
3555 IG->getIndex(IRInsertPos),
3556 /*IsSigned=*/true);
3557 VPValue *OffsetVPV = Plan.getConstantInt(-Offset);
3558 VPBuilder B(InsertPos);
3559 Addr = B.createNoWrapPtrAdd(InsertPos->getAddr(), OffsetVPV, NW);
3560 }
3561 // If the group is reverse, adjust the index to refer to the last vector
3562 // lane instead of the first. We adjust the index from the first vector
3563 // lane, rather than directly getting the pointer for lane VF - 1, because
3564 // the pointer operand of the interleaved access is supposed to be uniform.
3565 if (IG->isReverse()) {
3566 auto *ReversePtr = new VPVectorEndPointerRecipe(
3567 Addr, &Plan.getVF(), getLoadStoreType(IRInsertPos),
3568 -(int64_t)IG->getFactor(), NW, InsertPos->getDebugLoc());
3569 ReversePtr->insertBefore(InsertPos);
3570 Addr = ReversePtr;
3571 }
3572 auto *VPIG = new VPInterleaveRecipe(IG, Addr, StoredValues,
3573 InsertPos->getMask(), NeedsMaskForGaps,
3574 InterleaveMD, InsertPos->getDebugLoc());
3575 VPIG->insertBefore(InsertPos);
3576
3577 unsigned J = 0;
3578 for (unsigned i = 0; i < IG->getFactor(); ++i)
3579 if (Instruction *Member = IG->getMember(i)) {
3580 VPRecipeBase *MemberR = IRMemberToRecipe.lookup(Member);
3581 if (!Member->getType()->isVoidTy()) {
3582 VPValue *OriginalV = MemberR->getVPSingleValue();
3583 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));
3584 J++;
3585 }
3586 MemberR->eraseFromParent();
3587 }
3588 }
3589}
3590
3591/// Expand a VPWidenIntOrFpInduction into executable recipes, for the initial
3592/// value, phi and backedge value. In the following example:
3593///
3594/// vector.ph:
3595/// Successor(s): vector loop
3596///
3597/// <x1> vector loop: {
3598/// vector.body:
3599/// WIDEN-INDUCTION %i = phi %start, %step, %vf
3600/// ...
3601/// EMIT branch-on-count ...
3602/// No successors
3603/// }
3604///
3605/// WIDEN-INDUCTION will get expanded to:
3606///
3607/// vector.ph:
3608/// ...
3609/// vp<%induction.start> = ...
3610/// vp<%induction.increment> = ...
3611///
3612/// Successor(s): vector loop
3613///
3614/// <x1> vector loop: {
3615/// vector.body:
3616/// ir<%i> = WIDEN-PHI vp<%induction.start>, vp<%vec.ind.next>
3617/// ...
3618/// vp<%vec.ind.next> = add ir<%i>, vp<%induction.increment>
3619/// EMIT branch-on-count ...
3620/// No successors
3621/// }
3622static void
3624 VPTypeAnalysis &TypeInfo) {
3625 VPlan *Plan = WidenIVR->getParent()->getPlan();
3626 VPValue *Start = WidenIVR->getStartValue();
3627 VPValue *Step = WidenIVR->getStepValue();
3628 VPValue *VF = WidenIVR->getVFValue();
3629 DebugLoc DL = WidenIVR->getDebugLoc();
3630
3631 // The value from the original loop to which we are mapping the new induction
3632 // variable.
3633 Type *Ty = TypeInfo.inferScalarType(WidenIVR);
3634
3635 const InductionDescriptor &ID = WidenIVR->getInductionDescriptor();
3638 VPIRFlags Flags = *WidenIVR;
3639 if (ID.getKind() == InductionDescriptor::IK_IntInduction) {
3640 AddOp = Instruction::Add;
3641 MulOp = Instruction::Mul;
3642 } else {
3643 AddOp = ID.getInductionOpcode();
3644 MulOp = Instruction::FMul;
3645 }
3646
3647 // If the phi is truncated, truncate the start and step values.
3648 VPBuilder Builder(Plan->getVectorPreheader());
3649 Type *StepTy = TypeInfo.inferScalarType(Step);
3650 if (Ty->getScalarSizeInBits() < StepTy->getScalarSizeInBits()) {
3651 assert(StepTy->isIntegerTy() && "Truncation requires an integer type");
3652 Step = Builder.createScalarCast(Instruction::Trunc, Step, Ty, DL);
3653 Start = Builder.createScalarCast(Instruction::Trunc, Start, Ty, DL);
3654 StepTy = Ty;
3655 }
3656
3657 // Construct the initial value of the vector IV in the vector loop preheader.
3658 Type *IVIntTy =
3660 VPValue *Init = Builder.createNaryOp(VPInstruction::StepVector, {}, IVIntTy);
3661 if (StepTy->isFloatingPointTy())
3662 Init = Builder.createWidenCast(Instruction::UIToFP, Init, StepTy);
3663
3664 VPValue *SplatStart = Builder.createNaryOp(VPInstruction::Broadcast, Start);
3665 VPValue *SplatStep = Builder.createNaryOp(VPInstruction::Broadcast, Step);
3666
3667 Init = Builder.createNaryOp(MulOp, {Init, SplatStep}, Flags);
3668 Init = Builder.createNaryOp(AddOp, {SplatStart, Init}, Flags,
3669 DebugLoc::getUnknown(), "induction");
3670
3671 // Create the widened phi of the vector IV.
3672 auto *WidePHI = VPBuilder(WidenIVR).createWidenPhi(
3673 Init, WidenIVR->getDebugLoc(), "vec.ind");
3674
3675 // Create the backedge value for the vector IV.
3676 VPValue *Inc;
3677 VPValue *Prev;
3678 // If unrolled, use the increment and prev value from the operands.
3679 if (auto *SplatVF = WidenIVR->getSplatVFValue()) {
3680 Inc = SplatVF;
3681 Prev = WidenIVR->getLastUnrolledPartOperand();
3682 } else {
3683 if (VPRecipeBase *R = VF->getDefiningRecipe())
3684 Builder.setInsertPoint(R->getParent(), std::next(R->getIterator()));
3685 // Multiply the vectorization factor by the step using integer or
3686 // floating-point arithmetic as appropriate.
3687 if (StepTy->isFloatingPointTy())
3688 VF = Builder.createScalarCast(Instruction::CastOps::UIToFP, VF, StepTy,
3689 DL);
3690 else
3691 VF = Builder.createScalarZExtOrTrunc(VF, StepTy,
3692 TypeInfo.inferScalarType(VF), DL);
3693
3694 Inc = Builder.createNaryOp(MulOp, {Step, VF}, Flags);
3695 Inc = Builder.createNaryOp(VPInstruction::Broadcast, Inc);
3696 Prev = WidePHI;
3697 }
3698
3700 Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator());
3701 auto *Next = Builder.createNaryOp(AddOp, {Prev, Inc}, Flags,
3702 WidenIVR->getDebugLoc(), "vec.ind.next");
3703
3704 WidePHI->addOperand(Next);
3705
3706 WidenIVR->replaceAllUsesWith(WidePHI);
3707}
3708
3709/// Expand a VPWidenPointerInductionRecipe into executable recipes, for the
3710/// initial value, phi and backedge value. In the following example:
3711///
3712/// <x1> vector loop: {
3713/// vector.body:
3714/// EMIT ir<%ptr.iv> = WIDEN-POINTER-INDUCTION %start, %step, %vf
3715/// ...
3716/// EMIT branch-on-count ...
3717/// }
3718///
3719/// WIDEN-POINTER-INDUCTION will get expanded to:
3720///
3721/// <x1> vector loop: {
3722/// vector.body:
3723/// EMIT-SCALAR %pointer.phi = phi %start, %ptr.ind
3724/// EMIT %mul = mul %stepvector, %step
3725/// EMIT %vector.gep = wide-ptradd %pointer.phi, %mul
3726/// ...
3727/// EMIT %ptr.ind = ptradd %pointer.phi, %vf
3728/// EMIT branch-on-count ...
3729/// }
3731 VPTypeAnalysis &TypeInfo) {
3732 VPlan *Plan = R->getParent()->getPlan();
3733 VPValue *Start = R->getStartValue();
3734 VPValue *Step = R->getStepValue();
3735 VPValue *VF = R->getVFValue();
3736
3737 assert(R->getInductionDescriptor().getKind() ==
3739 "Not a pointer induction according to InductionDescriptor!");
3740 assert(TypeInfo.inferScalarType(R)->isPointerTy() && "Unexpected type.");
3741 assert(!R->onlyScalarsGenerated(Plan->hasScalableVF()) &&
3742 "Recipe should have been replaced");
3743
3744 VPBuilder Builder(R);
3745 DebugLoc DL = R->getDebugLoc();
3746
3747 // Build a scalar pointer phi.
3748 VPPhi *ScalarPtrPhi = Builder.createScalarPhi(Start, DL, "pointer.phi");
3749
3750 // Create actual address geps that use the pointer phi as base and a
3751 // vectorized version of the step value (<step*0, ..., step*N>) as offset.
3752 Builder.setInsertPoint(R->getParent(), R->getParent()->getFirstNonPhi());
3753 Type *StepTy = TypeInfo.inferScalarType(Step);
3754 VPValue *Offset = Builder.createNaryOp(VPInstruction::StepVector, {}, StepTy);
3755 Offset = Builder.createOverflowingOp(Instruction::Mul, {Offset, Step});
3756 VPValue *PtrAdd =
3757 Builder.createWidePtrAdd(ScalarPtrPhi, Offset, DL, "vector.gep");
3758 R->replaceAllUsesWith(PtrAdd);
3759
3760 // Create the backedge value for the scalar pointer phi.
3762 Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator());
3763 VF = Builder.createScalarZExtOrTrunc(VF, StepTy, TypeInfo.inferScalarType(VF),
3764 DL);
3765 VPValue *Inc = Builder.createOverflowingOp(Instruction::Mul, {Step, VF});
3766
3767 VPValue *InductionGEP =
3768 Builder.createPtrAdd(ScalarPtrPhi, Inc, DL, "ptr.ind");
3769 ScalarPtrPhi->addOperand(InductionGEP);
3770}
3771
3772/// Expand a VPDerivedIVRecipe into executable recipes.
3774 VPBuilder Builder(R);
3775 VPIRValue *Start = R->getStartValue();
3776 VPValue *Step = R->getStepValue();
3777 VPValue *Index = R->getIndex();
3778 Type *StepTy = TypeInfo.inferScalarType(Step);
3779 Type *IndexTy = TypeInfo.inferScalarType(Index);
3780 Index = StepTy->isIntegerTy()
3781 ? Builder.createScalarSExtOrTrunc(
3782 Index, StepTy, IndexTy, DebugLoc::getCompilerGenerated())
3783 : Builder.createScalarCast(Instruction::SIToFP, Index, StepTy,
3785 switch (R->getInductionKind()) {
3787 assert(TypeInfo.inferScalarType(Index) == TypeInfo.inferScalarType(Start) &&
3788 "Index type does not match StartValue type");
3789 return R->replaceAllUsesWith(Builder.createAdd(
3790 Start, Builder.createOverflowingOp(Instruction::Mul, {Index, Step})));
3791 }
3793 return R->replaceAllUsesWith(Builder.createPtrAdd(
3794 Start, Builder.createOverflowingOp(Instruction::Mul, {Index, Step})));
3796 assert(StepTy->isFloatingPointTy() && "Expected FP Step value");
3797 const FPMathOperator *FPBinOp = R->getFPBinOp();
3798 assert(FPBinOp &&
3799 (FPBinOp->getOpcode() == Instruction::FAdd ||
3800 FPBinOp->getOpcode() == Instruction::FSub) &&
3801 "Original BinOp should be defined for FP induction");
3802 FastMathFlags FMF = FPBinOp->getFastMathFlags();
3803 VPValue *FMul = Builder.createNaryOp(Instruction::FMul, {Step, Index}, FMF);
3804 return R->replaceAllUsesWith(
3805 Builder.createNaryOp(FPBinOp->getOpcode(), {Start, FMul}, FMF));
3806 }
3808 return;
3809 }
3810 llvm_unreachable("Unhandled induction kind");
3811}
3812
3814 // Replace loop regions with explicity CFG.
3815 SmallVector<VPRegionBlock *> LoopRegions;
3817 vp_depth_first_deep(Plan.getEntry()))) {
3818 if (!R->isReplicator())
3819 LoopRegions.push_back(R);
3820 }
3821 for (VPRegionBlock *R : LoopRegions)
3822 R->dissolveToCFGLoop();
3823}
3824
3827 // The transform runs after dissolving loop regions, so all VPBasicBlocks
3828 // terminated with BranchOnTwoConds are reached via a shallow traversal.
3831 if (!VPBB->empty() && match(&VPBB->back(), m_BranchOnTwoConds()))
3832 WorkList.push_back(cast<VPInstruction>(&VPBB->back()));
3833 }
3834
3835 // Expand BranchOnTwoConds instructions into explicit CFG with two new
3836 // single-condition branches:
3837 // 1. A branch that replaces BranchOnTwoConds, jumps to the first successor if
3838 // the first condition is true, and otherwise jumps to a new interim block.
3839 // 2. A branch that ends the interim block, jumps to the second successor if
3840 // the second condition is true, and otherwise jumps to the third
3841 // successor.
3842 for (VPInstruction *Br : WorkList) {
3843 assert(Br->getNumOperands() == 2 &&
3844 "BranchOnTwoConds must have exactly 2 conditions");
3845 DebugLoc DL = Br->getDebugLoc();
3846 VPBasicBlock *BrOnTwoCondsBB = Br->getParent();
3847 const auto Successors = to_vector(BrOnTwoCondsBB->getSuccessors());
3848 assert(Successors.size() == 3 &&
3849 "BranchOnTwoConds must have exactly 3 successors");
3850
3851 for (VPBlockBase *Succ : Successors)
3852 VPBlockUtils::disconnectBlocks(BrOnTwoCondsBB, Succ);
3853
3854 VPValue *Cond0 = Br->getOperand(0);
3855 VPValue *Cond1 = Br->getOperand(1);
3856 VPBlockBase *Succ0 = Successors[0];
3857 VPBlockBase *Succ1 = Successors[1];
3858 VPBlockBase *Succ2 = Successors[2];
3859 assert(!Succ0->getParent() && !Succ1->getParent() && !Succ2->getParent() &&
3860 !BrOnTwoCondsBB->getParent() && "regions must already be dissolved");
3861
3862 VPBasicBlock *InterimBB =
3863 Plan.createVPBasicBlock(BrOnTwoCondsBB->getName() + ".interim");
3864
3865 VPBuilder(BrOnTwoCondsBB)
3867 VPBlockUtils::connectBlocks(BrOnTwoCondsBB, Succ0);
3868 VPBlockUtils::connectBlocks(BrOnTwoCondsBB, InterimBB);
3869
3871 VPBlockUtils::connectBlocks(InterimBB, Succ1);
3872 VPBlockUtils::connectBlocks(InterimBB, Succ2);
3873 Br->eraseFromParent();
3874 }
3875}
3876
3878 VPTypeAnalysis TypeInfo(Plan);
3881 vp_depth_first_deep(Plan.getEntry()))) {
3882 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
3883 if (auto *WidenIVR = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R)) {
3884 expandVPWidenIntOrFpInduction(WidenIVR, TypeInfo);
3885 ToRemove.push_back(WidenIVR);
3886 continue;
3887 }
3888
3889 if (auto *WidenIVR = dyn_cast<VPWidenPointerInductionRecipe>(&R)) {
3890 // If the recipe only generates scalars, scalarize it instead of
3891 // expanding it.
3892 if (WidenIVR->onlyScalarsGenerated(Plan.hasScalableVF())) {
3893 VPBuilder Builder(WidenIVR);
3894 VPValue *PtrAdd =
3895 scalarizeVPWidenPointerInduction(WidenIVR, Plan, Builder);
3896 WidenIVR->replaceAllUsesWith(PtrAdd);
3897 ToRemove.push_back(WidenIVR);
3898 continue;
3899 }
3900 expandVPWidenPointerInduction(WidenIVR, TypeInfo);
3901 ToRemove.push_back(WidenIVR);
3902 continue;
3903 }
3904
3905 if (auto *DerivedIVR = dyn_cast<VPDerivedIVRecipe>(&R)) {
3906 expandVPDerivedIV(DerivedIVR, TypeInfo);
3907 ToRemove.push_back(DerivedIVR);
3908 continue;
3909 }
3910
3911 // Expand VPBlendRecipe into VPInstruction::Select.
3912 VPBuilder Builder(&R);
3913 if (auto *Blend = dyn_cast<VPBlendRecipe>(&R)) {
3914 VPValue *Select = Blend->getIncomingValue(0);
3915 for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
3916 Select = Builder.createSelect(Blend->getMask(I),
3917 Blend->getIncomingValue(I), Select,
3918 R.getDebugLoc(), "predphi", *Blend);
3919 Blend->replaceAllUsesWith(Select);
3920 ToRemove.push_back(Blend);
3921 }
3922
3923 if (auto *VEPR = dyn_cast<VPVectorEndPointerRecipe>(&R)) {
3924 if (!VEPR->getOffset()) {
3925 assert(Plan.getConcreteUF() == 1 &&
3926 "Expected unroller to have materialized offset for UF != 1");
3927 VEPR->materializeOffset();
3928 }
3929 }
3930
3931 if (auto *Expr = dyn_cast<VPExpressionRecipe>(&R)) {
3932 Expr->decompose();
3933 ToRemove.push_back(Expr);
3934 }
3935
3936 // Expand LastActiveLane into Not + FirstActiveLane + Sub.
3937 auto *LastActiveL = dyn_cast<VPInstruction>(&R);
3938 if (LastActiveL &&
3939 LastActiveL->getOpcode() == VPInstruction::LastActiveLane) {
3940 // Create Not(Mask) for all operands.
3942 for (VPValue *Op : LastActiveL->operands()) {
3943 VPValue *NotMask = Builder.createNot(Op, LastActiveL->getDebugLoc());
3944 NotMasks.push_back(NotMask);
3945 }
3946
3947 // Create FirstActiveLane on the inverted masks.
3948 VPValue *FirstInactiveLane = Builder.createFirstActiveLane(
3949 NotMasks, LastActiveL->getDebugLoc(), "first.inactive.lane");
3950
3951 // Subtract 1 to get the last active lane.
3952 VPValue *One =
3953 Plan.getConstantInt(TypeInfo.inferScalarType(FirstInactiveLane), 1);
3954 VPValue *LastLane =
3955 Builder.createSub(FirstInactiveLane, One,
3956 LastActiveL->getDebugLoc(), "last.active.lane");
3957
3958 LastActiveL->replaceAllUsesWith(LastLane);
3959 ToRemove.push_back(LastActiveL);
3960 continue;
3961 }
3962
3963 // Lower MaskedCond with block mask to LogicalAnd.
3965 auto *VPI = cast<VPInstruction>(&R);
3966 assert(VPI->isMasked() &&
3967 "Unmasked MaskedCond should be simplified earlier");
3968 VPI->replaceAllUsesWith(Builder.createNaryOp(
3969 VPInstruction::LogicalAnd, {VPI->getMask(), VPI->getOperand(0)}));
3970 ToRemove.push_back(VPI);
3971 continue;
3972 }
3973
3974 // Lower CanonicalIVIncrementForPart to plain Add.
3975 if (match(
3976 &R,
3978 auto *VPI = cast<VPInstruction>(&R);
3979 VPValue *Add = Builder.createOverflowingOp(
3980 Instruction::Add, VPI->operands(), VPI->getNoWrapFlags(),
3981 VPI->getDebugLoc());
3982 VPI->replaceAllUsesWith(Add);
3983 ToRemove.push_back(VPI);
3984 continue;
3985 }
3986
3987 // Lower BranchOnCount to ICmp + BranchOnCond.
3988 VPValue *IV, *TC;
3989 if (match(&R, m_BranchOnCount(m_VPValue(IV), m_VPValue(TC)))) {
3990 auto *BranchOnCountInst = cast<VPInstruction>(&R);
3991 DebugLoc DL = BranchOnCountInst->getDebugLoc();
3992 VPValue *Cond = Builder.createICmp(CmpInst::ICMP_EQ, IV, TC, DL);
3993 Builder.createNaryOp(VPInstruction::BranchOnCond, Cond, DL);
3994 ToRemove.push_back(BranchOnCountInst);
3995 continue;
3996 }
3997
3998 VPValue *VectorStep;
3999 VPValue *ScalarStep;
4001 m_VPValue(VectorStep), m_VPValue(ScalarStep))))
4002 continue;
4003
4004 // Expand WideIVStep.
4005 auto *VPI = cast<VPInstruction>(&R);
4006 Type *IVTy = TypeInfo.inferScalarType(VPI);
4007 if (TypeInfo.inferScalarType(VectorStep) != IVTy) {
4009 ? Instruction::UIToFP
4010 : Instruction::Trunc;
4011 VectorStep = Builder.createWidenCast(CastOp, VectorStep, IVTy);
4012 }
4013
4014 assert(!match(ScalarStep, m_One()) && "Expected non-unit scalar-step");
4015 if (TypeInfo.inferScalarType(ScalarStep) != IVTy) {
4016 ScalarStep =
4017 Builder.createWidenCast(Instruction::Trunc, ScalarStep, IVTy);
4018 }
4019
4020 VPIRFlags Flags;
4021 unsigned MulOpc;
4022 if (IVTy->isFloatingPointTy()) {
4023 MulOpc = Instruction::FMul;
4024 Flags = VPI->getFastMathFlags();
4025 } else {
4026 MulOpc = Instruction::Mul;
4027 Flags = VPIRFlags::getDefaultFlags(MulOpc);
4028 }
4029
4030 VPInstruction *Mul = Builder.createNaryOp(
4031 MulOpc, {VectorStep, ScalarStep}, Flags, R.getDebugLoc());
4032 VectorStep = Mul;
4033 VPI->replaceAllUsesWith(VectorStep);
4034 ToRemove.push_back(VPI);
4035 }
4036 }
4037
4038 for (VPRecipeBase *R : ToRemove)
4039 R->eraseFromParent();
4040}
4041
4043 VPBasicBlock *HeaderVPBB,
4044 VPBasicBlock *LatchVPBB,
4045 VPBasicBlock *MiddleVPBB,
4046 UncountableExitStyle Style) {
4047 struct EarlyExitInfo {
4048 VPBasicBlock *EarlyExitingVPBB;
4049 VPIRBasicBlock *EarlyExitVPBB;
4050 VPValue *CondToExit;
4051 };
4052
4053 VPDominatorTree VPDT(Plan);
4054 VPBuilder Builder(LatchVPBB->getTerminator());
4056 for (VPIRBasicBlock *ExitBlock : Plan.getExitBlocks()) {
4057 for (VPBlockBase *Pred : to_vector(ExitBlock->getPredecessors())) {
4058 if (Pred == MiddleVPBB)
4059 continue;
4060 // Collect condition for this early exit.
4061 auto *EarlyExitingVPBB = cast<VPBasicBlock>(Pred);
4062 VPBlockBase *TrueSucc = EarlyExitingVPBB->getSuccessors()[0];
4063 VPValue *CondOfEarlyExitingVPBB;
4064 [[maybe_unused]] bool Matched =
4065 match(EarlyExitingVPBB->getTerminator(),
4066 m_BranchOnCond(m_VPValue(CondOfEarlyExitingVPBB)));
4067 assert(Matched && "Terminator must be BranchOnCond");
4068
4069 // Insert the MaskedCond in the EarlyExitingVPBB so the predicator adds
4070 // the correct block mask.
4071 VPBuilder EarlyExitingBuilder(EarlyExitingVPBB->getTerminator());
4072 auto *CondToEarlyExit = EarlyExitingBuilder.createNaryOp(
4074 TrueSucc == ExitBlock
4075 ? CondOfEarlyExitingVPBB
4076 : EarlyExitingBuilder.createNot(CondOfEarlyExitingVPBB));
4077 assert((isa<VPIRValue>(CondOfEarlyExitingVPBB) ||
4078 !VPDT.properlyDominates(EarlyExitingVPBB, LatchVPBB) ||
4079 VPDT.properlyDominates(
4080 CondOfEarlyExitingVPBB->getDefiningRecipe()->getParent(),
4081 LatchVPBB)) &&
4082 "exit condition must dominate the latch");
4083 Exits.push_back({
4084 EarlyExitingVPBB,
4085 ExitBlock,
4086 CondToEarlyExit,
4087 });
4088 }
4089 }
4090
4091 assert(!Exits.empty() && "must have at least one early exit");
4092 // Sort exits by RPO order to get correct program order. RPO gives a
4093 // topological ordering of the CFG, ensuring upstream exits are checked
4094 // before downstream exits in the dispatch chain.
4096 HeaderVPBB);
4098 for (const auto &[Num, VPB] : enumerate(RPOT))
4099 RPOIdx[VPB] = Num;
4100 llvm::sort(Exits, [&RPOIdx](const EarlyExitInfo &A, const EarlyExitInfo &B) {
4101 return RPOIdx[A.EarlyExitingVPBB] < RPOIdx[B.EarlyExitingVPBB];
4102 });
4103#ifndef NDEBUG
4104 // After RPO sorting, verify that for any pair where one exit dominates
4105 // another, the dominating exit comes first. This is guaranteed by RPO
4106 // (topological order) and is required for the dispatch chain correctness.
4107 for (unsigned I = 0; I + 1 < Exits.size(); ++I)
4108 for (unsigned J = I + 1; J < Exits.size(); ++J)
4109 assert(!VPDT.properlyDominates(Exits[J].EarlyExitingVPBB,
4110 Exits[I].EarlyExitingVPBB) &&
4111 "RPO sort must place dominating exits before dominated ones");
4112#endif
4113
4114 // Build the AnyOf condition for the latch terminator using logical OR
4115 // to avoid poison propagation from later exit conditions when an earlier
4116 // exit is taken.
4117 VPValue *Combined = Exits[0].CondToExit;
4118 for (const EarlyExitInfo &Info : drop_begin(Exits))
4119 Combined = Builder.createLogicalOr(Combined, Info.CondToExit);
4120
4121 VPValue *IsAnyExitTaken =
4122 Builder.createNaryOp(VPInstruction::AnyOf, {Combined});
4123
4125 "Early exit store masking not implemented");
4126
4127 // Create the vector.early.exit blocks.
4128 SmallVector<VPBasicBlock *> VectorEarlyExitVPBBs(Exits.size());
4129 for (unsigned Idx = 0; Idx != Exits.size(); ++Idx) {
4130 Twine BlockSuffix = Exits.size() == 1 ? "" : Twine(".") + Twine(Idx);
4131 VPBasicBlock *VectorEarlyExitVPBB =
4132 Plan.createVPBasicBlock("vector.early.exit" + BlockSuffix);
4133 VectorEarlyExitVPBBs[Idx] = VectorEarlyExitVPBB;
4134 }
4135
4136 // Create the dispatch block (or reuse the single exit block if only one
4137 // exit). The dispatch block computes the first active lane of the combined
4138 // condition and, for multiple exits, chains through conditions to determine
4139 // which exit to take.
4140 VPBasicBlock *DispatchVPBB =
4141 Exits.size() == 1 ? VectorEarlyExitVPBBs[0]
4142 : Plan.createVPBasicBlock("vector.early.exit.check");
4143 DispatchVPBB->setPredecessors({LatchVPBB});
4144 VPBuilder DispatchBuilder(DispatchVPBB, DispatchVPBB->begin());
4145 VPValue *FirstActiveLane = DispatchBuilder.createFirstActiveLane(
4146 {Combined}, DebugLoc::getUnknown(), "first.active.lane");
4147
4148 // For each early exit, disconnect the original exiting block
4149 // (early.exiting.I) from the exit block (ir-bb<exit.I>) and route through a
4150 // new vector.early.exit block. Update ir-bb<exit.I>'s phis to extract their
4151 // values at the first active lane:
4152 //
4153 // Input:
4154 // early.exiting.I:
4155 // ...
4156 // EMIT branch-on-cond vp<%cond.I>
4157 // Successor(s): in.loop.succ, ir-bb<exit.I>
4158 //
4159 // ir-bb<exit.I>:
4160 // IR %phi = phi [ vp<%incoming.I>, early.exiting.I ], ...
4161 //
4162 // Output:
4163 // early.exiting.I:
4164 // ...
4165 // Successor(s): in.loop.succ
4166 //
4167 // vector.early.exit.I:
4168 // EMIT vp<%exit.val> = extract-lane vp<%first.lane>, vp<%incoming.I>
4169 // Successor(s): ir-bb<exit.I>
4170 //
4171 // ir-bb<exit.I>:
4172 // IR %phi = phi ... (extra operand: vp<%exit.val> from
4173 // vector.early.exit.I)
4174 //
4175 for (auto [Exit, VectorEarlyExitVPBB] :
4176 zip_equal(Exits, VectorEarlyExitVPBBs)) {
4177 auto &[EarlyExitingVPBB, EarlyExitVPBB, _] = Exit;
4178 // Adjust the phi nodes in EarlyExitVPBB.
4179 // 1. remove incoming values from EarlyExitingVPBB,
4180 // 2. extract the incoming value at FirstActiveLane
4181 // 3. add back the extracts as last operands for the phis
4182 // Then adjust the CFG, removing the edge between EarlyExitingVPBB and
4183 // EarlyExitVPBB and adding a new edge between VectorEarlyExitVPBB and
4184 // EarlyExitVPBB. The extracts at FirstActiveLane are now the incoming
4185 // values from VectorEarlyExitVPBB.
4186 for (VPRecipeBase &R : EarlyExitVPBB->phis()) {
4187 auto *ExitIRI = cast<VPIRPhi>(&R);
4188 VPValue *IncomingVal =
4189 ExitIRI->getIncomingValueForBlock(EarlyExitingVPBB);
4190 VPValue *NewIncoming = IncomingVal;
4191 if (!isa<VPIRValue>(IncomingVal)) {
4192 VPBuilder EarlyExitBuilder(VectorEarlyExitVPBB);
4193 NewIncoming = EarlyExitBuilder.createNaryOp(
4194 VPInstruction::ExtractLane, {FirstActiveLane, IncomingVal},
4195 DebugLoc::getUnknown(), "early.exit.value");
4196 }
4197 ExitIRI->removeIncomingValueFor(EarlyExitingVPBB);
4198 ExitIRI->addOperand(NewIncoming);
4199 }
4200
4201 EarlyExitingVPBB->getTerminator()->eraseFromParent();
4202 VPBlockUtils::disconnectBlocks(EarlyExitingVPBB, EarlyExitVPBB);
4203 VPBlockUtils::connectBlocks(VectorEarlyExitVPBB, EarlyExitVPBB);
4204 }
4205
4206 // Chain through exits: for each exit, check if its condition is true at
4207 // the first active lane. If so, take that exit; otherwise, try the next.
4208 // The last exit needs no check since it must be taken if all others fail.
4209 //
4210 // For 3 exits (cond.0, cond.1, cond.2), this creates:
4211 //
4212 // latch:
4213 // ...
4214 // EMIT vp<%combined> = logical-or vp<%cond.0>, vp<%cond.1>, vp<%cond.2>
4215 // ...
4216 //
4217 // vector.early.exit.check:
4218 // EMIT vp<%first.lane> = first-active-lane vp<%combined>
4219 // EMIT vp<%at.cond.0> = extract-lane vp<%first.lane>, vp<%cond.0>
4220 // EMIT branch-on-cond vp<%at.cond.0>
4221 // Successor(s): vector.early.exit.0, vector.early.exit.check.0
4222 //
4223 // vector.early.exit.check.0:
4224 // EMIT vp<%at.cond.1> = extract-lane vp<%first.lane>, vp<%cond.1>
4225 // EMIT branch-on-cond vp<%at.cond.1>
4226 // Successor(s): vector.early.exit.1, vector.early.exit.2
4227 VPBasicBlock *CurrentBB = DispatchVPBB;
4228 for (auto [I, Exit] : enumerate(ArrayRef(Exits).drop_back())) {
4229 VPValue *LaneVal = DispatchBuilder.createNaryOp(
4230 VPInstruction::ExtractLane, {FirstActiveLane, Exit.CondToExit},
4231 DebugLoc::getUnknown(), "exit.cond.at.lane");
4232
4233 // For the last dispatch, branch directly to the last exit on false;
4234 // otherwise, create a new check block.
4235 bool IsLastDispatch = (I + 2 == Exits.size());
4236 VPBasicBlock *FalseBB =
4237 IsLastDispatch ? VectorEarlyExitVPBBs.back()
4238 : Plan.createVPBasicBlock(
4239 Twine("vector.early.exit.check.") + Twine(I));
4240
4241 DispatchBuilder.createNaryOp(VPInstruction::BranchOnCond, {LaneVal});
4242 CurrentBB->setSuccessors({VectorEarlyExitVPBBs[I], FalseBB});
4243 VectorEarlyExitVPBBs[I]->setPredecessors({CurrentBB});
4244 FalseBB->setPredecessors({CurrentBB});
4245
4246 CurrentBB = FalseBB;
4247 DispatchBuilder.setInsertPoint(CurrentBB);
4248 }
4249
4250 // Replace the latch terminator with the new branching logic.
4251 auto *LatchExitingBranch = cast<VPInstruction>(LatchVPBB->getTerminator());
4252 assert(LatchExitingBranch->getOpcode() == VPInstruction::BranchOnCount &&
4253 "Unexpected terminator");
4254 auto *IsLatchExitTaken =
4255 Builder.createICmp(CmpInst::ICMP_EQ, LatchExitingBranch->getOperand(0),
4256 LatchExitingBranch->getOperand(1));
4257
4258 DebugLoc LatchDL = LatchExitingBranch->getDebugLoc();
4259 LatchExitingBranch->eraseFromParent();
4260 Builder.setInsertPoint(LatchVPBB);
4261 Builder.createNaryOp(VPInstruction::BranchOnTwoConds,
4262 {IsAnyExitTaken, IsLatchExitTaken}, LatchDL);
4263 LatchVPBB->clearSuccessors();
4264 LatchVPBB->setSuccessors({DispatchVPBB, MiddleVPBB, HeaderVPBB});
4265}
4266
4267/// This function tries convert extended in-loop reductions to
4268/// VPExpressionRecipe and clamp the \p Range if it is beneficial and
4269/// valid. The created recipe must be decomposed to its constituent
4270/// recipes before execution.
4271static VPExpressionRecipe *
4273 VFRange &Range) {
4274 Type *RedTy = Ctx.Types.inferScalarType(Red);
4275 VPValue *VecOp = Red->getVecOp();
4276
4277 assert(!Red->isPartialReduction() &&
4278 "This path does not support partial reductions");
4279
4280 // Clamp the range if using extended-reduction is profitable.
4281 auto IsExtendedRedValidAndClampRange =
4282 [&](unsigned Opcode, Instruction::CastOps ExtOpc, Type *SrcTy) -> bool {
4284 [&](ElementCount VF) {
4285 auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF));
4287
4289 InstructionCost ExtCost =
4290 cast<VPWidenCastRecipe>(VecOp)->computeCost(VF, Ctx);
4291 InstructionCost RedCost = Red->computeCost(VF, Ctx);
4292
4293 assert(!RedTy->isFloatingPointTy() &&
4294 "getExtendedReductionCost only supports integer types");
4295 ExtRedCost = Ctx.TTI.getExtendedReductionCost(
4296 Opcode, ExtOpc == Instruction::CastOps::ZExt, RedTy, SrcVecTy,
4297 Red->getFastMathFlags(), CostKind);
4298 return ExtRedCost.isValid() && ExtRedCost < ExtCost + RedCost;
4299 },
4300 Range);
4301 };
4302
4303 VPValue *A;
4304 // Match reduce(ext)).
4306 IsExtendedRedValidAndClampRange(
4307 RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()),
4308 cast<VPWidenCastRecipe>(VecOp)->getOpcode(),
4309 Ctx.Types.inferScalarType(A)))
4310 return new VPExpressionRecipe(cast<VPWidenCastRecipe>(VecOp), Red);
4311
4312 return nullptr;
4313}
4314
4315/// This function tries convert extended in-loop reductions to
4316/// VPExpressionRecipe and clamp the \p Range if it is beneficial
4317/// and valid. The created VPExpressionRecipe must be decomposed to its
4318/// constituent recipes before execution. Patterns of the
4319/// VPExpressionRecipe:
4320/// reduce.add(mul(...)),
4321/// reduce.add(mul(ext(A), ext(B))),
4322/// reduce.add(ext(mul(ext(A), ext(B)))).
4323/// reduce.fadd(fmul(ext(A), ext(B)))
4324static VPExpressionRecipe *
4326 VPCostContext &Ctx, VFRange &Range) {
4327 unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind());
4328 if (Opcode != Instruction::Add && Opcode != Instruction::Sub &&
4329 Opcode != Instruction::FAdd)
4330 return nullptr;
4331
4332 assert(!Red->isPartialReduction() &&
4333 "This path does not support partial reductions");
4334 Type *RedTy = Ctx.Types.inferScalarType(Red);
4335
4336 // Clamp the range if using multiply-accumulate-reduction is profitable.
4337 auto IsMulAccValidAndClampRange =
4339 VPWidenCastRecipe *OuterExt) -> bool {
4341 [&](ElementCount VF) {
4343 Type *SrcTy =
4344 Ext0 ? Ctx.Types.inferScalarType(Ext0->getOperand(0)) : RedTy;
4345 InstructionCost MulAccCost;
4346
4347 // getMulAccReductionCost for in-loop reductions does not support
4348 // mixed or floating-point extends.
4349 if (Ext0 && Ext1 &&
4350 (Ext0->getOpcode() != Ext1->getOpcode() ||
4351 Ext0->getOpcode() == Instruction::CastOps::FPExt))
4352 return false;
4353
4354 bool IsZExt =
4355 !Ext0 || Ext0->getOpcode() == Instruction::CastOps::ZExt;
4356 auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF));
4357 MulAccCost = Ctx.TTI.getMulAccReductionCost(IsZExt, Opcode, RedTy,
4358 SrcVecTy, CostKind);
4359
4360 InstructionCost MulCost = Mul->computeCost(VF, Ctx);
4361 InstructionCost RedCost = Red->computeCost(VF, Ctx);
4362 InstructionCost ExtCost = 0;
4363 if (Ext0)
4364 ExtCost += Ext0->computeCost(VF, Ctx);
4365 if (Ext1)
4366 ExtCost += Ext1->computeCost(VF, Ctx);
4367 if (OuterExt)
4368 ExtCost += OuterExt->computeCost(VF, Ctx);
4369
4370 return MulAccCost.isValid() &&
4371 MulAccCost < ExtCost + MulCost + RedCost;
4372 },
4373 Range);
4374 };
4375
4376 VPValue *VecOp = Red->getVecOp();
4377 VPRecipeBase *Sub = nullptr;
4378 VPValue *A, *B;
4379 VPValue *Tmp = nullptr;
4380
4381 if (RedTy->isFloatingPointTy())
4382 return nullptr;
4383
4384 // Sub reductions could have a sub between the add reduction and vec op.
4385 if (match(VecOp, m_Sub(m_ZeroInt(), m_VPValue(Tmp)))) {
4386 Sub = VecOp->getDefiningRecipe();
4387 VecOp = Tmp;
4388 }
4389
4390 // If ValB is a constant and can be safely extended, truncate it to the same
4391 // type as ExtA's operand, then extend it to the same type as ExtA. This
4392 // creates two uniform extends that can more easily be matched by the rest of
4393 // the bundling code. The ExtB reference, ValB and operand 1 of Mul are all
4394 // replaced with the new extend of the constant.
4395 auto ExtendAndReplaceConstantOp = [&Ctx](VPWidenCastRecipe *ExtA,
4396 VPWidenCastRecipe *&ExtB,
4397 VPValue *&ValB, VPWidenRecipe *Mul) {
4398 if (!ExtA || ExtB || !isa<VPIRValue>(ValB))
4399 return;
4400 Type *NarrowTy = Ctx.Types.inferScalarType(ExtA->getOperand(0));
4401 Instruction::CastOps ExtOpc = ExtA->getOpcode();
4402 const APInt *Const;
4403 if (!match(ValB, m_APInt(Const)) ||
4405 Const, NarrowTy, TTI::getPartialReductionExtendKind(ExtOpc)))
4406 return;
4407 // The truncate ensures that the type of each extended operand is the
4408 // same, and it's been proven that the constant can be extended from
4409 // NarrowTy safely. Necessary since ExtA's extended operand would be
4410 // e.g. an i8, while the const will likely be an i32. This will be
4411 // elided by later optimisations.
4412 VPBuilder Builder(Mul);
4413 auto *Trunc =
4414 Builder.createWidenCast(Instruction::CastOps::Trunc, ValB, NarrowTy);
4415 Type *WideTy = Ctx.Types.inferScalarType(ExtA);
4416 ValB = ExtB = Builder.createWidenCast(ExtOpc, Trunc, WideTy);
4417 Mul->setOperand(1, ExtB);
4418 };
4419
4420 // Try to match reduce.add(mul(...)).
4421 if (match(VecOp, m_Mul(m_VPValue(A), m_VPValue(B)))) {
4422 auto *RecipeA = dyn_cast<VPWidenCastRecipe>(A);
4423 auto *RecipeB = dyn_cast<VPWidenCastRecipe>(B);
4424 auto *Mul = cast<VPWidenRecipe>(VecOp);
4425
4426 // Convert reduce.add(mul(ext, const)) to reduce.add(mul(ext, ext(const)))
4427 ExtendAndReplaceConstantOp(RecipeA, RecipeB, B, Mul);
4428
4429 // Match reduce.add/sub(mul(ext, ext)).
4430 if (RecipeA && RecipeB && match(RecipeA, m_ZExtOrSExt(m_VPValue())) &&
4431 match(RecipeB, m_ZExtOrSExt(m_VPValue())) &&
4432 IsMulAccValidAndClampRange(Mul, RecipeA, RecipeB, nullptr)) {
4433 if (Sub)
4434 return new VPExpressionRecipe(RecipeA, RecipeB, Mul,
4435 cast<VPWidenRecipe>(Sub), Red);
4436 return new VPExpressionRecipe(RecipeA, RecipeB, Mul, Red);
4437 }
4438 // TODO: Add an expression type for this variant with a negated mul
4439 if (!Sub && IsMulAccValidAndClampRange(Mul, nullptr, nullptr, nullptr))
4440 return new VPExpressionRecipe(Mul, Red);
4441 }
4442 // TODO: Add an expression type for negated versions of other expression
4443 // variants.
4444 if (Sub)
4445 return nullptr;
4446
4447 // Match reduce.add(ext(mul(A, B))).
4448 if (match(VecOp, m_ZExtOrSExt(m_Mul(m_VPValue(A), m_VPValue(B))))) {
4449 auto *Ext = cast<VPWidenCastRecipe>(VecOp);
4450 auto *Mul = cast<VPWidenRecipe>(Ext->getOperand(0));
4451 auto *Ext0 = dyn_cast<VPWidenCastRecipe>(A);
4452 auto *Ext1 = dyn_cast<VPWidenCastRecipe>(B);
4453
4454 // reduce.add(ext(mul(ext, const)))
4455 // -> reduce.add(ext(mul(ext, ext(const))))
4456 ExtendAndReplaceConstantOp(Ext0, Ext1, B, Mul);
4457
4458 // reduce.add(ext(mul(ext(A), ext(B))))
4459 // -> reduce.add(mul(wider_ext(A), wider_ext(B)))
4460 // The inner extends must either have the same opcode as the outer extend or
4461 // be the same, in which case the multiply can never result in a negative
4462 // value and the outer extend can be folded away by doing wider
4463 // extends for the operands of the mul.
4464 if (Ext0 && Ext1 &&
4465 (Ext->getOpcode() == Ext0->getOpcode() || Ext0 == Ext1) &&
4466 Ext0->getOpcode() == Ext1->getOpcode() &&
4467 IsMulAccValidAndClampRange(Mul, Ext0, Ext1, Ext) && Mul->hasOneUse()) {
4468 auto *NewExt0 = new VPWidenCastRecipe(
4469 Ext0->getOpcode(), Ext0->getOperand(0), Ext->getResultType(), nullptr,
4470 *Ext0, *Ext0, Ext0->getDebugLoc());
4471 NewExt0->insertBefore(Ext0);
4472
4473 VPWidenCastRecipe *NewExt1 = NewExt0;
4474 if (Ext0 != Ext1) {
4475 NewExt1 = new VPWidenCastRecipe(Ext1->getOpcode(), Ext1->getOperand(0),
4476 Ext->getResultType(), nullptr, *Ext1,
4477 *Ext1, Ext1->getDebugLoc());
4478 NewExt1->insertBefore(Ext1);
4479 }
4480 Mul->setOperand(0, NewExt0);
4481 Mul->setOperand(1, NewExt1);
4482 Red->setOperand(1, Mul);
4483 return new VPExpressionRecipe(NewExt0, NewExt1, Mul, Red);
4484 }
4485 }
4486 return nullptr;
4487}
4488
4489/// This function tries to create abstract recipes from the reduction recipe for
4490/// following optimizations and cost estimation.
4492 VPCostContext &Ctx,
4493 VFRange &Range) {
4494 // Creation of VPExpressions for partial reductions is entirely handled in
4495 // transformToPartialReduction.
4496 assert(!Red->isPartialReduction() &&
4497 "This path does not support partial reductions");
4498
4499 VPExpressionRecipe *AbstractR = nullptr;
4500 auto IP = std::next(Red->getIterator());
4501 auto *VPBB = Red->getParent();
4502 if (auto *MulAcc = tryToMatchAndCreateMulAccumulateReduction(Red, Ctx, Range))
4503 AbstractR = MulAcc;
4504 else if (auto *ExtRed = tryToMatchAndCreateExtendedReduction(Red, Ctx, Range))
4505 AbstractR = ExtRed;
4506 // Cannot create abstract inloop reduction recipes.
4507 if (!AbstractR)
4508 return;
4509
4510 AbstractR->insertBefore(*VPBB, IP);
4511 Red->replaceAllUsesWith(AbstractR);
4512}
4513
4524
4526 if (Plan.hasScalarVFOnly())
4527 return;
4528
4529#ifndef NDEBUG
4530 VPDominatorTree VPDT(Plan);
4531#endif
4532
4533 SmallVector<VPValue *> VPValues;
4534 if (VPValue *BTC = Plan.getBackedgeTakenCount())
4535 VPValues.push_back(BTC);
4536 append_range(VPValues, Plan.getLiveIns());
4537 for (VPRecipeBase &R : *Plan.getEntry())
4538 append_range(VPValues, R.definedValues());
4539
4540 auto *VectorPreheader = Plan.getVectorPreheader();
4541 for (VPValue *VPV : VPValues) {
4543 (isa<VPIRValue>(VPV) && isa<Constant>(VPV->getLiveInIRValue())))
4544 continue;
4545
4546 // Add explicit broadcast at the insert point that dominates all users.
4547 VPBasicBlock *HoistBlock = VectorPreheader;
4548 VPBasicBlock::iterator HoistPoint = VectorPreheader->end();
4549 for (VPUser *User : VPV->users()) {
4550 if (User->usesScalars(VPV))
4551 continue;
4552 if (cast<VPRecipeBase>(User)->getParent() == VectorPreheader)
4553 HoistPoint = HoistBlock->begin();
4554 else
4555 assert(VPDT.dominates(VectorPreheader,
4556 cast<VPRecipeBase>(User)->getParent()) &&
4557 "All users must be in the vector preheader or dominated by it");
4558 }
4559
4560 VPBuilder Builder(cast<VPBasicBlock>(HoistBlock), HoistPoint);
4561 auto *Broadcast = Builder.createNaryOp(VPInstruction::Broadcast, {VPV});
4562 VPV->replaceUsesWithIf(Broadcast,
4563 [VPV, Broadcast](VPUser &U, unsigned Idx) {
4564 return Broadcast != &U && !U.usesScalars(VPV);
4565 });
4566 }
4567}
4568
4569// Collect common metadata from a group of replicate recipes by intersecting
4570// metadata from all recipes in the group.
4572 VPIRMetadata CommonMetadata = *Recipes.front();
4573 for (VPReplicateRecipe *Recipe : drop_begin(Recipes))
4574 CommonMetadata.intersect(*Recipe);
4575 return CommonMetadata;
4576}
4577
4578template <unsigned Opcode>
4582 const Loop *L) {
4583 static_assert(Opcode == Instruction::Load || Opcode == Instruction::Store,
4584 "Only Load and Store opcodes supported");
4585 constexpr bool IsLoad = (Opcode == Instruction::Load);
4586 VPTypeAnalysis TypeInfo(Plan);
4587
4588 // For each address, collect operations with the same or complementary masks.
4590 auto GetLoadStoreValueType = [&](VPReplicateRecipe *Recipe) {
4591 return TypeInfo.inferScalarType(IsLoad ? Recipe : Recipe->getOperand(0));
4592 };
4594 Plan, PSE, L,
4595 [](VPReplicateRecipe *RepR) { return RepR->isPredicated(); });
4596 for (auto Recipes : Groups) {
4597 if (Recipes.size() < 2)
4598 continue;
4599
4600 // Collect groups with the same or complementary masks.
4601 for (VPReplicateRecipe *&RecipeI : Recipes) {
4602 if (!RecipeI)
4603 continue;
4604
4605 VPValue *MaskI = RecipeI->getMask();
4606 Type *TypeI = GetLoadStoreValueType(RecipeI);
4608 Group.push_back(RecipeI);
4609 RecipeI = nullptr;
4610
4611 // Find all operations with the same or complementary masks.
4612 bool HasComplementaryMask = false;
4613 for (VPReplicateRecipe *&RecipeJ : Recipes) {
4614 if (!RecipeJ)
4615 continue;
4616
4617 VPValue *MaskJ = RecipeJ->getMask();
4618 Type *TypeJ = GetLoadStoreValueType(RecipeJ);
4619 if (TypeI == TypeJ) {
4620 // Check if any operation in the group has a complementary mask with
4621 // another, that is M1 == NOT(M2) or M2 == NOT(M1).
4622 HasComplementaryMask |= match(MaskI, m_Not(m_Specific(MaskJ))) ||
4623 match(MaskJ, m_Not(m_Specific(MaskI)));
4624 Group.push_back(RecipeJ);
4625 RecipeJ = nullptr;
4626 }
4627 }
4628
4629 if (HasComplementaryMask) {
4630 assert(Group.size() >= 2 && "must have at least 2 entries");
4631 AllGroups.push_back(std::move(Group));
4632 }
4633 }
4634 }
4635
4636 return AllGroups;
4637}
4638
4639// Find the recipe with minimum alignment in the group.
4640template <typename InstType>
4641static VPReplicateRecipe *
4643 return *min_element(Group, [](VPReplicateRecipe *A, VPReplicateRecipe *B) {
4644 return cast<InstType>(A->getUnderlyingInstr())->getAlign() <
4645 cast<InstType>(B->getUnderlyingInstr())->getAlign();
4646 });
4647}
4648
4651 const Loop *L) {
4652 auto Groups =
4654 if (Groups.empty())
4655 return;
4656
4657 // Process each group of loads.
4658 for (auto &Group : Groups) {
4659 // Try to use the earliest (most dominating) load to replace all others.
4660 VPReplicateRecipe *EarliestLoad = Group[0];
4661 VPBasicBlock *FirstBB = EarliestLoad->getParent();
4662 VPBasicBlock *LastBB = Group.back()->getParent();
4663
4664 // Check that the load doesn't alias with stores between first and last.
4665 auto LoadLoc = vputils::getMemoryLocation(*EarliestLoad);
4666 if (!LoadLoc || !canHoistOrSinkWithNoAliasCheck(*LoadLoc, FirstBB, LastBB))
4667 continue;
4668
4669 // Collect common metadata from all loads in the group.
4670 VPIRMetadata CommonMetadata = getCommonMetadata(Group);
4671
4672 // Find the load with minimum alignment to use.
4673 auto *LoadWithMinAlign = findRecipeWithMinAlign<LoadInst>(Group);
4674
4675 bool IsSingleScalar = EarliestLoad->isSingleScalar();
4676 assert(all_of(Group,
4677 [IsSingleScalar](VPReplicateRecipe *R) {
4678 return R->isSingleScalar() == IsSingleScalar;
4679 }) &&
4680 "all members in group must agree on IsSingleScalar");
4681
4682 // Create an unpredicated version of the earliest load with common
4683 // metadata.
4684 auto *UnpredicatedLoad = new VPReplicateRecipe(
4685 LoadWithMinAlign->getUnderlyingInstr(), {EarliestLoad->getOperand(0)},
4686 IsSingleScalar, /*Mask=*/nullptr, *EarliestLoad, CommonMetadata);
4687
4688 UnpredicatedLoad->insertBefore(EarliestLoad);
4689
4690 // Replace all loads in the group with the unpredicated load.
4691 for (VPReplicateRecipe *Load : Group) {
4692 Load->replaceAllUsesWith(UnpredicatedLoad);
4693 Load->eraseFromParent();
4694 }
4695 }
4696}
4697
4698static bool
4700 PredicatedScalarEvolution &PSE, const Loop &L,
4701 VPTypeAnalysis &TypeInfo) {
4702 auto StoreLoc = vputils::getMemoryLocation(*StoresToSink.front());
4703 if (!StoreLoc || !StoreLoc->AATags.Scope)
4704 return false;
4705
4706 // When sinking a group of stores, all members of the group alias each other.
4707 // Skip them during the alias checks.
4708 SmallPtrSet<VPRecipeBase *, 4> StoresToSinkSet(StoresToSink.begin(),
4709 StoresToSink.end());
4710
4711 VPBasicBlock *FirstBB = StoresToSink.front()->getParent();
4712 VPBasicBlock *LastBB = StoresToSink.back()->getParent();
4713 SinkStoreInfo SinkInfo(StoresToSinkSet, *StoresToSink[0], PSE, L, TypeInfo);
4714 return canHoistOrSinkWithNoAliasCheck(*StoreLoc, FirstBB, LastBB, SinkInfo);
4715}
4716
4719 const Loop *L) {
4720 auto Groups =
4722 if (Groups.empty())
4723 return;
4724
4725 VPTypeAnalysis TypeInfo(Plan);
4726
4727 for (auto &Group : Groups) {
4728 if (!canSinkStoreWithNoAliasCheck(Group, PSE, *L, TypeInfo))
4729 continue;
4730
4731 // Use the last (most dominated) store's location for the unconditional
4732 // store.
4733 VPReplicateRecipe *LastStore = Group.back();
4734 VPBasicBlock *InsertBB = LastStore->getParent();
4735
4736 // Collect common alias metadata from all stores in the group.
4737 VPIRMetadata CommonMetadata = getCommonMetadata(Group);
4738
4739 // Build select chain for stored values.
4740 VPValue *SelectedValue = Group[0]->getOperand(0);
4741 VPBuilder Builder(InsertBB, LastStore->getIterator());
4742
4743 bool IsSingleScalar = Group[0]->isSingleScalar();
4744 for (unsigned I = 1; I < Group.size(); ++I) {
4745 assert(IsSingleScalar == Group[I]->isSingleScalar() &&
4746 "all members in group must agree on IsSingleScalar");
4747 VPValue *Mask = Group[I]->getMask();
4748 VPValue *Value = Group[I]->getOperand(0);
4749 SelectedValue = Builder.createSelect(Mask, Value, SelectedValue,
4750 Group[I]->getDebugLoc());
4751 }
4752
4753 // Find the store with minimum alignment to use.
4754 auto *StoreWithMinAlign = findRecipeWithMinAlign<StoreInst>(Group);
4755
4756 // Create unconditional store with selected value and common metadata.
4757 auto *UnpredicatedStore = new VPReplicateRecipe(
4758 StoreWithMinAlign->getUnderlyingInstr(),
4759 {SelectedValue, LastStore->getOperand(1)}, IsSingleScalar,
4760 /*Mask=*/nullptr, *LastStore, CommonMetadata);
4761 UnpredicatedStore->insertBefore(*InsertBB, LastStore->getIterator());
4762
4763 // Remove all predicated stores from the group.
4764 for (VPReplicateRecipe *Store : Group)
4765 Store->eraseFromParent();
4766 }
4767}
4768
4770 VPlan &Plan, ElementCount BestVF, unsigned BestUF,
4772 assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
4773 assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
4774
4775 VPValue *TC = Plan.getTripCount();
4776 if (TC->getNumUsers() == 0)
4777 return;
4778
4779 // Skip cases for which the trip count may be non-trivial to materialize.
4780 // I.e., when a scalar tail is absent - due to tail folding, or when a scalar
4781 // tail is required.
4782 if (!Plan.hasScalarTail() ||
4784 Plan.getScalarPreheader() ||
4785 !isa<VPIRValue>(TC))
4786 return;
4787
4788 // Materialize vector trip counts for constants early if it can simply
4789 // be computed as (Original TC / VF * UF) * VF * UF.
4790 // TODO: Compute vector trip counts for loops requiring a scalar epilogue and
4791 // tail-folded loops.
4792 ScalarEvolution &SE = *PSE.getSE();
4793 auto *TCScev = SE.getSCEV(TC->getLiveInIRValue());
4794 if (!isa<SCEVConstant>(TCScev))
4795 return;
4796 const SCEV *VFxUF = SE.getElementCount(TCScev->getType(), BestVF * BestUF);
4797 auto VecTCScev = SE.getMulExpr(SE.getUDivExpr(TCScev, VFxUF), VFxUF);
4798 if (auto *ConstVecTC = dyn_cast<SCEVConstant>(VecTCScev))
4799 Plan.getVectorTripCount().setUnderlyingValue(ConstVecTC->getValue());
4800}
4801
4803 VPBasicBlock *VectorPH) {
4805 if (BTC->getNumUsers() == 0)
4806 return;
4807
4808 VPBuilder Builder(VectorPH, VectorPH->begin());
4809 auto *TCTy = VPTypeAnalysis(Plan).inferScalarType(Plan.getTripCount());
4810 auto *TCMO =
4811 Builder.createSub(Plan.getTripCount(), Plan.getConstantInt(TCTy, 1),
4812 DebugLoc::getCompilerGenerated(), "trip.count.minus.1");
4813 BTC->replaceAllUsesWith(TCMO);
4814}
4815
4817 if (Plan.hasScalarVFOnly())
4818 return;
4819
4820 VPTypeAnalysis TypeInfo(Plan);
4821 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
4822 auto VPBBsOutsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
4824 auto VPBBsInsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
4825 vp_depth_first_shallow(LoopRegion->getEntry()));
4826 // Materialize Build(Struct)Vector for all replicating VPReplicateRecipes,
4827 // VPScalarIVStepsRecipe and VPInstructions, excluding ones in replicate
4828 // regions. Those are not materialized explicitly yet.
4829 // TODO: materialize build vectors for replicating recipes in replicating
4830 // regions.
4831 for (VPBasicBlock *VPBB :
4832 concat<VPBasicBlock *>(VPBBsOutsideLoopRegion, VPBBsInsideLoopRegion)) {
4833 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
4835 continue;
4836 auto *DefR = cast<VPSingleDefRecipe>(&R);
4837 auto UsesVectorOrInsideReplicateRegion = [DefR, LoopRegion](VPUser *U) {
4838 VPRegionBlock *ParentRegion = cast<VPRecipeBase>(U)->getRegion();
4839 return !U->usesScalars(DefR) || ParentRegion != LoopRegion;
4840 };
4841 if ((isa<VPReplicateRecipe>(DefR) &&
4842 cast<VPReplicateRecipe>(DefR)->isSingleScalar()) ||
4843 (isa<VPInstruction>(DefR) &&
4845 !cast<VPInstruction>(DefR)->doesGeneratePerAllLanes())) ||
4846 none_of(DefR->users(), UsesVectorOrInsideReplicateRegion))
4847 continue;
4848
4849 Type *ScalarTy = TypeInfo.inferScalarType(DefR);
4850 unsigned Opcode = ScalarTy->isStructTy()
4853 auto *BuildVector = new VPInstruction(Opcode, {DefR});
4854 BuildVector->insertAfter(DefR);
4855
4856 DefR->replaceUsesWithIf(
4857 BuildVector, [BuildVector, &UsesVectorOrInsideReplicateRegion](
4858 VPUser &U, unsigned) {
4859 return &U != BuildVector && UsesVectorOrInsideReplicateRegion(&U);
4860 });
4861 }
4862 }
4863
4864 // Create explicit VPInstructions to convert vectors to scalars. The current
4865 // implementation is conservative - it may miss some cases that may or may not
4866 // be vector values. TODO: introduce Unpacks speculatively - remove them later
4867 // if they are known to operate on scalar values.
4868 for (VPBasicBlock *VPBB : VPBBsInsideLoopRegion) {
4869 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
4871 VPDerivedIVRecipe>(&R))
4872 continue;
4873 for (VPValue *Def : R.definedValues()) {
4874 // Skip recipes that are single-scalar or only have their first lane
4875 // used.
4876 // TODO: The Defs skipped here may or may not be vector values.
4877 // Introduce Unpacks, and remove them later, if they are guaranteed to
4878 // produce scalar values.
4880 continue;
4881
4882 // At the moment, we create unpacks only for scalar users outside
4883 // replicate regions. Recipes inside replicate regions still extract the
4884 // required lanes implicitly.
4885 // TODO: Remove once replicate regions are unrolled completely.
4886 auto IsCandidateUnpackUser = [Def](VPUser *U) {
4887 VPRegionBlock *ParentRegion = cast<VPRecipeBase>(U)->getRegion();
4888 return U->usesScalars(Def) &&
4889 (!ParentRegion || !ParentRegion->isReplicator());
4890 };
4891 if (none_of(Def->users(), IsCandidateUnpackUser))
4892 continue;
4893
4894 auto *Unpack = new VPInstruction(VPInstruction::Unpack, {Def});
4895 if (R.isPhi())
4896 Unpack->insertBefore(*VPBB, VPBB->getFirstNonPhi());
4897 else
4898 Unpack->insertAfter(&R);
4899 Def->replaceUsesWithIf(Unpack,
4900 [&IsCandidateUnpackUser](VPUser &U, unsigned) {
4901 return IsCandidateUnpackUser(&U);
4902 });
4903 }
4904 }
4905 }
4906}
4907
4909 VPlan &Plan, VPBasicBlock *VectorPHVPBB, bool TailByMasking,
4910 bool RequiresScalarEpilogue, VPValue *Step,
4911 std::optional<uint64_t> MaxRuntimeStep) {
4912 VPSymbolicValue &VectorTC = Plan.getVectorTripCount();
4913 // There's nothing to do if there are no users of the vector trip count or its
4914 // IR value has already been set.
4915 if (VectorTC.getNumUsers() == 0 || VectorTC.getUnderlyingValue())
4916 return;
4917
4918 VPValue *TC = Plan.getTripCount();
4919 Type *TCTy = VPTypeAnalysis(Plan).inferScalarType(TC);
4920 VPBasicBlock::iterator InsertPt = VectorPHVPBB->begin();
4921 if (auto *StepR = Step->getDefiningRecipe()) {
4922 assert(StepR->getParent() == VectorPHVPBB &&
4923 "Step must be defined in VectorPHVPBB");
4924 // Insert after Step's definition to maintain valid def-use ordering.
4925 InsertPt = std::next(StepR->getIterator());
4926 }
4927 VPBuilder Builder(VectorPHVPBB, InsertPt);
4928
4929 // For scalable steps, if TC is a constant and is divisible by the maximum
4930 // possible runtime step, then TC % Step == 0 for all valid vscale values
4931 // and the vector trip count equals TC directly.
4932 const APInt *TCVal;
4933 if (!RequiresScalarEpilogue && match(TC, m_APInt(TCVal)) && MaxRuntimeStep &&
4934 TCVal->getZExtValue() % *MaxRuntimeStep == 0) {
4935 VectorTC.replaceAllUsesWith(TC);
4936 return;
4937 }
4938
4939 // If the tail is to be folded by masking, round the number of iterations N
4940 // up to a multiple of Step instead of rounding down. This is done by first
4941 // adding Step-1 and then rounding down. Note that it's ok if this addition
4942 // overflows: the vector induction variable will eventually wrap to zero given
4943 // that it starts at zero and its Step is a power of two; the loop will then
4944 // exit, with the last early-exit vector comparison also producing all-true.
4945 if (TailByMasking) {
4946 TC = Builder.createAdd(
4947 TC, Builder.createSub(Step, Plan.getConstantInt(TCTy, 1)),
4948 DebugLoc::getCompilerGenerated(), "n.rnd.up");
4949 }
4950
4951 // Now we need to generate the expression for the part of the loop that the
4952 // vectorized body will execute. This is equal to N - (N % Step) if scalar
4953 // iterations are not required for correctness, or N - Step, otherwise. Step
4954 // is equal to the vectorization factor (number of SIMD elements) times the
4955 // unroll factor (number of SIMD instructions).
4956 VPValue *R =
4957 Builder.createNaryOp(Instruction::URem, {TC, Step},
4958 DebugLoc::getCompilerGenerated(), "n.mod.vf");
4959
4960 // There are cases where we *must* run at least one iteration in the remainder
4961 // loop. See the cost model for when this can happen. If the step evenly
4962 // divides the trip count, we set the remainder to be equal to the step. If
4963 // the step does not evenly divide the trip count, no adjustment is necessary
4964 // since there will already be scalar iterations. Note that the minimum
4965 // iterations check ensures that N >= Step.
4966 if (RequiresScalarEpilogue) {
4967 assert(!TailByMasking &&
4968 "requiring scalar epilogue is not supported with fail folding");
4969 VPValue *IsZero =
4970 Builder.createICmp(CmpInst::ICMP_EQ, R, Plan.getZero(TCTy));
4971 R = Builder.createSelect(IsZero, Step, R);
4972 }
4973
4974 VPValue *Res =
4975 Builder.createSub(TC, R, DebugLoc::getCompilerGenerated(), "n.vec");
4976 VectorTC.replaceAllUsesWith(Res);
4977}
4978
4980 ElementCount VFEC) {
4981 // If VF and VFxUF have already been materialized (no remaining users),
4982 // there's nothing more to do.
4983 if (Plan.getVF().isMaterialized()) {
4984 assert(Plan.getVFxUF().isMaterialized() &&
4985 "VF and VFxUF must be materialized together");
4986 return;
4987 }
4988
4989 VPBuilder Builder(VectorPH, VectorPH->begin());
4990 Type *TCTy = VPTypeAnalysis(Plan).inferScalarType(Plan.getTripCount());
4991 VPValue &VF = Plan.getVF();
4992 VPValue &VFxUF = Plan.getVFxUF();
4993 // If there are no users of the runtime VF, compute VFxUF by constant folding
4994 // the multiplication of VF and UF.
4995 if (VF.getNumUsers() == 0) {
4996 VPValue *RuntimeVFxUF =
4997 Builder.createElementCount(TCTy, VFEC * Plan.getConcreteUF());
4998 VFxUF.replaceAllUsesWith(RuntimeVFxUF);
4999 return;
5000 }
5001
5002 // For users of the runtime VF, compute it as VF * vscale, and VFxUF as (VF *
5003 // vscale) * UF.
5004 VPValue *RuntimeVF = Builder.createElementCount(TCTy, VFEC);
5006 VPValue *BC = Builder.createNaryOp(VPInstruction::Broadcast, RuntimeVF);
5008 BC, [&VF](VPUser &U, unsigned) { return !U.usesScalars(&VF); });
5009 }
5010 VF.replaceAllUsesWith(RuntimeVF);
5011
5012 VPValue *MulByUF = Builder.createOverflowingOp(
5013 Instruction::Mul,
5014 {RuntimeVF, Plan.getConstantInt(TCTy, Plan.getConcreteUF())},
5015 {true, false});
5016 VFxUF.replaceAllUsesWith(MulByUF);
5017}
5018
5021 SCEVExpander Expander(SE, "induction", /*PreserveLCSSA=*/false);
5022
5023 auto *Entry = cast<VPIRBasicBlock>(Plan.getEntry());
5024 BasicBlock *EntryBB = Entry->getIRBasicBlock();
5025 DenseMap<const SCEV *, Value *> ExpandedSCEVs;
5026 for (VPRecipeBase &R : make_early_inc_range(*Entry)) {
5028 continue;
5029 auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(&R);
5030 if (!ExpSCEV)
5031 break;
5032 const SCEV *Expr = ExpSCEV->getSCEV();
5033 Value *Res =
5034 Expander.expandCodeFor(Expr, Expr->getType(), EntryBB->getTerminator());
5035 ExpandedSCEVs[ExpSCEV->getSCEV()] = Res;
5036 VPValue *Exp = Plan.getOrAddLiveIn(Res);
5037 ExpSCEV->replaceAllUsesWith(Exp);
5038 if (Plan.getTripCount() == ExpSCEV)
5039 Plan.resetTripCount(Exp);
5040 ExpSCEV->eraseFromParent();
5041 }
5043 "VPExpandSCEVRecipes must be at the beginning of the entry block, "
5044 "before any VPIRInstructions");
5045 // Add IR instructions in the entry basic block but not in the VPIRBasicBlock
5046 // to the VPIRBasicBlock.
5047 auto EI = Entry->begin();
5048 for (Instruction &I : drop_end(*EntryBB)) {
5049 if (EI != Entry->end() && isa<VPIRInstruction>(*EI) &&
5050 &cast<VPIRInstruction>(&*EI)->getInstruction() == &I) {
5051 EI++;
5052 continue;
5053 }
5055 }
5056
5057 return ExpandedSCEVs;
5058}
5059
5060/// Returns true if \p V is VPWidenLoadRecipe or VPInterleaveRecipe that can be
5061/// converted to a narrower recipe. \p V is used by a wide recipe that feeds a
5062/// store interleave group at index \p Idx, \p WideMember0 is the recipe feeding
5063/// the same interleave group at index 0. A VPWidenLoadRecipe can be narrowed to
5064/// an index-independent load if it feeds all wide ops at all indices (\p OpV
5065/// must be the operand at index \p OpIdx for both the recipe at lane 0, \p
5066/// WideMember0). A VPInterleaveRecipe can be narrowed to a wide load, if \p V
5067/// is defined at \p Idx of a load interleave group.
5068static bool canNarrowLoad(VPSingleDefRecipe *WideMember0, unsigned OpIdx,
5069 VPValue *OpV, unsigned Idx, bool IsScalable) {
5070 VPValue *Member0Op = WideMember0->getOperand(OpIdx);
5071 VPRecipeBase *Member0OpR = Member0Op->getDefiningRecipe();
5072 if (!Member0OpR)
5073 return Member0Op == OpV;
5074 if (auto *W = dyn_cast<VPWidenLoadRecipe>(Member0OpR))
5075 // For scalable VFs, the narrowed plan processes vscale iterations at once,
5076 // so a shared wide load cannot be narrowed to a uniform scalar; bail out.
5077 return !IsScalable && !W->getMask() && W->isConsecutive() &&
5078 Member0Op == OpV;
5079 if (auto *IR = dyn_cast<VPInterleaveRecipe>(Member0OpR))
5080 return IR->getInterleaveGroup()->isFull() && IR->getVPValue(Idx) == OpV;
5081 return false;
5082}
5083
5084static bool canNarrowOps(ArrayRef<VPValue *> Ops, bool IsScalable) {
5086 auto *WideMember0 = dyn_cast<VPSingleDefRecipe>(Ops[0]);
5087 if (!WideMember0)
5088 return false;
5089 for (VPValue *V : Ops) {
5091 return false;
5092 auto *R = cast<VPSingleDefRecipe>(V);
5093 if (getOpcodeOrIntrinsicID(R) != getOpcodeOrIntrinsicID(WideMember0))
5094 return false;
5095 }
5096
5097 for (unsigned Idx = 0; Idx != WideMember0->getNumOperands(); ++Idx) {
5099 for (VPValue *Op : Ops)
5100 OpsI.push_back(Op->getDefiningRecipe()->getOperand(Idx));
5101
5102 if (canNarrowOps(OpsI, IsScalable))
5103 continue;
5104
5105 if (any_of(enumerate(OpsI), [WideMember0, Idx, IsScalable](const auto &P) {
5106 const auto &[OpIdx, OpV] = P;
5107 return !canNarrowLoad(WideMember0, Idx, OpV, OpIdx, IsScalable);
5108 }))
5109 return false;
5110 }
5111
5112 return true;
5113}
5114
5115/// Returns VF from \p VFs if \p IR is a full interleave group with factor and
5116/// number of members both equal to VF. The interleave group must also access
5117/// the full vector width.
5118static std::optional<ElementCount> isConsecutiveInterleaveGroup(
5120 VPTypeAnalysis &TypeInfo, const TargetTransformInfo &TTI) {
5121 if (!InterleaveR || InterleaveR->getMask())
5122 return std::nullopt;
5123
5124 Type *GroupElementTy = nullptr;
5125 if (InterleaveR->getStoredValues().empty()) {
5126 GroupElementTy = TypeInfo.inferScalarType(InterleaveR->getVPValue(0));
5127 if (!all_of(InterleaveR->definedValues(),
5128 [&TypeInfo, GroupElementTy](VPValue *Op) {
5129 return TypeInfo.inferScalarType(Op) == GroupElementTy;
5130 }))
5131 return std::nullopt;
5132 } else {
5133 GroupElementTy =
5134 TypeInfo.inferScalarType(InterleaveR->getStoredValues()[0]);
5135 if (!all_of(InterleaveR->getStoredValues(),
5136 [&TypeInfo, GroupElementTy](VPValue *Op) {
5137 return TypeInfo.inferScalarType(Op) == GroupElementTy;
5138 }))
5139 return std::nullopt;
5140 }
5141
5142 auto IG = InterleaveR->getInterleaveGroup();
5143 if (IG->getFactor() != IG->getNumMembers())
5144 return std::nullopt;
5145
5146 auto GetVectorBitWidthForVF = [&TTI](ElementCount VF) {
5147 TypeSize Size = TTI.getRegisterBitWidth(
5150 assert(Size.isScalable() == VF.isScalable() &&
5151 "if Size is scalable, VF must be scalable and vice versa");
5152 return Size.getKnownMinValue();
5153 };
5154
5155 for (ElementCount VF : VFs) {
5156 unsigned MinVal = VF.getKnownMinValue();
5157 unsigned GroupSize = GroupElementTy->getScalarSizeInBits() * MinVal;
5158 if (IG->getFactor() == MinVal && GroupSize == GetVectorBitWidthForVF(VF))
5159 return {VF};
5160 }
5161 return std::nullopt;
5162}
5163
5164/// Returns true if \p VPValue is a narrow VPValue.
5165static bool isAlreadyNarrow(VPValue *VPV) {
5166 if (isa<VPIRValue>(VPV))
5167 return true;
5168 auto *RepR = dyn_cast<VPReplicateRecipe>(VPV);
5169 return RepR && RepR->isSingleScalar();
5170}
5171
5172// Convert a wide recipe defining a VPValue \p V feeding an interleave group to
5173// a narrow variant.
5174static VPValue *
5176 auto *R = V->getDefiningRecipe();
5177 if (!R || NarrowedOps.contains(V))
5178 return V;
5179
5180 if (isAlreadyNarrow(V))
5181 return V;
5182
5184 auto *WideMember0 = cast<VPSingleDefRecipe>(R);
5185 for (unsigned Idx = 0, E = WideMember0->getNumOperands(); Idx != E; ++Idx)
5186 WideMember0->setOperand(
5187 Idx,
5188 narrowInterleaveGroupOp(WideMember0->getOperand(Idx), NarrowedOps));
5189 return V;
5190 }
5191
5192 if (auto *LoadGroup = dyn_cast<VPInterleaveRecipe>(R)) {
5193 // Narrow interleave group to wide load, as transformed VPlan will only
5194 // process one original iteration.
5195 auto *LI = cast<LoadInst>(LoadGroup->getInterleaveGroup()->getInsertPos());
5196 auto *L = new VPWidenLoadRecipe(*LI, LoadGroup->getAddr(),
5197 LoadGroup->getMask(), /*Consecutive=*/true,
5198 {}, LoadGroup->getDebugLoc());
5199 L->insertBefore(LoadGroup);
5200 NarrowedOps.insert(L);
5201 return L;
5202 }
5203
5204 if (auto *RepR = dyn_cast<VPReplicateRecipe>(R)) {
5205 assert(RepR->isSingleScalar() && RepR->getOpcode() == Instruction::Load &&
5206 "must be a single scalar load");
5207 NarrowedOps.insert(RepR);
5208 return RepR;
5209 }
5210
5211 auto *WideLoad = cast<VPWidenLoadRecipe>(R);
5212 VPValue *PtrOp = WideLoad->getAddr();
5213 if (auto *VecPtr = dyn_cast<VPVectorPointerRecipe>(PtrOp))
5214 PtrOp = VecPtr->getOperand(0);
5215 // Narrow wide load to uniform scalar load, as transformed VPlan will only
5216 // process one original iteration.
5217 auto *N = new VPReplicateRecipe(&WideLoad->getIngredient(), {PtrOp},
5218 /*IsUniform*/ true,
5219 /*Mask*/ nullptr, {}, *WideLoad);
5220 N->insertBefore(WideLoad);
5221 NarrowedOps.insert(N);
5222 return N;
5223}
5224
5225std::unique_ptr<VPlan>
5227 const TargetTransformInfo &TTI) {
5228 VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
5229
5230 if (!VectorLoop)
5231 return nullptr;
5232
5233 // Only handle single-block loops for now.
5234 if (VectorLoop->getEntryBasicBlock() != VectorLoop->getExitingBasicBlock())
5235 return nullptr;
5236
5237 // Skip plans when we may not be able to properly narrow.
5238 VPBasicBlock *Exiting = VectorLoop->getExitingBasicBlock();
5239 if (!match(&Exiting->back(), m_BranchOnCount()))
5240 return nullptr;
5241
5242 assert(match(&Exiting->back(),
5244 m_Specific(&Plan.getVectorTripCount()))) &&
5245 "unexpected branch-on-count");
5246
5247 VPTypeAnalysis TypeInfo(Plan);
5249 std::optional<ElementCount> VFToOptimize;
5250 for (auto &R : *VectorLoop->getEntryBasicBlock()) {
5253 continue;
5254
5255 // Bail out on recipes not supported at the moment:
5256 // * phi recipes other than the canonical induction
5257 // * recipes writing to memory except interleave groups
5258 // Only support plans with a canonical induction phi.
5259 if (R.isPhi())
5260 return nullptr;
5261
5262 auto *InterleaveR = dyn_cast<VPInterleaveRecipe>(&R);
5263 if (R.mayWriteToMemory() && !InterleaveR)
5264 return nullptr;
5265
5266 // Bail out if any recipe defines a vector value used outside the
5267 // vector loop region.
5268 if (any_of(R.definedValues(), [&](VPValue *V) {
5269 return any_of(V->users(), [&](VPUser *U) {
5270 auto *UR = cast<VPRecipeBase>(U);
5271 return UR->getParent()->getParent() != VectorLoop;
5272 });
5273 }))
5274 return nullptr;
5275
5276 // All other ops are allowed, but we reject uses that cannot be converted
5277 // when checking all allowed consumers (store interleave groups) below.
5278 if (!InterleaveR)
5279 continue;
5280
5281 // Try to find a single VF, where all interleave groups are consecutive and
5282 // saturate the full vector width. If we already have a candidate VF, check
5283 // if it is applicable for the current InterleaveR, otherwise look for a
5284 // suitable VF across the Plan's VFs.
5286 VFToOptimize ? SmallVector<ElementCount>({*VFToOptimize})
5287 : to_vector(Plan.vectorFactors());
5288 std::optional<ElementCount> NarrowedVF =
5289 isConsecutiveInterleaveGroup(InterleaveR, VFs, TypeInfo, TTI);
5290 if (!NarrowedVF || (VFToOptimize && NarrowedVF != VFToOptimize))
5291 return nullptr;
5292 VFToOptimize = NarrowedVF;
5293
5294 // Skip read interleave groups.
5295 if (InterleaveR->getStoredValues().empty())
5296 continue;
5297
5298 // Narrow interleave groups, if all operands are already matching narrow
5299 // ops.
5300 auto *Member0 = InterleaveR->getStoredValues()[0];
5301 if (isAlreadyNarrow(Member0) &&
5302 all_of(InterleaveR->getStoredValues(), equal_to(Member0))) {
5303 StoreGroups.push_back(InterleaveR);
5304 continue;
5305 }
5306
5307 // For now, we only support full interleave groups storing load interleave
5308 // groups.
5309 if (all_of(enumerate(InterleaveR->getStoredValues()), [](auto Op) {
5310 VPRecipeBase *DefR = Op.value()->getDefiningRecipe();
5311 if (!DefR)
5312 return false;
5313 auto *IR = dyn_cast<VPInterleaveRecipe>(DefR);
5314 return IR && IR->getInterleaveGroup()->isFull() &&
5315 IR->getVPValue(Op.index()) == Op.value();
5316 })) {
5317 StoreGroups.push_back(InterleaveR);
5318 continue;
5319 }
5320
5321 // Check if all values feeding InterleaveR are matching wide recipes, which
5322 // operands that can be narrowed.
5323 if (!canNarrowOps(InterleaveR->getStoredValues(),
5324 VFToOptimize->isScalable()))
5325 return nullptr;
5326 StoreGroups.push_back(InterleaveR);
5327 }
5328
5329 if (StoreGroups.empty())
5330 return nullptr;
5331
5332 VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
5333 bool RequiresScalarEpilogue =
5334 MiddleVPBB->getNumSuccessors() == 1 &&
5335 MiddleVPBB->getSingleSuccessor() == Plan.getScalarPreheader();
5336 // Bail out for tail-folding (middle block with a single successor to exit).
5337 if (MiddleVPBB->getNumSuccessors() != 2 && !RequiresScalarEpilogue)
5338 return nullptr;
5339
5340 // All interleave groups in Plan can be narrowed for VFToOptimize. Split the
5341 // original Plan into 2: a) a new clone which contains all VFs of Plan, except
5342 // VFToOptimize, and b) the original Plan with VFToOptimize as single VF.
5343 // TODO: Handle cases where only some interleave groups can be narrowed.
5344 std::unique_ptr<VPlan> NewPlan;
5345 if (size(Plan.vectorFactors()) != 1) {
5346 NewPlan = std::unique_ptr<VPlan>(Plan.duplicate());
5347 Plan.setVF(*VFToOptimize);
5348 NewPlan->removeVF(*VFToOptimize);
5349 }
5350
5351 // Convert InterleaveGroup \p R to a single VPWidenLoadRecipe.
5352 SmallPtrSet<VPValue *, 4> NarrowedOps;
5353 // Narrow operation tree rooted at store groups.
5354 for (auto *StoreGroup : StoreGroups) {
5355 VPValue *Res =
5356 narrowInterleaveGroupOp(StoreGroup->getStoredValues()[0], NarrowedOps);
5357 auto *SI =
5358 cast<StoreInst>(StoreGroup->getInterleaveGroup()->getInsertPos());
5359 auto *S = new VPWidenStoreRecipe(*SI, StoreGroup->getAddr(), Res, nullptr,
5360 /*Consecutive=*/true, {},
5361 StoreGroup->getDebugLoc());
5362 S->insertBefore(StoreGroup);
5363 StoreGroup->eraseFromParent();
5364 }
5365
5366 // Adjust induction to reflect that the transformed plan only processes one
5367 // original iteration.
5369 Type *CanIVTy = VectorLoop->getCanonicalIVType();
5370 VPBasicBlock *VectorPH = Plan.getVectorPreheader();
5371 VPBuilder PHBuilder(VectorPH, VectorPH->begin());
5372
5373 VPValue *UF = &Plan.getUF();
5374 VPValue *Step;
5375 if (VFToOptimize->isScalable()) {
5376 VPValue *VScale =
5377 PHBuilder.createElementCount(CanIVTy, ElementCount::getScalable(1));
5378 Step = PHBuilder.createOverflowingOp(Instruction::Mul, {VScale, UF},
5379 {true, false});
5380 Plan.getVF().replaceAllUsesWith(VScale);
5381 } else {
5382 Step = UF;
5383 Plan.getVF().replaceAllUsesWith(Plan.getConstantInt(CanIVTy, 1));
5384 }
5385 // Materialize vector trip count with the narrowed step.
5386 materializeVectorTripCount(Plan, VectorPH, /*TailByMasking=*/false,
5387 RequiresScalarEpilogue, Step);
5388
5389 CanIVInc->setOperand(1, Step);
5390 Plan.getVFxUF().replaceAllUsesWith(Step);
5391
5392 removeDeadRecipes(Plan);
5393 assert(none_of(*VectorLoop->getEntryBasicBlock(),
5395 "All VPVectorPointerRecipes should have been removed");
5396 return NewPlan;
5397}
5398
5399/// Add branch weight metadata, if the \p Plan's middle block is terminated by a
5400/// BranchOnCond recipe.
5402 VPlan &Plan, ElementCount VF, std::optional<unsigned> VScaleForTuning) {
5403 VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
5404 auto *MiddleTerm =
5406 // Only add branch metadata if there is a (conditional) terminator.
5407 if (!MiddleTerm)
5408 return;
5409
5410 assert(MiddleTerm->getOpcode() == VPInstruction::BranchOnCond &&
5411 "must have a BranchOnCond");
5412 // Assume that `TripCount % VectorStep ` is equally distributed.
5413 unsigned VectorStep = Plan.getConcreteUF() * VF.getKnownMinValue();
5414 if (VF.isScalable() && VScaleForTuning.has_value())
5415 VectorStep *= *VScaleForTuning;
5416 assert(VectorStep > 0 && "trip count should not be zero");
5417 MDBuilder MDB(Plan.getContext());
5418 MDNode *BranchWeights =
5419 MDB.createBranchWeights({1, VectorStep - 1}, /*IsExpected=*/false);
5420 MiddleTerm->setMetadata(LLVMContext::MD_prof, BranchWeights);
5421}
5422
5424 VFRange &Range) {
5425 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
5426 auto *MiddleVPBB = Plan.getMiddleBlock();
5427 VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
5428 VPTypeAnalysis TypeInfo(Plan);
5429
5430 auto IsScalableOne = [](ElementCount VF) -> bool {
5431 return VF == ElementCount::getScalable(1);
5432 };
5433
5434 for (auto &HeaderPhi : VectorRegion->getEntryBasicBlock()->phis()) {
5435 auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&HeaderPhi);
5436 if (!FOR)
5437 continue;
5438
5439 assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
5440 "Cannot handle loops with uncountable early exits");
5441
5442 // Find the existing splice for this FOR, created in
5443 // createHeaderPhiRecipes. All uses of FOR have already been replaced with
5444 // RecurSplice there; only RecurSplice itself still references FOR.
5445 auto *RecurSplice =
5447 assert(RecurSplice && "expected FirstOrderRecurrenceSplice");
5448
5449 // For VF vscale x 1, if vscale = 1, we are unable to extract the
5450 // penultimate value of the recurrence. Instead we rely on the existing
5451 // extract of the last element from the result of
5452 // VPInstruction::FirstOrderRecurrenceSplice.
5453 // TODO: Consider vscale_range info and UF.
5454 if (any_of(RecurSplice->users(),
5455 [](VPUser *U) { return !cast<VPRecipeBase>(U)->getRegion(); }) &&
5457 Range))
5458 return;
5459
5460 // This is the second phase of vectorizing first-order recurrences, creating
5461 // extracts for users outside the loop. An overview of the transformation is
5462 // described below. Suppose we have the following loop with some use after
5463 // the loop of the last a[i-1],
5464 //
5465 // for (int i = 0; i < n; ++i) {
5466 // t = a[i - 1];
5467 // b[i] = a[i] - t;
5468 // }
5469 // use t;
5470 //
5471 // There is a first-order recurrence on "a". For this loop, the shorthand
5472 // scalar IR looks like:
5473 //
5474 // scalar.ph:
5475 // s.init = a[-1]
5476 // br scalar.body
5477 //
5478 // scalar.body:
5479 // i = phi [0, scalar.ph], [i+1, scalar.body]
5480 // s1 = phi [s.init, scalar.ph], [s2, scalar.body]
5481 // s2 = a[i]
5482 // b[i] = s2 - s1
5483 // br cond, scalar.body, exit.block
5484 //
5485 // exit.block:
5486 // use = lcssa.phi [s1, scalar.body]
5487 //
5488 // In this example, s1 is a recurrence because it's value depends on the
5489 // previous iteration. In the first phase of vectorization, we created a
5490 // VPFirstOrderRecurrencePHIRecipe v1 for s1. Now we create the extracts
5491 // for users in the scalar preheader and exit block.
5492 //
5493 // vector.ph:
5494 // v_init = vector(..., ..., ..., a[-1])
5495 // br vector.body
5496 //
5497 // vector.body
5498 // i = phi [0, vector.ph], [i+4, vector.body]
5499 // v1 = phi [v_init, vector.ph], [v2, vector.body]
5500 // v2 = a[i, i+1, i+2, i+3]
5501 // v1' = splice(v1(3), v2(0, 1, 2))
5502 // b[i, i+1, i+2, i+3] = v2 - v1'
5503 // br cond, vector.body, middle.block
5504 //
5505 // middle.block:
5506 // vector.recur.extract.for.phi = v2(2)
5507 // vector.recur.extract = v2(3)
5508 // br cond, scalar.ph, exit.block
5509 //
5510 // scalar.ph:
5511 // scalar.recur.init = phi [vector.recur.extract, middle.block],
5512 // [s.init, otherwise]
5513 // br scalar.body
5514 //
5515 // scalar.body:
5516 // i = phi [0, scalar.ph], [i+1, scalar.body]
5517 // s1 = phi [scalar.recur.init, scalar.ph], [s2, scalar.body]
5518 // s2 = a[i]
5519 // b[i] = s2 - s1
5520 // br cond, scalar.body, exit.block
5521 //
5522 // exit.block:
5523 // lo = lcssa.phi [s1, scalar.body],
5524 // [vector.recur.extract.for.phi, middle.block]
5525 //
5526 // Update extracts of the splice in the middle block: they extract the
5527 // penultimate element of the recurrence.
5529 make_range(MiddleVPBB->getFirstNonPhi(), MiddleVPBB->end()))) {
5530 if (!match(&R, m_ExtractLastLaneOfLastPart(m_Specific(RecurSplice))))
5531 continue;
5532
5533 auto *ExtractR = cast<VPInstruction>(&R);
5534 VPValue *PenultimateElement = MiddleBuilder.createNaryOp(
5535 VPInstruction::ExtractPenultimateElement, RecurSplice->getOperand(1),
5536 {}, "vector.recur.extract.for.phi");
5537 for (VPUser *ExitU : to_vector(ExtractR->users())) {
5538 if (auto *ExitPhi = dyn_cast<VPIRPhi>(ExitU))
5539 ExitPhi->replaceUsesOfWith(ExtractR, PenultimateElement);
5540 }
5541 }
5542 }
5543}
5544
5545/// Check if \p V is a binary expression of a widened IV and a loop-invariant
5546/// value. Returns the widened IV if found, nullptr otherwise.
5548 auto *BinOp = dyn_cast<VPWidenRecipe>(V);
5549 if (!BinOp || !Instruction::isBinaryOp(BinOp->getOpcode()) ||
5550 Instruction::isIntDivRem(BinOp->getOpcode()))
5551 return nullptr;
5552
5553 VPValue *WidenIVCandidate = BinOp->getOperand(0);
5554 VPValue *InvariantCandidate = BinOp->getOperand(1);
5555 if (!isa<VPWidenIntOrFpInductionRecipe>(WidenIVCandidate))
5556 std::swap(WidenIVCandidate, InvariantCandidate);
5557
5558 if (!InvariantCandidate->isDefinedOutsideLoopRegions())
5559 return nullptr;
5560
5561 return dyn_cast<VPWidenIntOrFpInductionRecipe>(WidenIVCandidate);
5562}
5563
5564/// Create a scalar version of \p BinOp, with its \p WidenIV operand replaced
5565/// by \p ScalarIV, and place it after \p ScalarIV's defining recipe.
5569 BinOp->getNumOperands() == 2 && "BinOp must have 2 operands");
5570 auto *ClonedOp = BinOp->clone();
5571 if (ClonedOp->getOperand(0) == WidenIV) {
5572 ClonedOp->setOperand(0, ScalarIV);
5573 } else {
5574 assert(ClonedOp->getOperand(1) == WidenIV && "one operand must be WideIV");
5575 ClonedOp->setOperand(1, ScalarIV);
5576 }
5577 ClonedOp->insertAfter(ScalarIV->getDefiningRecipe());
5578 return ClonedOp;
5579}
5580
5583 Loop &L) {
5584 ScalarEvolution &SE = *PSE.getSE();
5585 VPRegionBlock *VectorLoopRegion = Plan.getVectorLoopRegion();
5586
5587 // Helper lambda to check if the IV range excludes the sentinel value. Try
5588 // signed first, then unsigned. Return an excluded sentinel if found,
5589 // otherwise return std::nullopt.
5590 auto CheckSentinel = [&SE](const SCEV *IVSCEV,
5591 bool UseMax) -> std::optional<APSInt> {
5592 unsigned BW = IVSCEV->getType()->getScalarSizeInBits();
5593 for (bool Signed : {true, false}) {
5594 APSInt Sentinel = UseMax ? APSInt::getMinValue(BW, /*Unsigned=*/!Signed)
5595 : APSInt::getMaxValue(BW, /*Unsigned=*/!Signed);
5596
5597 ConstantRange IVRange =
5598 Signed ? SE.getSignedRange(IVSCEV) : SE.getUnsignedRange(IVSCEV);
5599 if (!IVRange.contains(Sentinel))
5600 return Sentinel;
5601 }
5602 return std::nullopt;
5603 };
5604
5605 VPValue *HeaderMask = vputils::findHeaderMask(Plan);
5606 for (VPRecipeBase &Phi :
5607 make_early_inc_range(VectorLoopRegion->getEntryBasicBlock()->phis())) {
5608 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&Phi);
5610 PhiR->getRecurrenceKind()))
5611 continue;
5612
5613 Type *PhiTy = VPTypeAnalysis(Plan).inferScalarType(PhiR);
5614 if (PhiTy->isPointerTy() || PhiTy->isFloatingPointTy())
5615 continue;
5616
5617 // If there's a header mask, the backedge select will not be the find-last
5618 // select.
5619 VPValue *BackedgeVal = PhiR->getBackedgeValue();
5620 auto *FindLastSelect = cast<VPSingleDefRecipe>(BackedgeVal);
5621 if (HeaderMask &&
5622 !match(BackedgeVal,
5623 m_Select(m_Specific(HeaderMask),
5624 m_VPSingleDefRecipe(FindLastSelect), m_Specific(PhiR))))
5625 llvm_unreachable("expected header mask select");
5626
5627 // Get the find-last expression from the find-last select of the reduction
5628 // phi. The find-last select should be a select between the phi and the
5629 // find-last expression.
5630 VPValue *Cond, *FindLastExpression;
5631 if (!match(FindLastSelect, m_Select(m_VPValue(Cond), m_Specific(PhiR),
5632 m_VPValue(FindLastExpression))) &&
5633 !match(FindLastSelect,
5634 m_Select(m_VPValue(Cond), m_VPValue(FindLastExpression),
5635 m_Specific(PhiR))))
5636 continue;
5637
5638 // Check if FindLastExpression is a simple expression of a widened IV. If
5639 // so, we can track the underlying IV instead and sink the expression.
5640 auto *IVOfExpressionToSink = getExpressionIV(FindLastExpression);
5641 const SCEV *IVSCEV = vputils::getSCEVExprForVPValue(
5642 IVOfExpressionToSink ? IVOfExpressionToSink : FindLastExpression, PSE,
5643 &L);
5644 const SCEV *Step;
5645 if (!match(IVSCEV, m_scev_AffineAddRec(m_SCEV(), m_SCEV(Step)))) {
5646 assert(!match(vputils::getSCEVExprForVPValue(FindLastExpression, PSE, &L),
5648 "IVOfExpressionToSink not being an AddRec must imply "
5649 "FindLastExpression not being an AddRec.");
5650 continue;
5651 }
5652
5653 // Determine direction from SCEV step.
5654 if (!SE.isKnownNonZero(Step))
5655 continue;
5656
5657 // Positive step means we need UMax/SMax to find the last IV value, and
5658 // UMin/SMin otherwise.
5659 bool UseMax = SE.isKnownPositive(Step);
5660 std::optional<APSInt> SentinelVal = CheckSentinel(IVSCEV, UseMax);
5661 bool UseSigned = SentinelVal && SentinelVal->isSigned();
5662
5663 // Sinking an expression will disable epilogue vectorization. Only use it,
5664 // if FindLastExpression cannot be vectorized via a sentinel. Sinking may
5665 // also prevent vectorizing using a sentinel (e.g., if the expression is a
5666 // multiply or divide by large constant, respectively), which also makes
5667 // sinking undesirable.
5668 if (IVOfExpressionToSink) {
5669 const SCEV *FindLastExpressionSCEV =
5670 vputils::getSCEVExprForVPValue(FindLastExpression, PSE, &L);
5671 if (match(FindLastExpressionSCEV,
5672 m_scev_AffineAddRec(m_SCEV(), m_SCEV(Step)))) {
5673 bool NewUseMax = SE.isKnownPositive(Step);
5674 if (auto NewSentinel =
5675 CheckSentinel(FindLastExpressionSCEV, NewUseMax)) {
5676 // The original expression already has a sentinel, so prefer not
5677 // sinking to keep epilogue vectorization possible.
5678 SentinelVal = *NewSentinel;
5679 UseSigned = NewSentinel->isSigned();
5680 UseMax = NewUseMax;
5681 IVSCEV = FindLastExpressionSCEV;
5682 IVOfExpressionToSink = nullptr;
5683 }
5684 }
5685 }
5686
5687 // If no sentinel was found, fall back to a boolean AnyOf reduction to track
5688 // if the condition was ever true. Requires the IV to not wrap, otherwise we
5689 // cannot use min/max.
5690 if (!SentinelVal) {
5691 auto *AR = cast<SCEVAddRecExpr>(IVSCEV);
5692 if (AR->hasNoSignedWrap())
5693 UseSigned = true;
5694 else if (AR->hasNoUnsignedWrap())
5695 UseSigned = false;
5696 else
5697 continue;
5698 }
5699
5701 BackedgeVal,
5703
5704 VPValue *NewFindLastSelect = BackedgeVal;
5705 VPValue *SelectCond = Cond;
5706 if (!SentinelVal || IVOfExpressionToSink) {
5707 // When we need to create a new select, normalize the condition so that
5708 // PhiR is the last operand and include the header mask if needed.
5709 DebugLoc DL = FindLastSelect->getDefiningRecipe()->getDebugLoc();
5710 VPBuilder LoopBuilder(FindLastSelect->getDefiningRecipe());
5711 if (FindLastSelect->getDefiningRecipe()->getOperand(1) == PhiR)
5712 SelectCond = LoopBuilder.createNot(SelectCond);
5713
5714 // When tail folding, mask the condition with the header mask to prevent
5715 // propagating poison from inactive lanes in the last vector iteration.
5716 if (HeaderMask)
5717 SelectCond = LoopBuilder.createLogicalAnd(HeaderMask, SelectCond);
5718
5719 if (SelectCond != Cond || IVOfExpressionToSink) {
5720 NewFindLastSelect = LoopBuilder.createSelect(
5721 SelectCond,
5722 IVOfExpressionToSink ? IVOfExpressionToSink : FindLastExpression,
5723 PhiR, DL);
5724 }
5725 }
5726
5727 // Create the reduction result in the middle block using sentinel directly.
5728 RecurKind MinMaxKind =
5729 UseMax ? (UseSigned ? RecurKind::SMax : RecurKind::UMax)
5730 : (UseSigned ? RecurKind::SMin : RecurKind::UMin);
5731 VPIRFlags Flags(MinMaxKind, /*IsOrdered=*/false, /*IsInLoop=*/false,
5732 FastMathFlags());
5733 DebugLoc ExitDL = RdxResult->getDebugLoc();
5734 VPBuilder MiddleBuilder(RdxResult);
5735 VPValue *ReducedIV =
5737 NewFindLastSelect, Flags, ExitDL);
5738
5739 // If IVOfExpressionToSink is an expression to sink, sink it now.
5740 VPValue *VectorRegionExitingVal = ReducedIV;
5741 if (IVOfExpressionToSink)
5742 VectorRegionExitingVal =
5743 cloneBinOpForScalarIV(cast<VPWidenRecipe>(FindLastExpression),
5744 ReducedIV, IVOfExpressionToSink);
5745
5746 VPValue *NewRdxResult;
5747 VPValue *StartVPV = PhiR->getStartValue();
5748 if (SentinelVal) {
5749 // Sentinel-based approach: reduce IVs with min/max, compare against
5750 // sentinel to detect if condition was ever true, select accordingly.
5751 VPValue *Sentinel = Plan.getConstantInt(*SentinelVal);
5752 auto *Cmp = MiddleBuilder.createICmp(CmpInst::ICMP_NE, ReducedIV,
5753 Sentinel, ExitDL);
5754 NewRdxResult = MiddleBuilder.createSelect(Cmp, VectorRegionExitingVal,
5755 StartVPV, ExitDL);
5756 StartVPV = Sentinel;
5757 } else {
5758 // Introduce a boolean AnyOf reduction to track if the condition was ever
5759 // true in the loop. Use it to select the initial start value, if it was
5760 // never true.
5761 auto *AnyOfPhi = new VPReductionPHIRecipe(
5762 /*Phi=*/nullptr, RecurKind::Or, *Plan.getFalse(), *Plan.getFalse(),
5763 RdxUnordered{1}, {}, /*HasUsesOutsideReductionChain=*/false);
5764 AnyOfPhi->insertAfter(PhiR);
5765
5766 VPBuilder LoopBuilder(BackedgeVal->getDefiningRecipe());
5767 VPValue *OrVal = LoopBuilder.createOr(AnyOfPhi, SelectCond);
5768 AnyOfPhi->setOperand(1, OrVal);
5769
5770 NewRdxResult = MiddleBuilder.createAnyOfReduction(
5771 OrVal, VectorRegionExitingVal, StartVPV, ExitDL);
5772
5773 // Initialize the IV reduction phi with the neutral element, not the
5774 // original start value, to ensure correct min/max reduction results.
5775 StartVPV = Plan.getOrAddLiveIn(
5776 getRecurrenceIdentity(MinMaxKind, IVSCEV->getType(), {}));
5777 }
5778 RdxResult->replaceAllUsesWith(NewRdxResult);
5779 RdxResult->eraseFromParent();
5780
5781 auto *NewPhiR = new VPReductionPHIRecipe(
5782 cast<PHINode>(PhiR->getUnderlyingInstr()), RecurKind::FindIV, *StartVPV,
5783 *NewFindLastSelect, RdxUnordered{1}, {},
5784 PhiR->hasUsesOutsideReductionChain());
5785 NewPhiR->insertBefore(PhiR);
5786 PhiR->replaceAllUsesWith(NewPhiR);
5787 PhiR->eraseFromParent();
5788 }
5789}
5790
5791namespace {
5792
5793using ExtendKind = TTI::PartialReductionExtendKind;
5794struct ReductionExtend {
5795 Type *SrcType = nullptr;
5796 ExtendKind Kind = ExtendKind::PR_None;
5797};
5798
5799/// Describes the extends used to compute the extended reduction operand.
5800/// ExtendB is optional. If ExtendB is present, ExtendsUser is a binary
5801/// operation.
5802struct ExtendedReductionOperand {
5803 /// The recipe that consumes the extends.
5804 VPWidenRecipe *ExtendsUser = nullptr;
5805 /// Extend descriptions (inputs to getPartialReductionCost).
5806 ReductionExtend ExtendA, ExtendB;
5807};
5808
5809/// A chain of recipes that form a partial reduction. Matches either
5810/// reduction_bin_op (extended op, accumulator), or
5811/// reduction_bin_op (accumulator, extended op).
5812/// The possible forms of the "extended op" are listed in
5813/// matchExtendedReductionOperand.
5814struct VPPartialReductionChain {
5815 /// The top-level binary operation that forms the reduction to a scalar
5816 /// after the loop body.
5817 VPWidenRecipe *ReductionBinOp = nullptr;
5818 /// The user of the extends that is then reduced.
5819 ExtendedReductionOperand ExtendedOp;
5820 /// The recurrence kind for the entire partial reduction chain.
5821 /// This allows distinguishing between Sub and AddWithSub recurrences,
5822 /// when the ReductionBinOp is a Instruction::Sub.
5823 RecurKind RK;
5824 /// The index of the accumulator operand of ReductionBinOp. The extended op
5825 /// is `1 - AccumulatorOpIdx`.
5826 unsigned AccumulatorOpIdx;
5827 unsigned ScaleFactor;
5828};
5829
5830static VPSingleDefRecipe *
5831optimizeExtendsForPartialReduction(VPSingleDefRecipe *Op,
5832 VPTypeAnalysis &TypeInfo) {
5833 // reduce.add(mul(ext(A), C))
5834 // -> reduce.add(mul(ext(A), ext(trunc(C))))
5835 const APInt *Const;
5836 if (match(Op, m_Mul(m_ZExtOrSExt(m_VPValue()), m_APInt(Const)))) {
5837 auto *ExtA = cast<VPWidenCastRecipe>(Op->getOperand(0));
5838 Instruction::CastOps ExtOpc = ExtA->getOpcode();
5839 Type *NarrowTy = TypeInfo.inferScalarType(ExtA->getOperand(0));
5840 if (!Op->hasOneUse() ||
5842 Const, NarrowTy, TTI::getPartialReductionExtendKind(ExtOpc)))
5843 return Op;
5844
5845 VPBuilder Builder(Op);
5846 auto *Trunc = Builder.createWidenCast(Instruction::CastOps::Trunc,
5847 Op->getOperand(1), NarrowTy);
5848 Type *WideTy = TypeInfo.inferScalarType(ExtA);
5849 Op->setOperand(1, Builder.createWidenCast(ExtOpc, Trunc, WideTy));
5850 return Op;
5851 }
5852
5853 // reduce.add(abs(sub(ext(A), ext(B))))
5854 // -> reduce.add(ext(absolute-difference(A, B)))
5855 VPValue *X, *Y;
5858 auto *Sub = Op->getOperand(0)->getDefiningRecipe();
5859 auto *Ext = cast<VPWidenCastRecipe>(Sub->getOperand(0));
5860 assert(Ext->getOpcode() ==
5861 cast<VPWidenCastRecipe>(Sub->getOperand(1))->getOpcode() &&
5862 "Expected both the LHS and RHS extends to be the same");
5863 bool IsSigned = Ext->getOpcode() == Instruction::SExt;
5864 VPBuilder Builder(Op);
5865 Type *SrcTy = TypeInfo.inferScalarType(X);
5866 auto *FreezeX = Builder.insert(new VPWidenRecipe(Instruction::Freeze, {X}));
5867 auto *FreezeY = Builder.insert(new VPWidenRecipe(Instruction::Freeze, {Y}));
5868 auto *Max = Builder.insert(
5869 new VPWidenIntrinsicRecipe(IsSigned ? Intrinsic::smax : Intrinsic::umax,
5870 {FreezeX, FreezeY}, SrcTy));
5871 auto *Min = Builder.insert(
5872 new VPWidenIntrinsicRecipe(IsSigned ? Intrinsic::smin : Intrinsic::umin,
5873 {FreezeX, FreezeY}, SrcTy));
5874 auto *AbsDiff =
5875 Builder.insert(new VPWidenRecipe(Instruction::Sub, {Max, Min}));
5876 return Builder.createWidenCast(Instruction::CastOps::ZExt, AbsDiff,
5877 TypeInfo.inferScalarType(Op));
5878 }
5879
5880 // reduce.add(ext(mul(ext(A), ext(B))))
5881 // -> reduce.add(mul(wider_ext(A), wider_ext(B)))
5882 // TODO: Support this optimization for float types.
5884 m_ZExtOrSExt(m_VPValue()))))) {
5885 auto *Ext = cast<VPWidenCastRecipe>(Op);
5886 auto *Mul = cast<VPWidenRecipe>(Ext->getOperand(0));
5887 auto *MulLHS = cast<VPWidenCastRecipe>(Mul->getOperand(0));
5888 auto *MulRHS = cast<VPWidenCastRecipe>(Mul->getOperand(1));
5889 if (!Mul->hasOneUse() ||
5890 (Ext->getOpcode() != MulLHS->getOpcode() && MulLHS != MulRHS) ||
5891 MulLHS->getOpcode() != MulRHS->getOpcode())
5892 return Op;
5893 VPBuilder Builder(Mul);
5894 Mul->setOperand(0, Builder.createWidenCast(MulLHS->getOpcode(),
5895 MulLHS->getOperand(0),
5896 Ext->getResultType()));
5897 Mul->setOperand(1, MulLHS == MulRHS
5898 ? Mul->getOperand(0)
5899 : Builder.createWidenCast(MulRHS->getOpcode(),
5900 MulRHS->getOperand(0),
5901 Ext->getResultType()));
5902 return Mul;
5903 }
5904
5905 return Op;
5906}
5907
5908static VPExpressionRecipe *
5909createPartialReductionExpression(VPReductionRecipe *Red) {
5910 VPValue *VecOp = Red->getVecOp();
5911
5912 // reduce.[f]add(ext(op))
5913 // -> VPExpressionRecipe(op, red)
5914 if (match(VecOp, m_WidenAnyExtend(m_VPValue())))
5915 return new VPExpressionRecipe(cast<VPWidenCastRecipe>(VecOp), Red);
5916
5917 // reduce.[f]add([f]mul(ext(a), ext(b)))
5918 // -> VPExpressionRecipe(a, b, mul, red)
5919 if (match(VecOp, m_FMul(m_FPExt(m_VPValue()), m_FPExt(m_VPValue()))) ||
5920 match(VecOp,
5922 auto *Mul = cast<VPWidenRecipe>(VecOp);
5923 auto *ExtA = cast<VPWidenCastRecipe>(Mul->getOperand(0));
5924 auto *ExtB = cast<VPWidenCastRecipe>(Mul->getOperand(1));
5925 return new VPExpressionRecipe(ExtA, ExtB, Mul, Red);
5926 }
5927
5928 // reduce.add(neg(mul(ext(a), ext(b))))
5929 // -> VPExpressionRecipe(a, b, mul, sub, red)
5931 m_ZExtOrSExt(m_VPValue()))))) {
5932 auto *Sub = cast<VPWidenRecipe>(VecOp);
5933 auto *Mul = cast<VPWidenRecipe>(Sub->getOperand(1));
5934 auto *ExtA = cast<VPWidenCastRecipe>(Mul->getOperand(0));
5935 auto *ExtB = cast<VPWidenCastRecipe>(Mul->getOperand(1));
5936 return new VPExpressionRecipe(ExtA, ExtB, Mul, Sub, Red);
5937 }
5938
5939 llvm_unreachable("Unsupported expression");
5940}
5941
5942// Helper to transform a partial reduction chain into a partial reduction
5943// recipe. Assumes profitability has been checked.
5944static void transformToPartialReduction(const VPPartialReductionChain &Chain,
5945 VPTypeAnalysis &TypeInfo, VPlan &Plan,
5946 VPReductionPHIRecipe *RdxPhi) {
5947 VPWidenRecipe *WidenRecipe = Chain.ReductionBinOp;
5948 assert(WidenRecipe->getNumOperands() == 2 && "Expected binary operation");
5949
5950 VPValue *Accumulator = WidenRecipe->getOperand(Chain.AccumulatorOpIdx);
5951 auto *ExtendedOp = cast<VPSingleDefRecipe>(
5952 WidenRecipe->getOperand(1 - Chain.AccumulatorOpIdx));
5953
5954 // Sub-reductions can be implemented in two ways:
5955 // (1) negate the operand in the vector loop (the default way).
5956 // (2) subtract the reduced value from the init value in the middle block.
5957 // Both ways keep the reduction itself as an 'add' reduction.
5958 //
5959 // The ISD nodes for partial reductions don't support folding the
5960 // sub/negation into its operands because the following is not a valid
5961 // transformation:
5962 // sub(0, mul(ext(a), ext(b)))
5963 // -> mul(ext(a), ext(sub(0, b)))
5964 //
5965 // It's therefore better to choose option (2) such that the partial
5966 // reduction is always positive (starting at '0') and to do a final
5967 // subtract in the middle block.
5968 if (WidenRecipe->getOpcode() == Instruction::Sub &&
5969 Chain.RK != RecurKind::Sub) {
5970 VPBuilder Builder(WidenRecipe);
5971 Type *ElemTy = TypeInfo.inferScalarType(ExtendedOp);
5972 auto *Zero = Plan.getZero(ElemTy);
5973 auto *NegRecipe =
5974 new VPWidenRecipe(Instruction::Sub, {Zero, ExtendedOp}, VPIRFlags(),
5976 Builder.insert(NegRecipe);
5977 ExtendedOp = NegRecipe;
5978 }
5979
5980 assert((Chain.RK != RecurKind::FAddChainWithSubs) &&
5981 "FSub chain reduction isn't supported");
5982
5983 // FIXME: Do these transforms before invoking the cost-model.
5984 ExtendedOp = optimizeExtendsForPartialReduction(ExtendedOp, TypeInfo);
5985
5986 // Check if WidenRecipe is the final result of the reduction. If so look
5987 // through selects for predicated reductions.
5988 VPValue *Cond = nullptr;
5990 WidenRecipe,
5991 m_Select(m_VPValue(Cond), m_Specific(WidenRecipe), m_Specific(RdxPhi))));
5992 bool IsLastInChain = RdxPhi->getBackedgeValue() == WidenRecipe ||
5993 RdxPhi->getBackedgeValue() == ExitValue;
5994 assert((!ExitValue || IsLastInChain) &&
5995 "if we found ExitValue, it must match RdxPhi's backedge value");
5996
5997 Type *PhiType = TypeInfo.inferScalarType(RdxPhi);
5998 RecurKind RdxKind =
6000 auto *PartialRed = new VPReductionRecipe(
6001 RdxKind,
6002 RdxKind == RecurKind::FAdd ? WidenRecipe->getFastMathFlags()
6003 : FastMathFlags(),
6004 WidenRecipe->getUnderlyingInstr(), Accumulator, ExtendedOp, Cond,
6005 RdxUnordered{/*VFScaleFactor=*/Chain.ScaleFactor});
6006 PartialRed->insertBefore(WidenRecipe);
6007
6008 if (Cond)
6009 ExitValue->replaceAllUsesWith(PartialRed);
6010 WidenRecipe->replaceAllUsesWith(PartialRed);
6011
6012 // For cost-model purposes, fold this into a VPExpression.
6013 VPExpressionRecipe *E = createPartialReductionExpression(PartialRed);
6014 E->insertBefore(WidenRecipe);
6015 PartialRed->replaceAllUsesWith(E);
6016
6017 // We only need to update the PHI node once, which is when we find the
6018 // last reduction in the chain.
6019 if (!IsLastInChain)
6020 return;
6021
6022 // Scale the PHI and ReductionStartVector by the VFScaleFactor
6023 assert(RdxPhi->getVFScaleFactor() == 1 && "scale factor must not be set");
6024 RdxPhi->setVFScaleFactor(Chain.ScaleFactor);
6025
6026 auto *StartInst = cast<VPInstruction>(RdxPhi->getStartValue());
6027 assert(StartInst->getOpcode() == VPInstruction::ReductionStartVector);
6028 auto *NewScaleFactor = Plan.getConstantInt(32, Chain.ScaleFactor);
6029 StartInst->setOperand(2, NewScaleFactor);
6030
6031 // If this is the last value in a sub-reduction chain, then update the PHI
6032 // node to start at `0` and update the reduction-result to subtract from
6033 // the PHI's start value.
6034 if (Chain.RK != RecurKind::Sub && Chain.RK != RecurKind::FSub)
6035 return;
6036
6037 VPValue *OldStartValue = StartInst->getOperand(0);
6038 StartInst->setOperand(0, StartInst->getOperand(1));
6039
6040 // Replace reduction_result by 'sub (startval, reductionresult)'.
6042 assert(RdxResult && "Could not find reduction result");
6043
6044 VPBuilder Builder = VPBuilder::getToInsertAfter(RdxResult);
6045 unsigned SubOpc = Chain.RK == RecurKind::FSub ? Instruction::BinaryOps::FSub
6046 : Instruction::BinaryOps::Sub;
6047 VPInstruction *NewResult = Builder.createNaryOp(
6048 SubOpc, {OldStartValue, RdxResult}, VPIRFlags::getDefaultFlags(SubOpc),
6049 RdxPhi->getDebugLoc());
6050 RdxResult->replaceUsesWithIf(
6051 NewResult,
6052 [&NewResult](VPUser &U, unsigned Idx) { return &U != NewResult; });
6053}
6054
6055/// Returns the cost of a link in a partial-reduction chain for a given VF.
6056static InstructionCost
6057getPartialReductionLinkCost(VPCostContext &CostCtx,
6058 const VPPartialReductionChain &Link,
6059 ElementCount VF) {
6060 Type *RdxType = CostCtx.Types.inferScalarType(Link.ReductionBinOp);
6061 const ExtendedReductionOperand &ExtendedOp = Link.ExtendedOp;
6062 std::optional<unsigned> BinOpc = std::nullopt;
6063 // If ExtendB is not none, then the "ExtendsUser" is the binary operation.
6064 if (ExtendedOp.ExtendB.Kind != ExtendKind::PR_None)
6065 BinOpc = ExtendedOp.ExtendsUser->getOpcode();
6066
6067 std::optional<llvm::FastMathFlags> Flags;
6068 if (RdxType->isFloatingPointTy())
6069 Flags = Link.ReductionBinOp->getFastMathFlags();
6070
6071 auto GetLinkOpcode = [&Link]() -> unsigned {
6072 switch (Link.RK) {
6073 case RecurKind::Sub:
6074 return Instruction::Add;
6075 case RecurKind::FSub:
6076 return Instruction::FAdd;
6077 default:
6078 return Link.ReductionBinOp->getOpcode();
6079 }
6080 };
6081
6082 return CostCtx.TTI.getPartialReductionCost(
6083 GetLinkOpcode(), ExtendedOp.ExtendA.SrcType, ExtendedOp.ExtendB.SrcType,
6084 RdxType, VF, ExtendedOp.ExtendA.Kind, ExtendedOp.ExtendB.Kind, BinOpc,
6085 CostCtx.CostKind, Flags);
6086}
6087
6088static ExtendKind getPartialReductionExtendKind(VPWidenCastRecipe *Cast) {
6090}
6091
6092/// Checks if \p Op (which is an operand of \p UpdateR) is an extended reduction
6093/// operand. This is an operand where the source of the value (e.g. a load) has
6094/// been extended (sext, zext, or fpext) before it is used in the reduction.
6095///
6096/// Possible forms matched by this function:
6097/// - UpdateR(PrevValue, ext(...))
6098/// - UpdateR(PrevValue, mul(ext(...), ext(...)))
6099/// - UpdateR(PrevValue, mul(ext(...), Constant))
6100/// - UpdateR(PrevValue, neg(mul(ext(...), ext(...))))
6101/// - UpdateR(PrevValue, neg(mul(ext(...), Constant)))
6102/// - UpdateR(PrevValue, ext(mul(ext(...), ext(...))))
6103/// - UpdateR(PrevValue, ext(mul(ext(...), Constant)))
6104/// - UpdateR(PrevValue, abs(sub(ext(...), ext(...)))
6105///
6106/// Note: The second operand of UpdateR corresponds to \p Op in the examples.
6107static std::optional<ExtendedReductionOperand>
6108matchExtendedReductionOperand(VPWidenRecipe *UpdateR, VPValue *Op,
6109 VPTypeAnalysis &TypeInfo) {
6110 assert(is_contained(UpdateR->operands(), Op) &&
6111 "Op should be operand of UpdateR");
6112
6113 // Try matching an absolute difference operand of the form
6114 // `abs(sub(ext(A), ext(B)))`. This will be later transformed into
6115 // `ext(absolute-difference(A, B))`. This allows us to perform the absolute
6116 // difference on a wider type and get the extend for "free" from the partial
6117 // reduction.
6118 VPValue *X, *Y;
6119 if (Op->hasOneUse() &&
6123 auto *Abs = cast<VPWidenIntrinsicRecipe>(Op);
6124 auto *Sub = cast<VPWidenRecipe>(Abs->getOperand(0));
6125 auto *LHSExt = cast<VPWidenCastRecipe>(Sub->getOperand(0));
6126 auto *RHSExt = cast<VPWidenCastRecipe>(Sub->getOperand(1));
6127 Type *LHSInputType = TypeInfo.inferScalarType(X);
6128 Type *RHSInputType = TypeInfo.inferScalarType(Y);
6129 if (LHSInputType != RHSInputType ||
6130 LHSExt->getOpcode() != RHSExt->getOpcode())
6131 return std::nullopt;
6132 // Note: This is essentially the same as matching ext(...) as we will
6133 // rewrite this operand to ext(absolute-difference(A, B)).
6134 return ExtendedReductionOperand{
6135 Sub,
6136 /*ExtendA=*/{LHSInputType, getPartialReductionExtendKind(LHSExt)},
6137 /*ExtendB=*/{}};
6138 }
6139
6140 std::optional<TTI::PartialReductionExtendKind> OuterExtKind;
6142 auto *CastRecipe = cast<VPWidenCastRecipe>(Op);
6143 VPValue *CastSource = CastRecipe->getOperand(0);
6144 OuterExtKind = getPartialReductionExtendKind(CastRecipe);
6145 if (match(CastSource, m_Mul(m_VPValue(), m_VPValue())) ||
6146 match(CastSource, m_FMul(m_VPValue(), m_VPValue()))) {
6147 // Match: ext(mul(...))
6148 // Record the outer extend kind and set `Op` to the mul. We can then match
6149 // this as a binary operation. Note: We can optimize out the outer extend
6150 // by widening the inner extends to match it. See
6151 // optimizeExtendsForPartialReduction.
6152 Op = CastSource;
6153 // FIXME: createPartialReductionExpression can't handle sub(ext(mul(...)))
6154 if (UpdateR->getOpcode() == Instruction::Sub)
6155 return std::nullopt;
6156 } else if (UpdateR->getOpcode() == Instruction::Add ||
6157 UpdateR->getOpcode() == Instruction::FAdd) {
6158 // Match: UpdateR(PrevValue, ext(...))
6159 // TODO: Remove the add/fadd restriction (we should be able to handle this
6160 // case for sub reductions too).
6161 return ExtendedReductionOperand{
6162 UpdateR,
6163 /*ExtendA=*/{TypeInfo.inferScalarType(CastSource), *OuterExtKind},
6164 /*ExtendB=*/{}};
6165 }
6166 }
6167
6168 if (!Op->hasOneUse())
6169 return std::nullopt;
6170
6172 if (!MulOp ||
6173 !is_contained({Instruction::Mul, Instruction::FMul}, MulOp->getOpcode()))
6174 return std::nullopt;
6175
6176 // The rest of the matching assumes `Op` is a (possibly extended/negated)
6177 // binary operation.
6178
6179 VPValue *LHS = MulOp->getOperand(0);
6180 VPValue *RHS = MulOp->getOperand(1);
6181
6182 // The LHS of the operation must always be an extend.
6184 return std::nullopt;
6185
6186 auto *LHSCast = cast<VPWidenCastRecipe>(LHS);
6187 Type *LHSInputType = TypeInfo.inferScalarType(LHSCast->getOperand(0));
6188 ExtendKind LHSExtendKind = getPartialReductionExtendKind(LHSCast);
6189
6190 // The RHS of the operation can be an extend or a constant integer.
6191 const APInt *RHSConst = nullptr;
6192 VPWidenCastRecipe *RHSCast = nullptr;
6194 RHSCast = cast<VPWidenCastRecipe>(RHS);
6195 else if (!match(RHS, m_APInt(RHSConst)) ||
6196 !canConstantBeExtended(RHSConst, LHSInputType, LHSExtendKind))
6197 return std::nullopt;
6198
6199 // The outer extend kind must match the inner extends for folding.
6200 for (VPWidenCastRecipe *Cast : {LHSCast, RHSCast})
6201 if (Cast && OuterExtKind &&
6202 getPartialReductionExtendKind(Cast) != OuterExtKind)
6203 return std::nullopt;
6204
6205 Type *RHSInputType = LHSInputType;
6206 ExtendKind RHSExtendKind = LHSExtendKind;
6207 if (RHSCast) {
6208 RHSInputType = TypeInfo.inferScalarType(RHSCast->getOperand(0));
6209 RHSExtendKind = getPartialReductionExtendKind(RHSCast);
6210 }
6211
6212 return ExtendedReductionOperand{
6213 MulOp, {LHSInputType, LHSExtendKind}, {RHSInputType, RHSExtendKind}};
6214}
6215
6216/// Examines each operation in the reduction chain corresponding to \p RedPhiR,
6217/// and determines if the target can use a cheaper operation with a wider
6218/// per-iteration input VF and narrower PHI VF. If successful, returns the chain
6219/// of operations in the reduction.
6220static std::optional<SmallVector<VPPartialReductionChain>>
6221getScaledReductions(VPReductionPHIRecipe *RedPhiR, VPCostContext &CostCtx,
6222 VFRange &Range) {
6223 // Get the backedge value from the reduction PHI and find the
6224 // ComputeReductionResult that uses it (directly or through a select for
6225 // predicated reductions).
6226 auto *RdxResult = vputils::findComputeReductionResult(RedPhiR);
6227 if (!RdxResult)
6228 return std::nullopt;
6229 VPValue *ExitValue = RdxResult->getOperand(0);
6230 match(ExitValue, m_Select(m_VPValue(), m_VPValue(ExitValue), m_VPValue()));
6231
6232 VPTypeAnalysis &TypeInfo = CostCtx.Types;
6234 RecurKind RK = RedPhiR->getRecurrenceKind();
6235 Type *PhiType = TypeInfo.inferScalarType(RedPhiR);
6236 TypeSize PHISize = PhiType->getPrimitiveSizeInBits();
6237
6238 // Work backwards from the ExitValue examining each reduction operation.
6239 VPValue *CurrentValue = ExitValue;
6240 while (CurrentValue != RedPhiR) {
6241 auto *UpdateR = dyn_cast<VPWidenRecipe>(CurrentValue);
6242 if (!UpdateR || !Instruction::isBinaryOp(UpdateR->getOpcode()))
6243 return std::nullopt;
6244
6245 VPValue *Op = UpdateR->getOperand(1);
6246 VPValue *PrevValue = UpdateR->getOperand(0);
6247
6248 // Find the extended operand. The other operand (PrevValue) is the next link
6249 // in the reduction chain.
6250 std::optional<ExtendedReductionOperand> ExtendedOp =
6251 matchExtendedReductionOperand(UpdateR, Op, TypeInfo);
6252 if (!ExtendedOp) {
6253 ExtendedOp = matchExtendedReductionOperand(UpdateR, PrevValue, TypeInfo);
6254 if (!ExtendedOp)
6255 return std::nullopt;
6256 std::swap(Op, PrevValue);
6257 }
6258
6259 Type *ExtSrcType = ExtendedOp->ExtendA.SrcType;
6260 TypeSize ExtSrcSize = ExtSrcType->getPrimitiveSizeInBits();
6261 if (!PHISize.hasKnownScalarFactor(ExtSrcSize))
6262 return std::nullopt;
6263
6264 // Check if a partial reduction chain is supported by the target (i.e. does
6265 // not have an invalid cost) for the given VF range. Clamps the range and
6266 // returns true if feasible for any VF.
6267 VPPartialReductionChain Link(
6268 {UpdateR, *ExtendedOp, RK,
6269 PrevValue == UpdateR->getOperand(0) ? 0U : 1U,
6270 static_cast<unsigned>(PHISize.getKnownScalarFactor(ExtSrcSize))});
6271 Chain.push_back(Link);
6272 CurrentValue = PrevValue;
6273 }
6274
6275 // The chain links were collected by traversing backwards from the exit value.
6276 // Reverse the chains so they are in program order.
6277 std::reverse(Chain.begin(), Chain.end());
6278 return Chain;
6279}
6280} // namespace
6281
6283 VPCostContext &CostCtx,
6284 VFRange &Range) {
6285 // Find all possible valid partial reductions, grouping chains by their PHI.
6286 // This grouping allows invalidating the whole chain, if any link is not a
6287 // valid partial reduction.
6289 ChainsByPhi;
6290 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
6291 for (VPRecipeBase &R : HeaderVPBB->phis()) {
6292 auto *RedPhiR = dyn_cast<VPReductionPHIRecipe>(&R);
6293 if (!RedPhiR)
6294 continue;
6295
6296 if (auto Chains = getScaledReductions(RedPhiR, CostCtx, Range))
6297 ChainsByPhi.try_emplace(RedPhiR, std::move(*Chains));
6298 }
6299
6300 if (ChainsByPhi.empty())
6301 return;
6302
6303 // Build set of partial reduction operations for extend user validation and
6304 // a map of reduction bin ops to their scale factors for scale validation.
6305 SmallPtrSet<VPRecipeBase *, 4> PartialReductionOps;
6306 DenseMap<VPSingleDefRecipe *, unsigned> ScaledReductionMap;
6307 for (const auto &[_, Chains] : ChainsByPhi)
6308 for (const VPPartialReductionChain &Chain : Chains) {
6309 PartialReductionOps.insert(Chain.ExtendedOp.ExtendsUser);
6310 ScaledReductionMap[Chain.ReductionBinOp] = Chain.ScaleFactor;
6311 }
6312
6313 // A partial reduction is invalid if any of its extends are used by
6314 // something that isn't another partial reduction. This is because the
6315 // extends are intended to be lowered along with the reduction itself.
6316 auto ExtendUsersValid = [&](VPValue *Ext) {
6317 return !isa<VPWidenCastRecipe>(Ext) || all_of(Ext->users(), [&](VPUser *U) {
6318 return PartialReductionOps.contains(cast<VPRecipeBase>(U));
6319 });
6320 };
6321
6322 auto IsProfitablePartialReductionChainForVF =
6323 [&](ArrayRef<VPPartialReductionChain> Chain, ElementCount VF) -> bool {
6324 InstructionCost PartialCost = 0, RegularCost = 0;
6325
6326 // The chain is a profitable partial reduction chain if the cost of handling
6327 // the entire chain is cheaper when using partial reductions than when
6328 // handling the entire chain using regular reductions.
6329 for (const VPPartialReductionChain &Link : Chain) {
6330 const ExtendedReductionOperand &ExtendedOp = Link.ExtendedOp;
6331 InstructionCost LinkCost = getPartialReductionLinkCost(CostCtx, Link, VF);
6332 if (!LinkCost.isValid())
6333 return false;
6334
6335 PartialCost += LinkCost;
6336 RegularCost += Link.ReductionBinOp->computeCost(VF, CostCtx);
6337 // If ExtendB is not none, then the "ExtendsUser" is the binary operation.
6338 if (ExtendedOp.ExtendB.Kind != ExtendKind::PR_None)
6339 RegularCost += ExtendedOp.ExtendsUser->computeCost(VF, CostCtx);
6340 for (VPValue *Op : ExtendedOp.ExtendsUser->operands())
6341 if (auto *Extend = dyn_cast<VPWidenCastRecipe>(Op))
6342 RegularCost += Extend->computeCost(VF, CostCtx);
6343 }
6344 return PartialCost.isValid() && PartialCost < RegularCost;
6345 };
6346
6347 // Validate chains: check that extends are only used by partial reductions,
6348 // and that reduction bin ops are only used by other partial reductions with
6349 // matching scale factors, are outside the loop region or the select
6350 // introduced by tail-folding. Otherwise we would create users of scaled
6351 // reductions where the types of the other operands don't match.
6352 for (auto &[RedPhiR, Chains] : ChainsByPhi) {
6353 for (const VPPartialReductionChain &Chain : Chains) {
6354 if (!all_of(Chain.ExtendedOp.ExtendsUser->operands(), ExtendUsersValid)) {
6355 Chains.clear();
6356 break;
6357 }
6358 auto UseIsValid = [&, RedPhiR = RedPhiR](VPUser *U) {
6359 if (auto *PhiR = dyn_cast<VPReductionPHIRecipe>(U))
6360 return PhiR == RedPhiR;
6361 auto *R = cast<VPSingleDefRecipe>(U);
6362 return Chain.ScaleFactor == ScaledReductionMap.lookup_or(R, 0) ||
6364 m_Specific(Chain.ReductionBinOp))) ||
6365 match(R, m_Select(m_VPValue(), m_Specific(Chain.ReductionBinOp),
6366 m_Specific(RedPhiR)));
6367 };
6368 if (!all_of(Chain.ReductionBinOp->users(), UseIsValid)) {
6369 Chains.clear();
6370 break;
6371 }
6372
6373 // Check if the compute-reduction-result is used by a sunk store.
6374 // TODO: Also form partial reductions in those cases.
6375 if (auto *RdxResult = vputils::findComputeReductionResult(RedPhiR)) {
6376 if (any_of(RdxResult->users(), [](VPUser *U) {
6377 auto *RepR = dyn_cast<VPReplicateRecipe>(U);
6378 return RepR && RepR->getOpcode() == Instruction::Store;
6379 })) {
6380 Chains.clear();
6381 break;
6382 }
6383 }
6384 }
6385
6386 // Clear the chain if it is not profitable.
6388 [&, &Chains = Chains](ElementCount VF) {
6389 return IsProfitablePartialReductionChainForVF(Chains, VF);
6390 },
6391 Range))
6392 Chains.clear();
6393 }
6394
6395 for (auto &[Phi, Chains] : ChainsByPhi)
6396 for (const VPPartialReductionChain &Chain : Chains)
6397 transformToPartialReduction(Chain, CostCtx.Types, Plan, Phi);
6398}
6399
6401 VPlan &Plan, VFRange &Range, VPRecipeBuilder &RecipeBuilder) {
6402 // Collect all loads/stores first. We will start with ones having simpler
6403 // decisions followed by more complex ones that are potentially
6404 // guided/dependent on the simpler ones.
6406 for (VPBasicBlock *VPBB :
6409 for (VPRecipeBase &R : *VPBB) {
6410 auto *VPI = dyn_cast<VPInstruction>(&R);
6411 if (VPI && VPI->getUnderlyingValue() &&
6412 is_contained({Instruction::Load, Instruction::Store},
6413 VPI->getOpcode()))
6414 MemOps.push_back(VPI);
6415 }
6416 }
6417
6418 VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
6419 VPBuilder FinalRedStoresBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
6420
6421 for (VPInstruction *VPI : MemOps) {
6422 auto ReplaceWith = [&](VPRecipeBase *New) {
6423 New->insertBefore(VPI);
6424 if (VPI->getOpcode() == Instruction::Load)
6425 VPI->replaceAllUsesWith(New->getVPSingleValue());
6426 VPI->eraseFromParent();
6427 };
6428
6429 // Note: we must do that for scalar VPlan as well.
6430 if (RecipeBuilder.replaceWithFinalIfReductionStore(VPI,
6431 FinalRedStoresBuilder))
6432 continue;
6433
6434 // Filter out scalar VPlan for the remaining memory operations.
6436 [](ElementCount VF) { return VF.isScalar(); }, Range))
6437 continue;
6438
6439 if (VPHistogramRecipe *Histogram = RecipeBuilder.widenIfHistogram(VPI)) {
6440 ReplaceWith(Histogram);
6441 continue;
6442 }
6443
6444 VPRecipeBase *Recipe = RecipeBuilder.tryToWidenMemory(VPI, Range);
6445 if (!Recipe)
6446 Recipe = RecipeBuilder.handleReplication(VPI, Range);
6447
6448 ReplaceWith(Recipe);
6449 }
6450}
6451
6454 [&](ElementCount VF) { return VF.isScalar(); }, Range))
6455 return;
6456
6458 Plan.getEntry());
6460 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
6461 auto *VPI = dyn_cast<VPInstruction>(&R);
6462 if (!VPI)
6463 continue;
6464
6465 auto *I = cast_or_null<Instruction>(VPI->getUnderlyingValue());
6466 // Wouldn't be able to create a `VPReplicateRecipe` anyway.
6467 if (!I)
6468 continue;
6469
6470 // If executing other lanes produces side-effects we can't avoid them.
6471 if (VPI->mayHaveSideEffects())
6472 continue;
6473
6474 // We want to drop the mask operand, verify we can safely do that.
6475 if (VPI->isMasked() && !VPI->isSafeToSpeculativelyExecute())
6476 continue;
6477
6478 // Avoid rewriting IV increment as that interferes with
6479 // `removeRedundantCanonicalIVs`.
6480 if (VPI->getOpcode() == Instruction::Add &&
6482 continue;
6483
6484 // Other lanes are needed - can't drop them.
6486 continue;
6487
6488 auto *Recipe = new VPReplicateRecipe(
6489 I, VPI->operandsWithoutMask(), /*IsSingleScalar=*/true,
6490 /*Mask=*/nullptr, *VPI, *VPI, VPI->getDebugLoc());
6491 Recipe->insertBefore(VPI);
6492 VPI->replaceAllUsesWith(Recipe);
6493 VPI->eraseFromParent();
6494 }
6495 }
6496}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
AMDGPU Register Bank Select
This file implements a class to represent arbitrary precision integral constant values and operations...
ReachingDefInfo InstSet & ToRemove
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool isEqual(const Function &Caller, const Function &Callee)
static const Function * getParent(const Value *V)
#define X(NUM, ENUM, NAME)
Definition ELF.h:853
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
static bool isSentinel(const DWARFDebugNames::AttributeEncoding &AE)
@ Default
Hexagon Common GEP
#define _
iv Induction Variable Users
Definition IVUsers.cpp:48
iv users
Definition IVUsers.cpp:48
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
licm
Definition LICM.cpp:383
Legalize the Machine IR a function s Machine IR
Definition Legalizer.cpp:81
#define I(x, y, z)
Definition MD5.cpp:57
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first DebugLoc that has line number information, given a range of instructions.
This file provides utility analysis objects describing memory locations.
This file contains the declarations for metadata subclasses.
MachineInstr unsigned OpIdx
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
#define P(N)
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
R600 Clause Merge
const SmallVectorImpl< MachineOperand > & Cond
This file contains some templates that are useful if you are working with the STL at all.
This is the interface for a metadata-based scoped no-alias analysis.
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file defines the SmallPtrSet class.
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This file implements the TypeSwitch template, which mimics a switch() statement whose cases are type ...
This file implements dominator tree analysis for a single level of a VPlan's H-CFG.
This file contains the declarations of different VPlan-related auxiliary helpers.
static SmallVector< SmallVector< VPReplicateRecipe *, 4 > > collectComplementaryPredicatedMemOps(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L)
static void removeCommonBlendMask(VPBlendRecipe *Blend)
Try to see if all of Blend's masks share a common value logically and'ed and remove it from the masks...
static void tryToCreateAbstractReductionRecipe(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries to create abstract recipes from the reduction recipe for following optimizations ...
static VPReplicateRecipe * findRecipeWithMinAlign(ArrayRef< VPReplicateRecipe * > Group)
static bool sinkScalarOperands(VPlan &Plan)
static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Try to simplify the branch condition of Plan.
static void simplifyRecipe(VPSingleDefRecipe *Def, VPTypeAnalysis &TypeInfo)
Try to simplify VPSingleDefRecipe Def.
static VPValue * cloneBinOpForScalarIV(VPWidenRecipe *BinOp, VPValue *ScalarIV, VPWidenIntOrFpInductionRecipe *WidenIV)
Create a scalar version of BinOp, with its WidenIV operand replaced by ScalarIV, and place it after S...
static VPWidenIntOrFpInductionRecipe * getExpressionIV(VPValue *V)
Check if V is a binary expression of a widened IV and a loop-invariant value.
static void removeRedundantInductionCasts(VPlan &Plan)
Remove redundant casts of inductions.
static bool isConditionTrueViaVFAndUF(VPValue *Cond, VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Return true if Cond is known to be true for given BestVF and BestUF.
static bool tryToReplaceALMWithWideALM(VPlan &Plan, ElementCount VF, unsigned UF)
Try to replace multiple active lane masks used for control flow with a single, wide active lane mask ...
static std::optional< std::pair< bool, unsigned > > getOpcodeOrIntrinsicID(const VPSingleDefRecipe *R)
Get any instruction opcode or intrinsic ID data embedded in recipe R.
static VPExpressionRecipe * tryToMatchAndCreateExtendedReduction(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries convert extended in-loop reductions to VPExpressionRecipe and clamp the Range if ...
static RemoveMask_match< Op0_t, Op1_t > m_RemoveMask(const Op0_t &In, Op1_t &Out)
Match a specific mask In, or a combination of it (logical-and In, Out).
static VPIRMetadata getCommonMetadata(ArrayRef< VPReplicateRecipe * > Recipes)
static VPValue * getPredicatedMask(VPRegionBlock *R)
If R is a region with a VPBranchOnMaskRecipe in the entry block, return the mask.
static bool mergeReplicateRegionsIntoSuccessors(VPlan &Plan)
static VPScalarIVStepsRecipe * createScalarIVSteps(VPlan &Plan, InductionDescriptor::InductionKind Kind, Instruction::BinaryOps InductionOpcode, FPMathOperator *FPBinOp, Instruction *TruncI, VPIRValue *StartV, VPValue *Step, DebugLoc DL, VPBuilder &Builder)
static VPWidenInductionRecipe * getOptimizableIVOf(VPValue *VPV, PredicatedScalarEvolution &PSE)
Check if VPV is an untruncated wide induction, either before or after the increment.
static void fixupVFUsersForEVL(VPlan &Plan, VPValue &EVL)
After replacing the canonical IV with a EVL-based IV, fixup recipes that use VF to use the EVL instea...
static bool canNarrowLoad(VPSingleDefRecipe *WideMember0, unsigned OpIdx, VPValue *OpV, unsigned Idx, bool IsScalable)
Returns true if V is VPWidenLoadRecipe or VPInterleaveRecipe that can be converted to a narrower reci...
static void expandVPWidenPointerInduction(VPWidenPointerInductionRecipe *R, VPTypeAnalysis &TypeInfo)
Expand a VPWidenPointerInductionRecipe into executable recipes, for the initial value,...
static VPValue * optimizeLatchExitInductionUser(VPlan &Plan, VPTypeAnalysis &TypeInfo, VPValue *Op, DenseMap< VPValue *, VPValue * > &EndValues, PredicatedScalarEvolution &PSE)
Attempts to optimize the induction variable exit values for users in the exit block coming from the l...
static std::optional< ElementCount > isConsecutiveInterleaveGroup(VPInterleaveRecipe *InterleaveR, ArrayRef< ElementCount > VFs, VPTypeAnalysis &TypeInfo, const TargetTransformInfo &TTI)
Returns VF from VFs if IR is a full interleave group with factor and number of members both equal to ...
static bool isDeadRecipe(VPRecipeBase &R)
Returns true if R is dead and can be removed.
static void legalizeAndOptimizeInductions(VPlan &Plan)
Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd (IndStart, ScalarIVSteps (0,...
static void addReplicateRegions(VPlan &Plan)
static SmallVector< SmallVector< VPReplicateRecipe *, 4 > > collectGroupedReplicateMemOps(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L, function_ref< bool(VPReplicateRecipe *)> FilterFn)
Collect either replicated Loads or Stores grouped by their address SCEV, in a deep-traversal of the v...
static VPIRValue * tryToFoldLiveIns(VPSingleDefRecipe &R, ArrayRef< VPValue * > Operands, const DataLayout &DL, VPTypeAnalysis &TypeInfo)
Try to fold R using InstSimplifyFolder.
static VPValue * tryToComputeEndValueForInduction(VPWidenInductionRecipe *WideIV, VPBuilder &VectorPHBuilder, VPTypeAnalysis &TypeInfo, VPValue *VectorTC)
Compute the end value for WideIV, unless it is truncated.
static void removeRedundantExpandSCEVRecipes(VPlan &Plan)
Remove redundant EpxandSCEVRecipes in Plan's entry block by replacing them with already existing reci...
static VPValue * scalarizeVPWidenPointerInduction(VPWidenPointerInductionRecipe *PtrIV, VPlan &Plan, VPBuilder &Builder)
Scalarize a VPWidenPointerInductionRecipe by replacing it with a PtrAdd (IndStart,...
static SmallVector< VPUser * > collectUsersRecursively(VPValue *V)
static VPValue * optimizeEarlyExitInductionUser(VPlan &Plan, VPTypeAnalysis &TypeInfo, VPValue *Op, PredicatedScalarEvolution &PSE)
Attempts to optimize the induction variable exit values for users in the early exit block.
static void expandVPDerivedIV(VPDerivedIVRecipe *R, VPTypeAnalysis &TypeInfo)
Expand a VPDerivedIVRecipe into executable recipes.
static void recursivelyDeleteDeadRecipes(VPValue *V)
static void reassociateHeaderMask(VPlan &Plan)
Reassociate (headermask && x) && y -> headermask && (x && y) to allow the header mask to be simplifie...
static bool canSinkStoreWithNoAliasCheck(ArrayRef< VPReplicateRecipe * > StoresToSink, PredicatedScalarEvolution &PSE, const Loop &L, VPTypeAnalysis &TypeInfo)
static VPActiveLaneMaskPHIRecipe * addVPLaneMaskPhiAndUpdateExitBranch(VPlan &Plan)
static VPBasicBlock * getPredicatedThenBlock(VPRegionBlock *R)
If R is a triangle region, return the 'then' block of the triangle.
static VPValue * narrowInterleaveGroupOp(VPValue *V, SmallPtrSetImpl< VPValue * > &NarrowedOps)
static bool canHoistOrSinkWithNoAliasCheck(const MemoryLocation &MemLoc, VPBasicBlock *FirstBB, VPBasicBlock *LastBB, std::optional< SinkStoreInfo > SinkInfo={})
Check if a memory operation doesn't alias with memory operations using scoped noalias metadata,...
static VPRegionBlock * createReplicateRegion(VPReplicateRecipe *PredRecipe, VPRegionBlock *ParentRegion, VPlan &Plan)
static void simplifyBlends(VPlan &Plan)
Normalize and simplify VPBlendRecipes.
static VPRecipeBase * optimizeMaskToEVL(VPValue *HeaderMask, VPRecipeBase &CurRecipe, VPTypeAnalysis &TypeInfo, VPValue &EVL)
Try to optimize a CurRecipe masked by HeaderMask to a corresponding EVL-based recipe without the head...
static bool isAlreadyNarrow(VPValue *VPV)
Returns true if VPValue is a narrow VPValue.
static bool cannotHoistOrSinkRecipe(VPRecipeBase &R, VPBasicBlock *FirstBB, VPBasicBlock *LastBB)
Return true if we do not know how to (mechanically) hoist or sink a non-memory or memory recipe R out...
static bool canNarrowOps(ArrayRef< VPValue * > Ops, bool IsScalable)
static bool optimizeVectorInductionWidthForTCAndVFUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF)
Optimize the width of vector induction variables in Plan based on a known constant Trip Count,...
static VPExpressionRecipe * tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries convert extended in-loop reductions to VPExpressionRecipe and clamp the Range if ...
static void expandVPWidenIntOrFpInduction(VPWidenIntOrFpInductionRecipe *WidenIVR, VPTypeAnalysis &TypeInfo)
Expand a VPWidenIntOrFpInduction into executable recipes, for the initial value, phi and backedge val...
static void narrowToSingleScalarRecipes(VPlan &Plan)
This file provides utility VPlan to VPlan transformations.
#define RUN_VPLAN_PASS(PASS,...)
This file declares the class VPlanVerifier, which contains utility functions to check the consistency...
This file contains the declarations of the Vectorization Plan base classes:
static const X86InstrFMA3Group Groups[]
Value * RHS
Value * LHS
BinaryOperator * Mul
static const uint32_t IV[8]
Definition blake3_impl.h:83
Helper for extra no-alias checks via known-safe recipe and SCEV.
SinkStoreInfo(const SmallPtrSetImpl< VPRecipeBase * > &ExcludeRecipes, VPReplicateRecipe &GroupLeader, PredicatedScalarEvolution &PSE, const Loop &L, VPTypeAnalysis &TypeInfo)
bool shouldSkip(VPRecipeBase &R) const
Return true if R should be skipped during alias checking, either because it's in the exclude set or b...
Class for arbitrary precision integers.
Definition APInt.h:78
LLVM_ABI APInt zext(unsigned width) const
Zero extend to a new width.
Definition APInt.cpp:1055
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1563
unsigned getActiveBits() const
Compute the number of active bits in the value.
Definition APInt.h:1535
APInt abs() const
Get the absolute value.
Definition APInt.h:1818
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1511
LLVM_ABI APInt sext(unsigned width) const
Sign extend to a new width.
Definition APInt.cpp:1028
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition APInt.h:441
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition APInt.h:1228
An arbitrary precision integer that knows its signedness.
Definition APSInt.h:24
static APSInt getMinValue(uint32_t numBits, bool Unsigned)
Return the APSInt representing the minimum integer value with the given bit width and signedness.
Definition APSInt.h:310
static APSInt getMaxValue(uint32_t numBits, bool Unsigned)
Return the APSInt representing the maximum integer value with the given bit width and signedness.
Definition APSInt.h:302
@ NoAlias
The two locations do not alias at all.
Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
const T & back() const
Get the last element.
Definition ArrayRef.h:150
const T & front() const
Get the first element.
Definition ArrayRef.h:144
iterator end() const
Definition ArrayRef.h:130
iterator begin() const
Definition ArrayRef.h:129
LLVM Basic Block Representation.
Definition BasicBlock.h:62
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction; assumes that the block is well-formed.
Definition BasicBlock.h:237
This class represents a function call, abstracting a target machine's calling convention.
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:701
@ ICMP_NE
not equal
Definition InstrTypes.h:698
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:702
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition InstrTypes.h:686
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition InstrTypes.h:789
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
static ConstantInt * getSigned(IntegerType *Ty, int64_t V, bool ImplicitTrunc=false)
Return a ConstantInt with the specified value for the specified type.
Definition Constants.h:135
This class represents a range of values.
LLVM_ABI bool contains(const APInt &Val) const
Return true if the specified value is in the set.
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
A debug info location.
Definition DebugLoc.h:123
static DebugLoc getCompilerGenerated()
Definition DebugLoc.h:162
static DebugLoc getUnknown()
Definition DebugLoc.h:161
ValueT lookup(const_arg_type_t< KeyT > Val) const
Return the entry for the specified key, or a default constructed value if no such entry exists.
Definition DenseMap.h:205
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:254
ValueT lookup_or(const_arg_type_t< KeyT > Val, U &&Default) const
Definition DenseMap.h:215
bool dominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
dominates - Returns true iff A dominates B.
constexpr bool isVector() const
One or more elements.
Definition TypeSize.h:324
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition TypeSize.h:312
constexpr bool isScalar() const
Exactly one element.
Definition TypeSize.h:320
Utility class for floating point operations which can have information about relaxed accuracy require...
Definition Operator.h:200
FastMathFlags getFastMathFlags() const
Convenience function for getting all the fast-math flags.
Definition Operator.h:333
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:23
Represents flags for the getelementptr instruction/expression.
GEPNoWrapFlags withoutNoUnsignedWrap() const
static GEPNoWrapFlags none()
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
A struct for saving information about induction variables.
static LLVM_ABI InductionDescriptor getCanonicalIntInduction(Type *Ty, ScalarEvolution &SE)
Returns the canonical integer induction for type Ty with start = 0 and step = 1.
InductionKind
This enum represents the kinds of inductions that we support.
@ IK_NoInduction
Not an induction variable.
@ IK_FpInduction
Floating point induction variable.
@ IK_PtrInduction
Pointer induction var. Step = C.
@ IK_IntInduction
Integer induction variable. Step = C.
InstSimplifyFolder - Use InstructionSimplify to fold operations to existing values.
static InstructionCost getInvalid(CostType Val=0)
bool isCast() const
bool isBinaryOp() const
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
bool isIntDivRem() const
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:354
The group of interleaved loads/stores sharing the same stride and close to each other.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
static bool getDecisionAndClampRange(const std::function< bool(ElementCount)> &Predicate, VFRange &Range)
Test a Predicate on a Range of VF's.
Definition VPlan.cpp:1665
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
LLVM_ABI MDNode * createBranchWeights(uint32_t TrueWeight, uint32_t FalseWeight, bool IsExpected=false)
Return metadata containing two branch weights.
Definition MDBuilder.cpp:38
Metadata node.
Definition Metadata.h:1080
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:38
ValueT lookup(const KeyT &Key) const
Definition MapVector.h:110
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
Definition MapVector.h:118
bool empty() const
Definition MapVector.h:79
Representation for a specific memory location.
AAMDNodes AATags
The metadata nodes which describes the aliasing of the location (each member is null if that kind of ...
unsigned getOpcode() const
Return the opcode for this Instruction or ConstantExpr.
Definition Operator.h:43
Post-order traversal of a graph.
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
ScalarEvolution * getSE() const
Returns the ScalarEvolution analysis used.
LLVM_ABI const SCEV * getSCEV(Value *V)
Returns the SCEV expression of V, in the context of the current SCEV predicate.
static LLVM_ABI unsigned getOpcode(RecurKind Kind)
Returns the opcode corresponding to the RecurrenceKind.
static bool isFindLastRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is of the form select(cmp(),x,y) where one of (x,...
RegionT * getParent() const
Get the parent of the Region.
Definition RegionInfo.h:362
This class uses information about analyze scalars to rewrite expressions in canonical form.
LLVM_ABI Value * expandCodeFor(SCEVUse SH, Type *Ty, BasicBlock::iterator I)
Insert code to directly compute the specified SCEV expression into the program.
static const SCEV * rewrite(const SCEV *Scev, ScalarEvolution &SE, ValueToSCEVMapTy &Map)
This class represents an analyzed expression in the program.
LLVM_ABI Type * getType() const
Return the LLVM type of this SCEV expression.
The main scalar evolution driver.
LLVM_ABI const SCEV * getUDivExpr(SCEVUse LHS, SCEVUse RHS)
Get a canonical unsigned division expression, or something simpler if possible.
const DataLayout & getDataLayout() const
Return the DataLayout associated with the module this SCEV instance is operating on.
LLVM_ABI const SCEV * getNegativeSCEV(const SCEV *V, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
Return the SCEV object corresponding to -V.
LLVM_ABI bool isKnownNonZero(const SCEV *S)
Test if the given expression is known to be non-zero.
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI const SCEV * getMinusSCEV(SCEVUse LHS, SCEVUse RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
ConstantRange getSignedRange(const SCEV *S)
Determine the signed range for a particular SCEV.
LLVM_ABI bool isKnownPositive(const SCEV *S)
Test if the given expression is known to be positive.
LLVM_ABI const SCEV * getElementCount(Type *Ty, ElementCount EC, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
ConstantRange getUnsignedRange(const SCEV *S)
Determine the unsigned range for a particular SCEV.
LLVM_ABI const SCEV * getMulExpr(SmallVectorImpl< SCEVUse > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
LLVM_ABI bool isKnownPredicate(CmpPredicate Pred, SCEVUse LHS, SCEVUse RHS)
Test if the given expression is known to satisfy the condition described by Pred, LHS,...
static LLVM_ABI AliasResult alias(const MemoryLocation &LocA, const MemoryLocation &LocB)
A vector that has set insertion semantics.
Definition SetVector.h:57
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:103
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:151
size_type size() const
Definition SmallPtrSet.h:99
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
iterator begin() const
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
Provides information about what library functions are available for the current target.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
static LLVM_ABI PartialReductionExtendKind getPartialReductionExtendKind(Instruction *I)
Get the kind of extension that an instruction represents.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
LLVM_ABI InstructionCost getPartialReductionCost(unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType, ElementCount VF, PartialReductionExtendKind OpAExtend, PartialReductionExtendKind OpBExtend, std::optional< unsigned > BinOp, TTI::TargetCostKind CostKind, std::optional< FastMathFlags > FMF) const
@ SK_Broadcast
Broadcast element 0 to all other elements.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
This class implements a switch-like dispatch statement for a value of 'T' using dyn_cast functionalit...
Definition TypeSwitch.h:89
TypeSwitch< T, ResultT > & Case(CallableT &&caseFn)
Add a case on the given type.
Definition TypeSwitch.h:98
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:313
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:284
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
Definition Type.cpp:311
bool isStructTy() const
True if this is an instance of StructType.
Definition Type.h:278
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:201
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:236
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:310
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:186
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:257
op_range operands()
Definition User.h:267
A recipe for generating the active lane mask for the vector loop that is used to predicate the vector...
Definition VPlan.h:3779
VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph.
Definition VPlan.h:4146
void appendRecipe(VPRecipeBase *Recipe)
Augment the existing recipes of a VPBasicBlock with an additional Recipe as the last recipe.
Definition VPlan.h:4221
RecipeListTy::iterator iterator
Instruction iterators...
Definition VPlan.h:4173
iterator end()
Definition VPlan.h:4183
iterator begin()
Recipe iterator methods.
Definition VPlan.h:4181
iterator_range< iterator > phis()
Returns an iterator range over the PHI-like recipes in the block.
Definition VPlan.h:4234
iterator getFirstNonPhi()
Return the position of the first non-phi node recipe in the block.
Definition VPlan.cpp:233
VPBasicBlock * splitAt(iterator SplitAt)
Split current block at SplitAt by inserting a new block between the current block and its successors ...
Definition VPlan.cpp:549
VPRecipeBase * getTerminator()
If the block has multiple successors, return the branch recipe terminating the block.
Definition VPlan.cpp:628
const VPRecipeBase & back() const
Definition VPlan.h:4195
A recipe for vectorizing a phi-node as a sequence of mask-based select instructions.
Definition VPlan.h:2779
VPValue * getMask(unsigned Idx) const
Return mask number Idx.
Definition VPlan.h:2815
unsigned getNumIncomingValues() const
Return the number of incoming values, taking into account when normalized the first incoming value wi...
Definition VPlan.h:2805
void setMask(unsigned Idx, VPValue *V)
Set mask number Idx to V.
Definition VPlan.h:2821
bool isNormalized() const
A normalized blend is one that has an odd number of operands, whereby the first operand does not have...
Definition VPlan.h:2801
VPBlockBase is the building block of the Hierarchical Control-Flow Graph.
Definition VPlan.h:97
void setSuccessors(ArrayRef< VPBlockBase * > NewSuccs)
Set each VPBasicBlock in NewSuccss as successor of this VPBlockBase.
Definition VPlan.h:318
VPRegionBlock * getParent()
Definition VPlan.h:189
const VPBasicBlock * getExitingBasicBlock() const
Definition VPlan.cpp:203
size_t getNumSuccessors() const
Definition VPlan.h:240
void setPredecessors(ArrayRef< VPBlockBase * > NewPreds)
Set each VPBasicBlock in NewPreds as predecessor of this VPBlockBase.
Definition VPlan.h:309
const VPBlocksTy & getPredecessors() const
Definition VPlan.h:225
VPlan * getPlan()
Definition VPlan.cpp:178
const std::string & getName() const
Definition VPlan.h:180
void clearSuccessors()
Remove all the successors of this block.
Definition VPlan.h:328
VPBlockBase * getSinglePredecessor() const
Definition VPlan.h:236
const VPBasicBlock * getEntryBasicBlock() const
Definition VPlan.cpp:183
VPBlockBase * getSingleHierarchicalPredecessor()
Definition VPlan.h:282
VPBlockBase * getSingleSuccessor() const
Definition VPlan.h:230
const VPBlocksTy & getSuccessors() const
Definition VPlan.h:214
static void insertOnEdge(VPBlockBase *From, VPBlockBase *To, VPBlockBase *BlockPtr)
Inserts BlockPtr on the edge between From and To.
Definition VPlanUtils.h:322
static bool isLatch(const VPBlockBase *VPB, const VPDominatorTree &VPDT)
Returns true if VPB is a loop latch, using isHeader().
static void insertTwoBlocksAfter(VPBlockBase *IfTrue, VPBlockBase *IfFalse, VPBlockBase *BlockPtr)
Insert disconnected VPBlockBases IfTrue and IfFalse after BlockPtr.
Definition VPlanUtils.h:223
static void connectBlocks(VPBlockBase *From, VPBlockBase *To, unsigned PredIdx=-1u, unsigned SuccIdx=-1u)
Connect VPBlockBases From and To bi-directionally.
Definition VPlanUtils.h:241
static void disconnectBlocks(VPBlockBase *From, VPBlockBase *To)
Disconnect VPBlockBases From and To bi-directionally.
Definition VPlanUtils.h:259
static auto blocksOnly(T &&Range)
Return an iterator range over Range which only includes BlockTy blocks.
Definition VPlanUtils.h:295
static void transferSuccessors(VPBlockBase *Old, VPBlockBase *New)
Transfer successors from Old to New. New must have no successors.
Definition VPlanUtils.h:279
static SmallVector< VPBasicBlock * > blocksInSingleSuccessorChainBetween(VPBasicBlock *FirstBB, VPBasicBlock *LastBB)
Returns the blocks between FirstBB and LastBB, where FirstBB to LastBB forms a single-sucessor chain.
A recipe for generating conditional branches on the bits of a mask.
Definition VPlan.h:3270
RAII object that stores the current insertion point and restores it when the object is destroyed.
VPlan-based builder utility analogous to IRBuilder.
VPInstruction * createFirstActiveLane(ArrayRef< VPValue * > Masks, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
VPInstruction * createOr(VPValue *LHS, VPValue *RHS, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
VPValue * createScalarZExtOrTrunc(VPValue *Op, Type *ResultTy, Type *SrcTy, DebugLoc DL)
VPInstruction * createNot(VPValue *Operand, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
VPInstruction * createAnyOfReduction(VPValue *ChainOp, VPValue *TrueVal, VPValue *FalseVal, DebugLoc DL=DebugLoc::getUnknown())
Create an AnyOf reduction pattern: or-reduce ChainOp, freeze the result, then select between TrueVal ...
Definition VPlan.cpp:1650
VPInstruction * createLogicalAnd(VPValue *LHS, VPValue *RHS, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
VPDerivedIVRecipe * createDerivedIV(InductionDescriptor::InductionKind Kind, FPMathOperator *FPBinOp, VPIRValue *Start, VPValue *Current, VPValue *Step)
Convert the input value Current to the corresponding value of an induction with Start and Step values...
VPInstruction * createScalarCast(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy, DebugLoc DL, const VPIRMetadata &Metadata={})
VPWidenPHIRecipe * createWidenPhi(ArrayRef< VPValue * > IncomingValues, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
static VPBuilder getToInsertAfter(VPRecipeBase *R)
Create a VPBuilder to insert after R.
VPPhi * createScalarPhi(ArrayRef< VPValue * > IncomingValues, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="", const VPIRFlags &Flags={})
VPInstruction * createICmp(CmpInst::Predicate Pred, VPValue *A, VPValue *B, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
Create a new ICmp VPInstruction with predicate Pred and operands A and B.
VPInstruction * createSelect(VPValue *Cond, VPValue *TrueVal, VPValue *FalseVal, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="", const VPIRFlags &Flags={})
void setInsertPoint(VPBasicBlock *TheBB)
This specifies that created VPInstructions should be appended to the end of the specified block.
VPInstruction * createNaryOp(unsigned Opcode, ArrayRef< VPValue * > Operands, Instruction *Inst=nullptr, const VPIRFlags &Flags={}, const VPIRMetadata &MD={}, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
Create an N-ary operation with Opcode, Operands and set Inst as its underlying Instruction.
A recipe for generating the phi node tracking the current scalar iteration index.
Definition VPlan.h:3811
unsigned getNumDefinedValues() const
Returns the number of values defined by the VPDef.
Definition VPlanValue.h:504
VPValue * getVPSingleValue()
Returns the only VPValue defined by the VPDef.
Definition VPlanValue.h:477
VPValue * getVPValue(unsigned I)
Returns the VPValue with index I defined by the VPDef.
Definition VPlanValue.h:489
ArrayRef< VPRecipeValue * > definedValues()
Returns an ArrayRef of the values defined by the VPDef.
Definition VPlanValue.h:499
A recipe for converting the input value IV value to the corresponding value of an IV with different s...
Definition VPlan.h:3895
Template specialization of the standard LLVM dominator tree utility for VPBlockBases.
bool properlyDominates(const VPRecipeBase *A, const VPRecipeBase *B)
A recipe to combine multiple recipes into a single 'expression' recipe, which should be considered a ...
Definition VPlan.h:3315
A pure virtual base class for all recipes modeling header phis, including phis for first order recurr...
Definition VPlan.h:2299
virtual VPValue * getBackedgeValue()
Returns the incoming value from the loop backedge.
Definition VPlan.h:2341
VPValue * getStartValue()
Returns the start value of the phi, if one is set.
Definition VPlan.h:2330
A recipe representing a sequence of load -> update -> store as part of a histogram operation.
Definition VPlan.h:2044
A special type of VPBasicBlock that wraps an existing IR basic block.
Definition VPlan.h:4299
Class to record and manage LLVM IR flags.
Definition VPlan.h:691
static VPIRFlags getDefaultFlags(unsigned Opcode)
Returns default flags for Opcode for opcodes that support it, asserts otherwise.
LLVM_ABI_FOR_TEST FastMathFlags getFastMathFlags() const
static LLVM_ABI_FOR_TEST VPIRInstruction * create(Instruction &I)
Create a new VPIRPhi for \I , if it is a PHINode, otherwise create a VPIRInstruction.
Helper to manage IR metadata for recipes.
Definition VPlan.h:1171
void intersect(const VPIRMetadata &MD)
Intersect this VPIRMetadata object with MD, keeping only metadata nodes that are common to both.
This is a concrete Recipe that models a single VPlan-level instruction.
Definition VPlan.h:1226
@ ExtractLane
Extracts a single lane (first operand) from a set of vector operands.
Definition VPlan.h:1323
@ Unpack
Extracts all lanes from its (non-scalable) vector operand.
Definition VPlan.h:1266
@ ReductionStartVector
Start vector for reductions with 3 operands: the original start value, the identity value for the red...
Definition VPlan.h:1317
@ BuildVector
Creates a fixed-width vector containing all operands.
Definition VPlan.h:1261
@ BuildStructVector
Given operands of (the same) struct type, creates a struct of fixed- width vectors each containing a ...
Definition VPlan.h:1258
@ CanonicalIVIncrementForPart
Definition VPlan.h:1242
@ ComputeReductionResult
Reduce the operands to the final reduction result using the operation specified via the operation's V...
Definition VPlan.h:1269
const InterleaveGroup< Instruction > * getInterleaveGroup() const
Definition VPlan.h:2916
VPValue * getMask() const
Return the mask used by this recipe.
Definition VPlan.h:2908
ArrayRef< VPValue * > getStoredValues() const
Return the VPValues stored by this interleave group.
Definition VPlan.h:2937
A recipe for interleaved memory operations with vector-predication intrinsics.
Definition VPlan.h:2989
VPInterleaveRecipe is a recipe for transforming an interleave group of load or stores into one wide l...
Definition VPlan.h:2947
VPPredInstPHIRecipe is a recipe for generating the phi nodes needed when control converges back from ...
Definition VPlan.h:3457
VPRecipeBase is a base class modeling a sequence of one or more output IR instructions.
Definition VPlan.h:405
VPBasicBlock * getParent()
Definition VPlan.h:479
DebugLoc getDebugLoc() const
Returns the debug location of the recipe.
Definition VPlan.h:557
void moveBefore(VPBasicBlock &BB, iplist< VPRecipeBase >::iterator I)
Unlink this recipe and insert into BB before I.
void insertBefore(VPRecipeBase *InsertPos)
Insert an unlinked recipe into a basic block immediately before the specified recipe.
void insertAfter(VPRecipeBase *InsertPos)
Insert an unlinked Recipe into a basic block immediately after the specified Recipe.
iplist< VPRecipeBase >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Helper class to create VPRecipies from IR instructions.
VPHistogramRecipe * widenIfHistogram(VPInstruction *VPI)
If VPI represents a histogram operation (as determined by LoopVectorizationLegality) make that safe f...
VPRecipeBase * tryToWidenMemory(VPInstruction *VPI, VFRange &Range)
Check if the load or store instruction VPI should widened for Range.Start and potentially masked.
bool replaceWithFinalIfReductionStore(VPInstruction *VPI, VPBuilder &FinalRedStoresBuilder)
If VPI is a store of a reduction into an invariant address, delete it.
VPReplicateRecipe * handleReplication(VPInstruction *VPI, VFRange &Range)
Build a VPReplicationRecipe for VPI.
A recipe to represent inloop reduction operations with vector-predication intrinsics,...
Definition VPlan.h:3149
A recipe for handling reduction phis.
Definition VPlan.h:2685
void setVFScaleFactor(unsigned ScaleFactor)
Set the VFScaleFactor for this reduction phi.
Definition VPlan.h:2732
unsigned getVFScaleFactor() const
Get the factor that the VF of this recipe's output should be scaled by, or 1 if it isn't scaled.
Definition VPlan.h:2725
RecurKind getRecurrenceKind() const
Returns the recurrence kind of the reduction.
Definition VPlan.h:2743
A recipe to represent inloop, ordered or partial reduction operations.
Definition VPlan.h:3040
VPRegionBlock represents a collection of VPBasicBlocks and VPRegionBlocks which form a Single-Entry-S...
Definition VPlan.h:4356
const VPBlockBase * getEntry() const
Definition VPlan.h:4400
bool isReplicator() const
An indicator whether this region is to generate multiple replicated instances of output IR correspond...
Definition VPlan.h:4432
VPInstruction * getOrCreateCanonicalIVIncrement()
Get the canonical IV increment instruction if it exists.
Definition VPlan.cpp:846
void setExiting(VPBlockBase *ExitingBlock)
Set ExitingBlock as the exiting VPBlockBase of this VPRegionBlock.
Definition VPlan.h:4417
Type * getCanonicalIVType() const
Return the type of the canonical IV for loop regions.
Definition VPlan.h:4476
bool hasCanonicalIVNUW() const
Indicates if NUW is set for the canonical IV increment, for loop regions.
Definition VPlan.h:4481
void clearCanonicalIVNUW(VPInstruction *Increment)
Unsets NUW for the canonical IV increment Increment, for loop regions.
Definition VPlan.h:4484
VPRegionValue * getCanonicalIV()
Return the canonical induction variable of the region, null for replicating regions.
Definition VPlan.h:4468
const VPBlockBase * getExiting() const
Definition VPlan.h:4412
VPBasicBlock * getPreheaderVPBB()
Returns the pre-header VPBasicBlock of the loop region.
Definition VPlan.h:4425
VPReplicateRecipe replicates a given instruction producing multiple scalar copies of the original sca...
Definition VPlan.h:3194
bool isSingleScalar() const
Definition VPlan.h:3235
bool isPredicated() const
Definition VPlan.h:3237
VPValue * getMask()
Return the mask of a predicated VPReplicateRecipe.
Definition VPlan.h:3254
A recipe for handling phi nodes of integer and floating-point inductions, producing their scalar valu...
Definition VPlan.h:3966
VPSingleDef is a base class for recipes for modeling a sequence of one or more output IR that define ...
Definition VPlan.h:609
Instruction * getUnderlyingInstr()
Returns the underlying instruction.
Definition VPlan.h:676
VPSingleDefRecipe * clone() override=0
Clone the current recipe.
An analysis for type-inference for VPValues.
LLVMContext & getContext()
Return the LLVMContext used by the analysis.
Type * inferScalarType(const VPValue *V)
Infer the type of V. Returns the scalar type of V.
This class augments VPValue with operands which provide the inverse def-use edges from VPValue's user...
Definition VPlanValue.h:335
operand_range operands()
Definition VPlanValue.h:403
void setOperand(unsigned I, VPValue *New)
Definition VPlanValue.h:379
unsigned getNumOperands() const
Definition VPlanValue.h:373
VPValue * getOperand(unsigned N) const
Definition VPlanValue.h:374
void addOperand(VPValue *Operand)
Definition VPlanValue.h:368
This is the base class of the VPlan Def/Use graph, used for modeling the data flow into,...
Definition VPlanValue.h:49
Value * getLiveInIRValue() const
Return the underlying IR value for a VPIRValue.
Definition VPlan.cpp:138
bool isDefinedOutsideLoopRegions() const
Returns true if the VPValue is defined outside any loop.
Definition VPlan.cpp:1475
VPRecipeBase * getDefiningRecipe()
Returns the recipe defining this VPValue or nullptr if it is not defined by a recipe,...
Definition VPlan.cpp:128
Value * getUnderlyingValue() const
Return the underlying Value attached to this VPValue.
Definition VPlanValue.h:74
void setUnderlyingValue(Value *Val)
Definition VPlanValue.h:202
void replaceAllUsesWith(VPValue *New)
Definition VPlan.cpp:1478
unsigned getNumUsers() const
Definition VPlanValue.h:113
void replaceUsesWithIf(VPValue *New, llvm::function_ref< bool(VPUser &U, unsigned Idx)> ShouldReplace)
Go through the uses list for this VPValue and make each use point to New if the callback ShouldReplac...
Definition VPlan.cpp:1484
user_range users()
Definition VPlanValue.h:155
A recipe to compute a pointer to the last element of each part of a widened memory access for widened...
Definition VPlan.h:2150
VPWidenCastRecipe is a recipe to create vector cast instructions.
Definition VPlan.h:1836
Instruction::CastOps getOpcode() const
Definition VPlan.h:1874
A recipe for handling GEP instructions.
Definition VPlan.h:2086
Base class for widened induction (VPWidenIntOrFpInductionRecipe and VPWidenPointerInductionRecipe),...
Definition VPlan.h:2365
VPIRValue * getStartValue() const
Returns the start value of the induction.
Definition VPlan.h:2393
VPValue * getStepValue()
Returns the step value of the induction.
Definition VPlan.h:2396
const InductionDescriptor & getInductionDescriptor() const
Returns the induction descriptor for the recipe.
Definition VPlan.h:2416
A recipe for handling phi nodes of integer and floating-point inductions, producing their vector valu...
Definition VPlan.h:2447
VPIRValue * getStartValue() const
Returns the start value of the induction.
Definition VPlan.h:2494
VPValue * getSplatVFValue() const
If the recipe has been unrolled, return the VPValue for the induction increment, otherwise return nul...
Definition VPlan.h:2498
VPValue * getLastUnrolledPartOperand()
Returns the VPValue representing the value of this induction at the last unrolled part,...
Definition VPlan.h:2525
A recipe for widening vector intrinsics.
Definition VPlan.h:1888
A common base class for widening memory operations.
Definition VPlan.h:3493
A recipe for widened phis.
Definition VPlan.h:2583
VPWidenRecipe is a recipe for producing a widened instruction using the opcode and operands of the re...
Definition VPlan.h:1780
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPWidenRecipe.
VPWidenRecipe * clone() override
Clone the current recipe.
Definition VPlan.h:1800
unsigned getOpcode() const
Definition VPlan.h:1817
VPlan models a candidate for vectorization, encoding various decisions take to produce efficient outp...
Definition VPlan.h:4504
VPIRValue * getLiveIn(Value *V) const
Return the live-in VPIRValue for V, if there is one or nullptr otherwise.
Definition VPlan.h:4829
bool hasVF(ElementCount VF) const
Definition VPlan.h:4727
const DataLayout & getDataLayout() const
Definition VPlan.h:4709
LLVMContext & getContext() const
Definition VPlan.h:4705
VPBasicBlock * getEntry()
Definition VPlan.h:4600
bool hasScalableVF() const
Definition VPlan.h:4728
VPValue * getTripCount() const
The trip count of the original loop.
Definition VPlan.h:4663
VPValue * getOrCreateBackedgeTakenCount()
The backedge taken count of the original loop.
Definition VPlan.h:4684
iterator_range< SmallSetVector< ElementCount, 2 >::iterator > vectorFactors() const
Returns an iterator range over all VFs of the plan.
Definition VPlan.h:4734
VPIRValue * getFalse()
Return a VPIRValue wrapping i1 false.
Definition VPlan.h:4800
VPSymbolicValue & getVFxUF()
Returns VF * UF of the vector loop region.
Definition VPlan.h:4703
VPIRValue * getAllOnesValue(Type *Ty)
Return a VPIRValue wrapping the AllOnes value of type Ty.
Definition VPlan.h:4806
VPRegionBlock * createReplicateRegion(VPBlockBase *Entry, VPBlockBase *Exiting, const std::string &Name="")
Create a new replicate region with Entry, Exiting and Name.
Definition VPlan.h:4878
auto getLiveIns() const
Return the list of live-in VPValues available in the VPlan.
Definition VPlan.h:4832
bool hasUF(unsigned UF) const
Definition VPlan.h:4752
ArrayRef< VPIRBasicBlock * > getExitBlocks() const
Return an ArrayRef containing VPIRBasicBlocks wrapping the exit blocks of the original scalar loop.
Definition VPlan.h:4653
VPSymbolicValue & getVectorTripCount()
The vector trip count.
Definition VPlan.h:4693
VPValue * getBackedgeTakenCount() const
Definition VPlan.h:4690
VPIRValue * getOrAddLiveIn(Value *V)
Gets the live-in VPIRValue for V or adds a new live-in (if none exists yet) for V.
Definition VPlan.h:4777
VPIRValue * getZero(Type *Ty)
Return a VPIRValue wrapping the null value of type Ty.
Definition VPlan.h:4803
void setVF(ElementCount VF)
Definition VPlan.h:4715
bool isUnrolled() const
Returns true if the VPlan already has been unrolled, i.e.
Definition VPlan.h:4768
LLVM_ABI_FOR_TEST VPRegionBlock * getVectorLoopRegion()
Returns the VPRegionBlock of the vector loop.
Definition VPlan.cpp:1065
unsigned getConcreteUF() const
Returns the concrete UF of the plan, after unrolling.
Definition VPlan.h:4755
void resetTripCount(VPValue *NewTripCount)
Resets the trip count for the VPlan.
Definition VPlan.h:4677
VPBasicBlock * getMiddleBlock()
Returns the 'middle' block of the plan, that is the block that selects whether to execute the scalar ...
Definition VPlan.h:4629
VPBasicBlock * createVPBasicBlock(const Twine &Name, VPRecipeBase *Recipe=nullptr)
Create a new VPBasicBlock with Name and containing Recipe if present.
Definition VPlan.h:4855
VPIRValue * getTrue()
Return a VPIRValue wrapping i1 true.
Definition VPlan.h:4797
VPBasicBlock * getVectorPreheader() const
Returns the preheader of the vector loop region, if one exists, or null otherwise.
Definition VPlan.h:4605
VPSymbolicValue & getUF()
Returns the UF of the vector loop region.
Definition VPlan.h:4700
bool hasScalarVFOnly() const
Definition VPlan.h:4745
VPBasicBlock * getScalarPreheader() const
Return the VPBasicBlock for the preheader of the scalar loop.
Definition VPlan.h:4643
VPSymbolicValue & getVF()
Returns the VF of the vector loop region.
Definition VPlan.h:4696
void setUF(unsigned UF)
Definition VPlan.h:4760
bool hasScalarTail() const
Returns true if the scalar tail may execute after the vector loop, i.e.
Definition VPlan.h:4910
LLVM_ABI_FOR_TEST VPlan * duplicate()
Clone the current VPlan, update all VPValues of the new VPlan and cloned recipes to refer to the clon...
Definition VPlan.cpp:1221
VPIRValue * getConstantInt(Type *Ty, uint64_t Val, bool IsSigned=false)
Return a VPIRValue wrapping a ConstantInt with the given type and value.
Definition VPlan.h:4811
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255
iterator_range< user_iterator > users()
Definition Value.h:426
bool hasName() const
Definition Value.h:261
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:318
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
constexpr bool hasKnownScalarFactor(const FixedOrScalableQuantity &RHS) const
Returns true if there exists a value X where RHS.multiplyCoefficientBy(X) will result in a value whos...
Definition TypeSize.h:269
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
constexpr ScalarTy getKnownScalarFactor(const FixedOrScalableQuantity &RHS) const
Returns a value X where RHS.multiplyCoefficientBy(X) will result in a value whose quantity matches ou...
Definition TypeSize.h:277
static constexpr bool isKnownLT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:216
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:168
constexpr LeafTy multiplyCoefficientBy(ScalarTy RHS) const
Definition TypeSize.h:256
constexpr bool isFixed() const
Returns true if the quantity is not scaled by vscale.
Definition TypeSize.h:171
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
An efficient, type-erasing, non-owning reference to a callable.
self_iterator getIterator()
Definition ilist_node.h:123
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
LLVM_ABI APInt RoundingUDiv(const APInt &A, const APInt &B, APInt::Rounding RM)
Return A unsign-divided by B, rounded by the given rounding mode.
Definition APInt.cpp:2815
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
match_combine_or< Ty... > m_CombineOr(const Ty &...Ps)
Combine pattern matchers matching any of Ps patterns.
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
auto m_Cmp()
Matches any compare instruction and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
m_Intrinsic_Ty< Opnd0, Opnd1, Opnd2 >::Ty m_MaskedStore(const Opnd0 &Op0, const Opnd1 &Op1, const Opnd2 &Op2)
Matches MaskedStore Intrinsic.
ap_match< APInt > m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
CastInst_match< OpTy, TruncInst > m_Trunc(const OpTy &Op)
Matches Trunc.
LogicalOp_match< LHS, RHS, Instruction::And > m_LogicalAnd(const LHS &L, const RHS &R)
Matches L && R either in the form of L & R or L ?
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, ZExtInst >, OpTy > m_ZExtOrSelf(const OpTy &Op)
bool match(Val *V, const Pattern &P)
match_deferred< Value > m_Deferred(Value *const &V)
Like m_Specific(), but works if the specific value to match is determined as part of the same match()...
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
auto match_fn(const Pattern &P)
A match functor that can be used as a UnaryPredicate in functional algorithms like all_of.
m_Intrinsic_Ty< Opnd0, Opnd1, Opnd2 >::Ty m_MaskedLoad(const Opnd0 &Op0, const Opnd1 &Op1, const Opnd2 &Op2)
Matches MaskedLoad Intrinsic.
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
SpecificCmpClass_match< LHS, RHS, CmpInst > m_SpecificCmp(CmpPredicate MatchPred, const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
CastInst_match< OpTy, FPExtInst > m_FPExt(const OpTy &Op)
SpecificCmpClass_match< LHS, RHS, ICmpInst > m_SpecificICmp(CmpPredicate MatchPred, const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::UDiv > m_UDiv(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
CmpClass_match< LHS, RHS, ICmpInst > m_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
BinaryOp_match< LHS, RHS, Instruction::FAdd, true > m_c_FAdd(const LHS &L, const RHS &R)
Matches FAdd with LHS and RHS in either order.
LogicalOp_match< LHS, RHS, Instruction::And, true > m_c_LogicalAnd(const LHS &L, const RHS &R)
Matches L && R with LHS and RHS in either order.
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
BinaryOp_match< LHS, RHS, Instruction::Mul, true > m_c_Mul(const LHS &L, const RHS &R)
Matches a Mul with LHS and RHS in either order.
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
auto m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
bind_cst_ty m_scev_APInt(const APInt *&C)
Match an SCEV constant and bind it to an APInt.
bool match(const SCEV *S, const Pattern &P)
SCEVAffineAddRec_match< Op0_t, Op1_t, match_isa< const Loop > > m_scev_AffineAddRec(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::ExtractLastLane, VPInstruction_match< VPInstruction::ExtractLastPart, Op0_t > > m_ExtractLastLaneOfLastPart(const Op0_t &Op0)
AllRecipe_commutative_match< Instruction::And, Op0_t, Op1_t > m_c_BinaryAnd(const Op0_t &Op0, const Op1_t &Op1)
Match a binary AND operation.
AllRecipe_match< Instruction::Or, Op0_t, Op1_t > m_BinaryOr(const Op0_t &Op0, const Op1_t &Op1)
Match a binary OR operation.
VPInstruction_match< VPInstruction::AnyOf > m_AnyOf()
AllRecipe_commutative_match< Instruction::Or, Op0_t, Op1_t > m_c_BinaryOr(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::ComputeReductionResult, Op0_t > m_ComputeReductionResult(const Op0_t &Op0)
auto m_WidenAnyExtend(const Op0_t &Op0)
match_bind< VPIRValue > m_VPIRValue(VPIRValue *&V)
Match a VPIRValue.
VPInstruction_match< VPInstruction::StepVector > m_StepVector()
auto m_VPPhi(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::BranchOnTwoConds > m_BranchOnTwoConds()
AllRecipe_match< Opcode, Op0_t, Op1_t > m_Binary(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::LastActiveLane, Op0_t > m_LastActiveLane(const Op0_t &Op0)
auto m_WidenIntrinsic(const T &...Ops)
VPInstruction_match< VPInstruction::ExitingIVValue, Op0_t > m_ExitingIVValue(const Op0_t &Op0)
VPInstruction_match< Instruction::ExtractElement, Op0_t, Op1_t > m_ExtractElement(const Op0_t &Op0, const Op1_t &Op1)
specific_intval< 1 > m_False()
VPInstruction_match< VPInstruction::ExtractLastLane, Op0_t > m_ExtractLastLane(const Op0_t &Op0)
VPInstruction_match< VPInstruction::ActiveLaneMask, Op0_t, Op1_t, Op2_t > m_ActiveLaneMask(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2)
match_bind< VPSingleDefRecipe > m_VPSingleDefRecipe(VPSingleDefRecipe *&V)
Match a VPSingleDefRecipe, capturing if we match.
VPInstruction_match< VPInstruction::BranchOnCount > m_BranchOnCount()
auto m_GetElementPtr(const Op0_t &Op0, const Op1_t &Op1)
specific_intval< 1 > m_True()
auto m_VPValue()
Match an arbitrary VPValue and ignore it.
VectorEndPointerRecipe_match< Op0_t, Op1_t > m_VecEndPtr(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::ExtractLastPart, Op0_t > m_ExtractLastPart(const Op0_t &Op0)
VPInstruction_match< VPInstruction::Broadcast, Op0_t > m_Broadcast(const Op0_t &Op0)
VPInstruction_match< VPInstruction::ExplicitVectorLength, Op0_t > m_EVL(const Op0_t &Op0)
VPInstruction_match< VPInstruction::BuildVector > m_BuildVector()
BuildVector is matches only its opcode, w/o matching its operands as the number of operands is not fi...
VPInstruction_match< VPInstruction::ExtractPenultimateElement, Op0_t > m_ExtractPenultimateElement(const Op0_t &Op0)
match_bind< VPInstruction > m_VPInstruction(VPInstruction *&V)
Match a VPInstruction, capturing if we match.
VPInstruction_match< VPInstruction::FirstActiveLane, Op0_t > m_FirstActiveLane(const Op0_t &Op0)
auto m_DerivedIV(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2)
VPInstruction_match< VPInstruction::BranchOnCond > m_BranchOnCond()
VPInstruction_match< VPInstruction::ExtractLane, Op0_t, Op1_t > m_ExtractLane(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::Reverse, Op0_t > m_Reverse(const Op0_t &Op0)
NodeAddr< DefNode * > Def
Definition RDFGraph.h:384
bool isSingleScalar(const VPValue *VPV)
Returns true if VPV is a single scalar, either because it produces the same value for all lanes or on...
VPValue * getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr)
Get or create a VPValue that corresponds to the expansion of Expr.
bool cannotHoistOrSinkRecipe(const VPRecipeBase &R, bool Sinking=false)
Return true if we do not know how to (mechanically) hoist or sink R.
VPInstruction * findComputeReductionResult(VPReductionPHIRecipe *PhiR)
Find the ComputeReductionResult recipe for PhiR, looking through selects inserted for predicated redu...
VPInstruction * findCanonicalIVIncrement(VPlan &Plan)
Find the canonical IV increment of Plan's vector loop region.
std::optional< MemoryLocation > getMemoryLocation(const VPRecipeBase &R)
Return a MemoryLocation for R with noalias metadata populated from R, if the recipe is supported and ...
bool onlyFirstLaneUsed(const VPValue *Def)
Returns true if only the first lane of Def is used.
VPRecipeBase * findRecipe(VPValue *Start, PredT Pred)
Search Start's users for a recipe satisfying Pred, looking through recipes with definitions.
Definition VPlanUtils.h:116
VPSingleDefRecipe * findHeaderMask(VPlan &Plan)
Collect the header mask with the pattern: (ICMP_ULE, WideCanonicalIV, backedge-taken-count) TODO: Int...
bool onlyScalarValuesUsed(const VPValue *Def)
Returns true if only scalar values of Def are used by all users.
static VPRecipeBase * findUserOf(VPValue *V, const MatchT &P)
If V is used by a recipe matching pattern P, return it.
Definition VPlanUtils.h:137
bool isUniformAcrossVFsAndUFs(const VPValue *V)
Checks if V is uniform across all VF lanes and UF parts.
const SCEV * getSCEVExprForVPValue(const VPValue *V, PredicatedScalarEvolution &PSE, const Loop *L=nullptr)
Return the SCEV expression for V.
bool isHeaderMask(const VPValue *V, const VPlan &Plan)
Return true if V is a header mask in Plan.
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:315
@ Offset
Definition DWP.cpp:557
constexpr auto not_equal_to(T &&Arg)
Functor variant of std::not_equal_to that can be used as a UnaryPredicate in functional algorithms li...
Definition STLExtras.h:2179
void stable_sort(R &&Range)
Definition STLExtras.h:2115
auto min_element(R &&Range)
Provide wrappers to std::min_element which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:2077
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1738
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1668
LLVM_ABI Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
Definition STLExtras.h:840
DenseMap< const Value *, const SCEV * > ValueToSCEVMapTy
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2553
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
constexpr from_range_t from_range
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2207
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:633
auto cast_or_null(const Y &Val)
Definition Casting.h:714
iterator_range< df_iterator< VPBlockShallowTraversalWrapper< VPBlockBase * > > > vp_depth_first_shallow(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order.
Definition VPlanCFG.h:253
iterator_range< df_iterator< VPBlockDeepTraversalWrapper< VPBlockBase * > > > vp_depth_first_deep(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order while traversing t...
Definition VPlanCFG.h:265
constexpr auto equal_to(T &&Arg)
Functor variant of std::equal_to that can be used as a UnaryPredicate in functional algorithms like a...
Definition STLExtras.h:2172
SmallVector< VPRegisterUsage, 8 > calculateRegisterUsageForPlan(VPlan &Plan, ArrayRef< ElementCount > VFs, const TargetTransformInfo &TTI, const SmallPtrSetImpl< const Value * > &ValuesToIgnore)
Estimate the register usage for Plan and vectorization factors in VFs by calculating the highest numb...
detail::concat_range< ValueT, RangeTs... > concat(RangeTs &&...Ranges)
Returns a concatenated range across two or more ranges.
Definition STLExtras.h:1151
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:385
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1745
auto reverse(ContainerTy &&C)
Definition STLExtras.h:407
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1635
LLVM_ABI_FOR_TEST cl::opt< bool > EnableWideActiveLaneMask
UncountableExitStyle
Different methods of handling early exits.
Definition VPlan.h:82
@ ReadOnly
No side effects to worry about, so we can process any uncountable exits in the loop and branch either...
Definition VPlan.h:87
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1752
SmallVector< ValueTypeFromRangeType< R >, Size > to_vector(R &&Range)
Given a range of type R, iterate the entire range and return a SmallVector with elements of the vecto...
iterator_range< filter_iterator< detail::IterOfRange< RangeT >, PredicateT > > make_filter_range(RangeT &&Range, PredicateT Pred)
Convenience function that takes a range of elements and a predicate, and return a new filter_iterator...
Definition STLExtras.h:551
bool canConstantBeExtended(const APInt *C, Type *NarrowType, TTI::PartialReductionExtendKind ExtKind)
Check if a constant CI can be safely treated as having been extended from a narrower type with the gi...
Definition VPlan.cpp:1849
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
auto drop_end(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the last N elements excluded.
Definition STLExtras.h:322
TargetTransformInfo TTI
RecurKind
These are the kinds of recurrences that we support.
@ UMin
Unsigned integer min implemented in terms of select(cmp()).
@ FindIV
FindIV reduction with select(icmp(),x,y) where one of (x,y) is a loop induction variable (increasing ...
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ FSub
Subtraction of floats.
@ FAddChainWithSubs
A chain of fadds and fsubs.
@ FMul
Product of floats.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ SMin
Signed integer min implemented in terms of select(cmp()).
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
@ AddChainWithSubs
A chain of adds and subs.
@ FAdd
Sum of floats.
@ UMax
Unsigned integer max implemented in terms of select(cmp()).
LLVM_ABI Value * getRecurrenceIdentity(RecurKind K, Type *Tp, FastMathFlags FMF)
Given information about an recurrence kind, return the identity for the @llvm.vector....
LLVM_ABI BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="")
Split the specified block at the specified instruction.
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition STLExtras.h:2011
DWARFExpression::Operation Op
auto max_element(R &&Range)
Provide wrappers to std::max_element which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:2087
ArrayRef(const T &OneElt) -> ArrayRef< T >
auto make_second_range(ContainerTy &&c)
Given a container of pairs, return a range over the second elements.
Definition STLExtras.h:1408
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1946
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition STLExtras.h:2165
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition Hashing.h:325
bool equal(L &&LRange, R &&RRange)
Wrapper function around std::equal to detect if pair-wise elements between two ranges are the same.
Definition STLExtras.h:2145
Type * toVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
@ Default
The result value is uniform if and only if all operands are uniform.
Definition Uniformity.h:20
constexpr detail::IsaCheckPredicate< Types... > IsaPred
Function object wrapper for the llvm::isa type check.
Definition Casting.h:866
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition Hashing.h:305
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:876
#define N
RemoveMask_match(const Op0_t &In, Op1_t &Out)
bool match(OpTy *V) const
MDNode * Scope
The tag for alias scope specification (used with noalias).
Definition Metadata.h:786
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
An information struct used to provide DenseMap with the various necessary components for a given valu...
This reduction is unordered with the partial result scaled down by some factor.
Definition VPlan.h:2667
A range of powers-of-2 vectorization factors with fixed start and adjustable end.
Struct to hold various analysis needed for cost computations.
TargetTransformInfo::TargetCostKind CostKind
VPTypeAnalysis Types
const TargetTransformInfo & TTI
A VPValue representing a live-in from the input IR or a constant.
Definition VPlanValue.h:240
Type * getType() const
Returns the type of the underlying IR value.
Definition VPlan.cpp:142
A struct that represents some properties of the register usage of a loop.
SmallMapVector< unsigned, unsigned, 4 > MaxLocalUsers
Holds the maximum number of concurrent live intervals in the loop.
InstructionCost spillCost(const TargetTransformInfo &TTI, TargetTransformInfo::TargetCostKind CostKind, unsigned OverrideMaxNumRegs=0) const
Calculate the estimated cost of any spills due to using more registers than the number available for ...
A symbolic live-in VPValue, used for values like vector trip count, VF, and VFxUF.
Definition VPlanValue.h:280
bool isMaterialized() const
Returns true if this symbolic value has been materialized.
Definition VPlanValue.h:291
A recipe for widening load operations with vector-predication intrinsics, using the address to load f...
Definition VPlan.h:3612
A recipe for widening load operations, using the address to load from and an optional mask.
Definition VPlan.h:3572
A recipe for widening store operations with vector-predication intrinsics, using the value to store,...
Definition VPlan.h:3696
A recipe for widening store operations, using the stored value, the address to store to and an option...
Definition VPlan.h:3653
static void handleUncountableEarlyExits(VPlan &Plan, VPBasicBlock *HeaderVPBB, VPBasicBlock *LatchVPBB, VPBasicBlock *MiddleVPBB, UncountableExitStyle Style)
Update Plan to account for uncountable early exits by introducing appropriate branching logic in the ...
static LLVM_ABI_FOR_TEST bool tryToConvertVPInstructionsToVPRecipes(VPlan &Plan, const TargetLibraryInfo &TLI)
Replaces the VPInstructions in Plan with corresponding widen recipes.
static void makeMemOpWideningDecisions(VPlan &Plan, VFRange &Range, VPRecipeBuilder &RecipeBuilder)
Convert load/store VPInstructions in Plan into widened or replicate recipes.
static void materializeBroadcasts(VPlan &Plan)
Add explicit broadcasts for live-ins and VPValues defined in Plan's entry block if they are used as v...
static void materializePacksAndUnpacks(VPlan &Plan)
Add explicit Build[Struct]Vector recipes to Pack multiple scalar values into vectors and Unpack recip...
static void createInterleaveGroups(VPlan &Plan, const SmallPtrSetImpl< const InterleaveGroup< Instruction > * > &InterleaveGroups, const bool &EpilogueAllowed)
static bool simplifyKnownEVL(VPlan &Plan, ElementCount VF, PredicatedScalarEvolution &PSE)
Try to simplify VPInstruction::ExplicitVectorLength recipes when the AVL is known to be <= VF,...
static void removeBranchOnConst(VPlan &Plan, bool OnlyLatches=false)
Remove BranchOnCond recipes with true or false conditions together with removing dead edges to their ...
static void materializeFactors(VPlan &Plan, VPBasicBlock *VectorPH, ElementCount VF)
Materialize UF, VF and VFxUF to be computed explicitly using VPInstructions.
static void materializeBackedgeTakenCount(VPlan &Plan, VPBasicBlock *VectorPH)
Materialize the backedge-taken count to be computed explicitly using VPInstructions.
static void addActiveLaneMask(VPlan &Plan, bool UseActiveLaneMaskForControlFlow)
Replace (ICMP_ULE, wide canonical IV, backedge-taken-count) checks with an (active-lane-mask recipe,...
static void replaceWideCanonicalIVWithWideIV(VPlan &Plan, ScalarEvolution &SE, const TargetTransformInfo &TTI, TargetTransformInfo::TargetCostKind CostKind, ElementCount VF, unsigned UF, const SmallPtrSetImpl< const Value * > &ValuesToIgnore)
Replace a VPWidenCanonicalIVRecipe if it is present in Plan, with a VPWidenIntOrFpInductionRecipe,...
static void createAndOptimizeReplicateRegions(VPlan &Plan)
Wrap predicated VPReplicateRecipes with a mask operand in an if-then region block and remove the mask...
static void convertToVariableLengthStep(VPlan &Plan)
Transform loops with variable-length stepping after region dissolution.
static void addBranchWeightToMiddleTerminator(VPlan &Plan, ElementCount VF, std::optional< unsigned > VScaleForTuning)
Add branch weight metadata, if the Plan's middle block is terminated by a BranchOnCond recipe.
static std::unique_ptr< VPlan > narrowInterleaveGroups(VPlan &Plan, const TargetTransformInfo &TTI)
Try to find a single VF among Plan's VFs for which all interleave groups (with known minimum VF eleme...
static DenseMap< const SCEV *, Value * > expandSCEVs(VPlan &Plan, ScalarEvolution &SE)
Expand VPExpandSCEVRecipes in Plan's entry block.
static void convertToConcreteRecipes(VPlan &Plan)
Lower abstract recipes to concrete ones, that can be codegen'd.
static void expandBranchOnTwoConds(VPlan &Plan)
Expand BranchOnTwoConds instructions into explicit CFG with BranchOnCond instructions.
static void materializeVectorTripCount(VPlan &Plan, VPBasicBlock *VectorPHVPBB, bool TailByMasking, bool RequiresScalarEpilogue, VPValue *Step, std::optional< uint64_t > MaxRuntimeStep=std::nullopt)
Materialize vector trip count computations to a set of VPInstructions.
static void hoistPredicatedLoads(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L)
Hoist predicated loads from the same address to the loop entry block, if they are guaranteed to execu...
static bool mergeBlocksIntoPredecessors(VPlan &Plan)
Remove redundant VPBasicBlocks by merging them into their single predecessor if the latter has a sing...
static void optimizeFindIVReductions(VPlan &Plan, PredicatedScalarEvolution &PSE, Loop &L)
Optimize FindLast reductions selecting IVs (or expressions of IVs) by converting them to FindIV reduc...
static void convertToAbstractRecipes(VPlan &Plan, VPCostContext &Ctx, VFRange &Range)
This function converts initial recipes to the abstract recipes and clamps Range based on cost model f...
static void materializeConstantVectorTripCount(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
static void makeScalarizationDecisions(VPlan &Plan, VFRange &Range)
Make VPlan-based scalarization decision prior to delegating to the ones made by the legacy CM.
static void addExplicitVectorLength(VPlan &Plan, const std::optional< unsigned > &MaxEVLSafeElements)
Add a VPCurrentIterationPHIRecipe and related recipes to Plan and replaces all uses of the canonical ...
static void adjustFirstOrderRecurrenceMiddleUsers(VPlan &Plan, VFRange &Range)
Adjust first-order recurrence users in the middle block: create penultimate element extracts for LCSS...
static void optimizeEVLMasks(VPlan &Plan)
Optimize recipes which use an EVL-based header mask to VP intrinsics, for example:
static void replaceSymbolicStrides(VPlan &Plan, PredicatedScalarEvolution &PSE, const DenseMap< Value *, const SCEV * > &StridesMap)
Replace symbolic strides from StridesMap in Plan with constants when possible.
static void removeDeadRecipes(VPlan &Plan)
Remove dead recipes from Plan.
static void simplifyRecipes(VPlan &Plan)
Perform instcombine-like simplifications on recipes in Plan.
static void sinkPredicatedStores(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L)
Sink predicated stores to the same address with complementary predicates (P and NOT P) to an uncondit...
static void clearReductionWrapFlags(VPlan &Plan)
Clear NSW/NUW flags from reduction instructions if necessary.
static void optimizeInductionLiveOutUsers(VPlan &Plan, PredicatedScalarEvolution &PSE, bool FoldTail)
If there's a single exit block, optimize its phi recipes that use exiting IV values by feeding them p...
static void createPartialReductions(VPlan &Plan, VPCostContext &CostCtx, VFRange &Range)
Detect and create partial reduction recipes for scaled reductions in Plan.
static void cse(VPlan &Plan)
Perform common-subexpression-elimination on Plan.
static LLVM_ABI_FOR_TEST void optimize(VPlan &Plan)
Apply VPlan-to-VPlan optimizations to Plan, including induction recipe optimizations,...
static void dissolveLoopRegions(VPlan &Plan)
Replace loop regions with explicit CFG.
static void truncateToMinimalBitwidths(VPlan &Plan, const MapVector< Instruction *, uint64_t > &MinBWs)
Insert truncates and extends for any truncated recipe.
static void dropPoisonGeneratingRecipes(VPlan &Plan)
Drop poison flags from recipes that may generate a poison value that is used after vectorization,...
static void optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Optimize Plan based on BestVF and BestUF.
static void convertEVLExitCond(VPlan &Plan)
Replaces the exit condition from (branch-on-cond eq CanonicalIVInc, VectorTripCount) to (branch-on-co...