LLVM 23.0.0git
VPlanTransforms.cpp
Go to the documentation of this file.
1//===-- VPlanTransforms.cpp - Utility VPlan to VPlan transforms -----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file implements a set of utility VPlan to VPlan transformations.
11///
12//===----------------------------------------------------------------------===//
13
14#include "VPlanTransforms.h"
15#include "VPRecipeBuilder.h"
16#include "VPlan.h"
17#include "VPlanAnalysis.h"
18#include "VPlanCFG.h"
19#include "VPlanDominatorTree.h"
20#include "VPlanHelpers.h"
21#include "VPlanPatternMatch.h"
22#include "VPlanUtils.h"
23#include "VPlanVerifier.h"
24#include "llvm/ADT/APInt.h"
26#include "llvm/ADT/STLExtras.h"
28#include "llvm/ADT/SetVector.h"
30#include "llvm/ADT/TypeSwitch.h"
38#include "llvm/IR/Intrinsics.h"
39#include "llvm/IR/MDBuilder.h"
40#include "llvm/IR/Metadata.h"
45
46using namespace llvm;
47using namespace VPlanPatternMatch;
48using namespace SCEVPatternMatch;
49
51 VPlan &Plan, const TargetLibraryInfo &TLI) {
52
54 Plan.getVectorLoopRegion());
56 // Skip blocks outside region
57 if (!VPBB->getParent())
58 break;
59 VPRecipeBase *Term = VPBB->getTerminator();
60 auto EndIter = Term ? Term->getIterator() : VPBB->end();
61 // Introduce each ingredient into VPlan.
62 for (VPRecipeBase &Ingredient :
63 make_early_inc_range(make_range(VPBB->begin(), EndIter))) {
64
65 VPValue *VPV = Ingredient.getVPSingleValue();
66 if (!VPV->getUnderlyingValue())
67 continue;
68
70
71 VPRecipeBase *NewRecipe = nullptr;
72 if (auto *PhiR = dyn_cast<VPPhi>(&Ingredient)) {
73 auto *Phi = cast<PHINode>(PhiR->getUnderlyingValue());
74 NewRecipe = new VPWidenPHIRecipe(Phi, nullptr, PhiR->getDebugLoc());
75 for (VPValue *Op : PhiR->operands())
76 NewRecipe->addOperand(Op);
77 } else if (auto *VPI = dyn_cast<VPInstruction>(&Ingredient)) {
78 assert(!isa<PHINode>(Inst) && "phis should be handled above");
79 // Create VPWidenMemoryRecipe for loads and stores.
80 if (LoadInst *Load = dyn_cast<LoadInst>(Inst)) {
81 NewRecipe = new VPWidenLoadRecipe(
82 *Load, Ingredient.getOperand(0), nullptr /*Mask*/,
83 false /*Consecutive*/, false /*Reverse*/, *VPI,
84 Ingredient.getDebugLoc());
85 } else if (StoreInst *Store = dyn_cast<StoreInst>(Inst)) {
86 NewRecipe = new VPWidenStoreRecipe(
87 *Store, Ingredient.getOperand(1), Ingredient.getOperand(0),
88 nullptr /*Mask*/, false /*Consecutive*/, false /*Reverse*/, *VPI,
89 Ingredient.getDebugLoc());
91 NewRecipe = new VPWidenGEPRecipe(GEP, Ingredient.operands(), *VPI,
92 Ingredient.getDebugLoc());
93 } else if (CallInst *CI = dyn_cast<CallInst>(Inst)) {
94 Intrinsic::ID VectorID = getVectorIntrinsicIDForCall(CI, &TLI);
95 if (VectorID == Intrinsic::not_intrinsic)
96 return false;
97
98 // The noalias.scope.decl intrinsic declares a noalias scope that
99 // is valid for a single iteration. Emitting it as a single-scalar
100 // replicate would incorrectly extend the scope across multiple
101 // original iterations packed into one vector iteration.
102 // FIXME: If we want to vectorize this loop, then we have to drop
103 // all the associated !alias.scope and !noalias.
104 if (VectorID == Intrinsic::experimental_noalias_scope_decl)
105 return false;
106
107 // These intrinsics are recognized by getVectorIntrinsicIDForCall
108 // but are not widenable. Emit them as replicate instead of widening.
109 if (VectorID == Intrinsic::assume ||
110 VectorID == Intrinsic::lifetime_end ||
111 VectorID == Intrinsic::lifetime_start ||
112 VectorID == Intrinsic::sideeffect ||
113 VectorID == Intrinsic::pseudoprobe) {
114 // If the operand of llvm.assume holds before vectorization, it will
115 // also hold per lane.
116 // llvm.pseudoprobe requires to be duplicated per lane for accurate
117 // sample count.
118 const bool IsSingleScalar = VectorID != Intrinsic::assume &&
119 VectorID != Intrinsic::pseudoprobe;
120 NewRecipe = new VPReplicateRecipe(CI, Ingredient.operands(),
121 /*IsSingleScalar=*/IsSingleScalar,
122 /*Mask=*/nullptr, *VPI, *VPI,
123 Ingredient.getDebugLoc());
124 } else {
125 NewRecipe = new VPWidenIntrinsicRecipe(
126 *CI, VectorID, drop_end(Ingredient.operands()), CI->getType(),
127 VPIRFlags(*CI), *VPI, CI->getDebugLoc());
128 }
129 } else if (auto *CI = dyn_cast<CastInst>(Inst)) {
130 NewRecipe = new VPWidenCastRecipe(
131 CI->getOpcode(), Ingredient.getOperand(0), CI->getType(), CI,
132 VPIRFlags(*CI), VPIRMetadata(*CI));
133 } else {
134 NewRecipe = new VPWidenRecipe(*Inst, Ingredient.operands(), *VPI,
135 *VPI, Ingredient.getDebugLoc());
136 }
137 } else {
139 "inductions must be created earlier");
140 continue;
141 }
142
143 NewRecipe->insertBefore(&Ingredient);
144 if (NewRecipe->getNumDefinedValues() == 1)
145 VPV->replaceAllUsesWith(NewRecipe->getVPSingleValue());
146 else
147 assert(NewRecipe->getNumDefinedValues() == 0 &&
148 "Only recpies with zero or one defined values expected");
149 Ingredient.eraseFromParent();
150 }
151 }
152 return true;
153}
154
155/// Helper for extra no-alias checks via known-safe recipe and SCEV.
157 const SmallPtrSetImpl<VPRecipeBase *> &ExcludeRecipes;
158 VPReplicateRecipe &GroupLeader;
160 const Loop &L;
161 VPTypeAnalysis &TypeInfo;
162
163 // Return true if \p A and \p B are known to not alias for all VFs in the
164 // plan, checked via the distance between the accesses
165 bool isNoAliasViaDistance(VPReplicateRecipe *A, VPReplicateRecipe *B) const {
166 if (A->getOpcode() != Instruction::Store ||
167 B->getOpcode() != Instruction::Store)
168 return false;
169
170 VPValue *AddrA = A->getOperand(1);
171 const SCEV *SCEVA = vputils::getSCEVExprForVPValue(AddrA, PSE, &L);
172 VPValue *AddrB = B->getOperand(1);
173 const SCEV *SCEVB = vputils::getSCEVExprForVPValue(AddrB, PSE, &L);
175 return false;
176
177 const APInt *Distance;
178 ScalarEvolution &SE = *PSE.getSE();
179 if (!match(SE.getMinusSCEV(SCEVA, SCEVB), m_scev_APInt(Distance)))
180 return false;
181
182 const DataLayout &DL = SE.getDataLayout();
183 Type *TyA = TypeInfo.inferScalarType(A->getOperand(0));
184 uint64_t SizeA = DL.getTypeStoreSize(TyA);
185 Type *TyB = TypeInfo.inferScalarType(B->getOperand(0));
186 uint64_t SizeB = DL.getTypeStoreSize(TyB);
187
188 // Use the maximum store size to ensure no overlap from either direction.
189 // Currently only handles fixed sizes, as it is only used for
190 // replicating VPReplicateRecipes.
191 uint64_t MaxStoreSize = std::max(SizeA, SizeB);
192
193 auto VFs = B->getParent()->getPlan()->vectorFactors();
195 if (MaxVF.isScalable())
196 return false;
197 return Distance->abs().uge(
198 MaxVF.multiplyCoefficientBy(MaxStoreSize).getFixedValue());
199 }
200
201public:
204 const Loop &L, VPTypeAnalysis &TypeInfo)
205 : ExcludeRecipes(ExcludeRecipes), GroupLeader(GroupLeader), PSE(PSE),
206 L(L), TypeInfo(TypeInfo) {}
207
208 /// Return true if \p R should be skipped during alias checking, either
209 /// because it's in the exclude set or because no-alias can be proven via
210 /// SCEV.
211 bool shouldSkip(VPRecipeBase &R) const {
212 auto *Store = dyn_cast<VPReplicateRecipe>(&R);
213 return ExcludeRecipes.contains(&R) ||
214 (Store && isNoAliasViaDistance(Store, &GroupLeader));
215 }
216};
217
218/// Check if a memory operation doesn't alias with memory operations using
219/// scoped noalias metadata, in blocks in the single-successor chain between \p
220/// FirstBB and \p LastBB. If \p SinkInfo is std::nullopt, only recipes that may
221/// write to memory are checked (for load hoisting). Otherwise recipes that both
222/// read and write memory are checked, and SCEV is used to prove no-alias
223/// between the group leader and other replicate recipes (for store sinking).
224static bool
226 VPBasicBlock *FirstBB, VPBasicBlock *LastBB,
227 std::optional<SinkStoreInfo> SinkInfo = {}) {
228 bool CheckReads = SinkInfo.has_value();
229 if (!MemLoc.AATags.Scope)
230 return false;
231
232 for (VPBasicBlock *VPBB :
234 for (VPRecipeBase &R : *VPBB) {
235 if (SinkInfo && SinkInfo->shouldSkip(R))
236 continue;
237
238 // Skip recipes that don't need checking.
239 if (!R.mayWriteToMemory() && !(CheckReads && R.mayReadFromMemory()))
240 continue;
241
243 if (!Loc)
244 // Conservatively assume aliasing for memory operations without
245 // location.
246 return false;
247
249 return false;
250 }
251 }
252 return true;
253}
254
255/// Collect either replicated Loads or Stores grouped by their address SCEV, in
256/// a deep-traversal of the vector loop region in \p Plan.
257template <unsigned Opcode>
260 VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L,
261 function_ref<bool(VPReplicateRecipe *)> FilterFn) {
262 static_assert(Opcode == Instruction::Load || Opcode == Instruction::Store,
263 "Only Load and Store opcodes supported");
264 constexpr bool IsLoad = (Opcode == Instruction::Load);
266 RecipesByAddress;
269 for (VPRecipeBase &R : *VPBB) {
270 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
271 if (!RepR || RepR->getOpcode() != Opcode || !FilterFn(RepR))
272 continue;
273
274 // For loads, operand 0 is address; for stores, operand 1 is address.
275 VPValue *Addr = RepR->getOperand(IsLoad ? 0 : 1);
276 const SCEV *AddrSCEV = vputils::getSCEVExprForVPValue(Addr, PSE, L);
277 if (!isa<SCEVCouldNotCompute>(AddrSCEV))
278 RecipesByAddress[AddrSCEV].push_back(RepR);
279 }
280 }
281 auto Groups = to_vector(RecipesByAddress.values());
282 VPDominatorTree VPDT(Plan);
283 for (auto &Group : Groups) {
284 // Sort mem ops by dominance order, with earliest (most dominating) first.
286 return VPDT.properlyDominates(A, B);
287 });
288 }
289 return Groups;
290}
291
292/// Return true if we do not know how to (mechanically) hoist or sink \p R out
293/// of a loop region.
295 // Assumes don't alias anything or throw; as long as they're guaranteed to
296 // execute, they're safe to hoist.
298 return false;
299
300 // TODO: Relax checks in the future, e.g. we could also hoist reads, if their
301 // memory location is not modified in the vector loop.
302 if (R.mayHaveSideEffects() || R.mayReadFromMemory() || R.isPhi())
303 return true;
304
305 // Allocas cannot be hoisted.
306 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
307 return RepR && RepR->getOpcode() == Instruction::Alloca;
308}
309
310static bool sinkScalarOperands(VPlan &Plan) {
311 auto Iter = vp_depth_first_deep(Plan.getEntry());
312 bool ScalarVFOnly = Plan.hasScalarVFOnly();
313 bool Changed = false;
314
316 auto InsertIfValidSinkCandidate = [ScalarVFOnly, &WorkList](
317 VPBasicBlock *SinkTo, VPValue *Op) {
318 auto *Candidate =
319 dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe());
320 if (!Candidate)
321 return;
322
323 // We only know how to sink VPReplicateRecipes and VPScalarIVStepsRecipes
324 // for now.
326 return;
327
328 if (Candidate->getParent() == SinkTo || cannotHoistOrSinkRecipe(*Candidate))
329 return;
330
331 if (auto *RepR = dyn_cast<VPReplicateRecipe>(Candidate))
332 if (!ScalarVFOnly && RepR->isSingleScalar())
333 return;
334
335 WorkList.insert({SinkTo, Candidate});
336 };
337
338 // First, collect the operands of all recipes in replicate blocks as seeds for
339 // sinking.
341 VPBasicBlock *EntryVPBB = VPR->getEntryBasicBlock();
342 if (!VPR->isReplicator() || EntryVPBB->getSuccessors().size() != 2)
343 continue;
344 VPBasicBlock *VPBB = cast<VPBasicBlock>(EntryVPBB->getSuccessors().front());
345 if (VPBB->getSingleSuccessor() != VPR->getExitingBasicBlock())
346 continue;
347 for (auto &Recipe : *VPBB)
348 for (VPValue *Op : Recipe.operands())
349 InsertIfValidSinkCandidate(VPBB, Op);
350 }
351
352 // Try to sink each replicate or scalar IV steps recipe in the worklist.
353 for (unsigned I = 0; I != WorkList.size(); ++I) {
354 VPBasicBlock *SinkTo;
355 VPSingleDefRecipe *SinkCandidate;
356 std::tie(SinkTo, SinkCandidate) = WorkList[I];
357
358 // All recipe users of SinkCandidate must be in the same block SinkTo or all
359 // users outside of SinkTo must only use the first lane of SinkCandidate. In
360 // the latter case, we need to duplicate SinkCandidate.
361 auto UsersOutsideSinkTo =
362 make_filter_range(SinkCandidate->users(), [SinkTo](VPUser *U) {
363 return cast<VPRecipeBase>(U)->getParent() != SinkTo;
364 });
365 if (any_of(UsersOutsideSinkTo, [SinkCandidate](VPUser *U) {
366 return !U->usesFirstLaneOnly(SinkCandidate);
367 }))
368 continue;
369 bool NeedsDuplicating = !UsersOutsideSinkTo.empty();
370
371 if (NeedsDuplicating) {
372 if (ScalarVFOnly)
373 continue;
374 VPSingleDefRecipe *Clone;
375 if (auto *SinkCandidateRepR =
376 dyn_cast<VPReplicateRecipe>(SinkCandidate)) {
377 // TODO: Handle converting to uniform recipes as separate transform,
378 // then cloning should be sufficient here.
379 Instruction *I = SinkCandidate->getUnderlyingInstr();
380 Clone = new VPReplicateRecipe(I, SinkCandidate->operands(), true,
381 nullptr /*Mask*/, *SinkCandidateRepR,
382 *SinkCandidateRepR);
383 // TODO: add ".cloned" suffix to name of Clone's VPValue.
384 } else {
385 Clone = SinkCandidate->clone();
386 }
387
388 Clone->insertBefore(SinkCandidate);
389 SinkCandidate->replaceUsesWithIf(Clone, [SinkTo](VPUser &U, unsigned) {
390 return cast<VPRecipeBase>(&U)->getParent() != SinkTo;
391 });
392 }
393 SinkCandidate->moveBefore(*SinkTo, SinkTo->getFirstNonPhi());
394 for (VPValue *Op : SinkCandidate->operands())
395 InsertIfValidSinkCandidate(SinkTo, Op);
396 Changed = true;
397 }
398 return Changed;
399}
400
401/// If \p R is a region with a VPBranchOnMaskRecipe in the entry block, return
402/// the mask.
404 auto *EntryBB = dyn_cast<VPBasicBlock>(R->getEntry());
405 if (!EntryBB || EntryBB->size() != 1 ||
406 !isa<VPBranchOnMaskRecipe>(EntryBB->begin()))
407 return nullptr;
408
409 return cast<VPBranchOnMaskRecipe>(&*EntryBB->begin())->getOperand(0);
410}
411
412/// If \p R is a triangle region, return the 'then' block of the triangle.
414 auto *EntryBB = cast<VPBasicBlock>(R->getEntry());
415 if (EntryBB->getNumSuccessors() != 2)
416 return nullptr;
417
418 auto *Succ0 = dyn_cast<VPBasicBlock>(EntryBB->getSuccessors()[0]);
419 auto *Succ1 = dyn_cast<VPBasicBlock>(EntryBB->getSuccessors()[1]);
420 if (!Succ0 || !Succ1)
421 return nullptr;
422
423 if (Succ0->getNumSuccessors() + Succ1->getNumSuccessors() != 1)
424 return nullptr;
425 if (Succ0->getSingleSuccessor() == Succ1)
426 return Succ0;
427 if (Succ1->getSingleSuccessor() == Succ0)
428 return Succ1;
429 return nullptr;
430}
431
432// Merge replicate regions in their successor region, if a replicate region
433// is connected to a successor replicate region with the same predicate by a
434// single, empty VPBasicBlock.
436 SmallPtrSet<VPRegionBlock *, 4> TransformedRegions;
437
438 // Collect replicate regions followed by an empty block, followed by another
439 // replicate region with matching masks to process front. This is to avoid
440 // iterator invalidation issues while merging regions.
443 vp_depth_first_deep(Plan.getEntry()))) {
444 if (!Region1->isReplicator())
445 continue;
446 auto *MiddleBasicBlock =
447 dyn_cast_or_null<VPBasicBlock>(Region1->getSingleSuccessor());
448 if (!MiddleBasicBlock || !MiddleBasicBlock->empty())
449 continue;
450
451 auto *Region2 =
452 dyn_cast_or_null<VPRegionBlock>(MiddleBasicBlock->getSingleSuccessor());
453 if (!Region2 || !Region2->isReplicator())
454 continue;
455
456 VPValue *Mask1 = getPredicatedMask(Region1);
457 VPValue *Mask2 = getPredicatedMask(Region2);
458 if (!Mask1 || Mask1 != Mask2)
459 continue;
460
461 assert(Mask1 && Mask2 && "both region must have conditions");
462 WorkList.push_back(Region1);
463 }
464
465 // Move recipes from Region1 to its successor region, if both are triangles.
466 for (VPRegionBlock *Region1 : WorkList) {
467 if (TransformedRegions.contains(Region1))
468 continue;
469 auto *MiddleBasicBlock = cast<VPBasicBlock>(Region1->getSingleSuccessor());
470 auto *Region2 = cast<VPRegionBlock>(MiddleBasicBlock->getSingleSuccessor());
471
472 VPBasicBlock *Then1 = getPredicatedThenBlock(Region1);
473 VPBasicBlock *Then2 = getPredicatedThenBlock(Region2);
474 if (!Then1 || !Then2)
475 continue;
476
477 // Note: No fusion-preventing memory dependencies are expected in either
478 // region. Such dependencies should be rejected during earlier dependence
479 // checks, which guarantee accesses can be re-ordered for vectorization.
480 //
481 // Move recipes to the successor region.
482 for (VPRecipeBase &ToMove : make_early_inc_range(reverse(*Then1)))
483 ToMove.moveBefore(*Then2, Then2->getFirstNonPhi());
484
485 auto *Merge1 = cast<VPBasicBlock>(Then1->getSingleSuccessor());
486 auto *Merge2 = cast<VPBasicBlock>(Then2->getSingleSuccessor());
487
488 // Move VPPredInstPHIRecipes from the merge block to the successor region's
489 // merge block. Update all users inside the successor region to use the
490 // original values.
491 for (VPRecipeBase &Phi1ToMove : make_early_inc_range(reverse(*Merge1))) {
492 VPValue *PredInst1 =
493 cast<VPPredInstPHIRecipe>(&Phi1ToMove)->getOperand(0);
494 VPValue *Phi1ToMoveV = Phi1ToMove.getVPSingleValue();
495 Phi1ToMoveV->replaceUsesWithIf(PredInst1, [Then2](VPUser &U, unsigned) {
496 return cast<VPRecipeBase>(&U)->getParent() == Then2;
497 });
498
499 // Remove phi recipes that are unused after merging the regions.
500 if (Phi1ToMove.getVPSingleValue()->getNumUsers() == 0) {
501 Phi1ToMove.eraseFromParent();
502 continue;
503 }
504 Phi1ToMove.moveBefore(*Merge2, Merge2->begin());
505 }
506
507 // Remove the dead recipes in Region1's entry block.
508 for (VPRecipeBase &R :
509 make_early_inc_range(reverse(*Region1->getEntryBasicBlock())))
510 R.eraseFromParent();
511
512 // Finally, remove the first region.
513 for (VPBlockBase *Pred : make_early_inc_range(Region1->getPredecessors())) {
514 VPBlockUtils::disconnectBlocks(Pred, Region1);
515 VPBlockUtils::connectBlocks(Pred, MiddleBasicBlock);
516 }
517 VPBlockUtils::disconnectBlocks(Region1, MiddleBasicBlock);
518 TransformedRegions.insert(Region1);
519 }
520
521 return !TransformedRegions.empty();
522}
523
525 VPlan &Plan) {
526 Instruction *Instr = PredRecipe->getUnderlyingInstr();
527 // Build the triangular if-then region.
528 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
529 assert(Instr->getParent() && "Predicated instruction not in any basic block");
530 auto *BlockInMask = PredRecipe->getMask();
531 auto *MaskDef = BlockInMask->getDefiningRecipe();
532 auto *BOMRecipe = new VPBranchOnMaskRecipe(
533 BlockInMask, MaskDef ? MaskDef->getDebugLoc() : DebugLoc::getUnknown());
534 auto *Entry =
535 Plan.createVPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
536
537 // Replace predicated replicate recipe with a replicate recipe without a
538 // mask but in the replicate region.
539 auto *RecipeWithoutMask = new VPReplicateRecipe(
540 PredRecipe->getUnderlyingInstr(), drop_end(PredRecipe->operands()),
541 PredRecipe->isSingleScalar(), nullptr /*Mask*/, *PredRecipe, *PredRecipe,
542 PredRecipe->getDebugLoc());
543 auto *Pred =
544 Plan.createVPBasicBlock(Twine(RegionName) + ".if", RecipeWithoutMask);
545
546 VPPredInstPHIRecipe *PHIRecipe = nullptr;
547 if (PredRecipe->getNumUsers() != 0) {
548 PHIRecipe = new VPPredInstPHIRecipe(RecipeWithoutMask,
549 RecipeWithoutMask->getDebugLoc());
550 PredRecipe->replaceAllUsesWith(PHIRecipe);
551 PHIRecipe->setOperand(0, RecipeWithoutMask);
552 }
553 PredRecipe->eraseFromParent();
554 auto *Exiting =
555 Plan.createVPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
557 Plan.createReplicateRegion(Entry, Exiting, RegionName);
558
559 // Note: first set Entry as region entry and then connect successors starting
560 // from it in order, to propagate the "parent" of each VPBasicBlock.
561 VPBlockUtils::insertTwoBlocksAfter(Pred, Exiting, Entry);
562 VPBlockUtils::connectBlocks(Pred, Exiting);
563
564 return Region;
565}
566
567static void addReplicateRegions(VPlan &Plan) {
570 vp_depth_first_deep(Plan.getEntry()))) {
571 for (VPRecipeBase &R : *VPBB)
572 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
573 if (RepR->isPredicated())
574 WorkList.push_back(RepR);
575 }
576 }
577
578 unsigned BBNum = 0;
579 for (VPReplicateRecipe *RepR : WorkList) {
580 VPBasicBlock *CurrentBlock = RepR->getParent();
581 VPBasicBlock *SplitBlock = CurrentBlock->splitAt(RepR->getIterator());
582
583 BasicBlock *OrigBB = RepR->getUnderlyingInstr()->getParent();
584 SplitBlock->setName(
585 OrigBB->hasName() ? OrigBB->getName() + "." + Twine(BBNum++) : "");
586 // Record predicated instructions for above packing optimizations.
588 Region->setParent(CurrentBlock->getParent());
590
591 VPRegionBlock *ParentRegion = Region->getParent();
592 if (ParentRegion && ParentRegion->getExiting() == CurrentBlock)
593 ParentRegion->setExiting(SplitBlock);
594 }
595}
596
600 vp_depth_first_deep(Plan.getEntry()))) {
601 // Don't fold the blocks in the skeleton of the Plan into their single
602 // predecessors for now.
603 // TODO: Remove restriction once more of the skeleton is modeled in VPlan.
604 if (!VPBB->getParent())
605 continue;
606 auto *PredVPBB =
607 dyn_cast_or_null<VPBasicBlock>(VPBB->getSinglePredecessor());
608 if (!PredVPBB || PredVPBB->getNumSuccessors() != 1 ||
609 isa<VPIRBasicBlock>(PredVPBB))
610 continue;
611 WorkList.push_back(VPBB);
612 }
613
614 for (VPBasicBlock *VPBB : WorkList) {
615 VPBasicBlock *PredVPBB = cast<VPBasicBlock>(VPBB->getSinglePredecessor());
616 for (VPRecipeBase &R : make_early_inc_range(*VPBB))
617 R.moveBefore(*PredVPBB, PredVPBB->end());
618 VPBlockUtils::disconnectBlocks(PredVPBB, VPBB);
619 auto *ParentRegion = VPBB->getParent();
620 if (ParentRegion && ParentRegion->getExiting() == VPBB)
621 ParentRegion->setExiting(PredVPBB);
622 VPBlockUtils::transferSuccessors(VPBB, PredVPBB);
623 // VPBB is now dead and will be cleaned up when the plan gets destroyed.
624 }
625 return !WorkList.empty();
626}
627
629 // Convert masked VPReplicateRecipes to if-then region blocks.
631
632 bool ShouldSimplify = true;
633 while (ShouldSimplify) {
634 ShouldSimplify = sinkScalarOperands(Plan);
635 ShouldSimplify |= mergeReplicateRegionsIntoSuccessors(Plan);
636 ShouldSimplify |= mergeBlocksIntoPredecessors(Plan);
637 }
638}
639
640/// Remove redundant casts of inductions.
641///
642/// Such redundant casts are casts of induction variables that can be ignored,
643/// because we already proved that the casted phi is equal to the uncasted phi
644/// in the vectorized loop. There is no need to vectorize the cast - the same
645/// value can be used for both the phi and casts in the vector loop.
647 for (auto &Phi : Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
649 if (!IV || IV->getTruncInst())
650 continue;
651
652 // A sequence of IR Casts has potentially been recorded for IV, which
653 // *must be bypassed* when the IV is vectorized, because the vectorized IV
654 // will produce the desired casted value. This sequence forms a def-use
655 // chain and is provided in reverse order, ending with the cast that uses
656 // the IV phi. Search for the recipe of the last cast in the chain and
657 // replace it with the original IV. Note that only the final cast is
658 // expected to have users outside the cast-chain and the dead casts left
659 // over will be cleaned up later.
660 ArrayRef<Instruction *> Casts = IV->getInductionDescriptor().getCastInsts();
661 VPValue *FindMyCast = IV;
662 for (Instruction *IRCast : reverse(Casts)) {
663 VPSingleDefRecipe *FoundUserCast = nullptr;
664 for (auto *U : FindMyCast->users()) {
665 auto *UserCast = dyn_cast<VPSingleDefRecipe>(U);
666 if (UserCast && UserCast->getUnderlyingValue() == IRCast) {
667 FoundUserCast = UserCast;
668 break;
669 }
670 }
671 FindMyCast = FoundUserCast;
672 }
673 FindMyCast->replaceAllUsesWith(IV);
674 }
675}
676
677/// Try to replace VPWidenCanonicalIVRecipes with a widened canonical IV
678/// recipe, if it exists.
680 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
681 VPCanonicalIVPHIRecipe *CanonicalIV = LoopRegion->getCanonicalIV();
682 VPWidenCanonicalIVRecipe *WidenNewIV = nullptr;
683 for (VPUser *U : CanonicalIV->users()) {
685 if (WidenNewIV)
686 break;
687 }
688
689 if (!WidenNewIV)
690 return;
691
692 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
693 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
694 auto *WidenOriginalIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
695
696 if (!WidenOriginalIV || !WidenOriginalIV->isCanonical())
697 continue;
698
699 // Replace WidenNewIV with WidenOriginalIV if WidenOriginalIV provides
700 // everything WidenNewIV's users need. That is, WidenOriginalIV will
701 // generate a vector phi or all users of WidenNewIV demand the first lane
702 // only.
703 if (Plan.hasScalarVFOnly() ||
704 !vputils::onlyScalarValuesUsed(WidenOriginalIV) ||
705 vputils::onlyFirstLaneUsed(WidenNewIV)) {
706 // We are replacing a wide canonical iv with a suitable wide induction.
707 // This is used to compute header mask, hence all lanes will be used and
708 // we need to drop wrap flags only applying to lanes guranteed to execute
709 // in the original scalar loop.
710 WidenOriginalIV->dropPoisonGeneratingFlags();
711 WidenNewIV->replaceAllUsesWith(WidenOriginalIV);
712 WidenNewIV->eraseFromParent();
713 return;
714 }
715 }
716}
717
718/// Returns true if \p R is dead and can be removed.
719static bool isDeadRecipe(VPRecipeBase &R) {
720 // Do remove conditional assume instructions as their conditions may be
721 // flattened.
722 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
723 bool IsConditionalAssume = RepR && RepR->isPredicated() &&
725 if (IsConditionalAssume)
726 return true;
727
728 if (R.mayHaveSideEffects())
729 return false;
730
731 // Recipe is dead if no user keeps the recipe alive.
732 return all_of(R.definedValues(),
733 [](VPValue *V) { return V->getNumUsers() == 0; });
734}
735
738 vp_post_order_deep(Plan.getEntry()))) {
739 // The recipes in the block are processed in reverse order, to catch chains
740 // of dead recipes.
741 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
742 if (isDeadRecipe(R)) {
743 R.eraseFromParent();
744 continue;
745 }
746
747 // Check if R is a dead VPPhi <-> update cycle and remove it.
748 VPValue *Start, *Incoming;
749 if (!match(&R, m_VPPhi(m_VPValue(Start), m_VPValue(Incoming))))
750 continue;
751 auto *PhiR = cast<VPPhi>(&R);
752 VPUser *PhiUser = PhiR->getSingleUser();
753 if (!PhiUser)
754 continue;
755 if (PhiUser != Incoming->getDefiningRecipe() ||
756 Incoming->getNumUsers() != 1)
757 continue;
758 PhiR->replaceAllUsesWith(Start);
759 PhiR->eraseFromParent();
760 Incoming->getDefiningRecipe()->eraseFromParent();
761 }
762 }
763}
764
767 Instruction::BinaryOps InductionOpcode,
768 FPMathOperator *FPBinOp, Instruction *TruncI,
769 VPIRValue *StartV, VPValue *Step, DebugLoc DL,
770 VPBuilder &Builder) {
771 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
772 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
773 VPCanonicalIVPHIRecipe *CanonicalIV = LoopRegion->getCanonicalIV();
774 VPSingleDefRecipe *BaseIV = Builder.createDerivedIV(
775 Kind, FPBinOp, StartV, CanonicalIV, Step, "offset.idx");
776
777 // Truncate base induction if needed.
778 VPTypeAnalysis TypeInfo(Plan);
779 Type *ResultTy = TypeInfo.inferScalarType(BaseIV);
780 if (TruncI) {
781 Type *TruncTy = TruncI->getType();
782 assert(ResultTy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits() &&
783 "Not truncating.");
784 assert(ResultTy->isIntegerTy() && "Truncation requires an integer type");
785 BaseIV = Builder.createScalarCast(Instruction::Trunc, BaseIV, TruncTy, DL);
786 ResultTy = TruncTy;
787 }
788
789 // Truncate step if needed.
790 Type *StepTy = TypeInfo.inferScalarType(Step);
791 if (ResultTy != StepTy) {
792 assert(StepTy->getScalarSizeInBits() > ResultTy->getScalarSizeInBits() &&
793 "Not truncating.");
794 assert(StepTy->isIntegerTy() && "Truncation requires an integer type");
795 auto *VecPreheader =
797 VPBuilder::InsertPointGuard Guard(Builder);
798 Builder.setInsertPoint(VecPreheader);
799 Step = Builder.createScalarCast(Instruction::Trunc, Step, ResultTy, DL);
800 }
801 return Builder.createScalarIVSteps(InductionOpcode, FPBinOp, BaseIV, Step,
802 &Plan.getVF(), DL);
803}
804
807 for (unsigned I = 0; I != Users.size(); ++I) {
809 if (isa<VPHeaderPHIRecipe>(Cur))
810 continue;
811 for (VPValue *V : Cur->definedValues())
812 Users.insert_range(V->users());
813 }
814 return Users.takeVector();
815}
816
817/// Scalarize a VPWidenPointerInductionRecipe by replacing it with a PtrAdd
818/// (IndStart, ScalarIVSteps (0, Step)). This is used when the recipe only
819/// generates scalar values.
820static VPValue *
822 VPlan &Plan, VPBuilder &Builder) {
824 VPIRValue *StartV = Plan.getZero(ID.getStep()->getType());
825 VPValue *StepV = PtrIV->getOperand(1);
827 Plan, InductionDescriptor::IK_IntInduction, Instruction::Add, nullptr,
828 nullptr, StartV, StepV, PtrIV->getDebugLoc(), Builder);
829
830 return Builder.createPtrAdd(PtrIV->getStartValue(), Steps,
831 PtrIV->getDebugLoc(), "next.gep");
832}
833
834/// Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd
835/// (IndStart, ScalarIVSteps (0, Step)) if only its scalar values are used, as
836/// VPWidenPointerInductionRecipe will generate vectors only. If some users
837/// require vectors while other require scalars, the scalar uses need to extract
838/// the scalars from the generated vectors (Note that this is different to how
839/// int/fp inductions are handled). Legalize extract-from-ends using uniform
840/// VPReplicateRecipe of wide inductions to use regular VPReplicateRecipe, so
841/// the correct end value is available. Also optimize
842/// VPWidenIntOrFpInductionRecipe, if any of its users needs scalar values, by
843/// providing them scalar steps built on the canonical scalar IV and update the
844/// original IV's users. This is an optional optimization to reduce the needs of
845/// vector extracts.
848 bool HasOnlyVectorVFs = !Plan.hasScalarVFOnly();
849 VPBuilder Builder(HeaderVPBB, HeaderVPBB->getFirstNonPhi());
850 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
851 auto *PhiR = dyn_cast<VPWidenInductionRecipe>(&Phi);
852 if (!PhiR)
853 continue;
854
855 // Try to narrow wide and replicating recipes to uniform recipes, based on
856 // VPlan analysis.
857 // TODO: Apply to all recipes in the future, to replace legacy uniformity
858 // analysis.
859 auto Users = collectUsersRecursively(PhiR);
860 for (VPUser *U : reverse(Users)) {
861 auto *Def = dyn_cast<VPRecipeWithIRFlags>(U);
862 auto *RepR = dyn_cast<VPReplicateRecipe>(U);
863 // Skip recipes that shouldn't be narrowed.
864 if (!Def || !isa<VPReplicateRecipe, VPWidenRecipe>(Def) ||
865 Def->getNumUsers() == 0 || !Def->getUnderlyingValue() ||
866 (RepR && (RepR->isSingleScalar() || RepR->isPredicated())))
867 continue;
868
869 // Skip recipes that may have other lanes than their first used.
871 continue;
872
873 auto *Clone = new VPReplicateRecipe(Def->getUnderlyingInstr(),
874 Def->operands(), /*IsUniform*/ true,
875 /*Mask*/ nullptr, /*Flags*/ *Def);
876 Clone->insertAfter(Def);
877 Def->replaceAllUsesWith(Clone);
878 }
879
880 // Replace wide pointer inductions which have only their scalars used by
881 // PtrAdd(IndStart, ScalarIVSteps (0, Step)).
882 if (auto *PtrIV = dyn_cast<VPWidenPointerInductionRecipe>(&Phi)) {
883 if (!Plan.hasScalarVFOnly() &&
884 !PtrIV->onlyScalarsGenerated(Plan.hasScalableVF()))
885 continue;
886
887 VPValue *PtrAdd = scalarizeVPWidenPointerInduction(PtrIV, Plan, Builder);
888 PtrIV->replaceAllUsesWith(PtrAdd);
889 continue;
890 }
891
892 // Replace widened induction with scalar steps for users that only use
893 // scalars.
894 auto *WideIV = cast<VPWidenIntOrFpInductionRecipe>(&Phi);
895 if (HasOnlyVectorVFs && none_of(WideIV->users(), [WideIV](VPUser *U) {
896 return U->usesScalars(WideIV);
897 }))
898 continue;
899
900 const InductionDescriptor &ID = WideIV->getInductionDescriptor();
902 Plan, ID.getKind(), ID.getInductionOpcode(),
903 dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
904 WideIV->getTruncInst(), WideIV->getStartValue(), WideIV->getStepValue(),
905 WideIV->getDebugLoc(), Builder);
906
907 // Update scalar users of IV to use Step instead.
908 if (!HasOnlyVectorVFs) {
909 assert(!Plan.hasScalableVF() &&
910 "plans containing a scalar VF cannot also include scalable VFs");
911 WideIV->replaceAllUsesWith(Steps);
912 } else {
913 bool HasScalableVF = Plan.hasScalableVF();
914 WideIV->replaceUsesWithIf(Steps,
915 [WideIV, HasScalableVF](VPUser &U, unsigned) {
916 if (HasScalableVF)
917 return U.usesFirstLaneOnly(WideIV);
918 return U.usesScalars(WideIV);
919 });
920 }
921 }
922}
923
924/// Check if \p VPV is an untruncated wide induction, either before or after the
925/// increment. If so return the header IV (before the increment), otherwise
926/// return null.
929 auto *WideIV = dyn_cast<VPWidenInductionRecipe>(VPV);
930 if (WideIV) {
931 // VPV itself is a wide induction, separately compute the end value for exit
932 // users if it is not a truncated IV.
933 auto *IntOrFpIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
934 return (IntOrFpIV && IntOrFpIV->getTruncInst()) ? nullptr : WideIV;
935 }
936
937 // Check if VPV is an optimizable induction increment.
938 VPRecipeBase *Def = VPV->getDefiningRecipe();
939 if (!Def || Def->getNumOperands() != 2)
940 return nullptr;
941 WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(0));
942 if (!WideIV)
943 WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(1));
944 if (!WideIV)
945 return nullptr;
946
947 auto IsWideIVInc = [&]() {
948 auto &ID = WideIV->getInductionDescriptor();
949
950 // Check if VPV increments the induction by the induction step.
951 VPValue *IVStep = WideIV->getStepValue();
952 switch (ID.getInductionOpcode()) {
953 case Instruction::Add:
954 return match(VPV, m_c_Add(m_Specific(WideIV), m_Specific(IVStep)));
955 case Instruction::FAdd:
956 return match(VPV, m_c_FAdd(m_Specific(WideIV), m_Specific(IVStep)));
957 case Instruction::FSub:
958 return match(VPV, m_Binary<Instruction::FSub>(m_Specific(WideIV),
959 m_Specific(IVStep)));
960 case Instruction::Sub: {
961 // IVStep will be the negated step of the subtraction. Check if Step == -1
962 // * IVStep.
963 VPValue *Step;
964 if (!match(VPV, m_Sub(m_VPValue(), m_VPValue(Step))))
965 return false;
966 const SCEV *IVStepSCEV = vputils::getSCEVExprForVPValue(IVStep, PSE);
967 const SCEV *StepSCEV = vputils::getSCEVExprForVPValue(Step, PSE);
968 ScalarEvolution &SE = *PSE.getSE();
969 return !isa<SCEVCouldNotCompute>(IVStepSCEV) &&
970 !isa<SCEVCouldNotCompute>(StepSCEV) &&
971 IVStepSCEV == SE.getNegativeSCEV(StepSCEV);
972 }
973 default:
974 return ID.getKind() == InductionDescriptor::IK_PtrInduction &&
975 match(VPV, m_GetElementPtr(m_Specific(WideIV),
976 m_Specific(WideIV->getStepValue())));
977 }
978 llvm_unreachable("should have been covered by switch above");
979 };
980 return IsWideIVInc() ? WideIV : nullptr;
981}
982
983/// Attempts to optimize the induction variable exit values for users in the
984/// early exit block.
986 VPTypeAnalysis &TypeInfo,
987 VPBlockBase *PredVPBB,
988 VPValue *Op,
990 VPValue *Incoming, *Mask;
993 return nullptr;
994
995 auto *WideIV = getOptimizableIVOf(Incoming, PSE);
996 if (!WideIV)
997 return nullptr;
998
999 auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
1000 if (WideIntOrFp && WideIntOrFp->getTruncInst())
1001 return nullptr;
1002
1003 // Calculate the final index.
1004 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
1005 auto *CanonicalIV = LoopRegion->getCanonicalIV();
1006 Type *CanonicalIVType = LoopRegion->getCanonicalIVType();
1007 VPBuilder B(cast<VPBasicBlock>(PredVPBB));
1008
1009 DebugLoc DL = cast<VPInstruction>(Op)->getDebugLoc();
1010 VPValue *FirstActiveLane =
1011 B.createNaryOp(VPInstruction::FirstActiveLane, Mask, DL);
1012 Type *FirstActiveLaneType = TypeInfo.inferScalarType(FirstActiveLane);
1013 FirstActiveLane = B.createScalarZExtOrTrunc(FirstActiveLane, CanonicalIVType,
1014 FirstActiveLaneType, DL);
1015 VPValue *EndValue = B.createAdd(CanonicalIV, FirstActiveLane, DL);
1016
1017 // `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
1018 // changed it means the exit is using the incremented value, so we need to
1019 // add the step.
1020 if (Incoming != WideIV) {
1021 VPValue *One = Plan.getConstantInt(CanonicalIVType, 1);
1022 EndValue = B.createAdd(EndValue, One, DL);
1023 }
1024
1025 if (!WideIntOrFp || !WideIntOrFp->isCanonical()) {
1026 const InductionDescriptor &ID = WideIV->getInductionDescriptor();
1027 VPIRValue *Start = WideIV->getStartValue();
1028 VPValue *Step = WideIV->getStepValue();
1029 EndValue = B.createDerivedIV(
1030 ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
1031 Start, EndValue, Step);
1032 }
1033
1034 return EndValue;
1035}
1036
1037/// Compute the end value for \p WideIV, unless it is truncated. Creates a
1038/// VPDerivedIVRecipe for non-canonical inductions.
1040 VPBuilder &VectorPHBuilder,
1041 VPTypeAnalysis &TypeInfo,
1042 VPValue *VectorTC) {
1043 auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
1044 // Truncated wide inductions resume from the last lane of their vector value
1045 // in the last vector iteration which is handled elsewhere.
1046 if (WideIntOrFp && WideIntOrFp->getTruncInst())
1047 return nullptr;
1048
1049 VPIRValue *Start = WideIV->getStartValue();
1050 VPValue *Step = WideIV->getStepValue();
1052 VPValue *EndValue = VectorTC;
1053 if (!WideIntOrFp || !WideIntOrFp->isCanonical()) {
1054 EndValue = VectorPHBuilder.createDerivedIV(
1055 ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
1056 Start, VectorTC, Step);
1057 }
1058
1059 // EndValue is derived from the vector trip count (which has the same type as
1060 // the widest induction) and thus may be wider than the induction here.
1061 Type *ScalarTypeOfWideIV = TypeInfo.inferScalarType(WideIV);
1062 if (ScalarTypeOfWideIV != TypeInfo.inferScalarType(EndValue)) {
1063 EndValue = VectorPHBuilder.createScalarCast(Instruction::Trunc, EndValue,
1064 ScalarTypeOfWideIV,
1065 WideIV->getDebugLoc());
1066 }
1067
1068 return EndValue;
1069}
1070
1071/// Attempts to optimize the induction variable exit values for users in the
1072/// exit block coming from the latch in the original scalar loop.
1074 VPlan &Plan, VPTypeAnalysis &TypeInfo, VPBlockBase *PredVPBB, VPValue *Op,
1077 VPWidenInductionRecipe *WideIV = nullptr;
1079 WideIV = getOptimizableIVOf(Incoming, PSE);
1080
1081 if (!WideIV)
1082 return nullptr;
1083
1084 VPValue *EndValue = EndValues.lookup(WideIV);
1085 assert(EndValue && "Must have computed the end value up front");
1086
1087 // `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
1088 // changed it means the exit is using the incremented value, so we don't
1089 // need to subtract the step.
1090 if (Incoming != WideIV)
1091 return EndValue;
1092
1093 // Otherwise, subtract the step from the EndValue.
1094 VPBuilder B(cast<VPBasicBlock>(PredVPBB)->getTerminator());
1095 VPValue *Step = WideIV->getStepValue();
1096 Type *ScalarTy = TypeInfo.inferScalarType(WideIV);
1097 if (ScalarTy->isIntegerTy())
1098 return B.createSub(EndValue, Step, DebugLoc::getUnknown(), "ind.escape");
1099 if (ScalarTy->isPointerTy()) {
1100 Type *StepTy = TypeInfo.inferScalarType(Step);
1101 auto *Zero = Plan.getZero(StepTy);
1102 return B.createPtrAdd(EndValue, B.createSub(Zero, Step),
1103 DebugLoc::getUnknown(), "ind.escape");
1104 }
1105 if (ScalarTy->isFloatingPointTy()) {
1106 const auto &ID = WideIV->getInductionDescriptor();
1107 return B.createNaryOp(
1108 ID.getInductionBinOp()->getOpcode() == Instruction::FAdd
1109 ? Instruction::FSub
1110 : Instruction::FAdd,
1111 {EndValue, Step}, {ID.getInductionBinOp()->getFastMathFlags()});
1112 }
1113 llvm_unreachable("all possible induction types must be handled");
1114 return nullptr;
1115}
1116
1118 VPlan &Plan, PredicatedScalarEvolution &PSE, bool FoldTail) {
1119 // Compute end values for all inductions.
1120 VPTypeAnalysis TypeInfo(Plan);
1121 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
1122 auto *VectorPH = cast<VPBasicBlock>(VectorRegion->getSinglePredecessor());
1123 VPBuilder VectorPHBuilder(VectorPH, VectorPH->begin());
1125 VPValue *ResumeTC =
1126 FoldTail ? Plan.getTripCount() : &Plan.getVectorTripCount();
1127 for (auto &Phi : VectorRegion->getEntryBasicBlock()->phis()) {
1128 auto *WideIV = dyn_cast<VPWidenInductionRecipe>(&Phi);
1129 if (!WideIV)
1130 continue;
1132 WideIV, VectorPHBuilder, TypeInfo, ResumeTC))
1133 EndValues[WideIV] = EndValue;
1134 }
1135
1136 VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
1137 for (VPRecipeBase &R : make_early_inc_range(*MiddleVPBB)) {
1138 VPValue *Op;
1139 if (!match(&R, m_ExitingIVValue(m_VPValue(Op))))
1140 continue;
1141 auto *WideIV = cast<VPWidenInductionRecipe>(Op);
1142 if (VPValue *EndValue = EndValues.lookup(WideIV)) {
1143 R.getVPSingleValue()->replaceAllUsesWith(EndValue);
1144 R.eraseFromParent();
1145 }
1146 }
1147
1148 // Then, optimize exit block users.
1149 for (VPIRBasicBlock *ExitVPBB : Plan.getExitBlocks()) {
1150 for (VPRecipeBase &R : ExitVPBB->phis()) {
1151 auto *ExitIRI = cast<VPIRPhi>(&R);
1152
1153 for (auto [Idx, PredVPBB] : enumerate(ExitVPBB->getPredecessors())) {
1154 VPValue *Escape = nullptr;
1155 if (PredVPBB == MiddleVPBB)
1156 Escape = optimizeLatchExitInductionUser(Plan, TypeInfo, PredVPBB,
1157 ExitIRI->getOperand(Idx),
1158 EndValues, PSE);
1159 else
1161 Plan, TypeInfo, PredVPBB, ExitIRI->getOperand(Idx), PSE);
1162 if (Escape)
1163 ExitIRI->setOperand(Idx, Escape);
1164 }
1165 }
1166 }
1167}
1168
1169/// Remove redundant EpxandSCEVRecipes in \p Plan's entry block by replacing
1170/// them with already existing recipes expanding the same SCEV expression.
1173
1174 for (VPRecipeBase &R :
1176 auto *ExpR = dyn_cast<VPExpandSCEVRecipe>(&R);
1177 if (!ExpR)
1178 continue;
1179
1180 const auto &[V, Inserted] = SCEV2VPV.try_emplace(ExpR->getSCEV(), ExpR);
1181 if (Inserted)
1182 continue;
1183 ExpR->replaceAllUsesWith(V->second);
1184 ExpR->eraseFromParent();
1185 }
1186}
1187
1189 SmallVector<VPValue *> WorkList;
1191 WorkList.push_back(V);
1192
1193 while (!WorkList.empty()) {
1194 VPValue *Cur = WorkList.pop_back_val();
1195 if (!Seen.insert(Cur).second)
1196 continue;
1197 VPRecipeBase *R = Cur->getDefiningRecipe();
1198 if (!R)
1199 continue;
1200 if (!isDeadRecipe(*R))
1201 continue;
1202 append_range(WorkList, R->operands());
1203 R->eraseFromParent();
1204 }
1205}
1206
1207/// Get any instruction opcode or intrinsic ID data embedded in recipe \p R.
1208/// Returns an optional pair, where the first element indicates whether it is
1209/// an intrinsic ID.
1210static std::optional<std::pair<bool, unsigned>>
1212 return TypeSwitch<const VPSingleDefRecipe *,
1213 std::optional<std::pair<bool, unsigned>>>(R)
1216 [](auto *I) { return std::make_pair(false, I->getOpcode()); })
1217 .Case([](const VPWidenIntrinsicRecipe *I) {
1218 return std::make_pair(true, I->getVectorIntrinsicID());
1219 })
1220 .Case<VPVectorPointerRecipe, VPPredInstPHIRecipe>([](auto *I) {
1221 // For recipes that do not directly map to LLVM IR instructions,
1222 // assign opcodes after the last VPInstruction opcode (which is also
1223 // after the last IR Instruction opcode), based on the VPRecipeID.
1224 return std::make_pair(false,
1225 VPInstruction::OpsEnd + 1 + I->getVPRecipeID());
1226 })
1227 .Default([](auto *) { return std::nullopt; });
1228}
1229
1230/// Try to fold \p R using InstSimplifyFolder. Will succeed and return a
1231/// non-nullptr VPValue for a handled opcode or intrinsic ID if corresponding \p
1232/// Operands are foldable live-ins.
1234 ArrayRef<VPValue *> Operands,
1235 const DataLayout &DL,
1236 VPTypeAnalysis &TypeInfo) {
1237 auto OpcodeOrIID = getOpcodeOrIntrinsicID(&R);
1238 if (!OpcodeOrIID)
1239 return nullptr;
1240
1242 for (VPValue *Op : Operands) {
1243 if (!match(Op, m_LiveIn()))
1244 return nullptr;
1245 Value *V = Op->getUnderlyingValue();
1246 if (!V)
1247 return nullptr;
1248 Ops.push_back(V);
1249 }
1250
1251 auto FoldToIRValue = [&]() -> Value * {
1252 InstSimplifyFolder Folder(DL);
1253 if (OpcodeOrIID->first) {
1254 if (R.getNumOperands() != 2)
1255 return nullptr;
1256 unsigned ID = OpcodeOrIID->second;
1257 return Folder.FoldBinaryIntrinsic(ID, Ops[0], Ops[1],
1258 TypeInfo.inferScalarType(&R));
1259 }
1260 unsigned Opcode = OpcodeOrIID->second;
1261 if (Instruction::isBinaryOp(Opcode))
1262 return Folder.FoldBinOp(static_cast<Instruction::BinaryOps>(Opcode),
1263 Ops[0], Ops[1]);
1264 if (Instruction::isCast(Opcode))
1265 return Folder.FoldCast(static_cast<Instruction::CastOps>(Opcode), Ops[0],
1266 TypeInfo.inferScalarType(R.getVPSingleValue()));
1267 switch (Opcode) {
1269 return Folder.FoldSelect(Ops[0], Ops[1],
1271 case VPInstruction::Not:
1272 return Folder.FoldBinOp(Instruction::BinaryOps::Xor, Ops[0],
1274 case Instruction::Select:
1275 return Folder.FoldSelect(Ops[0], Ops[1], Ops[2]);
1276 case Instruction::ICmp:
1277 case Instruction::FCmp:
1278 return Folder.FoldCmp(cast<VPRecipeWithIRFlags>(R).getPredicate(), Ops[0],
1279 Ops[1]);
1280 case Instruction::GetElementPtr: {
1281 auto &RFlags = cast<VPRecipeWithIRFlags>(R);
1282 auto *GEP = cast<GetElementPtrInst>(RFlags.getUnderlyingInstr());
1283 return Folder.FoldGEP(GEP->getSourceElementType(), Ops[0],
1284 drop_begin(Ops), RFlags.getGEPNoWrapFlags());
1285 }
1288 return Folder.FoldGEP(IntegerType::getInt8Ty(TypeInfo.getContext()),
1289 Ops[0], Ops[1],
1290 cast<VPRecipeWithIRFlags>(R).getGEPNoWrapFlags());
1291 // An extract of a live-in is an extract of a broadcast, so return the
1292 // broadcasted element.
1293 case Instruction::ExtractElement:
1294 assert(!Ops[0]->getType()->isVectorTy() && "Live-ins should be scalar");
1295 return Ops[0];
1296 }
1297 return nullptr;
1298 };
1299
1300 if (Value *V = FoldToIRValue())
1301 return R.getParent()->getPlan()->getOrAddLiveIn(V);
1302 return nullptr;
1303}
1304
1305/// Try to simplify VPSingleDefRecipe \p Def.
1307 VPlan *Plan = Def->getParent()->getPlan();
1308
1309 // Simplification of live-in IR values for SingleDef recipes using
1310 // InstSimplifyFolder.
1311 const DataLayout &DL = Plan->getDataLayout();
1312 if (VPValue *V = tryToFoldLiveIns(*Def, Def->operands(), DL, TypeInfo))
1313 return Def->replaceAllUsesWith(V);
1314
1315 // Fold PredPHI LiveIn -> LiveIn.
1316 if (auto *PredPHI = dyn_cast<VPPredInstPHIRecipe>(Def)) {
1317 VPValue *Op = PredPHI->getOperand(0);
1318 if (isa<VPIRValue>(Op))
1319 PredPHI->replaceAllUsesWith(Op);
1320 }
1321
1322 VPBuilder Builder(Def);
1323
1324 // Avoid replacing VPInstructions with underlying values with new
1325 // VPInstructions, as we would fail to create widen/replicate recpes from the
1326 // new VPInstructions without an underlying value, and miss out on some
1327 // transformations that only apply to widened/replicated recipes later, by
1328 // doing so.
1329 // TODO: We should also not replace non-VPInstructions like VPWidenRecipe with
1330 // VPInstructions without underlying values, as those will get skipped during
1331 // cost computation.
1332 bool CanCreateNewRecipe =
1333 !isa<VPInstruction>(Def) || !Def->getUnderlyingValue();
1334
1335 VPValue *A;
1336 if (match(Def, m_Trunc(m_ZExtOrSExt(m_VPValue(A))))) {
1337 Type *TruncTy = TypeInfo.inferScalarType(Def);
1338 Type *ATy = TypeInfo.inferScalarType(A);
1339 if (TruncTy == ATy) {
1340 Def->replaceAllUsesWith(A);
1341 } else {
1342 // Don't replace a non-widened cast recipe with a widened cast.
1343 if (!isa<VPWidenCastRecipe>(Def))
1344 return;
1345 if (ATy->getScalarSizeInBits() < TruncTy->getScalarSizeInBits()) {
1346
1347 unsigned ExtOpcode = match(Def->getOperand(0), m_SExt(m_VPValue()))
1348 ? Instruction::SExt
1349 : Instruction::ZExt;
1350 auto *Ext = Builder.createWidenCast(Instruction::CastOps(ExtOpcode), A,
1351 TruncTy);
1352 if (auto *UnderlyingExt = Def->getOperand(0)->getUnderlyingValue()) {
1353 // UnderlyingExt has distinct return type, used to retain legacy cost.
1354 Ext->setUnderlyingValue(UnderlyingExt);
1355 }
1356 Def->replaceAllUsesWith(Ext);
1357 } else if (ATy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits()) {
1358 auto *Trunc = Builder.createWidenCast(Instruction::Trunc, A, TruncTy);
1359 Def->replaceAllUsesWith(Trunc);
1360 }
1361 }
1362#ifndef NDEBUG
1363 // Verify that the cached type info is for both A and its users is still
1364 // accurate by comparing it to freshly computed types.
1365 VPTypeAnalysis TypeInfo2(*Plan);
1366 assert(TypeInfo.inferScalarType(A) == TypeInfo2.inferScalarType(A));
1367 for (VPUser *U : A->users()) {
1368 auto *R = cast<VPRecipeBase>(U);
1369 for (VPValue *VPV : R->definedValues())
1370 assert(TypeInfo.inferScalarType(VPV) == TypeInfo2.inferScalarType(VPV));
1371 }
1372#endif
1373 }
1374
1375 // Simplify (X && Y) | (X && !Y) -> X.
1376 // TODO: Split up into simpler, modular combines: (X && Y) | (X && Z) into X
1377 // && (Y | Z) and (X | !X) into true. This requires queuing newly created
1378 // recipes to be visited during simplification.
1379 VPValue *X, *Y, *Z;
1380 if (match(Def,
1383 Def->replaceAllUsesWith(X);
1384 Def->eraseFromParent();
1385 return;
1386 }
1387
1388 // x | AllOnes -> AllOnes
1389 if (match(Def, m_c_BinaryOr(m_VPValue(X), m_AllOnes())))
1390 return Def->replaceAllUsesWith(
1391 Plan->getAllOnesValue(TypeInfo.inferScalarType(Def)));
1392
1393 // x | 0 -> x
1394 if (match(Def, m_c_BinaryOr(m_VPValue(X), m_ZeroInt())))
1395 return Def->replaceAllUsesWith(X);
1396
1397 // x | !x -> AllOnes
1399 return Def->replaceAllUsesWith(
1400 Plan->getAllOnesValue(TypeInfo.inferScalarType(Def)));
1401
1402 // x & 0 -> 0
1403 if (match(Def, m_c_BinaryAnd(m_VPValue(X), m_ZeroInt())))
1404 return Def->replaceAllUsesWith(
1405 Plan->getZero(TypeInfo.inferScalarType(Def)));
1406
1407 // x & AllOnes -> x
1408 if (match(Def, m_c_BinaryAnd(m_VPValue(X), m_AllOnes())))
1409 return Def->replaceAllUsesWith(X);
1410
1411 // x && false -> false
1412 if (match(Def, m_c_LogicalAnd(m_VPValue(X), m_False())))
1413 return Def->replaceAllUsesWith(Plan->getFalse());
1414
1415 // x && true -> x
1416 if (match(Def, m_c_LogicalAnd(m_VPValue(X), m_True())))
1417 return Def->replaceAllUsesWith(X);
1418
1419 // (x && y) | (x && z) -> x && (y | z)
1420 if (CanCreateNewRecipe &&
1423 // Simplify only if one of the operands has one use to avoid creating an
1424 // extra recipe.
1425 (!Def->getOperand(0)->hasMoreThanOneUniqueUser() ||
1426 !Def->getOperand(1)->hasMoreThanOneUniqueUser()))
1427 return Def->replaceAllUsesWith(
1428 Builder.createLogicalAnd(X, Builder.createOr(Y, Z)));
1429
1430 // x && (x && y) -> x && y
1431 if (match(Def, m_LogicalAnd(m_VPValue(X),
1433 return Def->replaceAllUsesWith(Def->getOperand(1));
1434
1435 // x && (y && x) -> x && y
1436 if (match(Def, m_LogicalAnd(m_VPValue(X),
1438 return Def->replaceAllUsesWith(Builder.createLogicalAnd(X, Y));
1439
1440 // x && !x -> 0
1442 return Def->replaceAllUsesWith(Plan->getFalse());
1443
1444 if (match(Def, m_Select(m_VPValue(), m_VPValue(X), m_Deferred(X))))
1445 return Def->replaceAllUsesWith(X);
1446
1447 // select c, false, true -> not c
1448 VPValue *C;
1449 if (CanCreateNewRecipe &&
1450 match(Def, m_Select(m_VPValue(C), m_False(), m_True())))
1451 return Def->replaceAllUsesWith(Builder.createNot(C));
1452
1453 // select !c, x, y -> select c, y, x
1454 if (match(Def, m_Select(m_Not(m_VPValue(C)), m_VPValue(X), m_VPValue(Y)))) {
1455 Def->setOperand(0, C);
1456 Def->setOperand(1, Y);
1457 Def->setOperand(2, X);
1458 return;
1459 }
1460
1461 if (match(Def, m_c_Add(m_VPValue(A), m_ZeroInt())))
1462 return Def->replaceAllUsesWith(A);
1463
1464 if (match(Def, m_c_Mul(m_VPValue(A), m_One())))
1465 return Def->replaceAllUsesWith(A);
1466
1467 if (match(Def, m_c_Mul(m_VPValue(A), m_ZeroInt())))
1468 return Def->replaceAllUsesWith(
1469 Plan->getZero(TypeInfo.inferScalarType(Def)));
1470
1471 if (CanCreateNewRecipe && match(Def, m_c_Mul(m_VPValue(A), m_AllOnes()))) {
1472 // Preserve nsw from the Mul on the new Sub.
1474 false, cast<VPRecipeWithIRFlags>(Def)->hasNoSignedWrap()};
1475 return Def->replaceAllUsesWith(
1476 Builder.createSub(Plan->getZero(TypeInfo.inferScalarType(A)), A,
1477 Def->getDebugLoc(), "", NW));
1478 }
1479
1480 const APInt *APC;
1481 if (CanCreateNewRecipe && match(Def, m_c_Mul(m_VPValue(A), m_APInt(APC))) &&
1482 APC->isPowerOf2())
1483 return Def->replaceAllUsesWith(Builder.createNaryOp(
1484 Instruction::Shl,
1485 {A, Plan->getConstantInt(APC->getBitWidth(), APC->exactLogBase2())},
1486 *cast<VPRecipeWithIRFlags>(Def), Def->getDebugLoc()));
1487
1488 // Don't convert udiv to lshr inside a replicate region, as VPInstructions are
1489 // not allowed in them.
1490 const VPRegionBlock *ParentRegion = Def->getParent()->getParent();
1491 bool IsInReplicateRegion = ParentRegion && ParentRegion->isReplicator();
1492 if (CanCreateNewRecipe && !IsInReplicateRegion &&
1493 match(Def, m_UDiv(m_VPValue(A), m_APInt(APC))) && APC->isPowerOf2())
1494 return Def->replaceAllUsesWith(Builder.createNaryOp(
1495 Instruction::LShr,
1496 {A, Plan->getConstantInt(APC->getBitWidth(), APC->exactLogBase2())},
1497 *cast<VPRecipeWithIRFlags>(Def), Def->getDebugLoc()));
1498
1499 if (match(Def, m_Not(m_VPValue(A)))) {
1500 if (match(A, m_Not(m_VPValue(A))))
1501 return Def->replaceAllUsesWith(A);
1502
1503 // Try to fold Not into compares by adjusting the predicate in-place.
1504 CmpPredicate Pred;
1505 if (match(A, m_Cmp(Pred, m_VPValue(), m_VPValue()))) {
1506 auto *Cmp = cast<VPRecipeWithIRFlags>(A);
1507 if (all_of(Cmp->users(),
1509 m_Not(m_Specific(Cmp)),
1510 m_Select(m_Specific(Cmp), m_VPValue(), m_VPValue()))))) {
1511 Cmp->setPredicate(CmpInst::getInversePredicate(Pred));
1512 for (VPUser *U : to_vector(Cmp->users())) {
1513 auto *R = cast<VPSingleDefRecipe>(U);
1514 if (match(R, m_Select(m_Specific(Cmp), m_VPValue(X), m_VPValue(Y)))) {
1515 // select (cmp pred), x, y -> select (cmp inv_pred), y, x
1516 R->setOperand(1, Y);
1517 R->setOperand(2, X);
1518 } else {
1519 // not (cmp pred) -> cmp inv_pred
1520 assert(match(R, m_Not(m_Specific(Cmp))) && "Unexpected user");
1521 R->replaceAllUsesWith(Cmp);
1522 }
1523 }
1524 // If Cmp doesn't have a debug location, use the one from the negation,
1525 // to preserve the location.
1526 if (!Cmp->getDebugLoc() && Def->getDebugLoc())
1527 Cmp->setDebugLoc(Def->getDebugLoc());
1528 }
1529 }
1530 }
1531
1532 // Fold any-of (fcmp uno %A, %A), (fcmp uno %B, %B), ... ->
1533 // any-of (fcmp uno %A, %B), ...
1534 if (match(Def, m_AnyOf())) {
1536 VPRecipeBase *UnpairedCmp = nullptr;
1537 for (VPValue *Op : Def->operands()) {
1538 VPValue *X;
1539 if (Op->getNumUsers() > 1 ||
1541 m_Deferred(X)))) {
1542 NewOps.push_back(Op);
1543 } else if (!UnpairedCmp) {
1544 UnpairedCmp = Op->getDefiningRecipe();
1545 } else {
1546 NewOps.push_back(Builder.createFCmp(CmpInst::FCMP_UNO,
1547 UnpairedCmp->getOperand(0), X));
1548 UnpairedCmp = nullptr;
1549 }
1550 }
1551
1552 if (UnpairedCmp)
1553 NewOps.push_back(UnpairedCmp->getVPSingleValue());
1554
1555 if (NewOps.size() < Def->getNumOperands()) {
1556 VPValue *NewAnyOf = Builder.createNaryOp(VPInstruction::AnyOf, NewOps);
1557 return Def->replaceAllUsesWith(NewAnyOf);
1558 }
1559 }
1560
1561 // Fold (fcmp uno %X, %X) or (fcmp uno %Y, %Y) -> fcmp uno %X, %Y
1562 // This is useful for fmax/fmin without fast-math flags, where we need to
1563 // check if any operand is NaN.
1564 if (CanCreateNewRecipe &&
1566 m_Deferred(X)),
1568 m_Deferred(Y))))) {
1569 VPValue *NewCmp = Builder.createFCmp(CmpInst::FCMP_UNO, X, Y);
1570 return Def->replaceAllUsesWith(NewCmp);
1571 }
1572
1573 // Remove redundant DerviedIVs, that is 0 + A * 1 -> A and 0 + 0 * x -> 0.
1574 if ((match(Def, m_DerivedIV(m_ZeroInt(), m_VPValue(A), m_One())) ||
1575 match(Def, m_DerivedIV(m_ZeroInt(), m_ZeroInt(), m_VPValue()))) &&
1576 TypeInfo.inferScalarType(Def->getOperand(1)) ==
1577 TypeInfo.inferScalarType(Def))
1578 return Def->replaceAllUsesWith(Def->getOperand(1));
1579
1581 m_One()))) {
1582 Type *WideStepTy = TypeInfo.inferScalarType(Def);
1583 if (TypeInfo.inferScalarType(X) != WideStepTy)
1584 X = Builder.createWidenCast(Instruction::Trunc, X, WideStepTy);
1585 Def->replaceAllUsesWith(X);
1586 return;
1587 }
1588
1589 // For i1 vp.merges produced by AnyOf reductions:
1590 // vp.merge true, (or x, y), x, evl -> vp.merge y, true, x, evl
1592 m_VPValue(X), m_VPValue())) &&
1594 TypeInfo.inferScalarType(Def)->isIntegerTy(1)) {
1595 Def->setOperand(1, Def->getOperand(0));
1596 Def->setOperand(0, Y);
1597 return;
1598 }
1599
1600 // Simplify MaskedCond with no block mask to its single operand.
1602 !cast<VPInstruction>(Def)->isMasked())
1603 return Def->replaceAllUsesWith(Def->getOperand(0));
1604
1605 // Look through ExtractLastLane.
1606 if (match(Def, m_ExtractLastLane(m_VPValue(A)))) {
1607 if (match(A, m_BuildVector())) {
1608 auto *BuildVector = cast<VPInstruction>(A);
1609 Def->replaceAllUsesWith(
1610 BuildVector->getOperand(BuildVector->getNumOperands() - 1));
1611 return;
1612 }
1613 if (Plan->hasScalarVFOnly())
1614 return Def->replaceAllUsesWith(A);
1615 }
1616
1617 // Look through ExtractPenultimateElement (BuildVector ....).
1619 auto *BuildVector = cast<VPInstruction>(Def->getOperand(0));
1620 Def->replaceAllUsesWith(
1621 BuildVector->getOperand(BuildVector->getNumOperands() - 2));
1622 return;
1623 }
1624
1625 uint64_t Idx;
1627 auto *BuildVector = cast<VPInstruction>(Def->getOperand(0));
1628 Def->replaceAllUsesWith(BuildVector->getOperand(Idx));
1629 return;
1630 }
1631
1632 if (match(Def, m_BuildVector()) && all_equal(Def->operands())) {
1633 Def->replaceAllUsesWith(
1634 Builder.createNaryOp(VPInstruction::Broadcast, Def->getOperand(0)));
1635 return;
1636 }
1637
1638 // Look through broadcast of single-scalar when used as select conditions; in
1639 // that case the scalar condition can be used directly.
1640 if (match(Def,
1643 "broadcast operand must be single-scalar");
1644 Def->setOperand(0, C);
1645 return;
1646 }
1647
1649 if (Def->getNumOperands() == 1) {
1650 Def->replaceAllUsesWith(Def->getOperand(0));
1651 return;
1652 }
1653 if (auto *Phi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(Def)) {
1654 if (all_equal(Phi->incoming_values()))
1655 Phi->replaceAllUsesWith(Phi->getOperand(0));
1656 }
1657 return;
1658 }
1659
1660 VPIRValue *IRV;
1661 if (Def->getNumOperands() == 1 &&
1663 return Def->replaceAllUsesWith(IRV);
1664
1665 // Some simplifications can only be applied after unrolling. Perform them
1666 // below.
1667 if (!Plan->isUnrolled())
1668 return;
1669
1670 // After unrolling, extract-lane may be used to extract values from multiple
1671 // scalar sources. Only simplify when extracting from a single scalar source.
1672 VPValue *LaneToExtract;
1673 if (match(Def, m_ExtractLane(m_VPValue(LaneToExtract), m_VPValue(A)))) {
1674 // Simplify extract-lane(%lane_num, %scalar_val) -> %scalar_val.
1676 return Def->replaceAllUsesWith(A);
1677
1678 // Simplify extract-lane with single source to extract-element.
1679 Def->replaceAllUsesWith(Builder.createNaryOp(
1680 Instruction::ExtractElement, {A, LaneToExtract}, Def->getDebugLoc()));
1681 return;
1682 }
1683
1684 // Look for cycles where Def is of the form:
1685 // X = phi(0, IVInc) ; used only by IVInc, or by IVInc and Inc = X + Y
1686 // IVInc = X + Step ; used by X and Def
1687 // Def = IVInc + Y
1688 // Fold the increment Y into the phi's start value, replace Def with IVInc,
1689 // and if Inc exists, replace it with X.
1690 if (match(Def, m_Add(m_Add(m_VPValue(X), m_VPValue()), m_VPValue(Y))) &&
1692 match(X, m_VPPhi(m_ZeroInt(), m_Specific(Def->getOperand(0))))) {
1693 auto *Phi = cast<VPPhi>(X);
1694 auto *IVInc = Def->getOperand(0);
1695 if (IVInc->getNumUsers() == 2) {
1696 // If Phi has a second user (besides IVInc's defining recipe), it must
1697 // be Inc = Phi + Y for the fold to apply.
1700 if (Phi->getNumUsers() == 1 || (Phi->getNumUsers() == 2 && Inc)) {
1701 Def->replaceAllUsesWith(IVInc);
1702 if (Inc)
1703 Inc->replaceAllUsesWith(Phi);
1704 Phi->setOperand(0, Y);
1705 return;
1706 }
1707 }
1708 }
1709
1710 // Simplify unrolled VectorPointer without offset, or with zero offset, to
1711 // just the pointer operand.
1712 if (auto *VPR = dyn_cast<VPVectorPointerRecipe>(Def))
1713 if (!VPR->getOffset() || match(VPR->getOffset(), m_ZeroInt()))
1714 return VPR->replaceAllUsesWith(VPR->getOperand(0));
1715
1716 // VPScalarIVSteps after unrolling can be replaced by their start value, if
1717 // the start index is zero and only the first lane 0 is demanded.
1718 if (auto *Steps = dyn_cast<VPScalarIVStepsRecipe>(Def)) {
1719 if (!Steps->getStartIndex() && vputils::onlyFirstLaneUsed(Steps)) {
1720 Steps->replaceAllUsesWith(Steps->getOperand(0));
1721 return;
1722 }
1723 }
1724 // Simplify redundant ReductionStartVector recipes after unrolling.
1725 VPValue *StartV;
1727 m_VPValue(StartV), m_VPValue(), m_VPValue()))) {
1728 Def->replaceUsesWithIf(StartV, [](const VPUser &U, unsigned Idx) {
1729 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&U);
1730 return PhiR && PhiR->isInLoop();
1731 });
1732 return;
1733 }
1734
1736 Def->replaceAllUsesWith(A);
1737 return;
1738 }
1739
1740 if (match(Def, m_ExtractLastLane(m_VPValue(A))) &&
1743 cast<VPReplicateRecipe>(A)->isSingleScalar())) &&
1744 all_of(A->users(),
1745 [Def, A](VPUser *U) { return U->usesScalars(A) || Def == U; })) {
1746 return Def->replaceAllUsesWith(A);
1747 }
1748
1749 if (Plan->getConcreteUF() == 1 && match(Def, m_ExtractLastPart(m_VPValue(A))))
1750 return Def->replaceAllUsesWith(A);
1751}
1752
1755 Plan.getEntry());
1756 VPTypeAnalysis TypeInfo(Plan);
1758 for (VPRecipeBase &R : make_early_inc_range(*VPBB))
1759 if (auto *Def = dyn_cast<VPSingleDefRecipe>(&R))
1760 simplifyRecipe(Def, TypeInfo);
1761 }
1762}
1763
1764/// Reassociate (headermask && x) && y -> headermask && (x && y) to allow the
1765/// header mask to be simplified further when tail folding, e.g. in
1766/// optimizeEVLMasks.
1767static void reassociateHeaderMask(VPlan &Plan) {
1768 VPValue *HeaderMask = vputils::findHeaderMask(Plan);
1769 if (!HeaderMask)
1770 return;
1771
1772 SmallVector<VPUser *> Worklist;
1773 for (VPUser *U : HeaderMask->users())
1774 if (match(U, m_LogicalAnd(m_Specific(HeaderMask), m_VPValue())))
1776
1777 while (!Worklist.empty()) {
1778 auto *R = dyn_cast<VPSingleDefRecipe>(Worklist.pop_back_val());
1779 VPValue *X, *Y;
1780 if (!R || !match(R, m_LogicalAnd(
1781 m_LogicalAnd(m_Specific(HeaderMask), m_VPValue(X)),
1782 m_VPValue(Y))))
1783 continue;
1784 append_range(Worklist, R->users());
1785 VPBuilder Builder(R);
1786 R->replaceAllUsesWith(
1787 Builder.createLogicalAnd(HeaderMask, Builder.createLogicalAnd(X, Y)));
1788 }
1789}
1790
1792 if (Plan.hasScalarVFOnly())
1793 return;
1794
1795 // Try to narrow wide and replicating recipes to single scalar recipes,
1796 // based on VPlan analysis. Only process blocks in the loop region for now,
1797 // without traversing into nested regions, as recipes in replicate regions
1798 // cannot be converted yet.
1801 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
1803 VPWidenStoreRecipe>(&R))
1804 continue;
1805 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
1806 if (RepR && (RepR->isSingleScalar() || RepR->isPredicated()))
1807 continue;
1808
1809 // Convert an unmasked scatter with an uniform address into
1810 // extract-last-lane + scalar store.
1811 // TODO: Add a profitability check comparing the cost of a scatter vs.
1812 // extract + scalar store.
1813 auto *WidenStoreR = dyn_cast<VPWidenStoreRecipe>(&R);
1814 if (WidenStoreR && vputils::isSingleScalar(WidenStoreR->getAddr()) &&
1815 !WidenStoreR->isConsecutive()) {
1816 assert(!WidenStoreR->isReverse() &&
1817 "Not consecutive memory recipes shouldn't be reversed");
1818 VPValue *Mask = WidenStoreR->getMask();
1819
1820 // Only convert the scatter to a scalar store if it is unmasked.
1821 // TODO: Support converting scatter masked by the header mask to scalar
1822 // store.
1823 if (Mask)
1824 continue;
1825
1827 {WidenStoreR->getOperand(1)});
1828 Extract->insertBefore(WidenStoreR);
1829
1830 // TODO: Sink the scalar store recipe to middle block if possible.
1831 auto *ScalarStore = new VPReplicateRecipe(
1832 &WidenStoreR->getIngredient(), {Extract, WidenStoreR->getAddr()},
1833 true /*IsSingleScalar*/, nullptr /*Mask*/, {},
1834 *WidenStoreR /*Metadata*/);
1835 ScalarStore->insertBefore(WidenStoreR);
1836 WidenStoreR->eraseFromParent();
1837 continue;
1838 }
1839
1840 auto *RepOrWidenR = dyn_cast<VPRecipeWithIRFlags>(&R);
1841 if (RepR && isa<StoreInst>(RepR->getUnderlyingInstr()) &&
1842 vputils::isSingleScalar(RepR->getOperand(1))) {
1843 auto *Clone = new VPReplicateRecipe(
1844 RepOrWidenR->getUnderlyingInstr(), RepOrWidenR->operands(),
1845 true /*IsSingleScalar*/, nullptr /*Mask*/, *RepR /*Flags*/,
1846 *RepR /*Metadata*/, RepR->getDebugLoc());
1847 Clone->insertBefore(RepOrWidenR);
1848 VPBuilder Builder(Clone);
1849 VPValue *ExtractOp = Clone->getOperand(0);
1850 if (vputils::isUniformAcrossVFsAndUFs(RepR->getOperand(1)))
1851 ExtractOp =
1852 Builder.createNaryOp(VPInstruction::ExtractLastPart, ExtractOp);
1853 ExtractOp =
1854 Builder.createNaryOp(VPInstruction::ExtractLastLane, ExtractOp);
1855 Clone->setOperand(0, ExtractOp);
1856 RepR->eraseFromParent();
1857 continue;
1858 }
1859
1860 // Skip recipes that aren't single scalars.
1861 if (!RepOrWidenR || !vputils::isSingleScalar(RepOrWidenR))
1862 continue;
1863
1864 // Predicate to check if a user of Op introduces extra broadcasts.
1865 auto IntroducesBCastOf = [](const VPValue *Op) {
1866 return [Op](const VPUser *U) {
1867 if (auto *VPI = dyn_cast<VPInstruction>(U)) {
1871 VPI->getOpcode()))
1872 return false;
1873 }
1874 return !U->usesScalars(Op);
1875 };
1876 };
1877
1878 if (any_of(RepOrWidenR->users(), IntroducesBCastOf(RepOrWidenR)) &&
1879 none_of(RepOrWidenR->operands(), [&](VPValue *Op) {
1880 if (any_of(
1881 make_filter_range(Op->users(), not_equal_to(RepOrWidenR)),
1882 IntroducesBCastOf(Op)))
1883 return false;
1884 // Non-constant live-ins require broadcasts, while constants do not
1885 // need explicit broadcasts.
1886 auto *IRV = dyn_cast<VPIRValue>(Op);
1887 bool LiveInNeedsBroadcast = IRV && !isa<Constant>(IRV->getValue());
1888 auto *OpR = dyn_cast<VPReplicateRecipe>(Op);
1889 return LiveInNeedsBroadcast || (OpR && OpR->isSingleScalar());
1890 }))
1891 continue;
1892
1893 auto *Clone = new VPReplicateRecipe(
1894 RepOrWidenR->getUnderlyingInstr(), RepOrWidenR->operands(),
1895 true /*IsSingleScalar*/, nullptr, *RepOrWidenR);
1896 Clone->insertBefore(RepOrWidenR);
1897 RepOrWidenR->replaceAllUsesWith(Clone);
1898 if (isDeadRecipe(*RepOrWidenR))
1899 RepOrWidenR->eraseFromParent();
1900 }
1901 }
1902}
1903
1904/// Try to see if all of \p Blend's masks share a common value logically and'ed
1905/// and remove it from the masks.
1907 if (Blend->isNormalized())
1908 return;
1909 VPValue *CommonEdgeMask;
1910 if (!match(Blend->getMask(0),
1911 m_LogicalAnd(m_VPValue(CommonEdgeMask), m_VPValue())))
1912 return;
1913 for (unsigned I = 0; I < Blend->getNumIncomingValues(); I++)
1914 if (!match(Blend->getMask(I),
1915 m_LogicalAnd(m_Specific(CommonEdgeMask), m_VPValue())))
1916 return;
1917 for (unsigned I = 0; I < Blend->getNumIncomingValues(); I++)
1918 Blend->setMask(I, Blend->getMask(I)->getDefiningRecipe()->getOperand(1));
1919}
1920
1921/// Normalize and simplify VPBlendRecipes. Should be run after simplifyRecipes
1922/// to make sure the masks are simplified.
1923static void simplifyBlends(VPlan &Plan) {
1926 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
1927 auto *Blend = dyn_cast<VPBlendRecipe>(&R);
1928 if (!Blend)
1929 continue;
1930
1931 removeCommonBlendMask(Blend);
1932
1933 // Try to remove redundant blend recipes.
1934 SmallPtrSet<VPValue *, 4> UniqueValues;
1935 if (Blend->isNormalized() || !match(Blend->getMask(0), m_False()))
1936 UniqueValues.insert(Blend->getIncomingValue(0));
1937 for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
1938 if (!match(Blend->getMask(I), m_False()))
1939 UniqueValues.insert(Blend->getIncomingValue(I));
1940
1941 if (UniqueValues.size() == 1) {
1942 Blend->replaceAllUsesWith(*UniqueValues.begin());
1943 Blend->eraseFromParent();
1944 continue;
1945 }
1946
1947 if (Blend->isNormalized())
1948 continue;
1949
1950 // Normalize the blend so its first incoming value is used as the initial
1951 // value with the others blended into it.
1952
1953 unsigned StartIndex = 0;
1954 for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
1955 // If a value's mask is used only by the blend then is can be deadcoded.
1956 // TODO: Find the most expensive mask that can be deadcoded, or a mask
1957 // that's used by multiple blends where it can be removed from them all.
1958 VPValue *Mask = Blend->getMask(I);
1959 if (Mask->getNumUsers() == 1 && !match(Mask, m_False())) {
1960 StartIndex = I;
1961 break;
1962 }
1963 }
1964
1965 SmallVector<VPValue *, 4> OperandsWithMask;
1966 OperandsWithMask.push_back(Blend->getIncomingValue(StartIndex));
1967
1968 for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
1969 if (I == StartIndex)
1970 continue;
1971 OperandsWithMask.push_back(Blend->getIncomingValue(I));
1972 OperandsWithMask.push_back(Blend->getMask(I));
1973 }
1974
1975 auto *NewBlend =
1976 new VPBlendRecipe(cast_or_null<PHINode>(Blend->getUnderlyingValue()),
1977 OperandsWithMask, *Blend, Blend->getDebugLoc());
1978 NewBlend->insertBefore(&R);
1979
1980 VPValue *DeadMask = Blend->getMask(StartIndex);
1981 Blend->replaceAllUsesWith(NewBlend);
1982 Blend->eraseFromParent();
1984
1985 /// Simplify BLEND %a, %b, Not(%mask) -> BLEND %b, %a, %mask.
1986 VPValue *NewMask;
1987 if (NewBlend->getNumOperands() == 3 &&
1988 match(NewBlend->getMask(1), m_Not(m_VPValue(NewMask)))) {
1989 VPValue *Inc0 = NewBlend->getOperand(0);
1990 VPValue *Inc1 = NewBlend->getOperand(1);
1991 VPValue *OldMask = NewBlend->getOperand(2);
1992 NewBlend->setOperand(0, Inc1);
1993 NewBlend->setOperand(1, Inc0);
1994 NewBlend->setOperand(2, NewMask);
1995 if (OldMask->getNumUsers() == 0)
1996 cast<VPInstruction>(OldMask)->eraseFromParent();
1997 }
1998 }
1999 }
2000}
2001
2002/// Optimize the width of vector induction variables in \p Plan based on a known
2003/// constant Trip Count, \p BestVF and \p BestUF.
2005 ElementCount BestVF,
2006 unsigned BestUF) {
2007 // Only proceed if we have not completely removed the vector region.
2008 if (!Plan.getVectorLoopRegion())
2009 return false;
2010
2011 const APInt *TC;
2012 if (!BestVF.isFixed() || !match(Plan.getTripCount(), m_APInt(TC)))
2013 return false;
2014
2015 // Calculate the minimum power-of-2 bit width that can fit the known TC, VF
2016 // and UF. Returns at least 8.
2017 auto ComputeBitWidth = [](APInt TC, uint64_t Align) {
2018 APInt AlignedTC =
2021 APInt MaxVal = AlignedTC - 1;
2022 return std::max<unsigned>(PowerOf2Ceil(MaxVal.getActiveBits()), 8);
2023 };
2024 unsigned NewBitWidth =
2025 ComputeBitWidth(*TC, BestVF.getKnownMinValue() * BestUF);
2026
2027 LLVMContext &Ctx = Plan.getContext();
2028 auto *NewIVTy = IntegerType::get(Ctx, NewBitWidth);
2029
2030 bool MadeChange = false;
2031
2032 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
2033 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
2034 auto *WideIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
2035
2036 // Currently only handle canonical IVs as it is trivial to replace the start
2037 // and stop values, and we currently only perform the optimization when the
2038 // IV has a single use.
2039 if (!WideIV || !WideIV->isCanonical() ||
2040 WideIV->hasMoreThanOneUniqueUser() ||
2041 NewIVTy == WideIV->getScalarType())
2042 continue;
2043
2044 // Currently only handle cases where the single user is a header-mask
2045 // comparison with the backedge-taken-count.
2046 VPUser *SingleUser = WideIV->getSingleUser();
2047 if (!SingleUser ||
2048 !match(SingleUser, m_ICmp(m_Specific(WideIV),
2051 continue;
2052
2053 // Update IV operands and comparison bound to use new narrower type.
2054 auto *NewStart = Plan.getZero(NewIVTy);
2055 WideIV->setStartValue(NewStart);
2056 auto *NewStep = Plan.getConstantInt(NewIVTy, 1);
2057 WideIV->setStepValue(NewStep);
2058
2059 auto *NewBTC = new VPWidenCastRecipe(
2060 Instruction::Trunc, Plan.getOrCreateBackedgeTakenCount(), NewIVTy,
2061 nullptr, VPIRFlags::getDefaultFlags(Instruction::Trunc));
2062 Plan.getVectorPreheader()->appendRecipe(NewBTC);
2063 auto *Cmp = cast<VPInstruction>(WideIV->getSingleUser());
2064 Cmp->setOperand(1, NewBTC);
2065
2066 MadeChange = true;
2067 }
2068
2069 return MadeChange;
2070}
2071
2072/// Return true if \p Cond is known to be true for given \p BestVF and \p
2073/// BestUF.
2075 ElementCount BestVF, unsigned BestUF,
2078 return any_of(Cond->getDefiningRecipe()->operands(), [&Plan, BestVF, BestUF,
2079 &PSE](VPValue *C) {
2080 return isConditionTrueViaVFAndUF(C, Plan, BestVF, BestUF, PSE);
2081 });
2082
2083 auto *CanIV = Plan.getVectorLoopRegion()->getCanonicalIV();
2085 m_Specific(CanIV->getBackedgeValue()),
2086 m_Specific(&Plan.getVectorTripCount()))))
2087 return false;
2088
2089 // The compare checks CanIV + VFxUF == vector trip count. The vector trip
2090 // count is not conveniently available as SCEV so far, so we compare directly
2091 // against the original trip count. This is stricter than necessary, as we
2092 // will only return true if the trip count == vector trip count.
2093 const SCEV *VectorTripCount =
2095 if (isa<SCEVCouldNotCompute>(VectorTripCount))
2096 VectorTripCount = vputils::getSCEVExprForVPValue(Plan.getTripCount(), PSE);
2097 assert(!isa<SCEVCouldNotCompute>(VectorTripCount) &&
2098 "Trip count SCEV must be computable");
2099 ScalarEvolution &SE = *PSE.getSE();
2100 ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF);
2101 const SCEV *C = SE.getElementCount(VectorTripCount->getType(), NumElements);
2102 return SE.isKnownPredicate(CmpInst::ICMP_EQ, VectorTripCount, C);
2103}
2104
2105/// Try to replace multiple active lane masks used for control flow with
2106/// a single, wide active lane mask instruction followed by multiple
2107/// extract subvector intrinsics. This applies to the active lane mask
2108/// instructions both in the loop and in the preheader.
2109/// Incoming values of all ActiveLaneMaskPHIs are updated to use the
2110/// new extracts from the first active lane mask, which has it's last
2111/// operand (multiplier) set to UF.
2113 unsigned UF) {
2114 if (!EnableWideActiveLaneMask || !VF.isVector() || UF == 1)
2115 return false;
2116
2117 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
2118 VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
2119 auto *Term = &ExitingVPBB->back();
2120
2121 using namespace llvm::VPlanPatternMatch;
2123 m_VPValue(), m_VPValue(), m_VPValue())))))
2124 return false;
2125
2126 auto *Header = cast<VPBasicBlock>(VectorRegion->getEntry());
2127 LLVMContext &Ctx = Plan.getContext();
2128
2129 auto ExtractFromALM = [&](VPInstruction *ALM,
2130 SmallVectorImpl<VPValue *> &Extracts) {
2131 DebugLoc DL = ALM->getDebugLoc();
2132 for (unsigned Part = 0; Part < UF; ++Part) {
2134 Ops.append({ALM, Plan.getConstantInt(64, VF.getKnownMinValue() * Part)});
2135 auto *Ext =
2136 new VPWidenIntrinsicRecipe(Intrinsic::vector_extract, Ops,
2137 IntegerType::getInt1Ty(Ctx), {}, {}, DL);
2138 Extracts[Part] = Ext;
2139 Ext->insertAfter(ALM);
2140 }
2141 };
2142
2143 // Create a list of each active lane mask phi, ordered by unroll part.
2145 for (VPRecipeBase &R : Header->phis()) {
2147 if (!Phi)
2148 continue;
2149 VPValue *Index = nullptr;
2150 match(Phi->getBackedgeValue(),
2152 assert(Index && "Expected index from ActiveLaneMask instruction");
2153
2154 uint64_t Part;
2155 if (match(Index,
2157 m_VPValue(), m_Mul(m_VPValue(), m_ConstantInt(Part)))))
2158 Phis[Part] = Phi;
2159 else {
2160 // Anything other than a CanonicalIVIncrementForPart is part 0
2161 assert(!match(
2162 Index,
2164 Phis[0] = Phi;
2165 }
2166 }
2167
2168 assert(all_of(Phis, [](VPActiveLaneMaskPHIRecipe *Phi) { return Phi; }) &&
2169 "Expected one VPActiveLaneMaskPHIRecipe for each unroll part");
2170
2171 auto *EntryALM = cast<VPInstruction>(Phis[0]->getStartValue());
2172 auto *LoopALM = cast<VPInstruction>(Phis[0]->getBackedgeValue());
2173
2174 assert((EntryALM->getOpcode() == VPInstruction::ActiveLaneMask &&
2175 LoopALM->getOpcode() == VPInstruction::ActiveLaneMask) &&
2176 "Expected incoming values of Phi to be ActiveLaneMasks");
2177
2178 // When using wide lane masks, the return type of the get.active.lane.mask
2179 // intrinsic is VF x UF (last operand).
2180 VPValue *ALMMultiplier = Plan.getConstantInt(64, UF);
2181 EntryALM->setOperand(2, ALMMultiplier);
2182 LoopALM->setOperand(2, ALMMultiplier);
2183
2184 // Create UF x extract vectors and insert into preheader.
2185 SmallVector<VPValue *> EntryExtracts(UF);
2186 ExtractFromALM(EntryALM, EntryExtracts);
2187
2188 // Create UF x extract vectors and insert before the loop compare & branch,
2189 // updating the compare to use the first extract.
2190 SmallVector<VPValue *> LoopExtracts(UF);
2191 ExtractFromALM(LoopALM, LoopExtracts);
2192 VPInstruction *Not = cast<VPInstruction>(Term->getOperand(0));
2193 Not->setOperand(0, LoopExtracts[0]);
2194
2195 // Update the incoming values of active lane mask phis.
2196 for (unsigned Part = 0; Part < UF; ++Part) {
2197 Phis[Part]->setStartValue(EntryExtracts[Part]);
2198 Phis[Part]->setBackedgeValue(LoopExtracts[Part]);
2199 }
2200
2201 return true;
2202}
2203
2204/// Try to simplify the branch condition of \p Plan. This may restrict the
2205/// resulting plan to \p BestVF and \p BestUF.
2207 unsigned BestUF,
2209 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
2210 VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
2211 auto *Term = &ExitingVPBB->back();
2212 VPValue *Cond;
2213 auto m_CanIVInc = m_Add(m_VPValue(), m_Specific(&Plan.getVFxUF()));
2214 // Check if the branch condition compares the canonical IV increment (for main
2215 // loop), or the canonical IV increment plus an offset (for epilog loop).
2216 if (match(Term, m_BranchOnCount(
2217 m_CombineOr(m_CanIVInc, m_c_Add(m_CanIVInc, m_LiveIn())),
2218 m_VPValue())) ||
2220 m_VPValue(), m_VPValue(), m_VPValue()))))) {
2221 // Try to simplify the branch condition if VectorTC <= VF * UF when the
2222 // latch terminator is BranchOnCount or BranchOnCond(Not(ActiveLaneMask)).
2223 const SCEV *VectorTripCount =
2225 if (isa<SCEVCouldNotCompute>(VectorTripCount))
2226 VectorTripCount =
2228 assert(!isa<SCEVCouldNotCompute>(VectorTripCount) &&
2229 "Trip count SCEV must be computable");
2230 ScalarEvolution &SE = *PSE.getSE();
2231 ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF);
2232 const SCEV *C = SE.getElementCount(VectorTripCount->getType(), NumElements);
2233 if (!SE.isKnownPredicate(CmpInst::ICMP_ULE, VectorTripCount, C))
2234 return false;
2235 } else if (match(Term, m_BranchOnCond(m_VPValue(Cond))) ||
2237 // For BranchOnCond, check if we can prove the condition to be true using VF
2238 // and UF.
2239 if (!isConditionTrueViaVFAndUF(Cond, Plan, BestVF, BestUF, PSE))
2240 return false;
2241 } else {
2242 return false;
2243 }
2244
2245 // The vector loop region only executes once. Convert terminator of the
2246 // exiting block to exit in the first iteration.
2247 if (match(Term, m_BranchOnTwoConds())) {
2248 Term->setOperand(1, Plan.getTrue());
2249 return true;
2250 }
2251
2252 auto *BOC = new VPInstruction(VPInstruction::BranchOnCond, Plan.getTrue(), {},
2253 {}, Term->getDebugLoc());
2254 ExitingVPBB->appendRecipe(BOC);
2255 Term->eraseFromParent();
2256
2257 return true;
2258}
2259
2260/// From the definition of llvm.experimental.get.vector.length,
2261/// VPInstruction::ExplicitVectorLength(%AVL) = %AVL when %AVL <= VF.
2265 vp_depth_first_deep(Plan.getEntry()))) {
2266 for (VPRecipeBase &R : *VPBB) {
2267 VPValue *AVL;
2268 if (!match(&R, m_EVL(m_VPValue(AVL))))
2269 continue;
2270
2271 const SCEV *AVLSCEV = vputils::getSCEVExprForVPValue(AVL, PSE);
2272 if (isa<SCEVCouldNotCompute>(AVLSCEV))
2273 continue;
2274 ScalarEvolution &SE = *PSE.getSE();
2275 const SCEV *VFSCEV = SE.getElementCount(AVLSCEV->getType(), VF);
2276 if (!SE.isKnownPredicate(CmpInst::ICMP_ULE, AVLSCEV, VFSCEV))
2277 continue;
2278
2280 AVL, Type::getInt32Ty(Plan.getContext()), AVLSCEV->getType(),
2281 R.getDebugLoc());
2282 if (Trunc != AVL) {
2283 auto *TruncR = cast<VPSingleDefRecipe>(Trunc);
2284 const DataLayout &DL = Plan.getDataLayout();
2285 VPTypeAnalysis TypeInfo(Plan);
2286 if (VPValue *Folded =
2287 tryToFoldLiveIns(*TruncR, TruncR->operands(), DL, TypeInfo))
2288 Trunc = Folded;
2289 }
2290 R.getVPSingleValue()->replaceAllUsesWith(Trunc);
2291 return true;
2292 }
2293 }
2294 return false;
2295}
2296
2298 unsigned BestUF,
2300 assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
2301 assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
2302
2303 bool MadeChange = tryToReplaceALMWithWideALM(Plan, BestVF, BestUF);
2304 MadeChange |= simplifyBranchConditionForVFAndUF(Plan, BestVF, BestUF, PSE);
2305 MadeChange |= optimizeVectorInductionWidthForTCAndVFUF(Plan, BestVF, BestUF);
2306
2307 if (MadeChange) {
2308 Plan.setVF(BestVF);
2309 assert(Plan.getConcreteUF() == BestUF && "BestUF must match the Plan's UF");
2310 }
2311}
2312
2313/// Sink users of \p FOR after the recipe defining the previous value \p
2314/// Previous of the recurrence. \returns true if all users of \p FOR could be
2315/// re-arranged as needed or false if it is not possible.
2316static bool
2318 VPRecipeBase *Previous,
2319 VPDominatorTree &VPDT) {
2320 // If Previous is a live-in (no defining recipe), it naturally dominates all
2321 // recipes in the loop, so no sinking is needed.
2322 if (!Previous)
2323 return true;
2324
2325 // Collect recipes that need sinking.
2328 Seen.insert(Previous);
2329 auto TryToPushSinkCandidate = [&](VPRecipeBase *SinkCandidate) {
2330 // The previous value must not depend on the users of the recurrence phi. In
2331 // that case, FOR is not a fixed order recurrence.
2332 if (SinkCandidate == Previous)
2333 return false;
2334
2335 if (isa<VPHeaderPHIRecipe>(SinkCandidate) ||
2336 !Seen.insert(SinkCandidate).second ||
2337 VPDT.properlyDominates(Previous, SinkCandidate))
2338 return true;
2339
2340 if (cannotHoistOrSinkRecipe(*SinkCandidate))
2341 return false;
2342
2343 WorkList.push_back(SinkCandidate);
2344 return true;
2345 };
2346
2347 // Recursively sink users of FOR after Previous.
2348 WorkList.push_back(FOR);
2349 for (unsigned I = 0; I != WorkList.size(); ++I) {
2350 VPRecipeBase *Current = WorkList[I];
2351 assert(Current->getNumDefinedValues() == 1 &&
2352 "only recipes with a single defined value expected");
2353
2354 for (VPUser *User : Current->getVPSingleValue()->users()) {
2355 if (!TryToPushSinkCandidate(cast<VPRecipeBase>(User)))
2356 return false;
2357 }
2358 }
2359
2360 // Keep recipes to sink ordered by dominance so earlier instructions are
2361 // processed first.
2362 sort(WorkList, [&VPDT](const VPRecipeBase *A, const VPRecipeBase *B) {
2363 return VPDT.properlyDominates(A, B);
2364 });
2365
2366 for (VPRecipeBase *SinkCandidate : WorkList) {
2367 if (SinkCandidate == FOR)
2368 continue;
2369
2370 SinkCandidate->moveAfter(Previous);
2371 Previous = SinkCandidate;
2372 }
2373 return true;
2374}
2375
2376/// Try to hoist \p Previous and its operands before all users of \p FOR.
2378 VPRecipeBase *Previous,
2379 VPDominatorTree &VPDT) {
2380 if (cannotHoistOrSinkRecipe(*Previous))
2381 return false;
2382
2383 // Collect recipes that need hoisting.
2384 SmallVector<VPRecipeBase *> HoistCandidates;
2386 VPRecipeBase *HoistPoint = nullptr;
2387 // Find the closest hoist point by looking at all users of FOR and selecting
2388 // the recipe dominating all other users.
2389 for (VPUser *U : FOR->users()) {
2390 auto *R = cast<VPRecipeBase>(U);
2391 if (!HoistPoint || VPDT.properlyDominates(R, HoistPoint))
2392 HoistPoint = R;
2393 }
2394 assert(all_of(FOR->users(),
2395 [&VPDT, HoistPoint](VPUser *U) {
2396 auto *R = cast<VPRecipeBase>(U);
2397 return HoistPoint == R ||
2398 VPDT.properlyDominates(HoistPoint, R);
2399 }) &&
2400 "HoistPoint must dominate all users of FOR");
2401
2402 auto NeedsHoisting = [HoistPoint, &VPDT,
2403 &Visited](VPValue *HoistCandidateV) -> VPRecipeBase * {
2404 VPRecipeBase *HoistCandidate = HoistCandidateV->getDefiningRecipe();
2405 if (!HoistCandidate)
2406 return nullptr;
2407 VPRegionBlock *EnclosingLoopRegion =
2408 HoistCandidate->getParent()->getEnclosingLoopRegion();
2409 assert((!HoistCandidate->getRegion() ||
2410 HoistCandidate->getRegion() == EnclosingLoopRegion) &&
2411 "CFG in VPlan should still be flat, without replicate regions");
2412 // Hoist candidate was already visited, no need to hoist.
2413 if (!Visited.insert(HoistCandidate).second)
2414 return nullptr;
2415
2416 // Candidate is outside loop region or a header phi, dominates FOR users w/o
2417 // hoisting.
2418 if (!EnclosingLoopRegion || isa<VPHeaderPHIRecipe>(HoistCandidate))
2419 return nullptr;
2420
2421 // If we reached a recipe that dominates HoistPoint, we don't need to
2422 // hoist the recipe.
2423 if (VPDT.properlyDominates(HoistCandidate, HoistPoint))
2424 return nullptr;
2425 return HoistCandidate;
2426 };
2427
2428 if (!NeedsHoisting(Previous->getVPSingleValue()))
2429 return true;
2430
2431 // Recursively try to hoist Previous and its operands before all users of FOR.
2432 HoistCandidates.push_back(Previous);
2433
2434 for (unsigned I = 0; I != HoistCandidates.size(); ++I) {
2435 VPRecipeBase *Current = HoistCandidates[I];
2436 assert(Current->getNumDefinedValues() == 1 &&
2437 "only recipes with a single defined value expected");
2438 if (cannotHoistOrSinkRecipe(*Current))
2439 return false;
2440
2441 for (VPValue *Op : Current->operands()) {
2442 // If we reach FOR, it means the original Previous depends on some other
2443 // recurrence that in turn depends on FOR. If that is the case, we would
2444 // also need to hoist recipes involving the other FOR, which may break
2445 // dependencies.
2446 if (Op == FOR)
2447 return false;
2448
2449 if (auto *R = NeedsHoisting(Op)) {
2450 // Bail out if the recipe defines multiple values.
2451 // TODO: Hoisting such recipes requires additional handling.
2452 if (R->getNumDefinedValues() != 1)
2453 return false;
2454 HoistCandidates.push_back(R);
2455 }
2456 }
2457 }
2458
2459 // Order recipes to hoist by dominance so earlier instructions are processed
2460 // first.
2461 sort(HoistCandidates, [&VPDT](const VPRecipeBase *A, const VPRecipeBase *B) {
2462 return VPDT.properlyDominates(A, B);
2463 });
2464
2465 for (VPRecipeBase *HoistCandidate : HoistCandidates) {
2466 HoistCandidate->moveBefore(*HoistPoint->getParent(),
2467 HoistPoint->getIterator());
2468 }
2469
2470 return true;
2471}
2472
2474 VPBuilder &LoopBuilder) {
2475 VPDominatorTree VPDT(Plan);
2476 VPTypeAnalysis TypeInfo(Plan);
2477
2479 for (VPRecipeBase &R :
2482 RecurrencePhis.push_back(FOR);
2483
2484 for (VPFirstOrderRecurrencePHIRecipe *FOR : RecurrencePhis) {
2486 VPRecipeBase *Previous = FOR->getBackedgeValue()->getDefiningRecipe();
2487 // Fixed-order recurrences do not contain cycles, so this loop is guaranteed
2488 // to terminate.
2489 while (auto *PrevPhi =
2491 assert(PrevPhi->getParent() == FOR->getParent());
2492 assert(SeenPhis.insert(PrevPhi).second);
2493 Previous = PrevPhi->getBackedgeValue()->getDefiningRecipe();
2494 }
2495
2496 if (!sinkRecurrenceUsersAfterPrevious(FOR, Previous, VPDT) &&
2497 !hoistPreviousBeforeFORUsers(FOR, Previous, VPDT))
2498 return false;
2499
2500 // Introduce a recipe to combine the incoming and previous values of a
2501 // fixed-order recurrence.
2502 VPBasicBlock *InsertBlock =
2503 Previous ? Previous->getParent() : FOR->getParent();
2504 if (!Previous || isa<VPHeaderPHIRecipe>(Previous))
2505 LoopBuilder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi());
2506 else
2507 LoopBuilder.setInsertPoint(InsertBlock,
2508 std::next(Previous->getIterator()));
2509
2510 auto *RecurSplice =
2512 {FOR, FOR->getBackedgeValue()});
2513
2514 FOR->replaceAllUsesWith(RecurSplice);
2515 // Set the first operand of RecurSplice to FOR again, after replacing
2516 // all users.
2517 RecurSplice->setOperand(0, FOR);
2518
2519 // Check for users extracting at the penultimate active lane of the FOR.
2520 // If only a single lane is active in the current iteration, we need to
2521 // select the last element from the previous iteration (from the FOR phi
2522 // directly).
2523 for (VPUser *U : RecurSplice->users()) {
2525 m_Specific(RecurSplice))))
2526 continue;
2527
2529 VPValue *LastActiveLane = cast<VPInstruction>(U)->getOperand(0);
2530 Type *Ty = TypeInfo.inferScalarType(LastActiveLane);
2531 VPValue *Zero = Plan.getConstantInt(Ty, 0);
2532 VPValue *One = Plan.getConstantInt(Ty, 1);
2533 VPValue *PenultimateIndex = B.createSub(LastActiveLane, One);
2534 VPValue *PenultimateLastIter =
2535 B.createNaryOp(VPInstruction::ExtractLane,
2536 {PenultimateIndex, FOR->getBackedgeValue()});
2537 VPValue *LastPrevIter =
2538 B.createNaryOp(VPInstruction::ExtractLastLane, FOR);
2539
2540 VPValue *Cmp = B.createICmp(CmpInst::ICMP_EQ, LastActiveLane, Zero);
2541 VPValue *Sel = B.createSelect(Cmp, LastPrevIter, PenultimateLastIter);
2542 cast<VPInstruction>(U)->replaceAllUsesWith(Sel);
2543 }
2544 }
2545 return true;
2546}
2547
2549 for (VPRecipeBase &R :
2551 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
2552 if (!PhiR)
2553 continue;
2554 RecurKind RK = PhiR->getRecurrenceKind();
2555 if (RK != RecurKind::Add && RK != RecurKind::Mul && RK != RecurKind::Sub &&
2557 continue;
2558
2559 for (VPUser *U : collectUsersRecursively(PhiR))
2560 if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(U)) {
2561 RecWithFlags->dropPoisonGeneratingFlags();
2562 }
2563 }
2564}
2565
2566namespace {
2567struct VPCSEDenseMapInfo : public DenseMapInfo<VPSingleDefRecipe *> {
2568 static bool isSentinel(const VPSingleDefRecipe *Def) {
2569 return Def == getEmptyKey() || Def == getTombstoneKey();
2570 }
2571
2572 /// If recipe \p R will lower to a GEP with a non-i8 source element type,
2573 /// return that source element type.
2574 static Type *getGEPSourceElementType(const VPSingleDefRecipe *R) {
2575 // All VPInstructions that lower to GEPs must have the i8 source element
2576 // type (as they are PtrAdds), so we omit it.
2578 .Case([](const VPReplicateRecipe *I) -> Type * {
2579 if (auto *GEP = dyn_cast<GetElementPtrInst>(I->getUnderlyingValue()))
2580 return GEP->getSourceElementType();
2581 return nullptr;
2582 })
2583 .Case<VPVectorPointerRecipe, VPWidenGEPRecipe>(
2584 [](auto *I) { return I->getSourceElementType(); })
2585 .Default([](auto *) { return nullptr; });
2586 }
2587
2588 /// Returns true if recipe \p Def can be safely handed for CSE.
2589 static bool canHandle(const VPSingleDefRecipe *Def) {
2590 // We can extend the list of handled recipes in the future,
2591 // provided we account for the data embedded in them while checking for
2592 // equality or hashing.
2593 auto C = getOpcodeOrIntrinsicID(Def);
2594
2595 // The issue with (Insert|Extract)Value is that the index of the
2596 // insert/extract is not a proper operand in LLVM IR, and hence also not in
2597 // VPlan.
2598 if (!C || (!C->first && (C->second == Instruction::InsertValue ||
2599 C->second == Instruction::ExtractValue)))
2600 return false;
2601
2602 // During CSE, we can only handle recipes that don't read from memory: if
2603 // they read from memory, there could be an intervening write to memory
2604 // before the next instance is CSE'd, leading to an incorrect result.
2605 return !Def->mayReadFromMemory();
2606 }
2607
2608 /// Hash the underlying data of \p Def.
2609 static unsigned getHashValue(const VPSingleDefRecipe *Def) {
2610 const VPlan *Plan = Def->getParent()->getPlan();
2611 VPTypeAnalysis TypeInfo(*Plan);
2612 hash_code Result = hash_combine(
2613 Def->getVPRecipeID(), getOpcodeOrIntrinsicID(Def),
2614 getGEPSourceElementType(Def), TypeInfo.inferScalarType(Def),
2616 if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(Def))
2617 if (RFlags->hasPredicate())
2618 return hash_combine(Result, RFlags->getPredicate());
2619 return Result;
2620 }
2621
2622 /// Check equality of underlying data of \p L and \p R.
2623 static bool isEqual(const VPSingleDefRecipe *L, const VPSingleDefRecipe *R) {
2624 if (isSentinel(L) || isSentinel(R))
2625 return L == R;
2626 if (L->getVPRecipeID() != R->getVPRecipeID() ||
2628 getGEPSourceElementType(L) != getGEPSourceElementType(R) ||
2630 !equal(L->operands(), R->operands()))
2631 return false;
2633 "must have valid opcode info for both recipes");
2634 if (auto *LFlags = dyn_cast<VPRecipeWithIRFlags>(L))
2635 if (LFlags->hasPredicate() &&
2636 LFlags->getPredicate() !=
2637 cast<VPRecipeWithIRFlags>(R)->getPredicate())
2638 return false;
2639 // Recipes in replicate regions implicitly depend on predicate. If either
2640 // recipe is in a replicate region, only consider them equal if both have
2641 // the same parent.
2642 const VPRegionBlock *RegionL = L->getRegion();
2643 const VPRegionBlock *RegionR = R->getRegion();
2644 if (((RegionL && RegionL->isReplicator()) ||
2645 (RegionR && RegionR->isReplicator())) &&
2646 L->getParent() != R->getParent())
2647 return false;
2648 const VPlan *Plan = L->getParent()->getPlan();
2649 VPTypeAnalysis TypeInfo(*Plan);
2650 return TypeInfo.inferScalarType(L) == TypeInfo.inferScalarType(R);
2651 }
2652};
2653} // end anonymous namespace
2654
2655/// Perform a common-subexpression-elimination of VPSingleDefRecipes on the \p
2656/// Plan.
2658 VPDominatorTree VPDT(Plan);
2660
2662 Plan.getEntry());
2664 for (VPRecipeBase &R : *VPBB) {
2665 auto *Def = dyn_cast<VPSingleDefRecipe>(&R);
2666 if (!Def || !VPCSEDenseMapInfo::canHandle(Def))
2667 continue;
2668 if (VPSingleDefRecipe *V = CSEMap.lookup(Def)) {
2669 // V must dominate Def for a valid replacement.
2670 if (!VPDT.dominates(V->getParent(), VPBB))
2671 continue;
2672 // Only keep flags present on both V and Def.
2673 if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(V))
2674 RFlags->intersectFlags(*cast<VPRecipeWithIRFlags>(Def));
2675 Def->replaceAllUsesWith(V);
2676 continue;
2677 }
2678 CSEMap[Def] = Def;
2679 }
2680 }
2681}
2682
2683/// Move loop-invariant recipes out of the vector loop region in \p Plan.
2684static void licm(VPlan &Plan) {
2685 VPBasicBlock *Preheader = Plan.getVectorPreheader();
2686
2687 // Hoist any loop invariant recipes from the vector loop region to the
2688 // preheader. Preform a shallow traversal of the vector loop region, to
2689 // exclude recipes in replicate regions. Since the top-level blocks in the
2690 // vector loop region are guaranteed to execute if the vector pre-header is,
2691 // we don't need to check speculation safety.
2692 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2693 assert(Preheader->getSingleSuccessor() == LoopRegion &&
2694 "Expected vector prehader's successor to be the vector loop region");
2696 vp_depth_first_shallow(LoopRegion->getEntry()))) {
2697 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
2699 continue;
2700 if (any_of(R.operands(), [](VPValue *Op) {
2701 return !Op->isDefinedOutsideLoopRegions();
2702 }))
2703 continue;
2704 R.moveBefore(*Preheader, Preheader->end());
2705 }
2706 }
2707
2708#ifndef NDEBUG
2709 VPDominatorTree VPDT(Plan);
2710#endif
2711 // Sink recipes with no users inside the vector loop region if all users are
2712 // in the same exit block of the region.
2713 // TODO: Extend to sink recipes from inner loops.
2715 vp_post_order_shallow(LoopRegion->getEntry()))) {
2716 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
2718 continue;
2719
2720 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
2721 assert(!RepR->isPredicated() &&
2722 "Expected prior transformation of predicated replicates to "
2723 "replicate regions");
2724 // narrowToSingleScalarRecipes should have already maximally narrowed
2725 // replicates to single-scalar replicates.
2726 // TODO: When unrolling, replicateByVF doesn't handle sunk
2727 // non-single-scalar replicates correctly.
2728 if (!RepR->isSingleScalar())
2729 continue;
2730 }
2731
2732 // TODO: Use R.definedValues() instead of casting to VPSingleDefRecipe to
2733 // support recipes with multiple defined values (e.g., interleaved loads).
2734 auto *Def = cast<VPSingleDefRecipe>(&R);
2735 // Skip recipes without users as we cannot determine a sink block.
2736 // TODO: Clone sinkable recipes without users to all exit blocks to reduce
2737 // their execution frequency.
2738 if (Def->getNumUsers() == 0)
2739 continue;
2740
2741 VPBasicBlock *SinkBB = nullptr;
2742 // Cannot sink the recipe if any user
2743 // * is defined in any loop region, or
2744 // * is a phi, or
2745 // * multiple users in different blocks.
2746 if (any_of(Def->users(), [&SinkBB](VPUser *U) {
2747 auto *UserR = cast<VPRecipeBase>(U);
2748 VPBasicBlock *Parent = UserR->getParent();
2749 // TODO: If the user is a PHI node, we should check the block of
2750 // incoming value. Support PHI node users if needed.
2751 if (UserR->isPhi() || Parent->getEnclosingLoopRegion())
2752 return true;
2753 // TODO: Support sinking when users are in multiple blocks.
2754 if (SinkBB && SinkBB != Parent)
2755 return true;
2756 SinkBB = Parent;
2757 return false;
2758 }))
2759 continue;
2760
2761 // Only sink to dedicated exit blocks of the loop region.
2762 if (SinkBB->getSinglePredecessor() != LoopRegion)
2763 continue;
2764
2765 // TODO: This will need to be a check instead of a assert after
2766 // conditional branches in vectorized loops are supported.
2767 assert(VPDT.properlyDominates(VPBB, SinkBB) &&
2768 "Defining block must dominate sink block");
2769 // TODO: Clone the recipe if users are on multiple exit paths, instead of
2770 // just moving.
2771 Def->moveBefore(*SinkBB, SinkBB->getFirstNonPhi());
2772 }
2773 }
2774}
2775
2777 VPlan &Plan, const MapVector<Instruction *, uint64_t> &MinBWs) {
2778 if (Plan.hasScalarVFOnly())
2779 return;
2780 // Keep track of created truncates, so they can be re-used. Note that we
2781 // cannot use RAUW after creating a new truncate, as this would could make
2782 // other uses have different types for their operands, making them invalidly
2783 // typed.
2785 VPTypeAnalysis TypeInfo(Plan);
2786 VPBasicBlock *PH = Plan.getVectorPreheader();
2789 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
2792 continue;
2793
2794 VPValue *ResultVPV = R.getVPSingleValue();
2795 auto *UI = cast_or_null<Instruction>(ResultVPV->getUnderlyingValue());
2796 unsigned NewResSizeInBits = MinBWs.lookup(UI);
2797 if (!NewResSizeInBits)
2798 continue;
2799
2800 // If the value wasn't vectorized, we must maintain the original scalar
2801 // type. Skip those here, after incrementing NumProcessedRecipes. Also
2802 // skip casts which do not need to be handled explicitly here, as
2803 // redundant casts will be removed during recipe simplification.
2805 continue;
2806
2807 Type *OldResTy = TypeInfo.inferScalarType(ResultVPV);
2808 unsigned OldResSizeInBits = OldResTy->getScalarSizeInBits();
2809 assert(OldResTy->isIntegerTy() && "only integer types supported");
2810 (void)OldResSizeInBits;
2811
2812 auto *NewResTy = IntegerType::get(Plan.getContext(), NewResSizeInBits);
2813
2814 // Any wrapping introduced by shrinking this operation shouldn't be
2815 // considered undefined behavior. So, we can't unconditionally copy
2816 // arithmetic wrapping flags to VPW.
2817 if (auto *VPW = dyn_cast<VPRecipeWithIRFlags>(&R))
2818 VPW->dropPoisonGeneratingFlags();
2819
2820 if (OldResSizeInBits != NewResSizeInBits &&
2821 !match(&R, m_ICmp(m_VPValue(), m_VPValue()))) {
2822 // Extend result to original width.
2823 auto *Ext = new VPWidenCastRecipe(
2824 Instruction::ZExt, ResultVPV, OldResTy, nullptr,
2825 VPIRFlags::getDefaultFlags(Instruction::ZExt));
2826 Ext->insertAfter(&R);
2827 ResultVPV->replaceAllUsesWith(Ext);
2828 Ext->setOperand(0, ResultVPV);
2829 assert(OldResSizeInBits > NewResSizeInBits && "Nothing to shrink?");
2830 } else {
2831 assert(match(&R, m_ICmp(m_VPValue(), m_VPValue())) &&
2832 "Only ICmps should not need extending the result.");
2833 }
2834
2835 assert(!isa<VPWidenStoreRecipe>(&R) && "stores cannot be narrowed");
2837 continue;
2838
2839 // Shrink operands by introducing truncates as needed.
2840 unsigned StartIdx =
2841 match(&R, m_Select(m_VPValue(), m_VPValue(), m_VPValue())) ? 1 : 0;
2842 for (unsigned Idx = StartIdx; Idx != R.getNumOperands(); ++Idx) {
2843 auto *Op = R.getOperand(Idx);
2844 unsigned OpSizeInBits =
2846 if (OpSizeInBits == NewResSizeInBits)
2847 continue;
2848 assert(OpSizeInBits > NewResSizeInBits && "nothing to truncate");
2849 auto [ProcessedIter, IterIsEmpty] = ProcessedTruncs.try_emplace(Op);
2850 if (!IterIsEmpty) {
2851 R.setOperand(Idx, ProcessedIter->second);
2852 continue;
2853 }
2854
2855 VPBuilder Builder;
2856 if (isa<VPIRValue>(Op))
2857 Builder.setInsertPoint(PH);
2858 else
2859 Builder.setInsertPoint(&R);
2860 VPWidenCastRecipe *NewOp =
2861 Builder.createWidenCast(Instruction::Trunc, Op, NewResTy);
2862 ProcessedIter->second = NewOp;
2863 R.setOperand(Idx, NewOp);
2864 }
2865
2866 }
2867 }
2868}
2869
2870void VPlanTransforms::removeBranchOnConst(VPlan &Plan, bool OnlyLatches) {
2871 std::optional<VPDominatorTree> VPDT;
2872 if (OnlyLatches)
2873 VPDT.emplace(Plan);
2874
2877 VPValue *Cond;
2878 // Skip blocks that are not terminated by BranchOnCond.
2879 if (VPBB->empty() || !match(&VPBB->back(), m_BranchOnCond(m_VPValue(Cond))))
2880 continue;
2881
2882 if (OnlyLatches && !VPBlockUtils::isLatch(VPBB, *VPDT))
2883 continue;
2884
2885 assert(VPBB->getNumSuccessors() == 2 &&
2886 "Two successors expected for BranchOnCond");
2887 unsigned RemovedIdx;
2888 if (match(Cond, m_True()))
2889 RemovedIdx = 1;
2890 else if (match(Cond, m_False()))
2891 RemovedIdx = 0;
2892 else
2893 continue;
2894
2895 VPBasicBlock *RemovedSucc =
2896 cast<VPBasicBlock>(VPBB->getSuccessors()[RemovedIdx]);
2897 assert(count(RemovedSucc->getPredecessors(), VPBB) == 1 &&
2898 "There must be a single edge between VPBB and its successor");
2899 // Values coming from VPBB into phi recipes of RemoveSucc are removed from
2900 // these recipes.
2901 for (VPRecipeBase &R : RemovedSucc->phis())
2902 cast<VPPhiAccessors>(&R)->removeIncomingValueFor(VPBB);
2903
2904 // Disconnect blocks and remove the terminator. RemovedSucc will be deleted
2905 // automatically on VPlan destruction if it becomes unreachable.
2906 VPBlockUtils::disconnectBlocks(VPBB, RemovedSucc);
2907 VPBB->back().eraseFromParent();
2908 }
2909}
2910
2932
2933// Add a VPActiveLaneMaskPHIRecipe and related recipes to \p Plan and replace
2934// the loop terminator with a branch-on-cond recipe with the negated
2935// active-lane-mask as operand. Note that this turns the loop into an
2936// uncountable one. Only the existing terminator is replaced, all other existing
2937// recipes/users remain unchanged, except for poison-generating flags being
2938// dropped from the canonical IV increment. Return the created
2939// VPActiveLaneMaskPHIRecipe.
2940//
2941// The function adds the following recipes:
2942//
2943// vector.ph:
2944// %EntryInc = canonical-iv-increment-for-part CanonicalIVStart
2945// %EntryALM = active-lane-mask %EntryInc, TC
2946//
2947// vector.body:
2948// ...
2949// %P = active-lane-mask-phi [ %EntryALM, %vector.ph ], [ %ALM, %vector.body ]
2950// ...
2951// %InLoopInc = canonical-iv-increment-for-part CanonicalIVIncrement
2952// %ALM = active-lane-mask %InLoopInc, TC
2953// %Negated = Not %ALM
2954// branch-on-cond %Negated
2955//
2958 VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
2959 VPBasicBlock *EB = TopRegion->getExitingBasicBlock();
2960 auto *CanonicalIVPHI = TopRegion->getCanonicalIV();
2961 VPValue *StartV = CanonicalIVPHI->getStartValue();
2962
2963 auto *CanonicalIVIncrement =
2964 cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue());
2965 // TODO: Check if dropping the flags is needed.
2966 CanonicalIVIncrement->dropPoisonGeneratingFlags();
2967 DebugLoc DL = CanonicalIVIncrement->getDebugLoc();
2968 // We can't use StartV directly in the ActiveLaneMask VPInstruction, since
2969 // we have to take unrolling into account. Each part needs to start at
2970 // Part * VF
2971 auto *VecPreheader = Plan.getVectorPreheader();
2972 VPBuilder Builder(VecPreheader);
2973
2974 // Create the ActiveLaneMask instruction using the correct start values.
2975 VPValue *TC = Plan.getTripCount();
2976 VPValue *VF = &Plan.getVF();
2977
2978 auto *EntryIncrement = Builder.createOverflowingOp(
2979 VPInstruction::CanonicalIVIncrementForPart, {StartV, VF}, {false, false},
2980 DL, "index.part.next");
2981
2982 // Create the active lane mask instruction in the VPlan preheader.
2983 VPValue *ALMMultiplier =
2984 Plan.getConstantInt(TopRegion->getCanonicalIVType(), 1);
2985 auto *EntryALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
2986 {EntryIncrement, TC, ALMMultiplier}, DL,
2987 "active.lane.mask.entry");
2988
2989 // Now create the ActiveLaneMaskPhi recipe in the main loop using the
2990 // preheader ActiveLaneMask instruction.
2991 auto *LaneMaskPhi =
2993 LaneMaskPhi->insertAfter(CanonicalIVPHI);
2994
2995 // Create the active lane mask for the next iteration of the loop before the
2996 // original terminator.
2997 VPRecipeBase *OriginalTerminator = EB->getTerminator();
2998 Builder.setInsertPoint(OriginalTerminator);
2999 auto *InLoopIncrement = Builder.createOverflowingOp(
3001 {CanonicalIVIncrement, &Plan.getVF()}, {false, false}, DL);
3002 auto *ALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
3003 {InLoopIncrement, TC, ALMMultiplier}, DL,
3004 "active.lane.mask.next");
3005 LaneMaskPhi->addOperand(ALM);
3006
3007 // Replace the original terminator with BranchOnCond. We have to invert the
3008 // mask here because a true condition means jumping to the exit block.
3009 auto *NotMask = Builder.createNot(ALM, DL);
3010 Builder.createNaryOp(VPInstruction::BranchOnCond, {NotMask}, DL);
3011 OriginalTerminator->eraseFromParent();
3012 return LaneMaskPhi;
3013}
3014
3016 bool UseActiveLaneMaskForControlFlow) {
3017 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3018 auto *FoundWidenCanonicalIVUser = find_if(
3020 assert(FoundWidenCanonicalIVUser &&
3021 "Must have widened canonical IV when tail folding!");
3022 VPSingleDefRecipe *HeaderMask = vputils::findHeaderMask(Plan);
3023 auto *WideCanonicalIV =
3024 cast<VPWidenCanonicalIVRecipe>(*FoundWidenCanonicalIVUser);
3025 VPSingleDefRecipe *LaneMask;
3026 if (UseActiveLaneMaskForControlFlow) {
3027 LaneMask = addVPLaneMaskPhiAndUpdateExitBranch(Plan);
3028 } else {
3029 VPBuilder B = VPBuilder::getToInsertAfter(WideCanonicalIV);
3030 VPValue *ALMMultiplier =
3031 Plan.getConstantInt(LoopRegion->getCanonicalIVType(), 1);
3032 LaneMask =
3033 B.createNaryOp(VPInstruction::ActiveLaneMask,
3034 {WideCanonicalIV, Plan.getTripCount(), ALMMultiplier},
3035 nullptr, "active.lane.mask");
3036 }
3037
3038 // Walk users of WideCanonicalIV and replace the header mask of the form
3039 // (ICMP_ULE, WideCanonicalIV, backedge-taken-count) with an active-lane-mask,
3040 // removing the old one to ensure there is always only a single header mask.
3041 HeaderMask->replaceAllUsesWith(LaneMask);
3042 HeaderMask->eraseFromParent();
3043}
3044
3045template <typename Op0_t, typename Op1_t> struct RemoveMask_match {
3046 Op0_t In;
3048
3049 RemoveMask_match(const Op0_t &In, Op1_t &Out) : In(In), Out(Out) {}
3050
3051 template <typename OpTy> bool match(OpTy *V) const {
3052 if (m_Specific(In).match(V)) {
3053 Out = nullptr;
3054 return true;
3055 }
3056 return m_LogicalAnd(m_Specific(In), m_VPValue(Out)).match(V);
3057 }
3058};
3059
3060/// Match a specific mask \p In, or a combination of it (logical-and In, Out).
3061/// Returns the remaining part \p Out if so, or nullptr otherwise.
3062template <typename Op0_t, typename Op1_t>
3063static inline RemoveMask_match<Op0_t, Op1_t> m_RemoveMask(const Op0_t &In,
3064 Op1_t &Out) {
3065 return RemoveMask_match<Op0_t, Op1_t>(In, Out);
3066}
3067
3068/// Try to optimize a \p CurRecipe masked by \p HeaderMask to a corresponding
3069/// EVL-based recipe without the header mask. Returns nullptr if no EVL-based
3070/// recipe could be created.
3071/// \p HeaderMask Header Mask.
3072/// \p CurRecipe Recipe to be transform.
3073/// \p TypeInfo VPlan-based type analysis.
3074/// \p EVL The explicit vector length parameter of vector-predication
3075/// intrinsics.
3077 VPRecipeBase &CurRecipe,
3078 VPTypeAnalysis &TypeInfo, VPValue &EVL) {
3079 VPlan *Plan = CurRecipe.getParent()->getPlan();
3080 DebugLoc DL = CurRecipe.getDebugLoc();
3081 VPValue *Addr, *Mask, *EndPtr;
3082
3083 /// Adjust any end pointers so that they point to the end of EVL lanes not VF.
3084 auto AdjustEndPtr = [&CurRecipe, &EVL](VPValue *EndPtr) {
3085 auto *EVLEndPtr = cast<VPVectorEndPointerRecipe>(EndPtr)->clone();
3086 EVLEndPtr->insertBefore(&CurRecipe);
3087 EVLEndPtr->setOperand(1, &EVL);
3088 return EVLEndPtr;
3089 };
3090
3091 if (match(&CurRecipe,
3092 m_MaskedLoad(m_VPValue(Addr), m_RemoveMask(HeaderMask, Mask))) &&
3093 !cast<VPWidenLoadRecipe>(CurRecipe).isReverse())
3094 return new VPWidenLoadEVLRecipe(cast<VPWidenLoadRecipe>(CurRecipe), Addr,
3095 EVL, Mask);
3096
3097 VPValue *ReversedVal;
3098 if (match(&CurRecipe, m_Reverse(m_VPValue(ReversedVal))) &&
3099 match(ReversedVal,
3100 m_MaskedLoad(m_VPValue(EndPtr), m_RemoveMask(HeaderMask, Mask))) &&
3101 match(EndPtr, m_VecEndPtr(m_VPValue(Addr), m_Specific(&Plan->getVF()))) &&
3102 cast<VPWidenLoadRecipe>(ReversedVal)->isReverse()) {
3103 auto *LoadR = new VPWidenLoadEVLRecipe(
3104 *cast<VPWidenLoadRecipe>(ReversedVal), AdjustEndPtr(EndPtr), EVL, Mask);
3105 LoadR->insertBefore(&CurRecipe);
3106 return new VPWidenIntrinsicRecipe(
3107 Intrinsic::experimental_vp_reverse, {LoadR, Plan->getTrue(), &EVL},
3108 TypeInfo.inferScalarType(LoadR), {}, {}, DL);
3109 }
3110
3111 VPValue *StoredVal;
3112 if (match(&CurRecipe, m_MaskedStore(m_VPValue(Addr), m_VPValue(StoredVal),
3113 m_RemoveMask(HeaderMask, Mask))) &&
3114 !cast<VPWidenStoreRecipe>(CurRecipe).isReverse())
3115 return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(CurRecipe), Addr,
3116 StoredVal, EVL, Mask);
3117
3118 if (match(&CurRecipe,
3119 m_MaskedStore(m_VPValue(EndPtr), m_Reverse(m_VPValue(ReversedVal)),
3120 m_RemoveMask(HeaderMask, Mask))) &&
3121 match(EndPtr, m_VecEndPtr(m_VPValue(Addr), m_Specific(&Plan->getVF()))) &&
3122 cast<VPWidenStoreRecipe>(CurRecipe).isReverse()) {
3123 auto *NewReverse = new VPWidenIntrinsicRecipe(
3124 Intrinsic::experimental_vp_reverse,
3125 {ReversedVal, Plan->getTrue(), &EVL},
3126 TypeInfo.inferScalarType(ReversedVal), {}, {}, DL);
3127 NewReverse->insertBefore(&CurRecipe);
3128 return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(CurRecipe),
3129 AdjustEndPtr(EndPtr), NewReverse, EVL,
3130 Mask);
3131 }
3132
3133 if (auto *Rdx = dyn_cast<VPReductionRecipe>(&CurRecipe))
3134 if (Rdx->isConditional() &&
3135 match(Rdx->getCondOp(), m_RemoveMask(HeaderMask, Mask)))
3136 return new VPReductionEVLRecipe(*Rdx, EVL, Mask);
3137
3138 if (auto *Interleave = dyn_cast<VPInterleaveRecipe>(&CurRecipe))
3139 if (Interleave->getMask() &&
3140 match(Interleave->getMask(), m_RemoveMask(HeaderMask, Mask)))
3141 return new VPInterleaveEVLRecipe(*Interleave, EVL, Mask);
3142
3143 VPValue *LHS, *RHS;
3144 if (match(&CurRecipe,
3145 m_Select(m_Specific(HeaderMask), m_VPValue(LHS), m_VPValue(RHS))))
3146 return new VPWidenIntrinsicRecipe(
3147 Intrinsic::vp_merge, {Plan->getTrue(), LHS, RHS, &EVL},
3148 TypeInfo.inferScalarType(LHS), {}, {}, DL);
3149
3150 if (match(&CurRecipe, m_Select(m_RemoveMask(HeaderMask, Mask), m_VPValue(LHS),
3151 m_VPValue(RHS))))
3152 return new VPWidenIntrinsicRecipe(
3153 Intrinsic::vp_merge, {Mask, LHS, RHS, &EVL},
3154 TypeInfo.inferScalarType(LHS), {}, {}, DL);
3155
3156 if (match(&CurRecipe, m_LastActiveLane(m_Specific(HeaderMask)))) {
3157 Type *Ty = TypeInfo.inferScalarType(CurRecipe.getVPSingleValue());
3158 VPValue *ZExt = VPBuilder(&CurRecipe)
3160 &EVL, Ty, TypeInfo.inferScalarType(&EVL), DL);
3161 return new VPInstruction(
3162 Instruction::Sub, {ZExt, Plan->getConstantInt(Ty, 1)},
3163 VPIRFlags::getDefaultFlags(Instruction::Sub), {}, DL);
3164 }
3165
3166 return nullptr;
3167}
3168
3169/// Optimize away any EVL-based header masks to VP intrinsic based recipes.
3170/// The transforms here need to preserve the original semantics.
3172 // Find the EVL-based header mask if it exists: icmp ult step-vector, EVL
3173 VPValue *HeaderMask = nullptr, *EVL = nullptr;
3176 m_VPValue(EVL))) &&
3177 match(EVL, m_EVL(m_VPValue()))) {
3178 HeaderMask = R.getVPSingleValue();
3179 break;
3180 }
3181 }
3182 if (!HeaderMask)
3183 return;
3184
3185 VPTypeAnalysis TypeInfo(Plan);
3186 SmallVector<VPRecipeBase *> OldRecipes;
3187 for (VPUser *U : collectUsersRecursively(HeaderMask)) {
3189 if (auto *NewR = optimizeMaskToEVL(HeaderMask, *R, TypeInfo, *EVL)) {
3190 NewR->insertBefore(R);
3191 for (auto [Old, New] :
3192 zip_equal(R->definedValues(), NewR->definedValues()))
3193 Old->replaceAllUsesWith(New);
3194 OldRecipes.push_back(R);
3195 }
3196 }
3197
3198 // Replace remaining (HeaderMask && Mask) with vp.merge (True, Mask,
3199 // False, EVL)
3200 for (VPUser *U : collectUsersRecursively(HeaderMask)) {
3201 VPValue *Mask;
3202 if (match(U, m_LogicalAnd(m_Specific(HeaderMask), m_VPValue(Mask)))) {
3203 auto *LogicalAnd = cast<VPInstruction>(U);
3204 auto *Merge = new VPWidenIntrinsicRecipe(
3205 Intrinsic::vp_merge, {Plan.getTrue(), Mask, Plan.getFalse(), EVL},
3206 TypeInfo.inferScalarType(Mask), {}, {}, LogicalAnd->getDebugLoc());
3207 Merge->insertBefore(LogicalAnd);
3208 LogicalAnd->replaceAllUsesWith(Merge);
3209 OldRecipes.push_back(LogicalAnd);
3210 }
3211 }
3212
3213 // Erase old recipes at the end so we don't invalidate TypeInfo.
3214 for (VPRecipeBase *R : reverse(OldRecipes)) {
3215 SmallVector<VPValue *> PossiblyDead(R->operands());
3216 R->eraseFromParent();
3217 for (VPValue *Op : PossiblyDead)
3219 }
3220}
3221
3222/// After replacing the canonical IV with a EVL-based IV, fixup recipes that use
3223/// VF to use the EVL instead to avoid incorrect updates on the penultimate
3224/// iteration.
3225static void fixupVFUsersForEVL(VPlan &Plan, VPValue &EVL) {
3226 VPTypeAnalysis TypeInfo(Plan);
3227 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3228 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
3229
3230 assert(all_of(Plan.getVF().users(),
3233 "User of VF that we can't transform to EVL.");
3234 Plan.getVF().replaceUsesWithIf(&EVL, [](VPUser &U, unsigned Idx) {
3236 });
3237
3238 assert(all_of(Plan.getVFxUF().users(),
3239 [&LoopRegion, &Plan](VPUser *U) {
3240 return match(U,
3241 m_c_Add(m_Specific(LoopRegion->getCanonicalIV()),
3242 m_Specific(&Plan.getVFxUF()))) ||
3243 isa<VPWidenPointerInductionRecipe>(U);
3244 }) &&
3245 "Only users of VFxUF should be VPWidenPointerInductionRecipe and the "
3246 "increment of the canonical induction.");
3247 Plan.getVFxUF().replaceUsesWithIf(&EVL, [](VPUser &U, unsigned Idx) {
3248 // Only replace uses in VPWidenPointerInductionRecipe; The increment of the
3249 // canonical induction must not be updated.
3251 });
3252
3253 // Create a scalar phi to track the previous EVL if fixed-order recurrence is
3254 // contained.
3255 bool ContainsFORs =
3257 if (ContainsFORs) {
3258 // TODO: Use VPInstruction::ExplicitVectorLength to get maximum EVL.
3259 VPValue *MaxEVL = &Plan.getVF();
3260 // Emit VPScalarCastRecipe in preheader if VF is not a 32 bits integer.
3261 VPBuilder Builder(LoopRegion->getPreheaderVPBB());
3262 MaxEVL = Builder.createScalarZExtOrTrunc(
3263 MaxEVL, Type::getInt32Ty(Plan.getContext()),
3264 TypeInfo.inferScalarType(MaxEVL), DebugLoc::getUnknown());
3265
3266 Builder.setInsertPoint(Header, Header->getFirstNonPhi());
3267 VPValue *PrevEVL = Builder.createScalarPhi(
3268 {MaxEVL, &EVL}, DebugLoc::getUnknown(), "prev.evl");
3269
3272 for (VPRecipeBase &R : *VPBB) {
3273 VPValue *V1, *V2;
3274 if (!match(&R,
3276 m_VPValue(V1), m_VPValue(V2))))
3277 continue;
3278 VPValue *Imm = Plan.getOrAddLiveIn(
3281 Intrinsic::experimental_vp_splice,
3282 {V1, V2, Imm, Plan.getTrue(), PrevEVL, &EVL},
3283 TypeInfo.inferScalarType(R.getVPSingleValue()), {}, {},
3284 R.getDebugLoc());
3285 VPSplice->insertBefore(&R);
3286 R.getVPSingleValue()->replaceAllUsesWith(VPSplice);
3287 }
3288 }
3289 }
3290
3291 VPValue *HeaderMask = vputils::findHeaderMask(Plan);
3292 if (!HeaderMask)
3293 return;
3294
3295 // Replace header masks with a mask equivalent to predicating by EVL:
3296 //
3297 // icmp ule widen-canonical-iv backedge-taken-count
3298 // ->
3299 // icmp ult step-vector, EVL
3300 VPRecipeBase *EVLR = EVL.getDefiningRecipe();
3301 VPBuilder Builder(EVLR->getParent(), std::next(EVLR->getIterator()));
3302 Type *EVLType = TypeInfo.inferScalarType(&EVL);
3303 VPValue *EVLMask = Builder.createICmp(
3305 Builder.createNaryOp(VPInstruction::StepVector, {}, EVLType), &EVL);
3306 HeaderMask->replaceAllUsesWith(EVLMask);
3307}
3308
3309/// Converts a tail folded vector loop region to step by
3310/// VPInstruction::ExplicitVectorLength elements instead of VF elements each
3311/// iteration.
3312///
3313/// - Add a VPCurrentIterationPHIRecipe and related recipes to \p Plan and
3314/// replaces all uses except the canonical IV increment of
3315/// VPCanonicalIVPHIRecipe with a VPCurrentIterationPHIRecipe.
3316/// VPCanonicalIVPHIRecipe is used only for loop iterations counting after
3317/// this transformation.
3318///
3319/// - The header mask is replaced with a header mask based on the EVL.
3320///
3321/// - Plans with FORs have a new phi added to keep track of the EVL of the
3322/// previous iteration, and VPFirstOrderRecurrencePHIRecipes are replaced with
3323/// @llvm.vp.splice.
3324///
3325/// The function uses the following definitions:
3326/// %StartV is the canonical induction start value.
3327///
3328/// The function adds the following recipes:
3329///
3330/// vector.ph:
3331/// ...
3332///
3333/// vector.body:
3334/// ...
3335/// %CurrentIter = CURRENT-ITERATION-PHI [ %StartV, %vector.ph ],
3336/// [ %NextIter, %vector.body ]
3337/// %AVL = phi [ trip-count, %vector.ph ], [ %NextAVL, %vector.body ]
3338/// %VPEVL = EXPLICIT-VECTOR-LENGTH %AVL
3339/// ...
3340/// %OpEVL = cast i32 %VPEVL to IVSize
3341/// %NextIter = add IVSize %OpEVL, %CurrentIter
3342/// %NextAVL = sub IVSize nuw %AVL, %OpEVL
3343/// ...
3344///
3345/// If MaxSafeElements is provided, the function adds the following recipes:
3346/// vector.ph:
3347/// ...
3348///
3349/// vector.body:
3350/// ...
3351/// %CurrentIter = CURRENT-ITERATION-PHI [ %StartV, %vector.ph ],
3352/// [ %NextIter, %vector.body ]
3353/// %AVL = phi [ trip-count, %vector.ph ], [ %NextAVL, %vector.body ]
3354/// %cmp = cmp ult %AVL, MaxSafeElements
3355/// %SAFE_AVL = select %cmp, %AVL, MaxSafeElements
3356/// %VPEVL = EXPLICIT-VECTOR-LENGTH %SAFE_AVL
3357/// ...
3358/// %OpEVL = cast i32 %VPEVL to IVSize
3359/// %NextIter = add IVSize %OpEVL, %CurrentIter
3360/// %NextAVL = sub IVSize nuw %AVL, %OpEVL
3361/// ...
3362///
3364 VPlan &Plan, const std::optional<unsigned> &MaxSafeElements) {
3365 if (Plan.hasScalarVFOnly())
3366 return;
3367 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3368 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
3369
3370 auto *CanonicalIVPHI = LoopRegion->getCanonicalIV();
3371 auto *CanIVTy = LoopRegion->getCanonicalIVType();
3372 VPValue *StartV = CanonicalIVPHI->getStartValue();
3373
3374 // Create the CurrentIteration recipe in the vector loop.
3375 auto *CurrentIteration =
3377 CurrentIteration->insertAfter(CanonicalIVPHI);
3378 VPBuilder Builder(Header, Header->getFirstNonPhi());
3379 // Create the AVL (application vector length), starting from TC -> 0 in steps
3380 // of EVL.
3381 VPPhi *AVLPhi = Builder.createScalarPhi(
3382 {Plan.getTripCount()}, DebugLoc::getCompilerGenerated(), "avl");
3383 VPValue *AVL = AVLPhi;
3384
3385 if (MaxSafeElements) {
3386 // Support for MaxSafeDist for correct loop emission.
3387 VPValue *AVLSafe = Plan.getConstantInt(CanIVTy, *MaxSafeElements);
3388 VPValue *Cmp = Builder.createICmp(ICmpInst::ICMP_ULT, AVL, AVLSafe);
3389 AVL = Builder.createSelect(Cmp, AVL, AVLSafe, DebugLoc::getUnknown(),
3390 "safe_avl");
3391 }
3392 auto *VPEVL = Builder.createNaryOp(VPInstruction::ExplicitVectorLength, AVL,
3393 DebugLoc::getUnknown(), "evl");
3394
3395 auto *CanonicalIVIncrement =
3396 cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue());
3397 Builder.setInsertPoint(CanonicalIVIncrement);
3398 VPValue *OpVPEVL = VPEVL;
3399
3400 auto *I32Ty = Type::getInt32Ty(Plan.getContext());
3401 OpVPEVL = Builder.createScalarZExtOrTrunc(
3402 OpVPEVL, CanIVTy, I32Ty, CanonicalIVIncrement->getDebugLoc());
3403
3404 auto *NextIter = Builder.createAdd(
3405 OpVPEVL, CurrentIteration, CanonicalIVIncrement->getDebugLoc(),
3406 "current.iteration.next", CanonicalIVIncrement->getNoWrapFlags());
3407 CurrentIteration->addOperand(NextIter);
3408
3409 VPValue *NextAVL =
3410 Builder.createSub(AVLPhi, OpVPEVL, DebugLoc::getCompilerGenerated(),
3411 "avl.next", {/*NUW=*/true, /*NSW=*/false});
3412 AVLPhi->addOperand(NextAVL);
3413
3414 fixupVFUsersForEVL(Plan, *VPEVL);
3415 removeDeadRecipes(Plan);
3416
3417 // Replace all uses of VPCanonicalIVPHIRecipe by
3418 // VPCurrentIterationPHIRecipe except for the canonical IV increment.
3419 CanonicalIVPHI->replaceAllUsesWith(CurrentIteration);
3420 CanonicalIVIncrement->setOperand(0, CanonicalIVPHI);
3421 // TODO: support unroll factor > 1.
3422 Plan.setUF(1);
3423}
3424
3426 // Find the vector loop entry by locating VPCurrentIterationPHIRecipe.
3427 // There should be only one VPCurrentIteration in the entire plan.
3428 VPCurrentIterationPHIRecipe *CurrentIteration = nullptr;
3429
3432 for (VPRecipeBase &R : VPBB->phis())
3433 if (auto *PhiR = dyn_cast<VPCurrentIterationPHIRecipe>(&R)) {
3434 assert(!CurrentIteration &&
3435 "Found multiple CurrentIteration. Only one expected");
3436 CurrentIteration = PhiR;
3437 }
3438
3439 // Early return if it is not variable-length stepping.
3440 if (!CurrentIteration)
3441 return;
3442
3443 VPBasicBlock *HeaderVPBB = CurrentIteration->getParent();
3444 VPValue *CurrentIterationIncr = CurrentIteration->getBackedgeValue();
3445
3446 // Convert CurrentIteration to concrete recipe.
3447 auto *ScalarR =
3448 VPBuilder(CurrentIteration)
3450 {CurrentIteration->getStartValue(), CurrentIterationIncr},
3451 CurrentIteration->getDebugLoc(), "current.iteration.iv");
3452 CurrentIteration->replaceAllUsesWith(ScalarR);
3453 CurrentIteration->eraseFromParent();
3454
3455 // Replace CanonicalIVInc with CurrentIteration increment.
3456 auto *CanonicalIV = cast<VPPhi>(&*HeaderVPBB->begin());
3457 VPValue *Backedge = CanonicalIV->getIncomingValue(1);
3458 assert(match(Backedge, m_c_Add(m_Specific(CanonicalIV),
3459 m_Specific(&Plan.getVFxUF()))) &&
3460 "Unexpected canonical iv");
3461 Backedge->replaceAllUsesWith(CurrentIterationIncr);
3462
3463 // Remove unused phi and increment.
3464 VPRecipeBase *CanonicalIVIncrement = Backedge->getDefiningRecipe();
3465 CanonicalIVIncrement->eraseFromParent();
3466 CanonicalIV->eraseFromParent();
3467}
3468
3470 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3471 // The canonical IV may not exist at this stage.
3472 if (!LoopRegion ||
3474 return;
3475 VPCanonicalIVPHIRecipe *CanIV = LoopRegion->getCanonicalIV();
3476 if (std::next(CanIV->getIterator()) == CanIV->getParent()->end())
3477 return;
3478 // The EVL IV is always immediately after the canonical IV.
3480 std::next(CanIV->getIterator()));
3481 if (!EVLPhi)
3482 return;
3483
3484 // Bail if not an EVL tail folded loop.
3485 VPValue *AVL;
3486 if (!match(EVLPhi->getBackedgeValue(),
3487 m_c_Add(m_ZExtOrSelf(m_EVL(m_VPValue(AVL))), m_Specific(EVLPhi))))
3488 return;
3489
3490 // The AVL may be capped to a safe distance.
3491 VPValue *SafeAVL, *UnsafeAVL;
3492 if (match(AVL,
3494 m_VPValue(SafeAVL)),
3495 m_Deferred(UnsafeAVL), m_Deferred(SafeAVL))))
3496 AVL = UnsafeAVL;
3497
3498 VPValue *AVLNext;
3499 [[maybe_unused]] bool FoundAVLNext =
3501 m_Specific(Plan.getTripCount()), m_VPValue(AVLNext)));
3502 assert(FoundAVLNext && "Didn't find AVL backedge?");
3503
3504 VPBasicBlock *Latch = LoopRegion->getExitingBasicBlock();
3505 auto *LatchBr = cast<VPInstruction>(Latch->getTerminator());
3506 if (match(LatchBr, m_BranchOnCond(m_True())))
3507 return;
3508
3509 assert(
3510 match(LatchBr,
3513 m_Specific(&Plan.getVectorTripCount())))) &&
3514 "Expected BranchOnCond with ICmp comparing CanIV increment with vector "
3515 "trip count");
3516
3517 Type *AVLTy = VPTypeAnalysis(Plan).inferScalarType(AVLNext);
3518 VPBuilder Builder(LatchBr);
3519 LatchBr->setOperand(
3520 0, Builder.createICmp(CmpInst::ICMP_EQ, AVLNext, Plan.getZero(AVLTy)));
3521}
3522
3524 VPlan &Plan, PredicatedScalarEvolution &PSE,
3525 const DenseMap<Value *, const SCEV *> &StridesMap) {
3526 // Replace VPValues for known constant strides guaranteed by predicate scalar
3527 // evolution.
3528 auto CanUseVersionedStride = [&Plan](VPUser &U, unsigned) {
3529 auto *R = cast<VPRecipeBase>(&U);
3530 return R->getRegion() ||
3531 R->getParent() == Plan.getVectorLoopRegion()->getSinglePredecessor();
3532 };
3533 ValueToSCEVMapTy RewriteMap;
3534 for (const SCEV *Stride : StridesMap.values()) {
3535 using namespace SCEVPatternMatch;
3536 auto *StrideV = cast<SCEVUnknown>(Stride)->getValue();
3537 const APInt *StrideConst;
3538 if (!match(PSE.getSCEV(StrideV), m_scev_APInt(StrideConst)))
3539 // Only handle constant strides for now.
3540 continue;
3541
3542 auto *CI = Plan.getConstantInt(*StrideConst);
3543 if (VPValue *StrideVPV = Plan.getLiveIn(StrideV))
3544 StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
3545
3546 // The versioned value may not be used in the loop directly but through a
3547 // sext/zext. Add new live-ins in those cases.
3548 for (Value *U : StrideV->users()) {
3550 continue;
3551 VPValue *StrideVPV = Plan.getLiveIn(U);
3552 if (!StrideVPV)
3553 continue;
3554 unsigned BW = U->getType()->getScalarSizeInBits();
3555 APInt C =
3556 isa<SExtInst>(U) ? StrideConst->sext(BW) : StrideConst->zext(BW);
3557 VPValue *CI = Plan.getConstantInt(C);
3558 StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
3559 }
3560 RewriteMap[StrideV] = PSE.getSCEV(StrideV);
3561 }
3562
3563 for (VPRecipeBase &R : *Plan.getEntry()) {
3564 auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(&R);
3565 if (!ExpSCEV)
3566 continue;
3567 const SCEV *ScevExpr = ExpSCEV->getSCEV();
3568 auto *NewSCEV =
3569 SCEVParameterRewriter::rewrite(ScevExpr, *PSE.getSE(), RewriteMap);
3570 if (NewSCEV != ScevExpr) {
3571 VPValue *NewExp = vputils::getOrCreateVPValueForSCEVExpr(Plan, NewSCEV);
3572 ExpSCEV->replaceAllUsesWith(NewExp);
3573 if (Plan.getTripCount() == ExpSCEV)
3574 Plan.resetTripCount(NewExp);
3575 }
3576 }
3577}
3578
3580 VPlan &Plan,
3581 const std::function<bool(BasicBlock *)> &BlockNeedsPredication) {
3582 // Collect recipes in the backward slice of `Root` that may generate a poison
3583 // value that is used after vectorization.
3585 auto CollectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) {
3587 Worklist.push_back(Root);
3588
3589 // Traverse the backward slice of Root through its use-def chain.
3590 while (!Worklist.empty()) {
3591 VPRecipeBase *CurRec = Worklist.pop_back_val();
3592
3593 if (!Visited.insert(CurRec).second)
3594 continue;
3595
3596 // Prune search if we find another recipe generating a widen memory
3597 // instruction. Widen memory instructions involved in address computation
3598 // will lead to gather/scatter instructions, which don't need to be
3599 // handled.
3601 VPHeaderPHIRecipe>(CurRec))
3602 continue;
3603
3604 // This recipe contributes to the address computation of a widen
3605 // load/store. If the underlying instruction has poison-generating flags,
3606 // drop them directly.
3607 if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(CurRec)) {
3608 VPValue *A, *B;
3609 // Dropping disjoint from an OR may yield incorrect results, as some
3610 // analysis may have converted it to an Add implicitly (e.g. SCEV used
3611 // for dependence analysis). Instead, replace it with an equivalent Add.
3612 // This is possible as all users of the disjoint OR only access lanes
3613 // where the operands are disjoint or poison otherwise.
3614 if (match(RecWithFlags, m_BinaryOr(m_VPValue(A), m_VPValue(B))) &&
3615 RecWithFlags->isDisjoint()) {
3616 VPBuilder Builder(RecWithFlags);
3617 VPInstruction *New =
3618 Builder.createAdd(A, B, RecWithFlags->getDebugLoc());
3619 New->setUnderlyingValue(RecWithFlags->getUnderlyingValue());
3620 RecWithFlags->replaceAllUsesWith(New);
3621 RecWithFlags->eraseFromParent();
3622 CurRec = New;
3623 } else
3624 RecWithFlags->dropPoisonGeneratingFlags();
3625 } else {
3628 (void)Instr;
3629 assert((!Instr || !Instr->hasPoisonGeneratingFlags()) &&
3630 "found instruction with poison generating flags not covered by "
3631 "VPRecipeWithIRFlags");
3632 }
3633
3634 // Add new definitions to the worklist.
3635 for (VPValue *Operand : CurRec->operands())
3636 if (VPRecipeBase *OpDef = Operand->getDefiningRecipe())
3637 Worklist.push_back(OpDef);
3638 }
3639 });
3640
3641 // Traverse all the recipes in the VPlan and collect the poison-generating
3642 // recipes in the backward slice starting at the address of a VPWidenRecipe or
3643 // VPInterleaveRecipe.
3644 auto Iter = vp_depth_first_deep(Plan.getEntry());
3646 for (VPRecipeBase &Recipe : *VPBB) {
3647 if (auto *WidenRec = dyn_cast<VPWidenMemoryRecipe>(&Recipe)) {
3648 Instruction &UnderlyingInstr = WidenRec->getIngredient();
3649 VPRecipeBase *AddrDef = WidenRec->getAddr()->getDefiningRecipe();
3650 if (AddrDef && WidenRec->isConsecutive() &&
3651 BlockNeedsPredication(UnderlyingInstr.getParent()))
3652 CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
3653 } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) {
3654 VPRecipeBase *AddrDef = InterleaveRec->getAddr()->getDefiningRecipe();
3655 if (AddrDef) {
3656 // Check if any member of the interleave group needs predication.
3657 const InterleaveGroup<Instruction> *InterGroup =
3658 InterleaveRec->getInterleaveGroup();
3659 bool NeedPredication = false;
3660 for (int I = 0, NumMembers = InterGroup->getNumMembers();
3661 I < NumMembers; ++I) {
3662 Instruction *Member = InterGroup->getMember(I);
3663 if (Member)
3664 NeedPredication |= BlockNeedsPredication(Member->getParent());
3665 }
3666
3667 if (NeedPredication)
3668 CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
3669 }
3670 }
3671 }
3672 }
3673}
3674
3676 VPlan &Plan,
3678 &InterleaveGroups,
3679 VPRecipeBuilder &RecipeBuilder, const bool &ScalarEpilogueAllowed) {
3680 if (InterleaveGroups.empty())
3681 return;
3682
3683 // Interleave memory: for each Interleave Group we marked earlier as relevant
3684 // for this VPlan, replace the Recipes widening its memory instructions with a
3685 // single VPInterleaveRecipe at its insertion point.
3686 VPDominatorTree VPDT(Plan);
3687 for (const auto *IG : InterleaveGroups) {
3688 auto *Start =
3689 cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IG->getMember(0)));
3690 VPIRMetadata InterleaveMD(*Start);
3691 SmallVector<VPValue *, 4> StoredValues;
3692 if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(Start))
3693 StoredValues.push_back(StoreR->getStoredValue());
3694 for (unsigned I = 1; I < IG->getFactor(); ++I) {
3695 Instruction *MemberI = IG->getMember(I);
3696 if (!MemberI)
3697 continue;
3698 VPWidenMemoryRecipe *MemoryR =
3699 cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(MemberI));
3700 if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(MemoryR))
3701 StoredValues.push_back(StoreR->getStoredValue());
3702 InterleaveMD.intersect(*MemoryR);
3703 }
3704
3705 bool NeedsMaskForGaps =
3706 (IG->requiresScalarEpilogue() && !ScalarEpilogueAllowed) ||
3707 (!StoredValues.empty() && !IG->isFull());
3708
3709 Instruction *IRInsertPos = IG->getInsertPos();
3710 auto *InsertPos =
3711 cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IRInsertPos));
3712
3714 if (auto *Gep = dyn_cast<GetElementPtrInst>(
3715 getLoadStorePointerOperand(IRInsertPos)->stripPointerCasts()))
3716 NW = Gep->getNoWrapFlags().withoutNoUnsignedWrap();
3717
3718 // Get or create the start address for the interleave group.
3719 VPValue *Addr = Start->getAddr();
3720 VPRecipeBase *AddrDef = Addr->getDefiningRecipe();
3721 if (AddrDef && !VPDT.properlyDominates(AddrDef, InsertPos)) {
3722 // We cannot re-use the address of member zero because it does not
3723 // dominate the insert position. Instead, use the address of the insert
3724 // position and create a PtrAdd adjusting it to the address of member
3725 // zero.
3726 // TODO: Hoist Addr's defining recipe (and any operands as needed) to
3727 // InsertPos or sink loads above zero members to join it.
3728 assert(IG->getIndex(IRInsertPos) != 0 &&
3729 "index of insert position shouldn't be zero");
3730 auto &DL = IRInsertPos->getDataLayout();
3731 APInt Offset(32,
3732 DL.getTypeAllocSize(getLoadStoreType(IRInsertPos)) *
3733 IG->getIndex(IRInsertPos),
3734 /*IsSigned=*/true);
3735 VPValue *OffsetVPV = Plan.getConstantInt(-Offset);
3736 VPBuilder B(InsertPos);
3737 Addr = B.createNoWrapPtrAdd(InsertPos->getAddr(), OffsetVPV, NW);
3738 }
3739 // If the group is reverse, adjust the index to refer to the last vector
3740 // lane instead of the first. We adjust the index from the first vector
3741 // lane, rather than directly getting the pointer for lane VF - 1, because
3742 // the pointer operand of the interleaved access is supposed to be uniform.
3743 if (IG->isReverse()) {
3744 auto *ReversePtr = new VPVectorEndPointerRecipe(
3745 Addr, &Plan.getVF(), getLoadStoreType(IRInsertPos),
3746 -(int64_t)IG->getFactor(), NW, InsertPos->getDebugLoc());
3747 ReversePtr->insertBefore(InsertPos);
3748 Addr = ReversePtr;
3749 }
3750 auto *VPIG = new VPInterleaveRecipe(IG, Addr, StoredValues,
3751 InsertPos->getMask(), NeedsMaskForGaps,
3752 InterleaveMD, InsertPos->getDebugLoc());
3753 VPIG->insertBefore(InsertPos);
3754
3755 unsigned J = 0;
3756 for (unsigned i = 0; i < IG->getFactor(); ++i)
3757 if (Instruction *Member = IG->getMember(i)) {
3758 VPRecipeBase *MemberR = RecipeBuilder.getRecipe(Member);
3759 if (!Member->getType()->isVoidTy()) {
3760 VPValue *OriginalV = MemberR->getVPSingleValue();
3761 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));
3762 J++;
3763 }
3764 MemberR->eraseFromParent();
3765 }
3766 }
3767}
3768
3769/// Expand a VPWidenIntOrFpInduction into executable recipes, for the initial
3770/// value, phi and backedge value. In the following example:
3771///
3772/// vector.ph:
3773/// Successor(s): vector loop
3774///
3775/// <x1> vector loop: {
3776/// vector.body:
3777/// WIDEN-INDUCTION %i = phi %start, %step, %vf
3778/// ...
3779/// EMIT branch-on-count ...
3780/// No successors
3781/// }
3782///
3783/// WIDEN-INDUCTION will get expanded to:
3784///
3785/// vector.ph:
3786/// ...
3787/// vp<%induction.start> = ...
3788/// vp<%induction.increment> = ...
3789///
3790/// Successor(s): vector loop
3791///
3792/// <x1> vector loop: {
3793/// vector.body:
3794/// ir<%i> = WIDEN-PHI vp<%induction.start>, vp<%vec.ind.next>
3795/// ...
3796/// vp<%vec.ind.next> = add ir<%i>, vp<%induction.increment>
3797/// EMIT branch-on-count ...
3798/// No successors
3799/// }
3800static void
3802 VPTypeAnalysis &TypeInfo) {
3803 VPlan *Plan = WidenIVR->getParent()->getPlan();
3804 VPValue *Start = WidenIVR->getStartValue();
3805 VPValue *Step = WidenIVR->getStepValue();
3806 VPValue *VF = WidenIVR->getVFValue();
3807 DebugLoc DL = WidenIVR->getDebugLoc();
3808
3809 // The value from the original loop to which we are mapping the new induction
3810 // variable.
3811 Type *Ty = TypeInfo.inferScalarType(WidenIVR);
3812
3813 const InductionDescriptor &ID = WidenIVR->getInductionDescriptor();
3816 VPIRFlags Flags = *WidenIVR;
3817 if (ID.getKind() == InductionDescriptor::IK_IntInduction) {
3818 AddOp = Instruction::Add;
3819 MulOp = Instruction::Mul;
3820 } else {
3821 AddOp = ID.getInductionOpcode();
3822 MulOp = Instruction::FMul;
3823 }
3824
3825 // If the phi is truncated, truncate the start and step values.
3826 VPBuilder Builder(Plan->getVectorPreheader());
3827 Type *StepTy = TypeInfo.inferScalarType(Step);
3828 if (Ty->getScalarSizeInBits() < StepTy->getScalarSizeInBits()) {
3829 assert(StepTy->isIntegerTy() && "Truncation requires an integer type");
3830 Step = Builder.createScalarCast(Instruction::Trunc, Step, Ty, DL);
3831 Start = Builder.createScalarCast(Instruction::Trunc, Start, Ty, DL);
3832 StepTy = Ty;
3833 }
3834
3835 // Construct the initial value of the vector IV in the vector loop preheader.
3836 Type *IVIntTy =
3838 VPValue *Init = Builder.createNaryOp(VPInstruction::StepVector, {}, IVIntTy);
3839 if (StepTy->isFloatingPointTy())
3840 Init = Builder.createWidenCast(Instruction::UIToFP, Init, StepTy);
3841
3842 VPValue *SplatStart = Builder.createNaryOp(VPInstruction::Broadcast, Start);
3843 VPValue *SplatStep = Builder.createNaryOp(VPInstruction::Broadcast, Step);
3844
3845 Init = Builder.createNaryOp(MulOp, {Init, SplatStep}, Flags);
3846 Init = Builder.createNaryOp(AddOp, {SplatStart, Init}, Flags,
3847 DebugLoc::getUnknown(), "induction");
3848
3849 // Create the widened phi of the vector IV.
3850 auto *WidePHI = new VPWidenPHIRecipe(WidenIVR->getPHINode(), Init,
3851 WidenIVR->getDebugLoc(), "vec.ind");
3852 WidePHI->insertBefore(WidenIVR);
3853
3854 // Create the backedge value for the vector IV.
3855 VPValue *Inc;
3856 VPValue *Prev;
3857 // If unrolled, use the increment and prev value from the operands.
3858 if (auto *SplatVF = WidenIVR->getSplatVFValue()) {
3859 Inc = SplatVF;
3860 Prev = WidenIVR->getLastUnrolledPartOperand();
3861 } else {
3862 if (VPRecipeBase *R = VF->getDefiningRecipe())
3863 Builder.setInsertPoint(R->getParent(), std::next(R->getIterator()));
3864 // Multiply the vectorization factor by the step using integer or
3865 // floating-point arithmetic as appropriate.
3866 if (StepTy->isFloatingPointTy())
3867 VF = Builder.createScalarCast(Instruction::CastOps::UIToFP, VF, StepTy,
3868 DL);
3869 else
3870 VF = Builder.createScalarZExtOrTrunc(VF, StepTy,
3871 TypeInfo.inferScalarType(VF), DL);
3872
3873 Inc = Builder.createNaryOp(MulOp, {Step, VF}, Flags);
3874 Inc = Builder.createNaryOp(VPInstruction::Broadcast, Inc);
3875 Prev = WidePHI;
3876 }
3877
3879 Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator());
3880 auto *Next = Builder.createNaryOp(AddOp, {Prev, Inc}, Flags,
3881 WidenIVR->getDebugLoc(), "vec.ind.next");
3882
3883 WidePHI->addOperand(Next);
3884
3885 WidenIVR->replaceAllUsesWith(WidePHI);
3886}
3887
3888/// Expand a VPWidenPointerInductionRecipe into executable recipes, for the
3889/// initial value, phi and backedge value. In the following example:
3890///
3891/// <x1> vector loop: {
3892/// vector.body:
3893/// EMIT ir<%ptr.iv> = WIDEN-POINTER-INDUCTION %start, %step, %vf
3894/// ...
3895/// EMIT branch-on-count ...
3896/// }
3897///
3898/// WIDEN-POINTER-INDUCTION will get expanded to:
3899///
3900/// <x1> vector loop: {
3901/// vector.body:
3902/// EMIT-SCALAR %pointer.phi = phi %start, %ptr.ind
3903/// EMIT %mul = mul %stepvector, %step
3904/// EMIT %vector.gep = wide-ptradd %pointer.phi, %mul
3905/// ...
3906/// EMIT %ptr.ind = ptradd %pointer.phi, %vf
3907/// EMIT branch-on-count ...
3908/// }
3910 VPTypeAnalysis &TypeInfo) {
3911 VPlan *Plan = R->getParent()->getPlan();
3912 VPValue *Start = R->getStartValue();
3913 VPValue *Step = R->getStepValue();
3914 VPValue *VF = R->getVFValue();
3915
3916 assert(R->getInductionDescriptor().getKind() ==
3918 "Not a pointer induction according to InductionDescriptor!");
3919 assert(TypeInfo.inferScalarType(R)->isPointerTy() && "Unexpected type.");
3920 assert(!R->onlyScalarsGenerated(Plan->hasScalableVF()) &&
3921 "Recipe should have been replaced");
3922
3923 VPBuilder Builder(R);
3924 DebugLoc DL = R->getDebugLoc();
3925
3926 // Build a scalar pointer phi.
3927 VPPhi *ScalarPtrPhi = Builder.createScalarPhi(Start, DL, "pointer.phi");
3928
3929 // Create actual address geps that use the pointer phi as base and a
3930 // vectorized version of the step value (<step*0, ..., step*N>) as offset.
3931 Builder.setInsertPoint(R->getParent(), R->getParent()->getFirstNonPhi());
3932 Type *StepTy = TypeInfo.inferScalarType(Step);
3933 VPValue *Offset = Builder.createNaryOp(VPInstruction::StepVector, {}, StepTy);
3934 Offset = Builder.createOverflowingOp(Instruction::Mul, {Offset, Step});
3935 VPValue *PtrAdd =
3936 Builder.createWidePtrAdd(ScalarPtrPhi, Offset, DL, "vector.gep");
3937 R->replaceAllUsesWith(PtrAdd);
3938
3939 // Create the backedge value for the scalar pointer phi.
3941 Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator());
3942 VF = Builder.createScalarZExtOrTrunc(VF, StepTy, TypeInfo.inferScalarType(VF),
3943 DL);
3944 VPValue *Inc = Builder.createOverflowingOp(Instruction::Mul, {Step, VF});
3945
3946 VPValue *InductionGEP =
3947 Builder.createPtrAdd(ScalarPtrPhi, Inc, DL, "ptr.ind");
3948 ScalarPtrPhi->addOperand(InductionGEP);
3949}
3950
3952 // Replace loop regions with explicity CFG.
3953 SmallVector<VPRegionBlock *> LoopRegions;
3955 vp_depth_first_deep(Plan.getEntry()))) {
3956 if (!R->isReplicator())
3957 LoopRegions.push_back(R);
3958 }
3959 for (VPRegionBlock *R : LoopRegions)
3960 R->dissolveToCFGLoop();
3961}
3962
3965 // The transform runs after dissolving loop regions, so all VPBasicBlocks
3966 // terminated with BranchOnTwoConds are reached via a shallow traversal.
3969 if (!VPBB->empty() && match(&VPBB->back(), m_BranchOnTwoConds()))
3970 WorkList.push_back(cast<VPInstruction>(&VPBB->back()));
3971 }
3972
3973 // Expand BranchOnTwoConds instructions into explicit CFG with two new
3974 // single-condition branches:
3975 // 1. A branch that replaces BranchOnTwoConds, jumps to the first successor if
3976 // the first condition is true, and otherwise jumps to a new interim block.
3977 // 2. A branch that ends the interim block, jumps to the second successor if
3978 // the second condition is true, and otherwise jumps to the third
3979 // successor.
3980 for (VPInstruction *Br : WorkList) {
3981 assert(Br->getNumOperands() == 2 &&
3982 "BranchOnTwoConds must have exactly 2 conditions");
3983 DebugLoc DL = Br->getDebugLoc();
3984 VPBasicBlock *BrOnTwoCondsBB = Br->getParent();
3985 const auto Successors = to_vector(BrOnTwoCondsBB->getSuccessors());
3986 assert(Successors.size() == 3 &&
3987 "BranchOnTwoConds must have exactly 3 successors");
3988
3989 for (VPBlockBase *Succ : Successors)
3990 VPBlockUtils::disconnectBlocks(BrOnTwoCondsBB, Succ);
3991
3992 VPValue *Cond0 = Br->getOperand(0);
3993 VPValue *Cond1 = Br->getOperand(1);
3994 VPBlockBase *Succ0 = Successors[0];
3995 VPBlockBase *Succ1 = Successors[1];
3996 VPBlockBase *Succ2 = Successors[2];
3997 assert(!Succ0->getParent() && !Succ1->getParent() && !Succ2->getParent() &&
3998 !BrOnTwoCondsBB->getParent() && "regions must already be dissolved");
3999
4000 VPBasicBlock *InterimBB =
4001 Plan.createVPBasicBlock(BrOnTwoCondsBB->getName() + ".interim");
4002
4003 VPBuilder(BrOnTwoCondsBB)
4005 VPBlockUtils::connectBlocks(BrOnTwoCondsBB, Succ0);
4006 VPBlockUtils::connectBlocks(BrOnTwoCondsBB, InterimBB);
4007
4009 VPBlockUtils::connectBlocks(InterimBB, Succ1);
4010 VPBlockUtils::connectBlocks(InterimBB, Succ2);
4011 Br->eraseFromParent();
4012 }
4013}
4014
4016 VPTypeAnalysis TypeInfo(Plan);
4019 vp_depth_first_deep(Plan.getEntry()))) {
4020 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
4021 if (auto *WidenIVR = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R)) {
4022 expandVPWidenIntOrFpInduction(WidenIVR, TypeInfo);
4023 ToRemove.push_back(WidenIVR);
4024 continue;
4025 }
4026
4027 if (auto *WidenIVR = dyn_cast<VPWidenPointerInductionRecipe>(&R)) {
4028 // If the recipe only generates scalars, scalarize it instead of
4029 // expanding it.
4030 if (WidenIVR->onlyScalarsGenerated(Plan.hasScalableVF())) {
4031 VPBuilder Builder(WidenIVR);
4032 VPValue *PtrAdd =
4033 scalarizeVPWidenPointerInduction(WidenIVR, Plan, Builder);
4034 WidenIVR->replaceAllUsesWith(PtrAdd);
4035 ToRemove.push_back(WidenIVR);
4036 continue;
4037 }
4038 expandVPWidenPointerInduction(WidenIVR, TypeInfo);
4039 ToRemove.push_back(WidenIVR);
4040 continue;
4041 }
4042
4043 // Expand VPBlendRecipe into VPInstruction::Select.
4044 VPBuilder Builder(&R);
4045 if (auto *Blend = dyn_cast<VPBlendRecipe>(&R)) {
4046 VPValue *Select = Blend->getIncomingValue(0);
4047 for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
4048 Select = Builder.createSelect(Blend->getMask(I),
4049 Blend->getIncomingValue(I), Select,
4050 R.getDebugLoc(), "predphi", *Blend);
4051 Blend->replaceAllUsesWith(Select);
4052 ToRemove.push_back(Blend);
4053 }
4054
4055 if (auto *VEPR = dyn_cast<VPVectorEndPointerRecipe>(&R)) {
4056 if (!VEPR->getOffset()) {
4057 assert(Plan.getConcreteUF() == 1 &&
4058 "Expected unroller to have materialized offset for UF != 1");
4059 VEPR->materializeOffset();
4060 }
4061 }
4062
4063 if (auto *Expr = dyn_cast<VPExpressionRecipe>(&R)) {
4064 Expr->decompose();
4065 ToRemove.push_back(Expr);
4066 }
4067
4068 // Expand LastActiveLane into Not + FirstActiveLane + Sub.
4069 auto *LastActiveL = dyn_cast<VPInstruction>(&R);
4070 if (LastActiveL &&
4071 LastActiveL->getOpcode() == VPInstruction::LastActiveLane) {
4072 // Create Not(Mask) for all operands.
4074 for (VPValue *Op : LastActiveL->operands()) {
4075 VPValue *NotMask = Builder.createNot(Op, LastActiveL->getDebugLoc());
4076 NotMasks.push_back(NotMask);
4077 }
4078
4079 // Create FirstActiveLane on the inverted masks.
4080 VPValue *FirstInactiveLane = Builder.createNaryOp(
4082 LastActiveL->getDebugLoc(), "first.inactive.lane");
4083
4084 // Subtract 1 to get the last active lane.
4085 VPValue *One =
4086 Plan.getConstantInt(TypeInfo.inferScalarType(FirstInactiveLane), 1);
4087 VPValue *LastLane =
4088 Builder.createSub(FirstInactiveLane, One,
4089 LastActiveL->getDebugLoc(), "last.active.lane");
4090
4091 LastActiveL->replaceAllUsesWith(LastLane);
4092 ToRemove.push_back(LastActiveL);
4093 continue;
4094 }
4095
4096 // Lower MaskedCond with block mask to LogicalAnd.
4098 auto *VPI = cast<VPInstruction>(&R);
4099 assert(VPI->isMasked() &&
4100 "Unmasked MaskedCond should be simplified earlier");
4101 VPI->replaceAllUsesWith(Builder.createNaryOp(
4102 VPInstruction::LogicalAnd, {VPI->getMask(), VPI->getOperand(0)}));
4103 ToRemove.push_back(VPI);
4104 continue;
4105 }
4106
4107 // Lower BranchOnCount to ICmp + BranchOnCond.
4108 VPValue *IV, *TC;
4109 if (match(&R, m_BranchOnCount(m_VPValue(IV), m_VPValue(TC)))) {
4110 auto *BranchOnCountInst = cast<VPInstruction>(&R);
4111 DebugLoc DL = BranchOnCountInst->getDebugLoc();
4112 VPValue *Cond = Builder.createICmp(CmpInst::ICMP_EQ, IV, TC, DL);
4113 Builder.createNaryOp(VPInstruction::BranchOnCond, Cond, DL);
4114 ToRemove.push_back(BranchOnCountInst);
4115 continue;
4116 }
4117
4118 VPValue *VectorStep;
4119 VPValue *ScalarStep;
4121 m_VPValue(VectorStep), m_VPValue(ScalarStep))))
4122 continue;
4123
4124 // Expand WideIVStep.
4125 auto *VPI = cast<VPInstruction>(&R);
4126 Type *IVTy = TypeInfo.inferScalarType(VPI);
4127 if (TypeInfo.inferScalarType(VectorStep) != IVTy) {
4129 ? Instruction::UIToFP
4130 : Instruction::Trunc;
4131 VectorStep = Builder.createWidenCast(CastOp, VectorStep, IVTy);
4132 }
4133
4134 assert(!match(ScalarStep, m_One()) && "Expected non-unit scalar-step");
4135 if (TypeInfo.inferScalarType(ScalarStep) != IVTy) {
4136 ScalarStep =
4137 Builder.createWidenCast(Instruction::Trunc, ScalarStep, IVTy);
4138 }
4139
4140 VPIRFlags Flags;
4141 unsigned MulOpc;
4142 if (IVTy->isFloatingPointTy()) {
4143 MulOpc = Instruction::FMul;
4144 Flags = VPI->getFastMathFlags();
4145 } else {
4146 MulOpc = Instruction::Mul;
4147 Flags = VPIRFlags::getDefaultFlags(MulOpc);
4148 }
4149
4150 VPInstruction *Mul = Builder.createNaryOp(
4151 MulOpc, {VectorStep, ScalarStep}, Flags, R.getDebugLoc());
4152 VectorStep = Mul;
4153 VPI->replaceAllUsesWith(VectorStep);
4154 ToRemove.push_back(VPI);
4155 }
4156 }
4157
4158 for (VPRecipeBase *R : ToRemove)
4159 R->eraseFromParent();
4160}
4161
4163 VPBasicBlock *HeaderVPBB,
4164 VPBasicBlock *LatchVPBB,
4165 VPBasicBlock *MiddleVPBB,
4166 UncountableExitStyle Style) {
4167 struct EarlyExitInfo {
4168 VPBasicBlock *EarlyExitingVPBB;
4169 VPIRBasicBlock *EarlyExitVPBB;
4170 VPValue *CondToExit;
4171 };
4172
4173 VPDominatorTree VPDT(Plan);
4174 VPBuilder Builder(LatchVPBB->getTerminator());
4176 for (VPIRBasicBlock *ExitBlock : Plan.getExitBlocks()) {
4177 for (VPBlockBase *Pred : to_vector(ExitBlock->getPredecessors())) {
4178 if (Pred == MiddleVPBB)
4179 continue;
4180 // Collect condition for this early exit.
4181 auto *EarlyExitingVPBB = cast<VPBasicBlock>(Pred);
4182 VPBlockBase *TrueSucc = EarlyExitingVPBB->getSuccessors()[0];
4183 VPValue *CondOfEarlyExitingVPBB;
4184 [[maybe_unused]] bool Matched =
4185 match(EarlyExitingVPBB->getTerminator(),
4186 m_BranchOnCond(m_VPValue(CondOfEarlyExitingVPBB)));
4187 assert(Matched && "Terminator must be BranchOnCond");
4188
4189 // Insert the MaskedCond in the EarlyExitingVPBB so the predicator adds
4190 // the correct block mask.
4191 VPBuilder EarlyExitingBuilder(EarlyExitingVPBB->getTerminator());
4192 auto *CondToEarlyExit = EarlyExitingBuilder.createNaryOp(
4194 TrueSucc == ExitBlock
4195 ? CondOfEarlyExitingVPBB
4196 : EarlyExitingBuilder.createNot(CondOfEarlyExitingVPBB));
4197 assert((isa<VPIRValue>(CondOfEarlyExitingVPBB) ||
4198 !VPDT.properlyDominates(EarlyExitingVPBB, LatchVPBB) ||
4199 VPDT.properlyDominates(
4200 CondOfEarlyExitingVPBB->getDefiningRecipe()->getParent(),
4201 LatchVPBB)) &&
4202 "exit condition must dominate the latch");
4203 Exits.push_back({
4204 EarlyExitingVPBB,
4205 ExitBlock,
4206 CondToEarlyExit,
4207 });
4208 }
4209 }
4210
4211 assert(!Exits.empty() && "must have at least one early exit");
4212 // Sort exits by RPO order to get correct program order. RPO gives a
4213 // topological ordering of the CFG, ensuring upstream exits are checked
4214 // before downstream exits in the dispatch chain.
4216 HeaderVPBB);
4218 for (const auto &[Num, VPB] : enumerate(RPOT))
4219 RPOIdx[VPB] = Num;
4220 llvm::sort(Exits, [&RPOIdx](const EarlyExitInfo &A, const EarlyExitInfo &B) {
4221 return RPOIdx[A.EarlyExitingVPBB] < RPOIdx[B.EarlyExitingVPBB];
4222 });
4223#ifndef NDEBUG
4224 // After RPO sorting, verify that for any pair where one exit dominates
4225 // another, the dominating exit comes first. This is guaranteed by RPO
4226 // (topological order) and is required for the dispatch chain correctness.
4227 for (unsigned I = 0; I + 1 < Exits.size(); ++I)
4228 for (unsigned J = I + 1; J < Exits.size(); ++J)
4229 assert(!VPDT.properlyDominates(Exits[J].EarlyExitingVPBB,
4230 Exits[I].EarlyExitingVPBB) &&
4231 "RPO sort must place dominating exits before dominated ones");
4232#endif
4233
4234 // Build the AnyOf condition for the latch terminator using logical OR
4235 // to avoid poison propagation from later exit conditions when an earlier
4236 // exit is taken.
4237 VPValue *Combined = Exits[0].CondToExit;
4238 for (const EarlyExitInfo &Info : drop_begin(Exits))
4239 Combined = Builder.createLogicalOr(Combined, Info.CondToExit);
4240
4241 VPValue *IsAnyExitTaken =
4242 Builder.createNaryOp(VPInstruction::AnyOf, {Combined});
4243
4245 "Early exit store masking not implemented");
4246
4247 // Create the vector.early.exit blocks.
4248 SmallVector<VPBasicBlock *> VectorEarlyExitVPBBs(Exits.size());
4249 for (unsigned Idx = 0; Idx != Exits.size(); ++Idx) {
4250 Twine BlockSuffix = Exits.size() == 1 ? "" : Twine(".") + Twine(Idx);
4251 VPBasicBlock *VectorEarlyExitVPBB =
4252 Plan.createVPBasicBlock("vector.early.exit" + BlockSuffix);
4253 VectorEarlyExitVPBBs[Idx] = VectorEarlyExitVPBB;
4254 }
4255
4256 // Create the dispatch block (or reuse the single exit block if only one
4257 // exit). The dispatch block computes the first active lane of the combined
4258 // condition and, for multiple exits, chains through conditions to determine
4259 // which exit to take.
4260 VPBasicBlock *DispatchVPBB =
4261 Exits.size() == 1 ? VectorEarlyExitVPBBs[0]
4262 : Plan.createVPBasicBlock("vector.early.exit.check");
4263 VPBuilder DispatchBuilder(DispatchVPBB, DispatchVPBB->begin());
4264 VPValue *FirstActiveLane =
4265 DispatchBuilder.createNaryOp(VPInstruction::FirstActiveLane, {Combined},
4266 DebugLoc::getUnknown(), "first.active.lane");
4267
4268 // For each early exit, disconnect the original exiting block
4269 // (early.exiting.I) from the exit block (ir-bb<exit.I>) and route through a
4270 // new vector.early.exit block. Update ir-bb<exit.I>'s phis to extract their
4271 // values at the first active lane:
4272 //
4273 // Input:
4274 // early.exiting.I:
4275 // ...
4276 // EMIT branch-on-cond vp<%cond.I>
4277 // Successor(s): in.loop.succ, ir-bb<exit.I>
4278 //
4279 // ir-bb<exit.I>:
4280 // IR %phi = phi [ vp<%incoming.I>, early.exiting.I ], ...
4281 //
4282 // Output:
4283 // early.exiting.I:
4284 // ...
4285 // Successor(s): in.loop.succ
4286 //
4287 // vector.early.exit.I:
4288 // EMIT vp<%exit.val> = extract-lane vp<%first.lane>, vp<%incoming.I>
4289 // Successor(s): ir-bb<exit.I>
4290 //
4291 // ir-bb<exit.I>:
4292 // IR %phi = phi ... (extra operand: vp<%exit.val> from
4293 // vector.early.exit.I)
4294 //
4295 for (auto [Exit, VectorEarlyExitVPBB] :
4296 zip_equal(Exits, VectorEarlyExitVPBBs)) {
4297 auto &[EarlyExitingVPBB, EarlyExitVPBB, _] = Exit;
4298 // Adjust the phi nodes in EarlyExitVPBB.
4299 // 1. remove incoming values from EarlyExitingVPBB,
4300 // 2. extract the incoming value at FirstActiveLane
4301 // 3. add back the extracts as last operands for the phis
4302 // Then adjust the CFG, removing the edge between EarlyExitingVPBB and
4303 // EarlyExitVPBB and adding a new edge between VectorEarlyExitVPBB and
4304 // EarlyExitVPBB. The extracts at FirstActiveLane are now the incoming
4305 // values from VectorEarlyExitVPBB.
4306 for (VPRecipeBase &R : EarlyExitVPBB->phis()) {
4307 auto *ExitIRI = cast<VPIRPhi>(&R);
4308 VPValue *IncomingVal =
4309 ExitIRI->getIncomingValueForBlock(EarlyExitingVPBB);
4310 VPValue *NewIncoming = IncomingVal;
4311 if (!isa<VPIRValue>(IncomingVal)) {
4312 VPBuilder EarlyExitBuilder(VectorEarlyExitVPBB);
4313 NewIncoming = EarlyExitBuilder.createNaryOp(
4314 VPInstruction::ExtractLane, {FirstActiveLane, IncomingVal},
4315 DebugLoc::getUnknown(), "early.exit.value");
4316 }
4317 ExitIRI->removeIncomingValueFor(EarlyExitingVPBB);
4318 ExitIRI->addOperand(NewIncoming);
4319 }
4320
4321 EarlyExitingVPBB->getTerminator()->eraseFromParent();
4322 VPBlockUtils::disconnectBlocks(EarlyExitingVPBB, EarlyExitVPBB);
4323 VPBlockUtils::connectBlocks(VectorEarlyExitVPBB, EarlyExitVPBB);
4324 }
4325
4326 // Chain through exits: for each exit, check if its condition is true at
4327 // the first active lane. If so, take that exit; otherwise, try the next.
4328 // The last exit needs no check since it must be taken if all others fail.
4329 //
4330 // For 3 exits (cond.0, cond.1, cond.2), this creates:
4331 //
4332 // latch:
4333 // ...
4334 // EMIT vp<%combined> = logical-or vp<%cond.0>, vp<%cond.1>, vp<%cond.2>
4335 // ...
4336 //
4337 // vector.early.exit.check:
4338 // EMIT vp<%first.lane> = first-active-lane vp<%combined>
4339 // EMIT vp<%at.cond.0> = extract-lane vp<%first.lane>, vp<%cond.0>
4340 // EMIT branch-on-cond vp<%at.cond.0>
4341 // Successor(s): vector.early.exit.0, vector.early.exit.check.0
4342 //
4343 // vector.early.exit.check.0:
4344 // EMIT vp<%at.cond.1> = extract-lane vp<%first.lane>, vp<%cond.1>
4345 // EMIT branch-on-cond vp<%at.cond.1>
4346 // Successor(s): vector.early.exit.1, vector.early.exit.2
4347 VPBasicBlock *CurrentBB = DispatchVPBB;
4348 for (auto [I, Exit] : enumerate(ArrayRef(Exits).drop_back())) {
4349 VPValue *LaneVal = DispatchBuilder.createNaryOp(
4350 VPInstruction::ExtractLane, {FirstActiveLane, Exit.CondToExit},
4351 DebugLoc::getUnknown(), "exit.cond.at.lane");
4352
4353 // For the last dispatch, branch directly to the last exit on false;
4354 // otherwise, create a new check block.
4355 bool IsLastDispatch = (I + 2 == Exits.size());
4356 VPBasicBlock *FalseBB =
4357 IsLastDispatch ? VectorEarlyExitVPBBs.back()
4358 : Plan.createVPBasicBlock(
4359 Twine("vector.early.exit.check.") + Twine(I));
4360
4361 DispatchBuilder.createNaryOp(VPInstruction::BranchOnCond, {LaneVal});
4362 CurrentBB->setSuccessors({VectorEarlyExitVPBBs[I], FalseBB});
4363 VectorEarlyExitVPBBs[I]->setPredecessors({CurrentBB});
4364 FalseBB->setPredecessors({CurrentBB});
4365
4366 CurrentBB = FalseBB;
4367 DispatchBuilder.setInsertPoint(CurrentBB);
4368 }
4369
4370 // Replace the latch terminator with the new branching logic.
4371 auto *LatchExitingBranch = cast<VPInstruction>(LatchVPBB->getTerminator());
4372 assert(LatchExitingBranch->getOpcode() == VPInstruction::BranchOnCount &&
4373 "Unexpected terminator");
4374 auto *IsLatchExitTaken =
4375 Builder.createICmp(CmpInst::ICMP_EQ, LatchExitingBranch->getOperand(0),
4376 LatchExitingBranch->getOperand(1));
4377
4378 DebugLoc LatchDL = LatchExitingBranch->getDebugLoc();
4379 LatchExitingBranch->eraseFromParent();
4380 Builder.setInsertPoint(LatchVPBB);
4381 Builder.createNaryOp(VPInstruction::BranchOnTwoConds,
4382 {IsAnyExitTaken, IsLatchExitTaken}, LatchDL);
4383 LatchVPBB->clearSuccessors();
4384 LatchVPBB->setSuccessors({DispatchVPBB, MiddleVPBB, HeaderVPBB});
4385 DispatchVPBB->setPredecessors({LatchVPBB});
4386}
4387
4388/// This function tries convert extended in-loop reductions to
4389/// VPExpressionRecipe and clamp the \p Range if it is beneficial and
4390/// valid. The created recipe must be decomposed to its constituent
4391/// recipes before execution.
4392static VPExpressionRecipe *
4394 VFRange &Range) {
4395 Type *RedTy = Ctx.Types.inferScalarType(Red);
4396 VPValue *VecOp = Red->getVecOp();
4397
4398 // Clamp the range if using extended-reduction is profitable.
4399 auto IsExtendedRedValidAndClampRange =
4400 [&](unsigned Opcode, Instruction::CastOps ExtOpc, Type *SrcTy) -> bool {
4402 [&](ElementCount VF) {
4403 auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF));
4405
4407 InstructionCost ExtCost =
4408 cast<VPWidenCastRecipe>(VecOp)->computeCost(VF, Ctx);
4409 InstructionCost RedCost = Red->computeCost(VF, Ctx);
4410
4411 if (Red->isPartialReduction()) {
4414 // FIXME: Move partial reduction creation, costing and clamping
4415 // here from LoopVectorize.cpp.
4416 ExtRedCost = Ctx.TTI.getPartialReductionCost(
4417 Opcode, SrcTy, nullptr, RedTy, VF, ExtKind,
4418 llvm::TargetTransformInfo::PR_None, std::nullopt, Ctx.CostKind,
4419 RedTy->isFloatingPointTy()
4420 ? std::optional{Red->getFastMathFlags()}
4421 : std::nullopt);
4422 } else if (!RedTy->isFloatingPointTy()) {
4423 // TTI::getExtendedReductionCost only supports integer types.
4424 ExtRedCost = Ctx.TTI.getExtendedReductionCost(
4425 Opcode, ExtOpc == Instruction::CastOps::ZExt, RedTy, SrcVecTy,
4426 Red->getFastMathFlags(), CostKind);
4427 }
4428 return ExtRedCost.isValid() && ExtRedCost < ExtCost + RedCost;
4429 },
4430 Range);
4431 };
4432
4433 VPValue *A;
4434 // Match reduce(ext)).
4435 if (isa<VPWidenCastRecipe>(VecOp) &&
4436 (match(VecOp, m_ZExtOrSExt(m_VPValue(A))) ||
4437 match(VecOp, m_FPExt(m_VPValue(A)))) &&
4438 IsExtendedRedValidAndClampRange(
4439 RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()),
4440 cast<VPWidenCastRecipe>(VecOp)->getOpcode(),
4441 Ctx.Types.inferScalarType(A)))
4442 return new VPExpressionRecipe(cast<VPWidenCastRecipe>(VecOp), Red);
4443
4444 return nullptr;
4445}
4446
4447/// This function tries convert extended in-loop reductions to
4448/// VPExpressionRecipe and clamp the \p Range if it is beneficial
4449/// and valid. The created VPExpressionRecipe must be decomposed to its
4450/// constituent recipes before execution. Patterns of the
4451/// VPExpressionRecipe:
4452/// reduce.add(mul(...)),
4453/// reduce.add(mul(ext(A), ext(B))),
4454/// reduce.add(ext(mul(ext(A), ext(B)))).
4455/// reduce.fadd(fmul(ext(A), ext(B)))
4456static VPExpressionRecipe *
4458 VPCostContext &Ctx, VFRange &Range) {
4459 unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind());
4460 if (Opcode != Instruction::Add && Opcode != Instruction::Sub &&
4461 Opcode != Instruction::FAdd)
4462 return nullptr;
4463
4464 Type *RedTy = Ctx.Types.inferScalarType(Red);
4465
4466 // Clamp the range if using multiply-accumulate-reduction is profitable.
4467 auto IsMulAccValidAndClampRange =
4469 VPWidenCastRecipe *OuterExt) -> bool {
4471 [&](ElementCount VF) {
4473 Type *SrcTy =
4474 Ext0 ? Ctx.Types.inferScalarType(Ext0->getOperand(0)) : RedTy;
4475 InstructionCost MulAccCost;
4476
4477 if (Red->isPartialReduction()) {
4478 Type *SrcTy2 =
4479 Ext1 ? Ctx.Types.inferScalarType(Ext1->getOperand(0)) : nullptr;
4480 // FIXME: Move partial reduction creation, costing and clamping
4481 // here from LoopVectorize.cpp.
4482 MulAccCost = Ctx.TTI.getPartialReductionCost(
4483 Opcode, SrcTy, SrcTy2, RedTy, VF,
4485 Ext0->getOpcode())
4488 Ext1->getOpcode())
4490 Mul->getOpcode(), CostKind,
4491 RedTy->isFloatingPointTy()
4492 ? std::optional{Red->getFastMathFlags()}
4493 : std::nullopt);
4494 } else {
4495 // Only partial reductions support mixed or floating-point extends
4496 // at the moment.
4497 if (Ext0 && Ext1 &&
4498 (Ext0->getOpcode() != Ext1->getOpcode() ||
4499 Ext0->getOpcode() == Instruction::CastOps::FPExt))
4500 return false;
4501
4502 bool IsZExt =
4503 !Ext0 || Ext0->getOpcode() == Instruction::CastOps::ZExt;
4504 auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF));
4505 MulAccCost = Ctx.TTI.getMulAccReductionCost(IsZExt, Opcode, RedTy,
4506 SrcVecTy, CostKind);
4507 }
4508
4509 InstructionCost MulCost = Mul->computeCost(VF, Ctx);
4510 InstructionCost RedCost = Red->computeCost(VF, Ctx);
4511 InstructionCost ExtCost = 0;
4512 if (Ext0)
4513 ExtCost += Ext0->computeCost(VF, Ctx);
4514 if (Ext1)
4515 ExtCost += Ext1->computeCost(VF, Ctx);
4516 if (OuterExt)
4517 ExtCost += OuterExt->computeCost(VF, Ctx);
4518
4519 return MulAccCost.isValid() &&
4520 MulAccCost < ExtCost + MulCost + RedCost;
4521 },
4522 Range);
4523 };
4524
4525 VPValue *VecOp = Red->getVecOp();
4526 VPRecipeBase *Sub = nullptr;
4527 VPValue *A, *B;
4528 VPValue *Tmp = nullptr;
4529
4530 // Try to match reduce.fadd(fmul(fpext(...), fpext(...))).
4531 if (match(VecOp, m_FMul(m_FPExt(m_VPValue()), m_FPExt(m_VPValue())))) {
4532 assert(Opcode == Instruction::FAdd &&
4533 "MulAccumulateReduction from an FMul must accumulate into an FAdd "
4534 "instruction");
4535 auto *FMul = dyn_cast<VPWidenRecipe>(VecOp);
4536 if (!FMul)
4537 return nullptr;
4538
4539 auto *RecipeA = dyn_cast<VPWidenCastRecipe>(FMul->getOperand(0));
4540 auto *RecipeB = dyn_cast<VPWidenCastRecipe>(FMul->getOperand(1));
4541
4542 if (RecipeA && RecipeB &&
4543 IsMulAccValidAndClampRange(FMul, RecipeA, RecipeB, nullptr)) {
4544 return new VPExpressionRecipe(RecipeA, RecipeB, FMul, Red);
4545 }
4546 }
4547 if (RedTy->isFloatingPointTy())
4548 return nullptr;
4549
4550 // Sub reductions could have a sub between the add reduction and vec op.
4551 if (match(VecOp, m_Sub(m_ZeroInt(), m_VPValue(Tmp)))) {
4552 Sub = VecOp->getDefiningRecipe();
4553 VecOp = Tmp;
4554 }
4555
4556 // If ValB is a constant and can be safely extended, truncate it to the same
4557 // type as ExtA's operand, then extend it to the same type as ExtA. This
4558 // creates two uniform extends that can more easily be matched by the rest of
4559 // the bundling code. The ExtB reference, ValB and operand 1 of Mul are all
4560 // replaced with the new extend of the constant.
4561 auto ExtendAndReplaceConstantOp = [&Ctx, &Red](VPWidenCastRecipe *ExtA,
4562 VPWidenCastRecipe *&ExtB,
4563 VPValue *&ValB,
4564 VPWidenRecipe *Mul) {
4565 if (!ExtA || ExtB || !isa<VPIRValue>(ValB) || Red->isPartialReduction())
4566 return;
4567 Type *NarrowTy = Ctx.Types.inferScalarType(ExtA->getOperand(0));
4568 Instruction::CastOps ExtOpc = ExtA->getOpcode();
4569 const APInt *Const;
4570 if (!match(ValB, m_APInt(Const)) ||
4572 Const, NarrowTy, TTI::getPartialReductionExtendKind(ExtOpc)))
4573 return;
4574 // The truncate ensures that the type of each extended operand is the
4575 // same, and it's been proven that the constant can be extended from
4576 // NarrowTy safely. Necessary since ExtA's extended operand would be
4577 // e.g. an i8, while the const will likely be an i32. This will be
4578 // elided by later optimisations.
4579 VPBuilder Builder(Mul);
4580 auto *Trunc =
4581 Builder.createWidenCast(Instruction::CastOps::Trunc, ValB, NarrowTy);
4582 Type *WideTy = Ctx.Types.inferScalarType(ExtA);
4583 ValB = ExtB = Builder.createWidenCast(ExtOpc, Trunc, WideTy);
4584 Mul->setOperand(1, ExtB);
4585 };
4586
4587 // Try to match reduce.add(mul(...)).
4588 if (match(VecOp, m_Mul(m_VPValue(A), m_VPValue(B)))) {
4591 auto *Mul = cast<VPWidenRecipe>(VecOp);
4592
4593 // Convert reduce.add(mul(ext, const)) to reduce.add(mul(ext, ext(const)))
4594 ExtendAndReplaceConstantOp(RecipeA, RecipeB, B, Mul);
4595
4596 // Match reduce.add/sub(mul(ext, ext)).
4597 if (RecipeA && RecipeB && match(RecipeA, m_ZExtOrSExt(m_VPValue())) &&
4598 match(RecipeB, m_ZExtOrSExt(m_VPValue())) &&
4599 IsMulAccValidAndClampRange(Mul, RecipeA, RecipeB, nullptr)) {
4600 if (Sub)
4601 return new VPExpressionRecipe(RecipeA, RecipeB, Mul,
4602 cast<VPWidenRecipe>(Sub), Red);
4603 return new VPExpressionRecipe(RecipeA, RecipeB, Mul, Red);
4604 }
4605 // TODO: Add an expression type for this variant with a negated mul
4606 if (!Sub && IsMulAccValidAndClampRange(Mul, nullptr, nullptr, nullptr))
4607 return new VPExpressionRecipe(Mul, Red);
4608 }
4609 // TODO: Add an expression type for negated versions of other expression
4610 // variants.
4611 if (Sub)
4612 return nullptr;
4613
4614 // Match reduce.add(ext(mul(A, B))).
4615 if (!Red->isPartialReduction() &&
4616 match(VecOp, m_ZExtOrSExt(m_Mul(m_VPValue(A), m_VPValue(B))))) {
4617 auto *Ext = cast<VPWidenCastRecipe>(VecOp);
4618 auto *Mul = cast<VPWidenRecipe>(Ext->getOperand(0));
4621
4622 // reduce.add(ext(mul(ext, const)))
4623 // -> reduce.add(ext(mul(ext, ext(const))))
4624 ExtendAndReplaceConstantOp(Ext0, Ext1, B, Mul);
4625
4626 // reduce.add(ext(mul(ext(A), ext(B))))
4627 // -> reduce.add(mul(wider_ext(A), wider_ext(B)))
4628 // The inner extends must either have the same opcode as the outer extend or
4629 // be the same, in which case the multiply can never result in a negative
4630 // value and the outer extend can be folded away by doing wider
4631 // extends for the operands of the mul.
4632 if (Ext0 && Ext1 &&
4633 (Ext->getOpcode() == Ext0->getOpcode() || Ext0 == Ext1) &&
4634 Ext0->getOpcode() == Ext1->getOpcode() &&
4635 IsMulAccValidAndClampRange(Mul, Ext0, Ext1, Ext) && Mul->hasOneUse()) {
4636 auto *NewExt0 = new VPWidenCastRecipe(
4637 Ext0->getOpcode(), Ext0->getOperand(0), Ext->getResultType(), nullptr,
4638 *Ext0, *Ext0, Ext0->getDebugLoc());
4639 NewExt0->insertBefore(Ext0);
4640
4641 VPWidenCastRecipe *NewExt1 = NewExt0;
4642 if (Ext0 != Ext1) {
4643 NewExt1 = new VPWidenCastRecipe(Ext1->getOpcode(), Ext1->getOperand(0),
4644 Ext->getResultType(), nullptr, *Ext1,
4645 *Ext1, Ext1->getDebugLoc());
4646 NewExt1->insertBefore(Ext1);
4647 }
4648 Mul->setOperand(0, NewExt0);
4649 Mul->setOperand(1, NewExt1);
4650 Red->setOperand(1, Mul);
4651 return new VPExpressionRecipe(NewExt0, NewExt1, Mul, Red);
4652 }
4653 }
4654 return nullptr;
4655}
4656
4657/// This function tries to create abstract recipes from the reduction recipe for
4658/// following optimizations and cost estimation.
4660 VPCostContext &Ctx,
4661 VFRange &Range) {
4662 VPExpressionRecipe *AbstractR = nullptr;
4663 auto IP = std::next(Red->getIterator());
4664 auto *VPBB = Red->getParent();
4665 if (auto *MulAcc = tryToMatchAndCreateMulAccumulateReduction(Red, Ctx, Range))
4666 AbstractR = MulAcc;
4667 else if (auto *ExtRed = tryToMatchAndCreateExtendedReduction(Red, Ctx, Range))
4668 AbstractR = ExtRed;
4669 // Cannot create abstract inloop reduction recipes.
4670 if (!AbstractR)
4671 return;
4672
4673 AbstractR->insertBefore(*VPBB, IP);
4674 Red->replaceAllUsesWith(AbstractR);
4675}
4676
4687
4689 if (Plan.hasScalarVFOnly())
4690 return;
4691
4692#ifndef NDEBUG
4693 VPDominatorTree VPDT(Plan);
4694#endif
4695
4696 SmallVector<VPValue *> VPValues;
4699 append_range(VPValues, Plan.getLiveIns());
4700 for (VPRecipeBase &R : *Plan.getEntry())
4701 append_range(VPValues, R.definedValues());
4702
4703 auto *VectorPreheader = Plan.getVectorPreheader();
4704 for (VPValue *VPV : VPValues) {
4706 (isa<VPIRValue>(VPV) && isa<Constant>(VPV->getLiveInIRValue())))
4707 continue;
4708
4709 // Add explicit broadcast at the insert point that dominates all users.
4710 VPBasicBlock *HoistBlock = VectorPreheader;
4711 VPBasicBlock::iterator HoistPoint = VectorPreheader->end();
4712 for (VPUser *User : VPV->users()) {
4713 if (User->usesScalars(VPV))
4714 continue;
4715 if (cast<VPRecipeBase>(User)->getParent() == VectorPreheader)
4716 HoistPoint = HoistBlock->begin();
4717 else
4718 assert(VPDT.dominates(VectorPreheader,
4719 cast<VPRecipeBase>(User)->getParent()) &&
4720 "All users must be in the vector preheader or dominated by it");
4721 }
4722
4723 VPBuilder Builder(cast<VPBasicBlock>(HoistBlock), HoistPoint);
4724 auto *Broadcast = Builder.createNaryOp(VPInstruction::Broadcast, {VPV});
4725 VPV->replaceUsesWithIf(Broadcast,
4726 [VPV, Broadcast](VPUser &U, unsigned Idx) {
4727 return Broadcast != &U && !U.usesScalars(VPV);
4728 });
4729 }
4730}
4731
4733 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
4734
4735 // Collect candidate loads with invariant addresses and noalias scopes
4736 // metadata and memory-writing recipes with noalias metadata.
4740 vp_depth_first_shallow(LoopRegion->getEntry()))) {
4741 for (VPRecipeBase &R : *VPBB) {
4742 // Only handle single-scalar replicated loads with invariant addresses.
4743 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
4744 if (RepR->isPredicated() || !RepR->isSingleScalar() ||
4745 RepR->getOpcode() != Instruction::Load)
4746 continue;
4747
4748 VPValue *Addr = RepR->getOperand(0);
4749 if (Addr->isDefinedOutsideLoopRegions()) {
4751 if (!Loc.AATags.Scope)
4752 continue;
4753 CandidateLoads.push_back({RepR, Loc});
4754 }
4755 }
4756 if (R.mayWriteToMemory()) {
4758 if (!Loc || !Loc->AATags.Scope || !Loc->AATags.NoAlias)
4759 return;
4760 Stores.push_back(*Loc);
4761 }
4762 }
4763 }
4764
4765 VPBasicBlock *Preheader = Plan.getVectorPreheader();
4766 for (auto &[LoadRecipe, LoadLoc] : CandidateLoads) {
4767 // Hoist the load to the preheader if it doesn't alias with any stores
4768 // according to the noalias metadata. Other loads should have been hoisted
4769 // by other passes
4770 const AAMDNodes &LoadAA = LoadLoc.AATags;
4771 if (all_of(Stores, [&](const MemoryLocation &StoreLoc) {
4773 LoadAA.Scope, StoreLoc.AATags.NoAlias);
4774 })) {
4775 LoadRecipe->moveBefore(*Preheader, Preheader->getFirstNonPhi());
4776 }
4777 }
4778}
4779
4780// Collect common metadata from a group of replicate recipes by intersecting
4781// metadata from all recipes in the group.
4783 VPIRMetadata CommonMetadata = *Recipes.front();
4784 for (VPReplicateRecipe *Recipe : drop_begin(Recipes))
4785 CommonMetadata.intersect(*Recipe);
4786 return CommonMetadata;
4787}
4788
4789template <unsigned Opcode>
4793 const Loop *L) {
4794 static_assert(Opcode == Instruction::Load || Opcode == Instruction::Store,
4795 "Only Load and Store opcodes supported");
4796 constexpr bool IsLoad = (Opcode == Instruction::Load);
4797 VPTypeAnalysis TypeInfo(Plan);
4798
4799 // For each address, collect operations with the same or complementary masks.
4801 auto GetLoadStoreValueType = [&](VPReplicateRecipe *Recipe) {
4802 return TypeInfo.inferScalarType(IsLoad ? Recipe : Recipe->getOperand(0));
4803 };
4805 Plan, PSE, L,
4806 [](VPReplicateRecipe *RepR) { return RepR->isPredicated(); });
4807 for (auto Recipes : Groups) {
4808 if (Recipes.size() < 2)
4809 continue;
4810
4811 // Collect groups with the same or complementary masks.
4812 for (VPReplicateRecipe *&RecipeI : Recipes) {
4813 if (!RecipeI)
4814 continue;
4815
4816 VPValue *MaskI = RecipeI->getMask();
4817 Type *TypeI = GetLoadStoreValueType(RecipeI);
4819 Group.push_back(RecipeI);
4820 RecipeI = nullptr;
4821
4822 // Find all operations with the same or complementary masks.
4823 bool HasComplementaryMask = false;
4824 for (VPReplicateRecipe *&RecipeJ : Recipes) {
4825 if (!RecipeJ)
4826 continue;
4827
4828 VPValue *MaskJ = RecipeJ->getMask();
4829 Type *TypeJ = GetLoadStoreValueType(RecipeJ);
4830 if (TypeI == TypeJ) {
4831 // Check if any operation in the group has a complementary mask with
4832 // another, that is M1 == NOT(M2) or M2 == NOT(M1).
4833 HasComplementaryMask |= match(MaskI, m_Not(m_Specific(MaskJ))) ||
4834 match(MaskJ, m_Not(m_Specific(MaskI)));
4835 Group.push_back(RecipeJ);
4836 RecipeJ = nullptr;
4837 }
4838 }
4839
4840 if (HasComplementaryMask) {
4841 assert(Group.size() >= 2 && "must have at least 2 entries");
4842 AllGroups.push_back(std::move(Group));
4843 }
4844 }
4845 }
4846
4847 return AllGroups;
4848}
4849
4850// Find the recipe with minimum alignment in the group.
4851template <typename InstType>
4852static VPReplicateRecipe *
4854 return *min_element(Group, [](VPReplicateRecipe *A, VPReplicateRecipe *B) {
4855 return cast<InstType>(A->getUnderlyingInstr())->getAlign() <
4856 cast<InstType>(B->getUnderlyingInstr())->getAlign();
4857 });
4858}
4859
4862 const Loop *L) {
4863 auto Groups =
4865 if (Groups.empty())
4866 return;
4867
4868 // Process each group of loads.
4869 for (auto &Group : Groups) {
4870 // Try to use the earliest (most dominating) load to replace all others.
4871 VPReplicateRecipe *EarliestLoad = Group[0];
4872 VPBasicBlock *FirstBB = EarliestLoad->getParent();
4873 VPBasicBlock *LastBB = Group.back()->getParent();
4874
4875 // Check that the load doesn't alias with stores between first and last.
4876 auto LoadLoc = vputils::getMemoryLocation(*EarliestLoad);
4877 if (!LoadLoc || !canHoistOrSinkWithNoAliasCheck(*LoadLoc, FirstBB, LastBB))
4878 continue;
4879
4880 // Collect common metadata from all loads in the group.
4881 VPIRMetadata CommonMetadata = getCommonMetadata(Group);
4882
4883 // Find the load with minimum alignment to use.
4884 auto *LoadWithMinAlign = findRecipeWithMinAlign<LoadInst>(Group);
4885
4886 bool IsSingleScalar = EarliestLoad->isSingleScalar();
4887 assert(all_of(Group,
4888 [IsSingleScalar](VPReplicateRecipe *R) {
4889 return R->isSingleScalar() == IsSingleScalar;
4890 }) &&
4891 "all members in group must agree on IsSingleScalar");
4892
4893 // Create an unpredicated version of the earliest load with common
4894 // metadata.
4895 auto *UnpredicatedLoad = new VPReplicateRecipe(
4896 LoadWithMinAlign->getUnderlyingInstr(), {EarliestLoad->getOperand(0)},
4897 IsSingleScalar, /*Mask=*/nullptr, *EarliestLoad, CommonMetadata);
4898
4899 UnpredicatedLoad->insertBefore(EarliestLoad);
4900
4901 // Replace all loads in the group with the unpredicated load.
4902 for (VPReplicateRecipe *Load : Group) {
4903 Load->replaceAllUsesWith(UnpredicatedLoad);
4904 Load->eraseFromParent();
4905 }
4906 }
4907}
4908
4909static bool
4911 PredicatedScalarEvolution &PSE, const Loop &L,
4912 VPTypeAnalysis &TypeInfo) {
4913 auto StoreLoc = vputils::getMemoryLocation(*StoresToSink.front());
4914 if (!StoreLoc || !StoreLoc->AATags.Scope)
4915 return false;
4916
4917 // When sinking a group of stores, all members of the group alias each other.
4918 // Skip them during the alias checks.
4919 SmallPtrSet<VPRecipeBase *, 4> StoresToSinkSet(StoresToSink.begin(),
4920 StoresToSink.end());
4921
4922 VPBasicBlock *FirstBB = StoresToSink.front()->getParent();
4923 VPBasicBlock *LastBB = StoresToSink.back()->getParent();
4924 SinkStoreInfo SinkInfo(StoresToSinkSet, *StoresToSink[0], PSE, L, TypeInfo);
4925 return canHoistOrSinkWithNoAliasCheck(*StoreLoc, FirstBB, LastBB, SinkInfo);
4926}
4927
4930 const Loop *L) {
4931 auto Groups =
4933 if (Groups.empty())
4934 return;
4935
4936 VPTypeAnalysis TypeInfo(Plan);
4937
4938 for (auto &Group : Groups) {
4939 if (!canSinkStoreWithNoAliasCheck(Group, PSE, *L, TypeInfo))
4940 continue;
4941
4942 // Use the last (most dominated) store's location for the unconditional
4943 // store.
4944 VPReplicateRecipe *LastStore = Group.back();
4945 VPBasicBlock *InsertBB = LastStore->getParent();
4946
4947 // Collect common alias metadata from all stores in the group.
4948 VPIRMetadata CommonMetadata = getCommonMetadata(Group);
4949
4950 // Build select chain for stored values.
4951 VPValue *SelectedValue = Group[0]->getOperand(0);
4952 VPBuilder Builder(InsertBB, LastStore->getIterator());
4953
4954 bool IsSingleScalar = Group[0]->isSingleScalar();
4955 for (unsigned I = 1; I < Group.size(); ++I) {
4956 assert(IsSingleScalar == Group[I]->isSingleScalar() &&
4957 "all members in group must agree on IsSingleScalar");
4958 VPValue *Mask = Group[I]->getMask();
4959 VPValue *Value = Group[I]->getOperand(0);
4960 SelectedValue = Builder.createSelect(Mask, Value, SelectedValue,
4961 Group[I]->getDebugLoc());
4962 }
4963
4964 // Find the store with minimum alignment to use.
4965 auto *StoreWithMinAlign = findRecipeWithMinAlign<StoreInst>(Group);
4966
4967 // Create unconditional store with selected value and common metadata.
4968 auto *UnpredicatedStore = new VPReplicateRecipe(
4969 StoreWithMinAlign->getUnderlyingInstr(),
4970 {SelectedValue, LastStore->getOperand(1)}, IsSingleScalar,
4971 /*Mask=*/nullptr, *LastStore, CommonMetadata);
4972 UnpredicatedStore->insertBefore(*InsertBB, LastStore->getIterator());
4973
4974 // Remove all predicated stores from the group.
4975 for (VPReplicateRecipe *Store : Group)
4976 Store->eraseFromParent();
4977 }
4978}
4979
4981 VPlan &Plan, ElementCount BestVF, unsigned BestUF,
4983 assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
4984 assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
4985
4986 VPValue *TC = Plan.getTripCount();
4987 if (TC->getNumUsers() == 0)
4988 return;
4989
4990 // Skip cases for which the trip count may be non-trivial to materialize.
4991 // I.e., when a scalar tail is absent - due to tail folding, or when a scalar
4992 // tail is required.
4993 if (!Plan.hasScalarTail() ||
4995 Plan.getScalarPreheader() ||
4996 !isa<VPIRValue>(TC))
4997 return;
4998
4999 // Materialize vector trip counts for constants early if it can simply
5000 // be computed as (Original TC / VF * UF) * VF * UF.
5001 // TODO: Compute vector trip counts for loops requiring a scalar epilogue and
5002 // tail-folded loops.
5003 ScalarEvolution &SE = *PSE.getSE();
5004 auto *TCScev = SE.getSCEV(TC->getLiveInIRValue());
5005 if (!isa<SCEVConstant>(TCScev))
5006 return;
5007 const SCEV *VFxUF = SE.getElementCount(TCScev->getType(), BestVF * BestUF);
5008 auto VecTCScev = SE.getMulExpr(SE.getUDivExpr(TCScev, VFxUF), VFxUF);
5009 if (auto *ConstVecTC = dyn_cast<SCEVConstant>(VecTCScev))
5010 Plan.getVectorTripCount().setUnderlyingValue(ConstVecTC->getValue());
5011}
5012
5014 VPBasicBlock *VectorPH) {
5016 if (BTC->getNumUsers() == 0)
5017 return;
5018
5019 VPBuilder Builder(VectorPH, VectorPH->begin());
5020 auto *TCTy = VPTypeAnalysis(Plan).inferScalarType(Plan.getTripCount());
5021 auto *TCMO =
5022 Builder.createSub(Plan.getTripCount(), Plan.getConstantInt(TCTy, 1),
5023 DebugLoc::getCompilerGenerated(), "trip.count.minus.1");
5024 BTC->replaceAllUsesWith(TCMO);
5025}
5026
5028 if (Plan.hasScalarVFOnly())
5029 return;
5030
5031 VPTypeAnalysis TypeInfo(Plan);
5032 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
5033 auto VPBBsOutsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
5035 auto VPBBsInsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
5036 vp_depth_first_shallow(LoopRegion->getEntry()));
5037 // Materialize Build(Struct)Vector for all replicating VPReplicateRecipes,
5038 // VPScalarIVStepsRecipe and VPInstructions, excluding ones in replicate
5039 // regions. Those are not materialized explicitly yet. Those vector users are
5040 // still handled in VPReplicateRegion::execute(), via shouldPack().
5041 // TODO: materialize build vectors for replicating recipes in replicating
5042 // regions.
5043 for (VPBasicBlock *VPBB :
5044 concat<VPBasicBlock *>(VPBBsOutsideLoopRegion, VPBBsInsideLoopRegion)) {
5045 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
5047 continue;
5048 auto *DefR = cast<VPSingleDefRecipe>(&R);
5049 auto UsesVectorOrInsideReplicateRegion = [DefR, LoopRegion](VPUser *U) {
5050 VPRegionBlock *ParentRegion = cast<VPRecipeBase>(U)->getRegion();
5051 return !U->usesScalars(DefR) || ParentRegion != LoopRegion;
5052 };
5053 if ((isa<VPReplicateRecipe>(DefR) &&
5054 cast<VPReplicateRecipe>(DefR)->isSingleScalar()) ||
5055 (isa<VPInstruction>(DefR) &&
5057 !cast<VPInstruction>(DefR)->doesGeneratePerAllLanes())) ||
5058 none_of(DefR->users(), UsesVectorOrInsideReplicateRegion))
5059 continue;
5060
5061 Type *ScalarTy = TypeInfo.inferScalarType(DefR);
5062 unsigned Opcode = ScalarTy->isStructTy()
5065 auto *BuildVector = new VPInstruction(Opcode, {DefR});
5066 BuildVector->insertAfter(DefR);
5067
5068 DefR->replaceUsesWithIf(
5069 BuildVector, [BuildVector, &UsesVectorOrInsideReplicateRegion](
5070 VPUser &U, unsigned) {
5071 return &U != BuildVector && UsesVectorOrInsideReplicateRegion(&U);
5072 });
5073 }
5074 }
5075
5076 // Create explicit VPInstructions to convert vectors to scalars. The current
5077 // implementation is conservative - it may miss some cases that may or may not
5078 // be vector values. TODO: introduce Unpacks speculatively - remove them later
5079 // if they are known to operate on scalar values.
5080 for (VPBasicBlock *VPBB : VPBBsInsideLoopRegion) {
5081 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
5084 continue;
5085 for (VPValue *Def : R.definedValues()) {
5086 // Skip recipes that are single-scalar or only have their first lane
5087 // used.
5088 // TODO: The Defs skipped here may or may not be vector values.
5089 // Introduce Unpacks, and remove them later, if they are guaranteed to
5090 // produce scalar values.
5092 continue;
5093
5094 // At the moment, we create unpacks only for scalar users outside
5095 // replicate regions. Recipes inside replicate regions still extract the
5096 // required lanes implicitly.
5097 // TODO: Remove once replicate regions are unrolled completely.
5098 auto IsCandidateUnpackUser = [Def](VPUser *U) {
5099 VPRegionBlock *ParentRegion = cast<VPRecipeBase>(U)->getRegion();
5100 return U->usesScalars(Def) &&
5101 (!ParentRegion || !ParentRegion->isReplicator());
5102 };
5103 if (none_of(Def->users(), IsCandidateUnpackUser))
5104 continue;
5105
5106 auto *Unpack = new VPInstruction(VPInstruction::Unpack, {Def});
5107 if (R.isPhi())
5108 Unpack->insertBefore(*VPBB, VPBB->getFirstNonPhi());
5109 else
5110 Unpack->insertAfter(&R);
5111 Def->replaceUsesWithIf(Unpack,
5112 [&IsCandidateUnpackUser](VPUser &U, unsigned) {
5113 return IsCandidateUnpackUser(&U);
5114 });
5115 }
5116 }
5117 }
5118}
5119
5121 VPBasicBlock *VectorPHVPBB,
5122 bool TailByMasking,
5123 bool RequiresScalarEpilogue,
5124 VPValue *Step) {
5125 VPSymbolicValue &VectorTC = Plan.getVectorTripCount();
5126 // There's nothing to do if there are no users of the vector trip count or its
5127 // IR value has already been set.
5128 if (VectorTC.getNumUsers() == 0 || VectorTC.getUnderlyingValue())
5129 return;
5130
5131 VPValue *TC = Plan.getTripCount();
5132 Type *TCTy = VPTypeAnalysis(Plan).inferScalarType(TC);
5133 VPBasicBlock::iterator InsertPt = VectorPHVPBB->begin();
5134 if (auto *StepR = Step->getDefiningRecipe()) {
5135 assert(StepR->getParent() == VectorPHVPBB &&
5136 "Step must be defined in VectorPHVPBB");
5137 // Insert after Step's definition to maintain valid def-use ordering.
5138 InsertPt = std::next(StepR->getIterator());
5139 }
5140 VPBuilder Builder(VectorPHVPBB, InsertPt);
5141
5142 // If the tail is to be folded by masking, round the number of iterations N
5143 // up to a multiple of Step instead of rounding down. This is done by first
5144 // adding Step-1 and then rounding down. Note that it's ok if this addition
5145 // overflows: the vector induction variable will eventually wrap to zero given
5146 // that it starts at zero and its Step is a power of two; the loop will then
5147 // exit, with the last early-exit vector comparison also producing all-true.
5148 if (TailByMasking) {
5149 TC = Builder.createAdd(
5150 TC, Builder.createSub(Step, Plan.getConstantInt(TCTy, 1)),
5151 DebugLoc::getCompilerGenerated(), "n.rnd.up");
5152 }
5153
5154 // Now we need to generate the expression for the part of the loop that the
5155 // vectorized body will execute. This is equal to N - (N % Step) if scalar
5156 // iterations are not required for correctness, or N - Step, otherwise. Step
5157 // is equal to the vectorization factor (number of SIMD elements) times the
5158 // unroll factor (number of SIMD instructions).
5159 VPValue *R =
5160 Builder.createNaryOp(Instruction::URem, {TC, Step},
5161 DebugLoc::getCompilerGenerated(), "n.mod.vf");
5162
5163 // There are cases where we *must* run at least one iteration in the remainder
5164 // loop. See the cost model for when this can happen. If the step evenly
5165 // divides the trip count, we set the remainder to be equal to the step. If
5166 // the step does not evenly divide the trip count, no adjustment is necessary
5167 // since there will already be scalar iterations. Note that the minimum
5168 // iterations check ensures that N >= Step.
5169 if (RequiresScalarEpilogue) {
5170 assert(!TailByMasking &&
5171 "requiring scalar epilogue is not supported with fail folding");
5172 VPValue *IsZero =
5173 Builder.createICmp(CmpInst::ICMP_EQ, R, Plan.getZero(TCTy));
5174 R = Builder.createSelect(IsZero, Step, R);
5175 }
5176
5177 VPValue *Res =
5178 Builder.createSub(TC, R, DebugLoc::getCompilerGenerated(), "n.vec");
5179 VectorTC.replaceAllUsesWith(Res);
5180}
5181
5183 ElementCount VFEC) {
5184 // If VF and VFxUF have already been materialized (no remaining users),
5185 // there's nothing more to do.
5186 if (Plan.getVF().isMaterialized()) {
5187 assert(Plan.getVFxUF().isMaterialized() &&
5188 "VF and VFxUF must be materialized together");
5189 return;
5190 }
5191
5192 VPBuilder Builder(VectorPH, VectorPH->begin());
5193 Type *TCTy = VPTypeAnalysis(Plan).inferScalarType(Plan.getTripCount());
5194 VPValue &VF = Plan.getVF();
5195 VPValue &VFxUF = Plan.getVFxUF();
5196 // If there are no users of the runtime VF, compute VFxUF by constant folding
5197 // the multiplication of VF and UF.
5198 if (VF.getNumUsers() == 0) {
5199 VPValue *RuntimeVFxUF =
5200 Builder.createElementCount(TCTy, VFEC * Plan.getConcreteUF());
5201 VFxUF.replaceAllUsesWith(RuntimeVFxUF);
5202 return;
5203 }
5204
5205 // For users of the runtime VF, compute it as VF * vscale, and VFxUF as (VF *
5206 // vscale) * UF.
5207 VPValue *RuntimeVF = Builder.createElementCount(TCTy, VFEC);
5209 VPValue *BC = Builder.createNaryOp(VPInstruction::Broadcast, RuntimeVF);
5211 BC, [&VF](VPUser &U, unsigned) { return !U.usesScalars(&VF); });
5212 }
5213 VF.replaceAllUsesWith(RuntimeVF);
5214
5215 VPValue *MulByUF = Builder.createOverflowingOp(
5216 Instruction::Mul,
5217 {RuntimeVF, Plan.getConstantInt(TCTy, Plan.getConcreteUF())},
5218 {true, false});
5219 VFxUF.replaceAllUsesWith(MulByUF);
5220}
5221
5224 SCEVExpander Expander(SE, "induction", /*PreserveLCSSA=*/false);
5225
5226 auto *Entry = cast<VPIRBasicBlock>(Plan.getEntry());
5227 BasicBlock *EntryBB = Entry->getIRBasicBlock();
5228 DenseMap<const SCEV *, Value *> ExpandedSCEVs;
5229 for (VPRecipeBase &R : make_early_inc_range(*Entry)) {
5231 continue;
5232 auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(&R);
5233 if (!ExpSCEV)
5234 break;
5235 const SCEV *Expr = ExpSCEV->getSCEV();
5236 Value *Res =
5237 Expander.expandCodeFor(Expr, Expr->getType(), EntryBB->getTerminator());
5238 ExpandedSCEVs[ExpSCEV->getSCEV()] = Res;
5239 VPValue *Exp = Plan.getOrAddLiveIn(Res);
5240 ExpSCEV->replaceAllUsesWith(Exp);
5241 if (Plan.getTripCount() == ExpSCEV)
5242 Plan.resetTripCount(Exp);
5243 ExpSCEV->eraseFromParent();
5244 }
5246 "VPExpandSCEVRecipes must be at the beginning of the entry block, "
5247 "before any VPIRInstructions");
5248 // Add IR instructions in the entry basic block but not in the VPIRBasicBlock
5249 // to the VPIRBasicBlock.
5250 auto EI = Entry->begin();
5251 for (Instruction &I : drop_end(*EntryBB)) {
5252 if (EI != Entry->end() && isa<VPIRInstruction>(*EI) &&
5253 &cast<VPIRInstruction>(&*EI)->getInstruction() == &I) {
5254 EI++;
5255 continue;
5256 }
5258 }
5259
5260 return ExpandedSCEVs;
5261}
5262
5263/// Returns true if \p V is VPWidenLoadRecipe or VPInterleaveRecipe that can be
5264/// converted to a narrower recipe. \p V is used by a wide recipe that feeds a
5265/// store interleave group at index \p Idx, \p WideMember0 is the recipe feeding
5266/// the same interleave group at index 0. A VPWidenLoadRecipe can be narrowed to
5267/// an index-independent load if it feeds all wide ops at all indices (\p OpV
5268/// must be the operand at index \p OpIdx for both the recipe at lane 0, \p
5269/// WideMember0). A VPInterleaveRecipe can be narrowed to a wide load, if \p V
5270/// is defined at \p Idx of a load interleave group.
5271static bool canNarrowLoad(VPSingleDefRecipe *WideMember0, unsigned OpIdx,
5272 VPValue *OpV, unsigned Idx, bool IsScalable) {
5273 VPValue *Member0Op = WideMember0->getOperand(OpIdx);
5274 VPRecipeBase *Member0OpR = Member0Op->getDefiningRecipe();
5275 if (!Member0OpR)
5276 return Member0Op == OpV;
5277 if (auto *W = dyn_cast<VPWidenLoadRecipe>(Member0OpR))
5278 // For scalable VFs, the narrowed plan processes vscale iterations at once,
5279 // so a shared wide load cannot be narrowed to a uniform scalar; bail out.
5280 return !IsScalable && !W->getMask() && W->isConsecutive() &&
5281 Member0Op == OpV;
5282 if (auto *IR = dyn_cast<VPInterleaveRecipe>(Member0OpR))
5283 return IR->getInterleaveGroup()->isFull() && IR->getVPValue(Idx) == OpV;
5284 return false;
5285}
5286
5287static bool canNarrowOps(ArrayRef<VPValue *> Ops, bool IsScalable) {
5289 auto *WideMember0 = dyn_cast<VPSingleDefRecipe>(Ops[0]);
5290 if (!WideMember0)
5291 return false;
5292 for (VPValue *V : Ops) {
5294 return false;
5295 auto *R = cast<VPSingleDefRecipe>(V);
5296 if (getOpcodeOrIntrinsicID(R) != getOpcodeOrIntrinsicID(WideMember0))
5297 return false;
5298 }
5299
5300 for (unsigned Idx = 0; Idx != WideMember0->getNumOperands(); ++Idx) {
5302 for (VPValue *Op : Ops)
5303 OpsI.push_back(Op->getDefiningRecipe()->getOperand(Idx));
5304
5305 if (canNarrowOps(OpsI, IsScalable))
5306 continue;
5307
5308 if (any_of(enumerate(OpsI), [WideMember0, Idx, IsScalable](const auto &P) {
5309 const auto &[OpIdx, OpV] = P;
5310 return !canNarrowLoad(WideMember0, Idx, OpV, OpIdx, IsScalable);
5311 }))
5312 return false;
5313 }
5314
5315 return true;
5316}
5317
5318/// Returns VF from \p VFs if \p IR is a full interleave group with factor and
5319/// number of members both equal to VF. The interleave group must also access
5320/// the full vector width.
5321static std::optional<ElementCount> isConsecutiveInterleaveGroup(
5323 VPTypeAnalysis &TypeInfo, const TargetTransformInfo &TTI) {
5324 if (!InterleaveR || InterleaveR->getMask())
5325 return std::nullopt;
5326
5327 Type *GroupElementTy = nullptr;
5328 if (InterleaveR->getStoredValues().empty()) {
5329 GroupElementTy = TypeInfo.inferScalarType(InterleaveR->getVPValue(0));
5330 if (!all_of(InterleaveR->definedValues(),
5331 [&TypeInfo, GroupElementTy](VPValue *Op) {
5332 return TypeInfo.inferScalarType(Op) == GroupElementTy;
5333 }))
5334 return std::nullopt;
5335 } else {
5336 GroupElementTy =
5337 TypeInfo.inferScalarType(InterleaveR->getStoredValues()[0]);
5338 if (!all_of(InterleaveR->getStoredValues(),
5339 [&TypeInfo, GroupElementTy](VPValue *Op) {
5340 return TypeInfo.inferScalarType(Op) == GroupElementTy;
5341 }))
5342 return std::nullopt;
5343 }
5344
5345 auto IG = InterleaveR->getInterleaveGroup();
5346 if (IG->getFactor() != IG->getNumMembers())
5347 return std::nullopt;
5348
5349 auto GetVectorBitWidthForVF = [&TTI](ElementCount VF) {
5350 TypeSize Size = TTI.getRegisterBitWidth(
5353 assert(Size.isScalable() == VF.isScalable() &&
5354 "if Size is scalable, VF must be scalable and vice versa");
5355 return Size.getKnownMinValue();
5356 };
5357
5358 for (ElementCount VF : VFs) {
5359 unsigned MinVal = VF.getKnownMinValue();
5360 unsigned GroupSize = GroupElementTy->getScalarSizeInBits() * MinVal;
5361 if (IG->getFactor() == MinVal && GroupSize == GetVectorBitWidthForVF(VF))
5362 return {VF};
5363 }
5364 return std::nullopt;
5365}
5366
5367/// Returns true if \p VPValue is a narrow VPValue.
5368static bool isAlreadyNarrow(VPValue *VPV) {
5369 if (isa<VPIRValue>(VPV))
5370 return true;
5371 auto *RepR = dyn_cast<VPReplicateRecipe>(VPV);
5372 return RepR && RepR->isSingleScalar();
5373}
5374
5375// Convert a wide recipe defining a VPValue \p V feeding an interleave group to
5376// a narrow variant.
5377static VPValue *
5379 auto *R = V->getDefiningRecipe();
5380 if (!R || NarrowedOps.contains(V))
5381 return V;
5382
5383 if (isAlreadyNarrow(V))
5384 return V;
5385
5387 auto *WideMember0 = cast<VPSingleDefRecipe>(R);
5388 for (unsigned Idx = 0, E = WideMember0->getNumOperands(); Idx != E; ++Idx)
5389 WideMember0->setOperand(
5390 Idx,
5391 narrowInterleaveGroupOp(WideMember0->getOperand(Idx), NarrowedOps));
5392 return V;
5393 }
5394
5395 if (auto *LoadGroup = dyn_cast<VPInterleaveRecipe>(R)) {
5396 // Narrow interleave group to wide load, as transformed VPlan will only
5397 // process one original iteration.
5398 auto *LI = cast<LoadInst>(LoadGroup->getInterleaveGroup()->getInsertPos());
5399 auto *L = new VPWidenLoadRecipe(
5400 *LI, LoadGroup->getAddr(), LoadGroup->getMask(), /*Consecutive=*/true,
5401 /*Reverse=*/false, {}, LoadGroup->getDebugLoc());
5402 L->insertBefore(LoadGroup);
5403 NarrowedOps.insert(L);
5404 return L;
5405 }
5406
5407 if (auto *RepR = dyn_cast<VPReplicateRecipe>(R)) {
5408 assert(RepR->isSingleScalar() &&
5409 isa<LoadInst>(RepR->getUnderlyingInstr()) &&
5410 "must be a single scalar load");
5411 NarrowedOps.insert(RepR);
5412 return RepR;
5413 }
5414
5415 auto *WideLoad = cast<VPWidenLoadRecipe>(R);
5416 VPValue *PtrOp = WideLoad->getAddr();
5417 if (auto *VecPtr = dyn_cast<VPVectorPointerRecipe>(PtrOp))
5418 PtrOp = VecPtr->getOperand(0);
5419 // Narrow wide load to uniform scalar load, as transformed VPlan will only
5420 // process one original iteration.
5421 auto *N = new VPReplicateRecipe(&WideLoad->getIngredient(), {PtrOp},
5422 /*IsUniform*/ true,
5423 /*Mask*/ nullptr, {}, *WideLoad);
5424 N->insertBefore(WideLoad);
5425 NarrowedOps.insert(N);
5426 return N;
5427}
5428
5429std::unique_ptr<VPlan>
5431 const TargetTransformInfo &TTI) {
5432 VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
5433
5434 if (!VectorLoop)
5435 return nullptr;
5436
5437 // Only handle single-block loops for now.
5438 if (VectorLoop->getEntryBasicBlock() != VectorLoop->getExitingBasicBlock())
5439 return nullptr;
5440
5441 // Skip plans when we may not be able to properly narrow.
5442 VPBasicBlock *Exiting = VectorLoop->getExitingBasicBlock();
5443 if (!match(&Exiting->back(), m_BranchOnCount()))
5444 return nullptr;
5445
5446 assert(match(&Exiting->back(),
5448 m_Specific(&Plan.getVectorTripCount()))) &&
5449 "unexpected branch-on-count");
5450
5451 VPTypeAnalysis TypeInfo(Plan);
5453 std::optional<ElementCount> VFToOptimize;
5454 for (auto &R : *VectorLoop->getEntryBasicBlock()) {
5456 continue;
5457
5460 continue;
5461
5462 // Bail out on recipes not supported at the moment:
5463 // * phi recipes other than the canonical induction
5464 // * recipes writing to memory except interleave groups
5465 // Only support plans with a canonical induction phi.
5466 if (R.isPhi())
5467 return nullptr;
5468
5469 auto *InterleaveR = dyn_cast<VPInterleaveRecipe>(&R);
5470 if (R.mayWriteToMemory() && !InterleaveR)
5471 return nullptr;
5472
5473 // All other ops are allowed, but we reject uses that cannot be converted
5474 // when checking all allowed consumers (store interleave groups) below.
5475 if (!InterleaveR)
5476 continue;
5477
5478 // Try to find a single VF, where all interleave groups are consecutive and
5479 // saturate the full vector width. If we already have a candidate VF, check
5480 // if it is applicable for the current InterleaveR, otherwise look for a
5481 // suitable VF across the Plan's VFs.
5483 VFToOptimize ? SmallVector<ElementCount>({*VFToOptimize})
5484 : to_vector(Plan.vectorFactors());
5485 std::optional<ElementCount> NarrowedVF =
5486 isConsecutiveInterleaveGroup(InterleaveR, VFs, TypeInfo, TTI);
5487 if (!NarrowedVF || (VFToOptimize && NarrowedVF != VFToOptimize))
5488 return nullptr;
5489 VFToOptimize = NarrowedVF;
5490
5491 // Skip read interleave groups.
5492 if (InterleaveR->getStoredValues().empty())
5493 continue;
5494
5495 // Narrow interleave groups, if all operands are already matching narrow
5496 // ops.
5497 auto *Member0 = InterleaveR->getStoredValues()[0];
5498 if (isAlreadyNarrow(Member0) &&
5499 all_of(InterleaveR->getStoredValues(), equal_to(Member0))) {
5500 StoreGroups.push_back(InterleaveR);
5501 continue;
5502 }
5503
5504 // For now, we only support full interleave groups storing load interleave
5505 // groups.
5506 if (all_of(enumerate(InterleaveR->getStoredValues()), [](auto Op) {
5507 VPRecipeBase *DefR = Op.value()->getDefiningRecipe();
5508 if (!DefR)
5509 return false;
5510 auto *IR = dyn_cast<VPInterleaveRecipe>(DefR);
5511 return IR && IR->getInterleaveGroup()->isFull() &&
5512 IR->getVPValue(Op.index()) == Op.value();
5513 })) {
5514 StoreGroups.push_back(InterleaveR);
5515 continue;
5516 }
5517
5518 // Check if all values feeding InterleaveR are matching wide recipes, which
5519 // operands that can be narrowed.
5520 if (!canNarrowOps(InterleaveR->getStoredValues(),
5521 VFToOptimize->isScalable()))
5522 return nullptr;
5523 StoreGroups.push_back(InterleaveR);
5524 }
5525
5526 if (StoreGroups.empty())
5527 return nullptr;
5528
5529 VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
5530 bool RequiresScalarEpilogue =
5531 MiddleVPBB->getNumSuccessors() == 1 &&
5532 MiddleVPBB->getSingleSuccessor() == Plan.getScalarPreheader();
5533 // Bail out for tail-folding (middle block with a single successor to exit).
5534 if (MiddleVPBB->getNumSuccessors() != 2 && !RequiresScalarEpilogue)
5535 return nullptr;
5536
5537 // All interleave groups in Plan can be narrowed for VFToOptimize. Split the
5538 // original Plan into 2: a) a new clone which contains all VFs of Plan, except
5539 // VFToOptimize, and b) the original Plan with VFToOptimize as single VF.
5540 // TODO: Handle cases where only some interleave groups can be narrowed.
5541 std::unique_ptr<VPlan> NewPlan;
5542 if (size(Plan.vectorFactors()) != 1) {
5543 NewPlan = std::unique_ptr<VPlan>(Plan.duplicate());
5544 Plan.setVF(*VFToOptimize);
5545 NewPlan->removeVF(*VFToOptimize);
5546 }
5547
5548 // Convert InterleaveGroup \p R to a single VPWidenLoadRecipe.
5549 SmallPtrSet<VPValue *, 4> NarrowedOps;
5550 // Narrow operation tree rooted at store groups.
5551 for (auto *StoreGroup : StoreGroups) {
5552 VPValue *Res =
5553 narrowInterleaveGroupOp(StoreGroup->getStoredValues()[0], NarrowedOps);
5554 auto *SI =
5555 cast<StoreInst>(StoreGroup->getInterleaveGroup()->getInsertPos());
5556 auto *S = new VPWidenStoreRecipe(
5557 *SI, StoreGroup->getAddr(), Res, nullptr, /*Consecutive=*/true,
5558 /*Reverse=*/false, {}, StoreGroup->getDebugLoc());
5559 S->insertBefore(StoreGroup);
5560 StoreGroup->eraseFromParent();
5561 }
5562
5563 // Adjust induction to reflect that the transformed plan only processes one
5564 // original iteration.
5565 auto *CanIV = VectorLoop->getCanonicalIV();
5566 auto *Inc = cast<VPInstruction>(CanIV->getBackedgeValue());
5567 VPBasicBlock *VectorPH = Plan.getVectorPreheader();
5568 VPBuilder PHBuilder(VectorPH, VectorPH->begin());
5569
5570 VPValue *UF = &Plan.getUF();
5571 VPValue *Step;
5572 if (VFToOptimize->isScalable()) {
5573 VPValue *VScale = PHBuilder.createElementCount(
5575 Step = PHBuilder.createOverflowingOp(Instruction::Mul, {VScale, UF},
5576 {true, false});
5577 Plan.getVF().replaceAllUsesWith(VScale);
5578 } else {
5579 Step = UF;
5581 Plan.getConstantInt(CanIV->getScalarType(), 1));
5582 }
5583 // Materialize vector trip count with the narrowed step.
5584 materializeVectorTripCount(Plan, VectorPH, /*TailByMasking=*/false,
5585 RequiresScalarEpilogue, Step);
5586
5587 Inc->setOperand(1, Step);
5588 Plan.getVFxUF().replaceAllUsesWith(Step);
5589
5590 removeDeadRecipes(Plan);
5591 assert(none_of(*VectorLoop->getEntryBasicBlock(),
5593 "All VPVectorPointerRecipes should have been removed");
5594 return NewPlan;
5595}
5596
5597/// Add branch weight metadata, if the \p Plan's middle block is terminated by a
5598/// BranchOnCond recipe.
5600 VPlan &Plan, ElementCount VF, std::optional<unsigned> VScaleForTuning) {
5601 VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
5602 auto *MiddleTerm =
5604 // Only add branch metadata if there is a (conditional) terminator.
5605 if (!MiddleTerm)
5606 return;
5607
5608 assert(MiddleTerm->getOpcode() == VPInstruction::BranchOnCond &&
5609 "must have a BranchOnCond");
5610 // Assume that `TripCount % VectorStep ` is equally distributed.
5611 unsigned VectorStep = Plan.getConcreteUF() * VF.getKnownMinValue();
5612 if (VF.isScalable() && VScaleForTuning.has_value())
5613 VectorStep *= *VScaleForTuning;
5614 assert(VectorStep > 0 && "trip count should not be zero");
5615 MDBuilder MDB(Plan.getContext());
5616 MDNode *BranchWeights =
5617 MDB.createBranchWeights({1, VectorStep - 1}, /*IsExpected=*/false);
5618 MiddleTerm->setMetadata(LLVMContext::MD_prof, BranchWeights);
5619}
5620
5622 VFRange &Range) {
5623 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
5624 auto *MiddleVPBB = Plan.getMiddleBlock();
5625 VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
5626
5627 auto IsScalableOne = [](ElementCount VF) -> bool {
5628 return VF == ElementCount::getScalable(1);
5629 };
5630
5631 for (auto &HeaderPhi : VectorRegion->getEntryBasicBlock()->phis()) {
5632 auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&HeaderPhi);
5633 if (!FOR)
5634 continue;
5635
5636 assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
5637 "Cannot handle loops with uncountable early exits");
5638
5639 // This is the second phase of vectorizing first-order recurrences, creating
5640 // extract for users outside the loop. An overview of the transformation is
5641 // described below. Suppose we have the following loop with some use after
5642 // the loop of the last a[i-1],
5643 //
5644 // for (int i = 0; i < n; ++i) {
5645 // t = a[i - 1];
5646 // b[i] = a[i] - t;
5647 // }
5648 // use t;
5649 //
5650 // There is a first-order recurrence on "a". For this loop, the shorthand
5651 // scalar IR looks like:
5652 //
5653 // scalar.ph:
5654 // s.init = a[-1]
5655 // br scalar.body
5656 //
5657 // scalar.body:
5658 // i = phi [0, scalar.ph], [i+1, scalar.body]
5659 // s1 = phi [s.init, scalar.ph], [s2, scalar.body]
5660 // s2 = a[i]
5661 // b[i] = s2 - s1
5662 // br cond, scalar.body, exit.block
5663 //
5664 // exit.block:
5665 // use = lcssa.phi [s1, scalar.body]
5666 //
5667 // In this example, s1 is a recurrence because it's value depends on the
5668 // previous iteration. In the first phase of vectorization, we created a
5669 // VPFirstOrderRecurrencePHIRecipe v1 for s1. Now we create the extracts
5670 // for users in the scalar preheader and exit block.
5671 //
5672 // vector.ph:
5673 // v_init = vector(..., ..., ..., a[-1])
5674 // br vector.body
5675 //
5676 // vector.body
5677 // i = phi [0, vector.ph], [i+4, vector.body]
5678 // v1 = phi [v_init, vector.ph], [v2, vector.body]
5679 // v2 = a[i, i+1, i+2, i+3]
5680 // b[i] = v2 - v1
5681 // // Next, third phase will introduce v1' = splice(v1(3), v2(0, 1, 2))
5682 // b[i, i+1, i+2, i+3] = v2 - v1
5683 // br cond, vector.body, middle.block
5684 //
5685 // middle.block:
5686 // vector.recur.extract.for.phi = v2(2)
5687 // vector.recur.extract = v2(3)
5688 // br cond, scalar.ph, exit.block
5689 //
5690 // scalar.ph:
5691 // scalar.recur.init = phi [vector.recur.extract, middle.block],
5692 // [s.init, otherwise]
5693 // br scalar.body
5694 //
5695 // scalar.body:
5696 // i = phi [0, scalar.ph], [i+1, scalar.body]
5697 // s1 = phi [scalar.recur.init, scalar.ph], [s2, scalar.body]
5698 // s2 = a[i]
5699 // b[i] = s2 - s1
5700 // br cond, scalar.body, exit.block
5701 //
5702 // exit.block:
5703 // lo = lcssa.phi [s1, scalar.body],
5704 // [vector.recur.extract.for.phi, middle.block]
5705 //
5706 // Now update VPIRInstructions modeling LCSSA phis in the exit block.
5707 // Extract the penultimate value of the recurrence and use it as operand for
5708 // the VPIRInstruction modeling the phi.
5710 make_range(MiddleVPBB->getFirstNonPhi(), MiddleVPBB->end()))) {
5712 continue;
5713
5714 // For VF vscale x 1, if vscale = 1, we are unable to extract the
5715 // penultimate value of the recurrence. Instead we rely on the existing
5716 // extract of the last element from the result of
5717 // VPInstruction::FirstOrderRecurrenceSplice.
5718 // TODO: Consider vscale_range info and UF.
5720 Range))
5721 return;
5722 VPValue *PenultimateElement = MiddleBuilder.createNaryOp(
5723 VPInstruction::ExtractPenultimateElement, FOR->getBackedgeValue(), {},
5724 "vector.recur.extract.for.phi");
5725 for (VPUser *U : to_vector(cast<VPInstruction>(&R)->users())) {
5726 auto *ExitPhi = dyn_cast<VPIRPhi>(U);
5727 if (!ExitPhi)
5728 continue;
5729 ExitPhi->replaceUsesOfWith(cast<VPInstruction>(&R), PenultimateElement);
5730 }
5731 }
5732 }
5733}
5734
5737 Loop &L) {
5738 ScalarEvolution &SE = *PSE.getSE();
5739 VPRegionBlock *VectorLoopRegion = Plan.getVectorLoopRegion();
5740
5741 // Helper lambda to check if the IV range excludes the sentinel value.
5742 auto CheckSentinel = [&SE](const SCEV *IVSCEV, bool UseMax,
5743 bool Signed) -> std::optional<APInt> {
5744 unsigned BW = IVSCEV->getType()->getScalarSizeInBits();
5745 APInt Sentinel =
5746 UseMax
5749
5750 ConstantRange IVRange =
5751 Signed ? SE.getSignedRange(IVSCEV) : SE.getUnsignedRange(IVSCEV);
5752 if (!IVRange.contains(Sentinel))
5753 return Sentinel;
5754 return std::nullopt;
5755 };
5756
5757 VPValue *HeaderMask = vputils::findHeaderMask(Plan);
5758 for (VPRecipeBase &Phi :
5759 make_early_inc_range(VectorLoopRegion->getEntryBasicBlock()->phis())) {
5760 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&Phi);
5762 PhiR->getRecurrenceKind()))
5763 continue;
5764
5765 Type *PhiTy = VPTypeAnalysis(Plan).inferScalarType(PhiR);
5766 if (PhiTy->isPointerTy() || PhiTy->isFloatingPointTy())
5767 continue;
5768
5769 // If there's a header mask, the backedge select will not be the find-last
5770 // select.
5771 VPValue *BackedgeVal = PhiR->getBackedgeValue();
5772 VPValue *CondSelect = BackedgeVal;
5773 if (HeaderMask &&
5774 !match(BackedgeVal, m_Select(m_Specific(HeaderMask),
5775 m_VPValue(CondSelect), m_Specific(PhiR))))
5776 llvm_unreachable("expected header mask select");
5777
5778 // Get the IV from the conditional select of the reduction phi.
5779 // The conditional select should be a select between the phi and the IV.
5780 VPValue *Cond, *TrueVal, *FalseVal;
5781 if (!match(CondSelect, m_Select(m_VPValue(Cond), m_VPValue(TrueVal),
5782 m_VPValue(FalseVal))))
5783 continue;
5784
5785 // The non-phi operand of the select is the IV.
5786 assert(is_contained(CondSelect->getDefiningRecipe()->operands(), PhiR));
5787 VPValue *IV = TrueVal == PhiR ? FalseVal : TrueVal;
5788
5789 const SCEV *IVSCEV = vputils::getSCEVExprForVPValue(IV, PSE, &L);
5790 const SCEV *Step;
5791 if (!match(IVSCEV, m_scev_AffineAddRec(m_SCEV(), m_SCEV(Step))))
5792 continue;
5793
5794 // Determine direction from SCEV step.
5795 if (!SE.isKnownNonZero(Step))
5796 continue;
5797
5798 // Positive step means we need UMax/SMax to find the last IV value, and
5799 // UMin/SMin otherwise.
5800 bool UseMax = SE.isKnownPositive(Step);
5801 bool UseSigned = true;
5802 std::optional<APInt> SentinelVal =
5803 CheckSentinel(IVSCEV, UseMax, /*IsSigned=*/true);
5804 if (!SentinelVal) {
5805 SentinelVal = CheckSentinel(IVSCEV, UseMax, /*IsSigned=*/false);
5806 UseSigned = false;
5807 }
5808
5809 // If no sentinel was found, fall back to a boolean AnyOf reduction to track
5810 // if the condition was ever true. Requires the IV to not wrap, otherwise we
5811 // cannot use min/max.
5812 if (!SentinelVal) {
5813 auto *AR = cast<SCEVAddRecExpr>(IVSCEV);
5814 if (AR->hasNoSignedWrap())
5815 UseSigned = true;
5816 else if (AR->hasNoUnsignedWrap())
5817 UseSigned = false;
5818 else
5819 continue;
5820 }
5821
5823 BackedgeVal,
5825
5826 RecurKind MinMaxKind =
5827 UseMax ? (UseSigned ? RecurKind::SMax : RecurKind::UMax)
5828 : (UseSigned ? RecurKind::SMin : RecurKind::UMin);
5829 VPIRFlags Flags(MinMaxKind, /*IsOrdered=*/false, /*IsInLoop=*/false,
5830 FastMathFlags());
5831 DebugLoc ExitDL = RdxResult->getDebugLoc();
5832 VPBuilder MiddleBuilder(RdxResult);
5833 VPValue *ReducedIV =
5835 RdxResult->getOperand(0), Flags, ExitDL);
5836
5837 VPValue *NewRdxResult;
5838 VPValue *StartVPV = PhiR->getStartValue();
5839 if (SentinelVal) {
5840 // Sentinel-based approach: reduce IVs with min/max, compare against
5841 // sentinel to detect if condition was ever true, select accordingly.
5842 VPValue *Sentinel = Plan.getConstantInt(*SentinelVal);
5843 auto *Cmp = MiddleBuilder.createICmp(CmpInst::ICMP_NE, ReducedIV,
5844 Sentinel, ExitDL);
5845 NewRdxResult =
5846 MiddleBuilder.createSelect(Cmp, ReducedIV, StartVPV, ExitDL);
5847 StartVPV = Sentinel;
5848 } else {
5849 // Introduce a boolean AnyOf reduction to track if the condition was ever
5850 // true in the loop. Use it to select the initial start value, if it was
5851 // never true.
5852 auto *AnyOfPhi = new VPReductionPHIRecipe(
5853 /*Phi=*/nullptr, RecurKind::Or, *Plan.getFalse(), *Plan.getFalse(),
5854 RdxUnordered{1}, {}, /*HasUsesOutsideReductionChain=*/false);
5855 AnyOfPhi->insertAfter(PhiR);
5856
5857 VPBuilder LoopBuilder(BackedgeVal->getDefiningRecipe());
5858 VPValue *AnyOfCond = Cond;
5859 if (TrueVal == PhiR)
5860 AnyOfCond = LoopBuilder.createNot(Cond);
5861 VPValue *OrVal = LoopBuilder.createOr(AnyOfPhi, AnyOfCond);
5862 AnyOfPhi->setOperand(1, OrVal);
5863
5864 NewRdxResult =
5866 {StartVPV, ReducedIV, OrVal}, {}, ExitDL);
5867
5868 // Initialize the IV reduction phi with the neutral element, not the
5869 // original start value, to ensure correct min/max reduction results.
5870 StartVPV = Plan.getOrAddLiveIn(
5871 getRecurrenceIdentity(MinMaxKind, IVSCEV->getType(), {}));
5872 }
5873 RdxResult->replaceAllUsesWith(NewRdxResult);
5874 RdxResult->eraseFromParent();
5875
5876 auto *NewPhiR = new VPReductionPHIRecipe(
5877 cast<PHINode>(PhiR->getUnderlyingInstr()), RecurKind::FindIV, *StartVPV,
5878 *CondSelect, RdxUnordered{1}, {}, PhiR->hasUsesOutsideReductionChain());
5879 NewPhiR->insertBefore(PhiR);
5880 PhiR->replaceAllUsesWith(NewPhiR);
5881 PhiR->eraseFromParent();
5882 }
5883}
5884
5885namespace {
5886
5887/// Holds the binary operation used to compute the extended operand and the
5888/// casts that feed into it.
5889struct ExtendedReductionOperand {
5890 VPWidenRecipe *BinOp = nullptr;
5891 // Note: The second cast recipe may be null.
5892 std::array<VPWidenCastRecipe *, 2> CastRecipes = {};
5893};
5894
5895/// A chain of recipes that form a partial reduction. Matches either
5896/// reduction_bin_op (extend (A), accumulator), or
5897/// reduction_bin_op (bin_op (extend (A), (extend (B))), accumulator).
5898struct VPPartialReductionChain {
5899 /// The top-level binary operation that forms the reduction to a scalar
5900 /// after the loop body.
5901 VPWidenRecipe *ReductionBinOp;
5902 /// The user of the extends that is then reduced.
5903 ExtendedReductionOperand ExtendedOp;
5904 unsigned ScaleFactor;
5905 /// The recurrence kind for the entire partial reduction chain.
5906 /// This allows distinguishing between Sub and AddWithSub recurrences,
5907 /// when the ReductionBinOp is a Instruction::Sub.
5908 RecurKind RK;
5909};
5910
5911static VPSingleDefRecipe *
5912optimizeExtendsForPartialReduction(VPSingleDefRecipe *BinOp,
5913 VPTypeAnalysis &TypeInfo) {
5914 // reduce.add(mul(ext(A), C))
5915 // -> reduce.add(mul(ext(A), ext(trunc(C))))
5916 const APInt *Const;
5917 if (match(BinOp, m_Mul(m_ZExtOrSExt(m_VPValue()), m_APInt(Const)))) {
5918 auto *ExtA = cast<VPWidenCastRecipe>(BinOp->getOperand(0));
5919 Instruction::CastOps ExtOpc = ExtA->getOpcode();
5920 Type *NarrowTy = TypeInfo.inferScalarType(ExtA->getOperand(0));
5921 if (!BinOp->hasOneUse() ||
5923 Const, NarrowTy, TTI::getPartialReductionExtendKind(ExtOpc)))
5924 return BinOp;
5925
5926 VPBuilder Builder(BinOp);
5927 auto *Trunc = Builder.createWidenCast(Instruction::CastOps::Trunc,
5928 BinOp->getOperand(1), NarrowTy);
5929 Type *WideTy = TypeInfo.inferScalarType(ExtA);
5930 BinOp->setOperand(1, Builder.createWidenCast(ExtOpc, Trunc, WideTy));
5931 return BinOp;
5932 }
5933
5934 // reduce.add(ext(mul(ext(A), ext(B))))
5935 // -> reduce.add(mul(wider_ext(A), wider_ext(B)))
5936 // TODO: Support this optimization for float types.
5938 m_ZExtOrSExt(m_VPValue()))))) {
5939 auto *Ext = cast<VPWidenCastRecipe>(BinOp);
5940 auto *Mul = cast<VPWidenRecipe>(Ext->getOperand(0));
5941 auto *MulLHS = cast<VPWidenCastRecipe>(Mul->getOperand(0));
5942 auto *MulRHS = cast<VPWidenCastRecipe>(Mul->getOperand(1));
5943 if (!Mul->hasOneUse() ||
5944 (Ext->getOpcode() != MulLHS->getOpcode() && MulLHS != MulRHS) ||
5945 MulLHS->getOpcode() != MulRHS->getOpcode())
5946 return BinOp;
5947 VPBuilder Builder(Mul);
5948 Mul->setOperand(0, Builder.createWidenCast(MulLHS->getOpcode(),
5949 MulLHS->getOperand(0),
5950 Ext->getResultType()));
5951 Mul->setOperand(1, MulLHS == MulRHS
5952 ? Mul->getOperand(0)
5953 : Builder.createWidenCast(MulRHS->getOpcode(),
5954 MulRHS->getOperand(0),
5955 Ext->getResultType()));
5956 return Mul;
5957 }
5958
5959 return BinOp;
5960}
5961
5962// Helper to transform a partial reduction chain into a partial reduction
5963// recipe. Assumes profitability has been checked.
5964static void transformToPartialReduction(const VPPartialReductionChain &Chain,
5965 VPTypeAnalysis &TypeInfo, VPlan &Plan,
5966 VPReductionPHIRecipe *RdxPhi) {
5967 VPWidenRecipe *WidenRecipe = Chain.ReductionBinOp;
5968 assert(WidenRecipe->getNumOperands() == 2 && "Expected binary operation");
5969
5970 VPValue *BinOpVal = WidenRecipe->getOperand(0);
5971 VPValue *Accumulator = WidenRecipe->getOperand(1);
5972
5973 // Swap if needed to ensure Accumulator is the PHI or partial reduction.
5975 isa<VPExpressionRecipe>(BinOpVal))
5976 std::swap(BinOpVal, Accumulator);
5977 auto *BinOp = cast<VPSingleDefRecipe>(BinOpVal->getDefiningRecipe());
5978
5979 // Sub-reductions can be implemented in two ways:
5980 // (1) negate the operand in the vector loop (the default way).
5981 // (2) subtract the reduced value from the init value in the middle block.
5982 // Both ways keep the reduction itself as an 'add' reduction.
5983 //
5984 // The ISD nodes for partial reductions don't support folding the
5985 // sub/negation into its operands because the following is not a valid
5986 // transformation:
5987 // sub(0, mul(ext(a), ext(b)))
5988 // -> mul(ext(a), ext(sub(0, b)))
5989 //
5990 // It's therefore better to choose option (2) such that the partial
5991 // reduction is always positive (starting at '0') and to do a final
5992 // subtract in the middle block.
5993 if (WidenRecipe->getOpcode() == Instruction::Sub &&
5994 Chain.RK != RecurKind::Sub) {
5995 VPBuilder Builder(WidenRecipe);
5996 Type *ElemTy = TypeInfo.inferScalarType(BinOp);
5997 auto *Zero = Plan.getZero(ElemTy);
5998 VPIRFlags Flags = WidenRecipe->getUnderlyingInstr()
5999 ? VPIRFlags(*WidenRecipe->getUnderlyingInstr())
6000 : VPIRFlags();
6001 auto *NegRecipe = new VPWidenRecipe(Instruction::Sub, {Zero, BinOp}, Flags,
6003 Builder.insert(NegRecipe);
6004 BinOp = NegRecipe;
6005 }
6006
6007 // FIXME: Do these transforms before invoking the cost-model.
6008 BinOp = optimizeExtendsForPartialReduction(BinOp, TypeInfo);
6009
6010 // Check if WidenRecipe is the final result of the reduction. If so look
6011 // through selects for predicated reductions.
6012 VPValue *Cond = nullptr;
6014 WidenRecipe,
6015 m_Select(m_VPValue(Cond), m_Specific(WidenRecipe), m_Specific(RdxPhi))));
6016 bool IsLastInChain = RdxPhi->getBackedgeValue() == WidenRecipe ||
6017 RdxPhi->getBackedgeValue() == ExitValue;
6018 assert((!ExitValue || IsLastInChain) &&
6019 "if we found ExitValue, it must match RdxPhi's backedge value");
6020
6021 Type *PhiType = TypeInfo.inferScalarType(RdxPhi);
6022 RecurKind RdxKind =
6024 auto *PartialRed = new VPReductionRecipe(
6025 RdxKind,
6026 RdxKind == RecurKind::FAdd ? WidenRecipe->getFastMathFlags()
6027 : FastMathFlags(),
6028 WidenRecipe->getUnderlyingInstr(), Accumulator, BinOp, Cond,
6029 RdxUnordered{/*VFScaleFactor=*/Chain.ScaleFactor});
6030 PartialRed->insertBefore(WidenRecipe);
6031
6032 if (Cond)
6033 ExitValue->replaceAllUsesWith(PartialRed);
6034 WidenRecipe->replaceAllUsesWith(PartialRed);
6035
6036 // We only need to update the PHI node once, which is when we find the
6037 // last reduction in the chain.
6038 if (!IsLastInChain)
6039 return;
6040
6041 // Scale the PHI and ReductionStartVector by the VFScaleFactor
6042 assert(RdxPhi->getVFScaleFactor() == 1 && "scale factor must not be set");
6043 RdxPhi->setVFScaleFactor(Chain.ScaleFactor);
6044
6045 auto *StartInst = cast<VPInstruction>(RdxPhi->getStartValue());
6046 assert(StartInst->getOpcode() == VPInstruction::ReductionStartVector);
6047 auto *NewScaleFactor = Plan.getConstantInt(32, Chain.ScaleFactor);
6048 StartInst->setOperand(2, NewScaleFactor);
6049
6050 // If this is the last value in a sub-reduction chain, then update the PHI
6051 // node to start at `0` and update the reduction-result to subtract from
6052 // the PHI's start value.
6053 if (Chain.RK != RecurKind::Sub)
6054 return;
6055
6056 VPValue *OldStartValue = StartInst->getOperand(0);
6057 StartInst->setOperand(0, StartInst->getOperand(1));
6058
6059 // Replace reduction_result by 'sub (startval, reductionresult)'.
6061 assert(RdxResult && "Could not find reduction result");
6062
6063 VPBuilder Builder = VPBuilder::getToInsertAfter(RdxResult);
6064 constexpr unsigned SubOpc = Instruction::BinaryOps::Sub;
6065 VPInstruction *NewResult = Builder.createNaryOp(
6066 SubOpc, {OldStartValue, RdxResult}, VPIRFlags::getDefaultFlags(SubOpc),
6067 RdxPhi->getDebugLoc());
6068 RdxResult->replaceUsesWithIf(
6069 NewResult,
6070 [&NewResult](VPUser &U, unsigned Idx) { return &U != NewResult; });
6071}
6072
6073/// Check if a partial reduction chain is is supported by the target (i.e. does
6074/// not have an invalid cost) for the given VF range. Clamps the range and
6075/// returns true if profitable for any VF.
6076static bool isValidPartialReduction(const VPPartialReductionChain &Chain,
6077 Type *PhiType, VPCostContext &CostCtx,
6078 VFRange &Range) {
6079 auto GetExtInfo = [&CostCtx](VPWidenCastRecipe *Ext)
6080 -> std::pair<Type *, TargetTransformInfo::PartialReductionExtendKind> {
6081 if (!Ext)
6082 return {nullptr, TargetTransformInfo::PR_None};
6083 Type *ExtOpType = CostCtx.Types.inferScalarType(Ext->getOperand(0));
6085 static_cast<Instruction::CastOps>(Ext->getOpcode()));
6086 return {ExtOpType, ExtKind};
6087 };
6088 ExtendedReductionOperand ExtendedOp = Chain.ExtendedOp;
6089 VPWidenCastRecipe *ExtendA = ExtendedOp.CastRecipes[0];
6090 VPWidenCastRecipe *ExtendB = ExtendedOp.CastRecipes[1];
6091
6092 Type *ExtOpTypeA, *ExtOpTypeB;
6094 std::tie(ExtOpTypeA, ExtKindA) = GetExtInfo(ExtendA);
6095 std::tie(ExtOpTypeB, ExtKindB) = GetExtInfo(ExtendB);
6096
6097 // If ExtendB is nullptr but there's a separate BinOp, the second operand
6098 // was a constant that can use the same extend kind as the first.
6099 if (!ExtendB && ExtendedOp.BinOp &&
6100 ExtendedOp.BinOp != Chain.ReductionBinOp) {
6101 const APInt *Const = nullptr;
6102 for (VPValue *Op : ExtendedOp.BinOp->operands()) {
6103 if (match(Op, m_APInt(Const)))
6104 break;
6105 }
6106 if (!Const || !canConstantBeExtended(Const, ExtOpTypeA, ExtKindA))
6107 return false;
6108 ExtOpTypeB = ExtOpTypeA;
6109 ExtKindB = ExtKindA;
6110 }
6111
6112 std::optional<unsigned> BinOpc;
6113 if (ExtendedOp.BinOp && ExtendedOp.BinOp != Chain.ReductionBinOp)
6114 BinOpc = ExtendedOp.BinOp->getOpcode();
6115
6116 VPWidenRecipe *WidenRecipe = Chain.ReductionBinOp;
6118 [&](ElementCount VF) {
6119 return CostCtx.TTI
6121 WidenRecipe->getOpcode(), ExtOpTypeA, ExtOpTypeB, PhiType, VF,
6122 ExtKindA, ExtKindB, BinOpc, CostCtx.CostKind,
6123 PhiType->isFloatingPointTy()
6124 ? std::optional{WidenRecipe->getFastMathFlags()}
6125 : std::nullopt)
6126 .isValid();
6127 },
6128 Range);
6129}
6130
6132getPartialReductionExtendKind(VPWidenCastRecipe *Cast) {
6134}
6135
6136/// Checks if \p Op (which is an operand of \p UpdateR) is an extended reduction
6137/// operand. This is an operand where the source of the value (e.g. a load) has
6138/// been extended (sext, zext, or fpext) before it is used in the reduction.
6139///
6140/// Possible forms matched by this function:
6141/// - UpdateR(PrevValue, ext(...))
6142/// - UpdateR(PrevValue, BinOp(ext(...), ext(...)))
6143/// - UpdateR(PrevValue, BinOp(ext(...), Constant))
6144/// - UpdateR(PrevValue, neg(BinOp(ext(...), ext(...))))
6145/// - UpdateR(PrevValue, neg(BinOp(ext(...), Constant)))
6146/// - UpdateR(PrevValue, ext(mul(ext(...), ext(...))))
6147/// - UpdateR(PrevValue, ext(mul(ext(...), Constant)))
6148///
6149/// Note: The second operand of UpdateR corresponds to \p Op in the examples.
6150static std::optional<ExtendedReductionOperand>
6151matchExtendedReductionOperand(VPWidenRecipe *UpdateR, VPValue *Op) {
6152 assert(is_contained(UpdateR->operands(), Op) &&
6153 "Op should be operand of UpdateR");
6154
6155 std::optional<TTI::PartialReductionExtendKind> OuterExtKind;
6157 auto *CastRecipe = cast<VPWidenCastRecipe>(Op);
6158 VPValue *CastSource = CastRecipe->getOperand(0);
6159 if (match(CastSource, m_Mul(m_VPValue(), m_VPValue())) ||
6160 match(CastSource, m_FMul(m_VPValue(), m_VPValue()))) {
6161 // Match: ext(mul(...))
6162 // Record the outer extend kind and set `Op` to the mul. We can then match
6163 // this as a binary operation. Note: We can optimize out the outer extend
6164 // by widening the inner extends to match it. See
6165 // optimizeExtendsForPartialReduction.
6166 Op = CastSource;
6167 OuterExtKind = getPartialReductionExtendKind(CastRecipe);
6168 } else if (UpdateR->getOpcode() == Instruction::Add ||
6169 UpdateR->getOpcode() == Instruction::FAdd) {
6170 // Match: UpdateR(PrevValue, ext(...))
6171 // TODO: Remove the add/fadd restriction (we should be able to handle this
6172 // case for sub reductions too).
6173 return ExtendedReductionOperand{UpdateR, {CastRecipe, nullptr}};
6174 }
6175 }
6176
6177 if (!Op->hasOneUse())
6178 return std::nullopt;
6179
6180 // Handle neg(...) pattern (aka sub(0, ...)).
6181 VPValue *NegatedOp = nullptr;
6182 if (match(Op, m_Sub(m_ZeroInt(), m_VPValue(NegatedOp))))
6183 Op = NegatedOp;
6184
6186 if (!BinOp || !Instruction::isBinaryOp(BinOp->getOpcode()))
6187 return std::nullopt;
6188
6189 // The rest of the matching assumes `Op` is a (possibly extended/negated)
6190 // binary operation.
6191
6192 VPValue *LHS = BinOp->getOperand(0);
6193 VPValue *RHS = BinOp->getOperand(1);
6194
6195 // The LHS of the operation must always be an extend.
6197 return std::nullopt;
6198
6199 auto *LHSCast = cast<VPWidenCastRecipe>(LHS);
6200
6201 // The RHS of the operation can be an extend or a constant integer.
6202 // The constant will be validated in isValidPartialReduction.
6203 VPWidenCastRecipe *RHSCast = nullptr;
6205 RHSCast = cast<VPWidenCastRecipe>(RHS);
6206 else if (!isa<VPConstantInt>(RHS))
6207 return std::nullopt;
6208
6209 // The outer extend kind must match the inner extends for folding.
6210 for (VPWidenCastRecipe *Cast : {LHSCast, RHSCast})
6211 if (Cast && OuterExtKind &&
6212 getPartialReductionExtendKind(Cast) != OuterExtKind)
6213 return std::nullopt;
6214
6215 return ExtendedReductionOperand{BinOp, {LHSCast, RHSCast}};
6216}
6217
6218/// Examines each operation in the reduction chain corresponding to \p RedPhiR,
6219/// and determines if the target can use a cheaper operation with a wider
6220/// per-iteration input VF and narrower PHI VF. If successful, returns the chain
6221/// of operations in the reduction.
6222static std::optional<SmallVector<VPPartialReductionChain>>
6223getScaledReductions(VPReductionPHIRecipe *RedPhiR, VPCostContext &CostCtx,
6224 VFRange &Range) {
6225 // Get the backedge value from the reduction PHI and find the
6226 // ComputeReductionResult that uses it (directly or through a select for
6227 // predicated reductions).
6228 auto *RdxResult = vputils::findComputeReductionResult(RedPhiR);
6229 if (!RdxResult)
6230 return std::nullopt;
6231 VPValue *ExitValue = RdxResult->getOperand(0);
6232 match(ExitValue, m_Select(m_VPValue(), m_VPValue(ExitValue), m_VPValue()));
6233
6235 RecurKind RK = RedPhiR->getRecurrenceKind();
6236 Type *PhiType = CostCtx.Types.inferScalarType(RedPhiR);
6237 TypeSize PHISize = PhiType->getPrimitiveSizeInBits();
6238
6239 // Work backwards from the ExitValue examining each reduction operation.
6240 VPValue *CurrentValue = ExitValue;
6241 while (CurrentValue != RedPhiR) {
6242 auto *UpdateR = dyn_cast<VPWidenRecipe>(CurrentValue);
6243 if (!UpdateR || !Instruction::isBinaryOp(UpdateR->getOpcode()))
6244 return std::nullopt;
6245
6246 VPValue *Op = UpdateR->getOperand(1);
6247 VPValue *PrevValue = UpdateR->getOperand(0);
6248
6249 // Find the extended operand. The other operand (PrevValue) is the next link
6250 // in the reduction chain.
6251 std::optional<ExtendedReductionOperand> ExtendedOp =
6252 matchExtendedReductionOperand(UpdateR, Op);
6253 if (!ExtendedOp) {
6254 ExtendedOp = matchExtendedReductionOperand(UpdateR, PrevValue);
6255 if (!ExtendedOp)
6256 return std::nullopt;
6257 std::swap(Op, PrevValue);
6258 }
6259
6260 Type *ExtSrcType = CostCtx.Types.inferScalarType(
6261 ExtendedOp->CastRecipes[0]->getOperand(0));
6262 TypeSize ExtSrcSize = ExtSrcType->getPrimitiveSizeInBits();
6263 if (!PHISize.hasKnownScalarFactor(ExtSrcSize))
6264 return std::nullopt;
6265
6266 VPPartialReductionChain Chain(
6267 {UpdateR, *ExtendedOp,
6268 static_cast<unsigned>(PHISize.getKnownScalarFactor(ExtSrcSize)), RK});
6269 if (!isValidPartialReduction(Chain, PhiType, CostCtx, Range))
6270 return std::nullopt;
6271
6272 Chains.push_back(Chain);
6273 CurrentValue = PrevValue;
6274 }
6275
6276 // The chains were collected by traversing backwards from the exit value.
6277 // Reverse the chains so they are in program order.
6278 std::reverse(Chains.begin(), Chains.end());
6279 return Chains;
6280}
6281} // namespace
6282
6284 VPCostContext &CostCtx,
6285 VFRange &Range) {
6286 // Find all possible valid partial reductions, grouping chains by their PHI.
6287 // This grouping allows invalidating the whole chain, if any link is not a
6288 // valid partial reduction.
6290 ChainsByPhi;
6291 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
6292 for (VPRecipeBase &R : HeaderVPBB->phis()) {
6293 auto *RedPhiR = dyn_cast<VPReductionPHIRecipe>(&R);
6294 if (!RedPhiR)
6295 continue;
6296
6297 if (auto Chains = getScaledReductions(RedPhiR, CostCtx, Range))
6298 ChainsByPhi.try_emplace(RedPhiR, std::move(*Chains));
6299 }
6300
6301 if (ChainsByPhi.empty())
6302 return;
6303
6304 // Build set of partial reduction operations for extend user validation and
6305 // a map of reduction bin ops to their scale factors for scale validation.
6306 SmallPtrSet<VPRecipeBase *, 4> PartialReductionOps;
6307 DenseMap<VPSingleDefRecipe *, unsigned> ScaledReductionMap;
6308 for (const auto &[_, Chains] : ChainsByPhi)
6309 for (const VPPartialReductionChain &Chain : Chains) {
6310 PartialReductionOps.insert(Chain.ExtendedOp.BinOp);
6311 ScaledReductionMap[Chain.ReductionBinOp] = Chain.ScaleFactor;
6312 }
6313
6314 // A partial reduction is invalid if any of its extends are used by
6315 // something that isn't another partial reduction. This is because the
6316 // extends are intended to be lowered along with the reduction itself.
6317 auto ExtendUsersValid = [&](VPWidenCastRecipe *Ext) {
6318 return !Ext || all_of(Ext->users(), [&](VPUser *U) {
6319 return PartialReductionOps.contains(cast<VPRecipeBase>(U));
6320 });
6321 };
6322
6323 // Validate chains: check that extends are only used by partial reductions,
6324 // and that reduction bin ops are only used by other partial reductions with
6325 // matching scale factors, are outside the loop region or the select
6326 // introduced by tail-folding. Otherwise we would create users of scaled
6327 // reductions where the types of the other operands don't match.
6328 for (auto &[RedPhiR, Chains] : ChainsByPhi) {
6329 for (const VPPartialReductionChain &Chain : Chains) {
6330 if (!all_of(Chain.ExtendedOp.CastRecipes, ExtendUsersValid)) {
6331 Chains.clear();
6332 break;
6333 }
6334 auto UseIsValid = [&, RedPhiR = RedPhiR](VPUser *U) {
6335 if (auto *PhiR = dyn_cast<VPReductionPHIRecipe>(U))
6336 return PhiR == RedPhiR;
6337 auto *R = cast<VPSingleDefRecipe>(U);
6338 return Chain.ScaleFactor == ScaledReductionMap.lookup_or(R, 0) ||
6340 m_Specific(Chain.ReductionBinOp))) ||
6341 match(R, m_Select(m_VPValue(), m_Specific(Chain.ReductionBinOp),
6342 m_Specific(RedPhiR)));
6343 };
6344 if (!all_of(Chain.ReductionBinOp->users(), UseIsValid)) {
6345 Chains.clear();
6346 break;
6347 }
6348
6349 // Check if the compute-reduction-result is used by a sunk store.
6350 // TODO: Also form partial reductions in those cases.
6351 if (auto *RdxResult = vputils::findComputeReductionResult(RedPhiR)) {
6352 if (any_of(RdxResult->users(), [](VPUser *U) {
6353 auto *RepR = dyn_cast<VPReplicateRecipe>(U);
6354 return RepR && isa<StoreInst>(RepR->getUnderlyingInstr());
6355 })) {
6356 Chains.clear();
6357 break;
6358 }
6359 }
6360 }
6361 }
6362
6363 for (auto &[Phi, Chains] : ChainsByPhi)
6364 for (const VPPartialReductionChain &Chain : Chains)
6365 transformToPartialReduction(Chain, CostCtx.Types, Plan, Phi);
6366}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
AMDGPU Register Bank Select
This file implements a class to represent arbitrary precision integral constant values and operations...
ReachingDefInfo InstSet & ToRemove
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool isEqual(const Function &Caller, const Function &Callee)
static const Function * getParent(const Value *V)
#define X(NUM, ENUM, NAME)
Definition ELF.h:849
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
static bool isSentinel(const DWARFDebugNames::AttributeEncoding &AE)
@ Default
Hexagon Common GEP
#define _
iv Induction Variable Users
Definition IVUsers.cpp:48
iv users
Definition IVUsers.cpp:48
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
licm
Definition LICM.cpp:383
Legalize the Machine IR a function s Machine IR
Definition Legalizer.cpp:81
#define I(x, y, z)
Definition MD5.cpp:57
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first DebugLoc that has line number information, given a range of instructions.
This file provides utility analysis objects describing memory locations.
This file contains the declarations for metadata subclasses.
MachineInstr unsigned OpIdx
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
#define P(N)
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
R600 Clause Merge
const SmallVectorImpl< MachineOperand > & Cond
This file contains some templates that are useful if you are working with the STL at all.
This is the interface for a metadata-based scoped no-alias analysis.
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file defines the SmallPtrSet class.
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This file implements the TypeSwitch template, which mimics a switch() statement whose cases are type ...
This file implements dominator tree analysis for a single level of a VPlan's H-CFG.
This file contains the declarations of different VPlan-related auxiliary helpers.
static SmallVector< SmallVector< VPReplicateRecipe *, 4 > > collectComplementaryPredicatedMemOps(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L)
static void removeCommonBlendMask(VPBlendRecipe *Blend)
Try to see if all of Blend's masks share a common value logically and'ed and remove it from the masks...
static void tryToCreateAbstractReductionRecipe(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries to create abstract recipes from the reduction recipe for following optimizations ...
static VPReplicateRecipe * findRecipeWithMinAlign(ArrayRef< VPReplicateRecipe * > Group)
static bool sinkScalarOperands(VPlan &Plan)
static bool cannotHoistOrSinkRecipe(const VPRecipeBase &R)
Return true if we do not know how to (mechanically) hoist or sink R out of a loop region.
static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Try to simplify the branch condition of Plan.
static void simplifyRecipe(VPSingleDefRecipe *Def, VPTypeAnalysis &TypeInfo)
Try to simplify VPSingleDefRecipe Def.
static void removeRedundantInductionCasts(VPlan &Plan)
Remove redundant casts of inductions.
static bool isConditionTrueViaVFAndUF(VPValue *Cond, VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Return true if Cond is known to be true for given BestVF and BestUF.
static bool tryToReplaceALMWithWideALM(VPlan &Plan, ElementCount VF, unsigned UF)
Try to replace multiple active lane masks used for control flow with a single, wide active lane mask ...
static std::optional< std::pair< bool, unsigned > > getOpcodeOrIntrinsicID(const VPSingleDefRecipe *R)
Get any instruction opcode or intrinsic ID data embedded in recipe R.
static VPExpressionRecipe * tryToMatchAndCreateExtendedReduction(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries convert extended in-loop reductions to VPExpressionRecipe and clamp the Range if ...
static RemoveMask_match< Op0_t, Op1_t > m_RemoveMask(const Op0_t &In, Op1_t &Out)
Match a specific mask In, or a combination of it (logical-and In, Out).
static VPIRMetadata getCommonMetadata(ArrayRef< VPReplicateRecipe * > Recipes)
static VPValue * getPredicatedMask(VPRegionBlock *R)
If R is a region with a VPBranchOnMaskRecipe in the entry block, return the mask.
static bool sinkRecurrenceUsersAfterPrevious(VPFirstOrderRecurrencePHIRecipe *FOR, VPRecipeBase *Previous, VPDominatorTree &VPDT)
Sink users of FOR after the recipe defining the previous value Previous of the recurrence.
static bool mergeReplicateRegionsIntoSuccessors(VPlan &Plan)
static VPScalarIVStepsRecipe * createScalarIVSteps(VPlan &Plan, InductionDescriptor::InductionKind Kind, Instruction::BinaryOps InductionOpcode, FPMathOperator *FPBinOp, Instruction *TruncI, VPIRValue *StartV, VPValue *Step, DebugLoc DL, VPBuilder &Builder)
static VPWidenInductionRecipe * getOptimizableIVOf(VPValue *VPV, PredicatedScalarEvolution &PSE)
Check if VPV is an untruncated wide induction, either before or after the increment.
static void fixupVFUsersForEVL(VPlan &Plan, VPValue &EVL)
After replacing the canonical IV with a EVL-based IV, fixup recipes that use VF to use the EVL instea...
static bool canNarrowLoad(VPSingleDefRecipe *WideMember0, unsigned OpIdx, VPValue *OpV, unsigned Idx, bool IsScalable)
Returns true if V is VPWidenLoadRecipe or VPInterleaveRecipe that can be converted to a narrower reci...
static void expandVPWidenPointerInduction(VPWidenPointerInductionRecipe *R, VPTypeAnalysis &TypeInfo)
Expand a VPWidenPointerInductionRecipe into executable recipes, for the initial value,...
static std::optional< ElementCount > isConsecutiveInterleaveGroup(VPInterleaveRecipe *InterleaveR, ArrayRef< ElementCount > VFs, VPTypeAnalysis &TypeInfo, const TargetTransformInfo &TTI)
Returns VF from VFs if IR is a full interleave group with factor and number of members both equal to ...
static bool isDeadRecipe(VPRecipeBase &R)
Returns true if R is dead and can be removed.
static void legalizeAndOptimizeInductions(VPlan &Plan)
Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd (IndStart, ScalarIVSteps (0,...
static void addReplicateRegions(VPlan &Plan)
static SmallVector< SmallVector< VPReplicateRecipe *, 4 > > collectGroupedReplicateMemOps(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L, function_ref< bool(VPReplicateRecipe *)> FilterFn)
Collect either replicated Loads or Stores grouped by their address SCEV, in a deep-traversal of the v...
static VPIRValue * tryToFoldLiveIns(VPSingleDefRecipe &R, ArrayRef< VPValue * > Operands, const DataLayout &DL, VPTypeAnalysis &TypeInfo)
Try to fold R using InstSimplifyFolder.
static VPValue * tryToComputeEndValueForInduction(VPWidenInductionRecipe *WideIV, VPBuilder &VectorPHBuilder, VPTypeAnalysis &TypeInfo, VPValue *VectorTC)
Compute the end value for WideIV, unless it is truncated.
static void removeRedundantExpandSCEVRecipes(VPlan &Plan)
Remove redundant EpxandSCEVRecipes in Plan's entry block by replacing them with already existing reci...
static bool hoistPreviousBeforeFORUsers(VPFirstOrderRecurrencePHIRecipe *FOR, VPRecipeBase *Previous, VPDominatorTree &VPDT)
Try to hoist Previous and its operands before all users of FOR.
static VPValue * scalarizeVPWidenPointerInduction(VPWidenPointerInductionRecipe *PtrIV, VPlan &Plan, VPBuilder &Builder)
Scalarize a VPWidenPointerInductionRecipe by replacing it with a PtrAdd (IndStart,...
static SmallVector< VPUser * > collectUsersRecursively(VPValue *V)
static VPValue * optimizeEarlyExitInductionUser(VPlan &Plan, VPTypeAnalysis &TypeInfo, VPBlockBase *PredVPBB, VPValue *Op, PredicatedScalarEvolution &PSE)
Attempts to optimize the induction variable exit values for users in the early exit block.
static void recursivelyDeleteDeadRecipes(VPValue *V)
static void reassociateHeaderMask(VPlan &Plan)
Reassociate (headermask && x) && y -> headermask && (x && y) to allow the header mask to be simplifie...
static bool canSinkStoreWithNoAliasCheck(ArrayRef< VPReplicateRecipe * > StoresToSink, PredicatedScalarEvolution &PSE, const Loop &L, VPTypeAnalysis &TypeInfo)
static VPActiveLaneMaskPHIRecipe * addVPLaneMaskPhiAndUpdateExitBranch(VPlan &Plan)
static VPRegionBlock * createReplicateRegion(VPReplicateRecipe *PredRecipe, VPlan &Plan)
static VPBasicBlock * getPredicatedThenBlock(VPRegionBlock *R)
If R is a triangle region, return the 'then' block of the triangle.
static VPValue * narrowInterleaveGroupOp(VPValue *V, SmallPtrSetImpl< VPValue * > &NarrowedOps)
static bool canHoistOrSinkWithNoAliasCheck(const MemoryLocation &MemLoc, VPBasicBlock *FirstBB, VPBasicBlock *LastBB, std::optional< SinkStoreInfo > SinkInfo={})
Check if a memory operation doesn't alias with memory operations using scoped noalias metadata,...
static void simplifyBlends(VPlan &Plan)
Normalize and simplify VPBlendRecipes.
static VPRecipeBase * optimizeMaskToEVL(VPValue *HeaderMask, VPRecipeBase &CurRecipe, VPTypeAnalysis &TypeInfo, VPValue &EVL)
Try to optimize a CurRecipe masked by HeaderMask to a corresponding EVL-based recipe without the head...
static bool isAlreadyNarrow(VPValue *VPV)
Returns true if VPValue is a narrow VPValue.
static bool canNarrowOps(ArrayRef< VPValue * > Ops, bool IsScalable)
static bool optimizeVectorInductionWidthForTCAndVFUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF)
Optimize the width of vector induction variables in Plan based on a known constant Trip Count,...
static VPExpressionRecipe * tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries convert extended in-loop reductions to VPExpressionRecipe and clamp the Range if ...
static void expandVPWidenIntOrFpInduction(VPWidenIntOrFpInductionRecipe *WidenIVR, VPTypeAnalysis &TypeInfo)
Expand a VPWidenIntOrFpInduction into executable recipes, for the initial value, phi and backedge val...
static void removeRedundantCanonicalIVs(VPlan &Plan)
Try to replace VPWidenCanonicalIVRecipes with a widened canonical IV recipe, if it exists.
static void narrowToSingleScalarRecipes(VPlan &Plan)
static VPValue * optimizeLatchExitInductionUser(VPlan &Plan, VPTypeAnalysis &TypeInfo, VPBlockBase *PredVPBB, VPValue *Op, DenseMap< VPValue *, VPValue * > &EndValues, PredicatedScalarEvolution &PSE)
Attempts to optimize the induction variable exit values for users in the exit block coming from the l...
This file provides utility VPlan to VPlan transformations.
#define RUN_VPLAN_PASS(PASS,...)
This file declares the class VPlanVerifier, which contains utility functions to check the consistency...
This file contains the declarations of the Vectorization Plan base classes:
static const X86InstrFMA3Group Groups[]
Value * RHS
Value * LHS
BinaryOperator * Mul
static const uint32_t IV[8]
Definition blake3_impl.h:83
Helper for extra no-alias checks via known-safe recipe and SCEV.
SinkStoreInfo(const SmallPtrSetImpl< VPRecipeBase * > &ExcludeRecipes, VPReplicateRecipe &GroupLeader, PredicatedScalarEvolution &PSE, const Loop &L, VPTypeAnalysis &TypeInfo)
bool shouldSkip(VPRecipeBase &R) const
Return true if R should be skipped during alias checking, either because it's in the exclude set or b...
Class for arbitrary precision integers.
Definition APInt.h:78
LLVM_ABI APInt zext(unsigned width) const
Zero extend to a new width.
Definition APInt.cpp:1043
unsigned getActiveBits() const
Compute the number of active bits in the value.
Definition APInt.h:1527
static APInt getMaxValue(unsigned numBits)
Gets maximum unsigned value of APInt for specific bit width.
Definition APInt.h:207
APInt abs() const
Get the absolute value.
Definition APInt.h:1810
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1503
static APInt getSignedMaxValue(unsigned numBits)
Gets maximum signed value of APInt for a specific bit width.
Definition APInt.h:210
static APInt getMinValue(unsigned numBits)
Gets minimum unsigned value of APInt for a specific bit width.
Definition APInt.h:217
static APInt getSignedMinValue(unsigned numBits)
Gets minimum signed value of APInt for a specific bit width.
Definition APInt.h:220
LLVM_ABI APInt sext(unsigned width) const
Sign extend to a new width.
Definition APInt.cpp:1016
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition APInt.h:441
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition APInt.h:1228
@ NoAlias
The two locations do not alias at all.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
const T & back() const
back - Get the last element.
Definition ArrayRef.h:151
const T & front() const
front - Get the first element.
Definition ArrayRef.h:145
iterator end() const
Definition ArrayRef.h:131
iterator begin() const
Definition ArrayRef.h:130
LLVM Basic Block Representation.
Definition BasicBlock.h:62
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction; assumes that the block is well-formed.
Definition BasicBlock.h:237
This class represents a function call, abstracting a target machine's calling convention.
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:701
@ ICMP_NE
not equal
Definition InstrTypes.h:698
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:702
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition InstrTypes.h:686
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition InstrTypes.h:789
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
static ConstantInt * getSigned(IntegerType *Ty, int64_t V, bool ImplicitTrunc=false)
Return a ConstantInt with the specified value for the specified type.
Definition Constants.h:135
This class represents a range of values.
LLVM_ABI bool contains(const APInt &Val) const
Return true if the specified value is in the set.
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
A debug info location.
Definition DebugLoc.h:123
static DebugLoc getCompilerGenerated()
Definition DebugLoc.h:162
static DebugLoc getUnknown()
Definition DebugLoc.h:161
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition DenseMap.h:205
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:256
ValueT lookup_or(const_arg_type_t< KeyT > Val, U &&Default) const
Definition DenseMap.h:215
bool dominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
dominates - Returns true iff A dominates B.
constexpr bool isVector() const
One or more elements.
Definition TypeSize.h:324
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition TypeSize.h:312
Utility class for floating point operations which can have information about relaxed accuracy require...
Definition Operator.h:200
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:23
Represents flags for the getelementptr instruction/expression.
GEPNoWrapFlags withoutNoUnsignedWrap() const
static GEPNoWrapFlags none()
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
A struct for saving information about induction variables.
InductionKind
This enum represents the kinds of inductions that we support.
@ IK_PtrInduction
Pointer induction var. Step = C.
@ IK_IntInduction
Integer induction variable. Step = C.
InstSimplifyFolder - Use InstructionSimplify to fold operations to existing values.
static InstructionCost getInvalid(CostType Val=0)
bool isCast() const
bool isBinaryOp() const
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:354
The group of interleaved loads/stores sharing the same stride and close to each other.
InstTy * getMember(uint32_t Index) const
Get the member with the given index Index.
uint32_t getNumMembers() const
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
static bool getDecisionAndClampRange(const std::function< bool(ElementCount)> &Predicate, VFRange &Range)
Test a Predicate on a Range of VF's.
Definition VPlan.cpp:1597
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
LLVM_ABI MDNode * createBranchWeights(uint32_t TrueWeight, uint32_t FalseWeight, bool IsExpected=false)
Return metadata containing two branch weights.
Definition MDBuilder.cpp:38
Metadata node.
Definition Metadata.h:1080
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:36
bool empty() const
Definition MapVector.h:77
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
Definition MapVector.h:116
ValueT lookup(const KeyT &Key) const
Definition MapVector.h:108
Representation for a specific memory location.
AAMDNodes AATags
The metadata nodes which describes the aliasing of the location (each member is null if that kind of ...
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
ScalarEvolution * getSE() const
Returns the ScalarEvolution analysis used.
LLVM_ABI const SCEV * getSCEV(Value *V)
Returns the SCEV expression of V, in the context of the current SCEV predicate.
static LLVM_ABI unsigned getOpcode(RecurKind Kind)
Returns the opcode corresponding to the RecurrenceKind.
static bool isFindLastRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is of the form select(cmp(),x,y) where one of (x,...
RegionT * getParent() const
Get the parent of the Region.
Definition RegionInfo.h:362
This class uses information about analyze scalars to rewrite expressions in canonical form.
LLVM_ABI Value * expandCodeFor(const SCEV *SH, Type *Ty, BasicBlock::iterator I)
Insert code to directly compute the specified SCEV expression into the program.
static const SCEV * rewrite(const SCEV *Scev, ScalarEvolution &SE, ValueToSCEVMapTy &Map)
This class represents an analyzed expression in the program.
LLVM_ABI Type * getType() const
Return the LLVM type of this SCEV expression.
The main scalar evolution driver.
LLVM_ABI const SCEV * getUDivExpr(SCEVUse LHS, SCEVUse RHS)
Get a canonical unsigned division expression, or something simpler if possible.
const DataLayout & getDataLayout() const
Return the DataLayout associated with the module this SCEV instance is operating on.
LLVM_ABI const SCEV * getNegativeSCEV(const SCEV *V, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
Return the SCEV object corresponding to -V.
LLVM_ABI bool isKnownNonZero(const SCEV *S)
Test if the given expression is known to be non-zero.
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI const SCEV * getMinusSCEV(SCEVUse LHS, SCEVUse RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
ConstantRange getSignedRange(const SCEV *S)
Determine the signed range for a particular SCEV.
LLVM_ABI bool isKnownPositive(const SCEV *S)
Test if the given expression is known to be positive.
LLVM_ABI const SCEV * getElementCount(Type *Ty, ElementCount EC, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
ConstantRange getUnsignedRange(const SCEV *S)
Determine the unsigned range for a particular SCEV.
LLVM_ABI const SCEV * getMulExpr(SmallVectorImpl< SCEVUse > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
LLVM_ABI bool isKnownPredicate(CmpPredicate Pred, SCEVUse LHS, SCEVUse RHS)
Test if the given expression is known to satisfy the condition described by Pred, LHS,...
static LLVM_ABI bool mayAliasInScopes(const MDNode *Scopes, const MDNode *NoAlias)
static LLVM_ABI AliasResult alias(const MemoryLocation &LocA, const MemoryLocation &LocB)
A vector that has set insertion semantics.
Definition SetVector.h:57
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:103
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:151
size_type size() const
Definition SmallPtrSet.h:99
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
iterator begin() const
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
Provides information about what library functions are available for the current target.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
static LLVM_ABI PartialReductionExtendKind getPartialReductionExtendKind(Instruction *I)
Get the kind of extension that an instruction represents.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
LLVM_ABI InstructionCost getPartialReductionCost(unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType, ElementCount VF, PartialReductionExtendKind OpAExtend, PartialReductionExtendKind OpBExtend, std::optional< unsigned > BinOp, TTI::TargetCostKind CostKind, std::optional< FastMathFlags > FMF) const
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
This class implements a switch-like dispatch statement for a value of 'T' using dyn_cast functionalit...
Definition TypeSwitch.h:89
TypeSwitch< T, ResultT > & Case(CallableT &&caseFn)
Add a case on the given type.
Definition TypeSwitch.h:98
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:313
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:284
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
Definition Type.cpp:311
bool isStructTy() const
True if this is an instance of StructType.
Definition Type.h:278
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:201
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:236
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:310
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:186
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:257
op_range operands()
Definition User.h:267
A recipe for generating the active lane mask for the vector loop that is used to predicate the vector...
Definition VPlan.h:3889
VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph.
Definition VPlan.h:4269
void appendRecipe(VPRecipeBase *Recipe)
Augment the existing recipes of a VPBasicBlock with an additional Recipe as the last recipe.
Definition VPlan.h:4344
RecipeListTy::iterator iterator
Instruction iterators...
Definition VPlan.h:4296
iterator end()
Definition VPlan.h:4306
iterator begin()
Recipe iterator methods.
Definition VPlan.h:4304
iterator_range< iterator > phis()
Returns an iterator range over the PHI-like recipes in the block.
Definition VPlan.h:4357
iterator getFirstNonPhi()
Return the position of the first non-phi node recipe in the block.
Definition VPlan.cpp:232
VPRegionBlock * getEnclosingLoopRegion()
Definition VPlan.cpp:598
VPBasicBlock * splitAt(iterator SplitAt)
Split current block at SplitAt by inserting a new block between the current block and its successors ...
Definition VPlan.cpp:565
const VPRecipeBase & front() const
Definition VPlan.h:4316
VPRecipeBase * getTerminator()
If the block has multiple successors, return the branch recipe terminating the block.
Definition VPlan.cpp:644
const VPRecipeBase & back() const
Definition VPlan.h:4318
A recipe for vectorizing a phi-node as a sequence of mask-based select instructions.
Definition VPlan.h:2794
VPValue * getMask(unsigned Idx) const
Return mask number Idx.
Definition VPlan.h:2830
unsigned getNumIncomingValues() const
Return the number of incoming values, taking into account when normalized the first incoming value wi...
Definition VPlan.h:2820
void setMask(unsigned Idx, VPValue *V)
Set mask number Idx to V.
Definition VPlan.h:2836
bool isNormalized() const
A normalized blend is one that has an odd number of operands, whereby the first operand does not have...
Definition VPlan.h:2816
VPBlockBase is the building block of the Hierarchical Control-Flow Graph.
Definition VPlan.h:98
void setSuccessors(ArrayRef< VPBlockBase * > NewSuccs)
Set each VPBasicBlock in NewSuccss as successor of this VPBlockBase.
Definition VPlan.h:319
VPRegionBlock * getParent()
Definition VPlan.h:190
const VPBasicBlock * getExitingBasicBlock() const
Definition VPlan.cpp:202
size_t getNumSuccessors() const
Definition VPlan.h:241
void setPredecessors(ArrayRef< VPBlockBase * > NewPreds)
Set each VPBasicBlock in NewPreds as predecessor of this VPBlockBase.
Definition VPlan.h:310
const VPBlocksTy & getPredecessors() const
Definition VPlan.h:226
VPlan * getPlan()
Definition VPlan.cpp:177
const std::string & getName() const
Definition VPlan.h:181
void clearSuccessors()
Remove all the successors of this block.
Definition VPlan.h:329
VPBlockBase * getSinglePredecessor() const
Definition VPlan.h:237
const VPBasicBlock * getEntryBasicBlock() const
Definition VPlan.cpp:182
VPBlockBase * getSingleHierarchicalPredecessor()
Definition VPlan.h:283
VPBlockBase * getSingleSuccessor() const
Definition VPlan.h:231
const VPBlocksTy & getSuccessors() const
Definition VPlan.h:215
static auto blocksOnly(const T &Range)
Return an iterator range over Range which only includes BlockTy blocks.
Definition VPlanUtils.h:273
static void insertOnEdge(VPBlockBase *From, VPBlockBase *To, VPBlockBase *BlockPtr)
Inserts BlockPtr on the edge between From and To.
Definition VPlanUtils.h:300
static bool isLatch(const VPBlockBase *VPB, const VPDominatorTree &VPDT)
Returns true if VPB is a loop latch, using isHeader().
static void insertTwoBlocksAfter(VPBlockBase *IfTrue, VPBlockBase *IfFalse, VPBlockBase *BlockPtr)
Insert disconnected VPBlockBases IfTrue and IfFalse after BlockPtr.
Definition VPlanUtils.h:200
static void connectBlocks(VPBlockBase *From, VPBlockBase *To, unsigned PredIdx=-1u, unsigned SuccIdx=-1u)
Connect VPBlockBases From and To bi-directionally.
Definition VPlanUtils.h:218
static void disconnectBlocks(VPBlockBase *From, VPBlockBase *To)
Disconnect VPBlockBases From and To bi-directionally.
Definition VPlanUtils.h:236
static void transferSuccessors(VPBlockBase *Old, VPBlockBase *New)
Transfer successors from Old to New. New must have no successors.
Definition VPlanUtils.h:256
static SmallVector< VPBasicBlock * > blocksInSingleSuccessorChainBetween(VPBasicBlock *FirstBB, VPBasicBlock *LastBB)
Returns the blocks between FirstBB and LastBB, where FirstBB to LastBB forms a single-sucessor chain.
A recipe for generating conditional branches on the bits of a mask.
Definition VPlan.h:3298
RAII object that stores the current insertion point and restores it when the object is destroyed.
VPlan-based builder utility analogous to IRBuilder.
VPInstruction * createOr(VPValue *LHS, VPValue *RHS, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
VPValue * createScalarZExtOrTrunc(VPValue *Op, Type *ResultTy, Type *SrcTy, DebugLoc DL)
VPValue * createElementCount(Type *Ty, ElementCount EC)
VPInstruction * createNot(VPValue *Operand, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
VPInstruction * createScalarCast(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy, DebugLoc DL, const VPIRMetadata &Metadata={})
static VPBuilder getToInsertAfter(VPRecipeBase *R)
Create a VPBuilder to insert after R.
VPInstruction * createOverflowingOp(unsigned Opcode, ArrayRef< VPValue * > Operands, VPRecipeWithIRFlags::WrapFlagsTy WrapFlags={false, false}, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
VPPhi * createScalarPhi(ArrayRef< VPValue * > IncomingValues, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="", const VPIRFlags &Flags={})
VPDerivedIVRecipe * createDerivedIV(InductionDescriptor::InductionKind Kind, FPMathOperator *FPBinOp, VPIRValue *Start, VPValue *Current, VPValue *Step, const Twine &Name="")
Convert the input value Current to the corresponding value of an induction with Start and Step values...
VPInstruction * createICmp(CmpInst::Predicate Pred, VPValue *A, VPValue *B, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
Create a new ICmp VPInstruction with predicate Pred and operands A and B.
VPInstruction * createSelect(VPValue *Cond, VPValue *TrueVal, VPValue *FalseVal, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="", const VPIRFlags &Flags={})
void setInsertPoint(VPBasicBlock *TheBB)
This specifies that created VPInstructions should be appended to the end of the specified block.
VPInstruction * createNaryOp(unsigned Opcode, ArrayRef< VPValue * > Operands, Instruction *Inst=nullptr, const VPIRFlags &Flags={}, const VPIRMetadata &MD={}, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
Create an N-ary operation with Opcode, Operands and set Inst as its underlying Instruction.
Canonical scalar induction phi of the vector loop.
Definition VPlan.h:3831
A recipe for generating the phi node tracking the current scalar iteration index.
Definition VPlan.h:3921
unsigned getNumDefinedValues() const
Returns the number of values defined by the VPDef.
Definition VPlanValue.h:465
VPValue * getVPSingleValue()
Returns the only VPValue defined by the VPDef.
Definition VPlanValue.h:438
VPValue * getVPValue(unsigned I)
Returns the VPValue with index I defined by the VPDef.
Definition VPlanValue.h:450
ArrayRef< VPRecipeValue * > definedValues()
Returns an ArrayRef of the values defined by the VPDef.
Definition VPlanValue.h:460
A recipe for converting the input value IV value to the corresponding value of an IV with different s...
Definition VPlan.h:4001
Template specialization of the standard LLVM dominator tree utility for VPBlockBases.
bool properlyDominates(const VPRecipeBase *A, const VPRecipeBase *B)
A recipe to combine multiple recipes into a single 'expression' recipe, which should be considered a ...
Definition VPlan.h:3343
A pure virtual base class for all recipes modeling header phis, including phis for first order recurr...
Definition VPlan.h:2306
virtual VPValue * getBackedgeValue()
Returns the incoming value from the loop backedge.
Definition VPlan.h:2348
VPValue * getStartValue()
Returns the start value of the phi, if one is set.
Definition VPlan.h:2337
A special type of VPBasicBlock that wraps an existing IR basic block.
Definition VPlan.h:4422
Class to record and manage LLVM IR flags.
Definition VPlan.h:690
static VPIRFlags getDefaultFlags(unsigned Opcode)
Returns default flags for Opcode for opcodes that support it, asserts otherwise.
LLVM_ABI_FOR_TEST FastMathFlags getFastMathFlags() const
static LLVM_ABI_FOR_TEST VPIRInstruction * create(Instruction &I)
Create a new VPIRPhi for \I , if it is a PHINode, otherwise create a VPIRInstruction.
Helper to manage IR metadata for recipes.
Definition VPlan.h:1170
void intersect(const VPIRMetadata &MD)
Intersect this VPIRMetadata object with MD, keeping only metadata nodes that are common to both.
This is a concrete Recipe that models a single VPlan-level instruction.
Definition VPlan.h:1225
@ ExtractLane
Extracts a single lane (first operand) from a set of vector operands.
Definition VPlan.h:1327
@ ComputeAnyOfResult
Compute the final result of a AnyOf reduction with select(cmp(),x,y), where one of (x,...
Definition VPlan.h:1272
@ Unpack
Extracts all lanes from its (non-scalable) vector operand.
Definition VPlan.h:1269
@ ReductionStartVector
Start vector for reductions with 3 operands: the original start value, the identity value for the red...
Definition VPlan.h:1321
@ BuildVector
Creates a fixed-width vector containing all operands.
Definition VPlan.h:1264
@ BuildStructVector
Given operands of (the same) struct type, creates a struct of fixed- width vectors each containing a ...
Definition VPlan.h:1261
@ CanonicalIVIncrementForPart
Definition VPlan.h:1245
const InterleaveGroup< Instruction > * getInterleaveGroup() const
Definition VPlan.h:2939
VPValue * getMask() const
Return the mask used by this recipe.
Definition VPlan.h:2931
ArrayRef< VPValue * > getStoredValues() const
Return the VPValues stored by this interleave group.
Definition VPlan.h:2960
A recipe for interleaved memory operations with vector-predication intrinsics.
Definition VPlan.h:3012
VPInterleaveRecipe is a recipe for transforming an interleave group of load or stores into one wide l...
Definition VPlan.h:2970
VPValue * getIncomingValue(unsigned Idx) const
Returns the incoming VPValue with index Idx.
Definition VPlan.h:1593
VPPredInstPHIRecipe is a recipe for generating the phi nodes needed when control converges back from ...
Definition VPlan.h:3485
VPRecipeBase is a base class modeling a sequence of one or more output IR instructions.
Definition VPlan.h:406
VPRegionBlock * getRegion()
Definition VPlan.h:4574
VPBasicBlock * getParent()
Definition VPlan.h:481
DebugLoc getDebugLoc() const
Returns the debug location of the recipe.
Definition VPlan.h:555
void moveBefore(VPBasicBlock &BB, iplist< VPRecipeBase >::iterator I)
Unlink this recipe and insert into BB before I.
void insertBefore(VPRecipeBase *InsertPos)
Insert an unlinked recipe into a basic block immediately before the specified recipe.
void insertAfter(VPRecipeBase *InsertPos)
Insert an unlinked Recipe into a basic block immediately after the specified Recipe.
iplist< VPRecipeBase >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Helper class to create VPRecipies from IR instructions.
VPRecipeBase * getRecipe(Instruction *I)
Return the recipe created for given ingredient.
A recipe to represent inloop reduction operations with vector-predication intrinsics,...
Definition VPlan.h:3172
A recipe for handling reduction phis.
Definition VPlan.h:2700
void setVFScaleFactor(unsigned ScaleFactor)
Set the VFScaleFactor for this reduction phi.
Definition VPlan.h:2747
unsigned getVFScaleFactor() const
Get the factor that the VF of this recipe's output should be scaled by, or 1 if it isn't scaled.
Definition VPlan.h:2740
RecurKind getRecurrenceKind() const
Returns the recurrence kind of the reduction.
Definition VPlan.h:2758
A recipe to represent inloop, ordered or partial reduction operations.
Definition VPlan.h:3063
VPRegionBlock represents a collection of VPBasicBlocks and VPRegionBlocks which form a Single-Entry-S...
Definition VPlan.h:4457
const VPBlockBase * getEntry() const
Definition VPlan.h:4493
Type * getCanonicalIVType()
Return the type of the canonical IV for loop regions.
Definition VPlan.h:4568
bool isReplicator() const
An indicator whether this region is to generate multiple replicated instances of output IR correspond...
Definition VPlan.h:4525
void setExiting(VPBlockBase *ExitingBlock)
Set ExitingBlock as the exiting VPBlockBase of this VPRegionBlock.
Definition VPlan.h:4510
VPCanonicalIVPHIRecipe * getCanonicalIV()
Returns the canonical induction recipe of the region.
Definition VPlan.h:4555
const VPBlockBase * getExiting() const
Definition VPlan.h:4505
VPBasicBlock * getPreheaderVPBB()
Returns the pre-header VPBasicBlock of the loop region.
Definition VPlan.h:4518
VPReplicateRecipe replicates a given instruction producing multiple scalar copies of the original sca...
Definition VPlan.h:3217
bool isSingleScalar() const
Definition VPlan.h:3258
bool isPredicated() const
Definition VPlan.h:3260
VPValue * getMask()
Return the mask of a predicated VPReplicateRecipe.
Definition VPlan.h:3282
A recipe for handling phi nodes of integer and floating-point inductions, producing their scalar valu...
Definition VPlan.h:4073
VPSingleDef is a base class for recipes for modeling a sequence of one or more output IR that define ...
Definition VPlan.h:607
Instruction * getUnderlyingInstr()
Returns the underlying instruction.
Definition VPlan.h:675
VPSingleDefRecipe * clone() override=0
Clone the current recipe.
An analysis for type-inference for VPValues.
LLVMContext & getContext()
Return the LLVMContext used by the analysis.
Type * inferScalarType(const VPValue *V)
Infer the type of V. Returns the scalar type of V.
This class augments VPValue with operands which provide the inverse def-use edges from VPValue's user...
Definition VPlanValue.h:296
operand_range operands()
Definition VPlanValue.h:364
void setOperand(unsigned I, VPValue *New)
Definition VPlanValue.h:340
unsigned getNumOperands() const
Definition VPlanValue.h:334
VPValue * getOperand(unsigned N) const
Definition VPlanValue.h:335
void addOperand(VPValue *Operand)
Definition VPlanValue.h:329
This is the base class of the VPlan Def/Use graph, used for modeling the data flow into,...
Definition VPlanValue.h:46
Value * getLiveInIRValue() const
Return the underlying IR value for a VPIRValue.
Definition VPlan.cpp:137
bool isDefinedOutsideLoopRegions() const
Returns true if the VPValue is defined outside any loop.
Definition VPlan.cpp:1425
VPRecipeBase * getDefiningRecipe()
Returns the recipe defining this VPValue or nullptr if it is not defined by a recipe,...
Definition VPlan.cpp:127
Value * getUnderlyingValue() const
Return the underlying Value attached to this VPValue.
Definition VPlanValue.h:70
bool hasOneUse() const
Definition VPlanValue.h:166
void setUnderlyingValue(Value *Val)
Definition VPlanValue.h:196
void replaceAllUsesWith(VPValue *New)
Definition VPlan.cpp:1428
unsigned getNumUsers() const
Definition VPlanValue.h:107
void replaceUsesWithIf(VPValue *New, llvm::function_ref< bool(VPUser &U, unsigned Idx)> ShouldReplace)
Go through the uses list for this VPValue and make each use point to New if the callback ShouldReplac...
Definition VPlan.cpp:1434
user_range users()
Definition VPlanValue.h:149
A recipe to compute a pointer to the last element of each part of a widened memory access for widened...
Definition VPlan.h:2154
A Recipe for widening the canonical induction variable of the vector loop.
Definition VPlan.h:3964
VPWidenCastRecipe is a recipe to create vector cast instructions.
Definition VPlan.h:1840
Instruction::CastOps getOpcode() const
Definition VPlan.h:1878
A recipe for handling GEP instructions.
Definition VPlan.h:2090
Base class for widened induction (VPWidenIntOrFpInductionRecipe and VPWidenPointerInductionRecipe),...
Definition VPlan.h:2372
VPIRValue * getStartValue() const
Returns the start value of the induction.
Definition VPlan.h:2400
PHINode * getPHINode() const
Returns the underlying PHINode if one exists, or null otherwise.
Definition VPlan.h:2418
VPValue * getStepValue()
Returns the step value of the induction.
Definition VPlan.h:2403
const InductionDescriptor & getInductionDescriptor() const
Returns the induction descriptor for the recipe.
Definition VPlan.h:2423
A recipe for handling phi nodes of integer and floating-point inductions, producing their vector valu...
Definition VPlan.h:2454
VPIRValue * getStartValue() const
Returns the start value of the induction.
Definition VPlan.h:2501
VPValue * getSplatVFValue() const
If the recipe has been unrolled, return the VPValue for the induction increment, otherwise return nul...
Definition VPlan.h:2505
VPValue * getLastUnrolledPartOperand()
Returns the VPValue representing the value of this induction at the last unrolled part,...
Definition VPlan.h:2532
A recipe for widening vector intrinsics.
Definition VPlan.h:1892
A common base class for widening memory operations.
Definition VPlan.h:3528
A recipe for widened phis.
Definition VPlan.h:2590
VPWidenRecipe is a recipe for producing a widened instruction using the opcode and operands of the re...
Definition VPlan.h:1784
unsigned getOpcode() const
Definition VPlan.h:1821
VPlan models a candidate for vectorization, encoding various decisions take to produce efficient outp...
Definition VPlan.h:4587
VPIRValue * getLiveIn(Value *V) const
Return the live-in VPIRValue for V, if there is one or nullptr otherwise.
Definition VPlan.h:4895
bool hasVF(ElementCount VF) const
Definition VPlan.h:4800
const DataLayout & getDataLayout() const
Definition VPlan.h:4782
LLVMContext & getContext() const
Definition VPlan.h:4778
VPBasicBlock * getEntry()
Definition VPlan.h:4679
bool hasScalableVF() const
Definition VPlan.h:4801
VPValue * getTripCount() const
The trip count of the original loop.
Definition VPlan.h:4737
VPValue * getOrCreateBackedgeTakenCount()
The backedge taken count of the original loop.
Definition VPlan.h:4758
iterator_range< SmallSetVector< ElementCount, 2 >::iterator > vectorFactors() const
Returns an iterator range over all VFs of the plan.
Definition VPlan.h:4807
VPIRValue * getFalse()
Return a VPIRValue wrapping i1 false.
Definition VPlan.h:4866
VPSymbolicValue & getVFxUF()
Returns VF * UF of the vector loop region.
Definition VPlan.h:4776
VPIRValue * getAllOnesValue(Type *Ty)
Return a VPIRValue wrapping the AllOnes value of type Ty.
Definition VPlan.h:4872
VPRegionBlock * createReplicateRegion(VPBlockBase *Entry, VPBlockBase *Exiting, const std::string &Name="")
Create a new replicate region with Entry, Exiting and Name.
Definition VPlan.h:4942
auto getLiveIns() const
Return the list of live-in VPValues available in the VPlan.
Definition VPlan.h:4898
bool hasUF(unsigned UF) const
Definition VPlan.h:4818
ArrayRef< VPIRBasicBlock * > getExitBlocks() const
Return an ArrayRef containing VPIRBasicBlocks wrapping the exit blocks of the original scalar loop.
Definition VPlan.h:4727
VPSymbolicValue & getVectorTripCount()
The vector trip count.
Definition VPlan.h:4766
VPIRValue * getOrAddLiveIn(Value *V)
Gets the live-in VPIRValue for V or adds a new live-in (if none exists yet) for V.
Definition VPlan.h:4843
VPIRValue * getZero(Type *Ty)
Return a VPIRValue wrapping the null value of type Ty.
Definition VPlan.h:4869
void setVF(ElementCount VF)
Definition VPlan.h:4788
bool isUnrolled() const
Returns true if the VPlan already has been unrolled, i.e.
Definition VPlan.h:4834
LLVM_ABI_FOR_TEST VPRegionBlock * getVectorLoopRegion()
Returns the VPRegionBlock of the vector loop.
Definition VPlan.cpp:1058
unsigned getConcreteUF() const
Returns the concrete UF of the plan, after unrolling.
Definition VPlan.h:4821
void resetTripCount(VPValue *NewTripCount)
Resets the trip count for the VPlan.
Definition VPlan.h:4751
VPBasicBlock * getMiddleBlock()
Returns the 'middle' block of the plan, that is the block that selects whether to execute the scalar ...
Definition VPlan.h:4704
VPBasicBlock * createVPBasicBlock(const Twine &Name, VPRecipeBase *Recipe=nullptr)
Create a new VPBasicBlock with Name and containing Recipe if present.
Definition VPlan.h:4921
VPIRValue * getTrue()
Return a VPIRValue wrapping i1 true.
Definition VPlan.h:4863
VPSymbolicValue & getUF()
Returns the UF of the vector loop region.
Definition VPlan.h:4773
bool hasScalarVFOnly() const
Definition VPlan.h:4811
VPBasicBlock * getScalarPreheader() const
Return the VPBasicBlock for the preheader of the scalar loop.
Definition VPlan.h:4718
VPBasicBlock * getVectorPreheader()
Returns the preheader of the vector loop region, if one exists, or null otherwise.
Definition VPlan.h:4684
VPSymbolicValue & getVF()
Returns the VF of the vector loop region.
Definition VPlan.h:4769
void setUF(unsigned UF)
Definition VPlan.h:4826
bool hasScalarTail() const
Returns true if the scalar tail may execute after the vector loop.
Definition VPlan.h:4974
LLVM_ABI_FOR_TEST VPlan * duplicate()
Clone the current VPlan, update all VPValues of the new VPlan and cloned recipes to refer to the clon...
Definition VPlan.cpp:1206
VPIRValue * getConstantInt(Type *Ty, uint64_t Val, bool IsSigned=false)
Return a VPIRValue wrapping a ConstantInt with the given type and value.
Definition VPlan.h:4877
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255
iterator_range< user_iterator > users()
Definition Value.h:426
bool hasName() const
Definition Value.h:261
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:318
constexpr bool hasKnownScalarFactor(const FixedOrScalableQuantity &RHS) const
Returns true if there exists a value X where RHS.multiplyCoefficientBy(X) will result in a value whos...
Definition TypeSize.h:269
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
constexpr ScalarTy getKnownScalarFactor(const FixedOrScalableQuantity &RHS) const
Returns a value X where RHS.multiplyCoefficientBy(X) will result in a value whose quantity matches ou...
Definition TypeSize.h:277
static constexpr bool isKnownLT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:216
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:168
constexpr LeafTy multiplyCoefficientBy(ScalarTy RHS) const
Definition TypeSize.h:256
constexpr bool isFixed() const
Returns true if the quantity is not scaled by vscale.
Definition TypeSize.h:171
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
LLVM_ABI APInt RoundingUDiv(const APInt &A, const APInt &B, APInt::Rounding RM)
Return A unsign-divided by B, rounded by the given rounding mode.
Definition APInt.cpp:2803
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
m_Intrinsic_Ty< Opnd0, Opnd1, Opnd2 >::Ty m_MaskedStore(const Opnd0 &Op0, const Opnd1 &Op1, const Opnd2 &Op2)
Matches MaskedStore Intrinsic.
ap_match< APInt > m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
CastInst_match< OpTy, TruncInst > m_Trunc(const OpTy &Op)
Matches Trunc.
LogicalOp_match< LHS, RHS, Instruction::And > m_LogicalAnd(const LHS &L, const RHS &R)
Matches L && R either in the form of L & R or L ?
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, ZExtInst >, OpTy > m_ZExtOrSelf(const OpTy &Op)
bool match(Val *V, const Pattern &P)
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
auto match_fn(const Pattern &P)
A match functor that can be used as a UnaryPredicate in functional algorithms like all_of.
m_Intrinsic_Ty< Opnd0, Opnd1, Opnd2 >::Ty m_MaskedLoad(const Opnd0 &Op0, const Opnd1 &Op1, const Opnd2 &Op2)
Matches MaskedLoad Intrinsic.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
SpecificCmpClass_match< LHS, RHS, CmpInst > m_SpecificCmp(CmpPredicate MatchPred, const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
deferredval_ty< Value > m_Deferred(Value *const &V)
Like m_Specific(), but works if the specific value to match is determined as part of the same match()...
CastInst_match< OpTy, FPExtInst > m_FPExt(const OpTy &Op)
SpecificCmpClass_match< LHS, RHS, ICmpInst > m_SpecificICmp(CmpPredicate MatchPred, const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::UDiv > m_UDiv(const LHS &L, const RHS &R)
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
CmpClass_match< LHS, RHS, ICmpInst > m_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
BinaryOp_match< LHS, RHS, Instruction::FAdd, true > m_c_FAdd(const LHS &L, const RHS &R)
Matches FAdd with LHS and RHS in either order.
LogicalOp_match< LHS, RHS, Instruction::And, true > m_c_LogicalAnd(const LHS &L, const RHS &R)
Matches L && R with LHS and RHS in either order.
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
BinaryOp_match< LHS, RHS, Instruction::Mul, true > m_c_Mul(const LHS &L, const RHS &R)
Matches a Mul with LHS and RHS in either order.
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
bind_cst_ty m_scev_APInt(const APInt *&C)
Match an SCEV constant and bind it to an APInt.
SCEVAffineAddRec_match< Op0_t, Op1_t, class_match< const Loop > > m_scev_AffineAddRec(const Op0_t &Op0, const Op1_t &Op1)
bool match(const SCEV *S, const Pattern &P)
class_match< const SCEV > m_SCEV()
VPInstruction_match< VPInstruction::ExtractLastLane, VPInstruction_match< VPInstruction::ExtractLastPart, Op0_t > > m_ExtractLastLaneOfLastPart(const Op0_t &Op0)
AllRecipe_commutative_match< Instruction::And, Op0_t, Op1_t > m_c_BinaryAnd(const Op0_t &Op0, const Op1_t &Op1)
Match a binary AND operation.
AllRecipe_match< Instruction::Or, Op0_t, Op1_t > m_BinaryOr(const Op0_t &Op0, const Op1_t &Op1)
Match a binary OR operation.
VPInstruction_match< VPInstruction::AnyOf > m_AnyOf()
AllRecipe_commutative_match< Instruction::Or, Op0_t, Op1_t > m_c_BinaryOr(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::ComputeReductionResult, Op0_t > m_ComputeReductionResult(const Op0_t &Op0)
auto m_WidenAnyExtend(const Op0_t &Op0)
VPInstruction_match< VPInstruction::StepVector > m_StepVector()
auto m_VPPhi(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::BranchOnTwoConds > m_BranchOnTwoConds()
AllRecipe_match< Opcode, Op0_t, Op1_t > m_Binary(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::LastActiveLane, Op0_t > m_LastActiveLane(const Op0_t &Op0)
VPInstruction_match< VPInstruction::ExitingIVValue, Op0_t > m_ExitingIVValue(const Op0_t &Op0)
VPInstruction_match< Instruction::ExtractElement, Op0_t, Op1_t > m_ExtractElement(const Op0_t &Op0, const Op1_t &Op1)
specific_intval< 1 > m_False()
VPInstruction_match< VPInstruction::ExtractLastLane, Op0_t > m_ExtractLastLane(const Op0_t &Op0)
VPInstruction_match< VPInstruction::ActiveLaneMask, Op0_t, Op1_t, Op2_t > m_ActiveLaneMask(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2)
VPInstruction_match< VPInstruction::BranchOnCount > m_BranchOnCount()
bind_ty< VPIRValue > m_VPIRValue(VPIRValue *&V)
Match a VPIRValue.
auto m_GetElementPtr(const Op0_t &Op0, const Op1_t &Op1)
specific_intval< 1 > m_True()
VectorEndPointerRecipe_match< Op0_t, Op1_t > m_VecEndPtr(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::ExtractLastPart, Op0_t > m_ExtractLastPart(const Op0_t &Op0)
VPInstruction_match< VPInstruction::Broadcast, Op0_t > m_Broadcast(const Op0_t &Op0)
class_match< VPValue > m_VPValue()
Match an arbitrary VPValue and ignore it.
VPInstruction_match< VPInstruction::ExplicitVectorLength, Op0_t > m_EVL(const Op0_t &Op0)
VPInstruction_match< VPInstruction::BuildVector > m_BuildVector()
BuildVector is matches only its opcode, w/o matching its operands as the number of operands is not fi...
VPInstruction_match< VPInstruction::ExtractPenultimateElement, Op0_t > m_ExtractPenultimateElement(const Op0_t &Op0)
VPInstruction_match< VPInstruction::FirstActiveLane, Op0_t > m_FirstActiveLane(const Op0_t &Op0)
bind_ty< VPInstruction > m_VPInstruction(VPInstruction *&V)
Match a VPInstruction, capturing if we match.
auto m_DerivedIV(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2)
VPInstruction_match< VPInstruction::BranchOnCond > m_BranchOnCond()
VPInstruction_match< VPInstruction::ExtractLane, Op0_t, Op1_t > m_ExtractLane(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::Reverse, Op0_t > m_Reverse(const Op0_t &Op0)
NodeAddr< DefNode * > Def
Definition RDFGraph.h:384
bool isSingleScalar(const VPValue *VPV)
Returns true if VPV is a single scalar, either because it produces the same value for all lanes or on...
bool isUniformAcrossVFsAndUFs(VPValue *V)
Checks if V is uniform across all VF lanes and UF parts.
VPValue * getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr)
Get or create a VPValue that corresponds to the expansion of Expr.
VPInstruction * findComputeReductionResult(VPReductionPHIRecipe *PhiR)
Find the ComputeReductionResult recipe for PhiR, looking through selects inserted for predicated redu...
std::optional< MemoryLocation > getMemoryLocation(const VPRecipeBase &R)
Return a MemoryLocation for R with noalias metadata populated from R, if the recipe is supported and ...
bool onlyFirstLaneUsed(const VPValue *Def)
Returns true if only the first lane of Def is used.
VPRecipeBase * findRecipe(VPValue *Start, PredT Pred)
Search Start's users for a recipe satisfying Pred, looking through recipes with definitions.
Definition VPlanUtils.h:111
VPSingleDefRecipe * findHeaderMask(VPlan &Plan)
Collect the header mask with the pattern: (ICMP_ULE, WideCanonicalIV, backedge-taken-count) TODO: Int...
bool onlyScalarValuesUsed(const VPValue *Def)
Returns true if only scalar values of Def are used by all users.
static VPRecipeBase * findUserOf(VPValue *V, const MatchT &P)
If V is used by a recipe matching pattern P, return it.
Definition VPlanUtils.h:132
const SCEV * getSCEVExprForVPValue(const VPValue *V, PredicatedScalarEvolution &PSE, const Loop *L=nullptr)
Return the SCEV expression for V.
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
@ Offset
Definition DWP.cpp:532
void stable_sort(R &&Range)
Definition STLExtras.h:2116
auto min_element(R &&Range)
Provide wrappers to std::min_element which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:2078
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1669
LLVM_ABI Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
Definition STLExtras.h:841
DenseMap< const Value *, const SCEV * > ValueToSCEVMapTy
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2554
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
constexpr from_range_t from_range
auto dyn_cast_if_present(const Y &Val)
dyn_cast_if_present<X> - Functionally identical to dyn_cast, except that a null (or none in the case ...
Definition Casting.h:732
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2208
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:634
auto cast_or_null(const Y &Val)
Definition Casting.h:714
iterator_range< df_iterator< VPBlockShallowTraversalWrapper< VPBlockBase * > > > vp_depth_first_shallow(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order.
Definition VPlanCFG.h:253
iterator_range< df_iterator< VPBlockDeepTraversalWrapper< VPBlockBase * > > > vp_depth_first_deep(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order while traversing t...
Definition VPlanCFG.h:280
constexpr auto equal_to(T &&Arg)
Functor variant of std::equal_to that can be used as a UnaryPredicate in functional algorithms like a...
Definition STLExtras.h:2173
detail::concat_range< ValueT, RangeTs... > concat(RangeTs &&...Ranges)
Returns a concatenated range across two or more ranges.
Definition STLExtras.h:1152
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:385
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1746
auto reverse(ContainerTy &&C)
Definition STLExtras.h:408
iterator_range< po_iterator< VPBlockDeepTraversalWrapper< VPBlockBase * > > > vp_post_order_deep(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in post order while traversing through ...
Definition VPlanCFG.h:273
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1636
LLVM_ABI_FOR_TEST cl::opt< bool > EnableWideActiveLaneMask
UncountableExitStyle
Different methods of handling early exits.
Definition VPlan.h:83
@ ReadOnly
No side effects to worry about, so we can process any uncountable exits in the loop and branch either...
Definition VPlan.h:88
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1753
SmallVector< ValueTypeFromRangeType< R >, Size > to_vector(R &&Range)
Given a range of type R, iterate the entire range and return a SmallVector with elements of the vecto...
iterator_range< filter_iterator< detail::IterOfRange< RangeT >, PredicateT > > make_filter_range(RangeT &&Range, PredicateT Pred)
Convenience function that takes a range of elements and a predicate, and return a new filter_iterator...
Definition STLExtras.h:552
bool canConstantBeExtended(const APInt *C, Type *NarrowType, TTI::PartialReductionExtendKind ExtKind)
Check if a constant CI can be safely treated as having been extended from a narrower type with the gi...
Definition VPlan.cpp:1790
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
iterator_range< po_iterator< VPBlockShallowTraversalWrapper< VPBlockBase * > > > vp_post_order_shallow(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in post order.
Definition VPlanCFG.h:266
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
auto drop_end(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the last N elements excluded.
Definition STLExtras.h:323
TargetTransformInfo TTI
RecurKind
These are the kinds of recurrences that we support.
@ UMin
Unsigned integer min implemented in terms of select(cmp()).
@ FindIV
FindIV reduction with select(icmp(),x,y) where one of (x,y) is a loop induction variable (increasing ...
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ FMul
Product of floats.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ SMin
Signed integer min implemented in terms of select(cmp()).
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
@ AddChainWithSubs
A chain of adds and subs.
@ FAdd
Sum of floats.
@ UMax
Unsigned integer max implemented in terms of select(cmp()).
LLVM_ABI Value * getRecurrenceIdentity(RecurKind K, Type *Tp, FastMathFlags FMF)
Given information about an recurrence kind, return the identity for the @llvm.vector....
LLVM_ABI BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="")
Split the specified block at the specified instruction.
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition STLExtras.h:2012
DWARFExpression::Operation Op
auto max_element(R &&Range)
Provide wrappers to std::max_element which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:2088
ArrayRef(const T &OneElt) -> ArrayRef< T >
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1772
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1947
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition STLExtras.h:2166
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition Hashing.h:592
bool equal(L &&LRange, R &&RRange)
Wrapper function around std::equal to detect if pair-wise elements between two ranges are the same.
Definition STLExtras.h:2146
Type * toVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
@ Default
The result value is uniform if and only if all operands are uniform.
Definition Uniformity.h:20
constexpr detail::IsaCheckPredicate< Types... > IsaPred
Function object wrapper for the llvm::isa type check.
Definition Casting.h:866
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition Hashing.h:466
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
#define N
RemoveMask_match(const Op0_t &In, Op1_t &Out)
bool match(OpTy *V) const
A collection of metadata nodes that might be associated with a memory access used by the alias-analys...
Definition Metadata.h:763
MDNode * Scope
The tag for alias scope specification (used with noalias).
Definition Metadata.h:786
MDNode * NoAlias
The tag specifying the noalias scope.
Definition Metadata.h:789
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
An information struct used to provide DenseMap with the various necessary components for a given valu...
Incoming for lane mask phi as machine instruction, incoming register Reg and incoming block Block are...
This reduction is unordered with the partial result scaled down by some factor.
Definition VPlan.h:2682
A range of powers-of-2 vectorization factors with fixed start and adjustable end.
Struct to hold various analysis needed for cost computations.
TargetTransformInfo::TargetCostKind CostKind
VPTypeAnalysis Types
const TargetTransformInfo & TTI
A recipe for handling first-order recurrence phis.
Definition VPlan.h:2638
A VPValue representing a live-in from the input IR or a constant.
Definition VPlanValue.h:207
Type * getType() const
Returns the type of the underlying IR value.
Definition VPlan.cpp:141
A symbolic live-in VPValue, used for values like vector trip count, VF, and VFxUF.
Definition VPlanValue.h:247
bool isMaterialized() const
Returns true if this symbolic value has been materialized.
Definition VPlanValue.h:255
A recipe for widening load operations with vector-predication intrinsics, using the address to load f...
Definition VPlan.h:3661
A recipe for widening load operations, using the address to load from and an optional mask.
Definition VPlan.h:3619
A recipe for widening store operations with vector-predication intrinsics, using the value to store,...
Definition VPlan.h:3746
A recipe for widening store operations, using the stored value, the address to store to and an option...
Definition VPlan.h:3702
static void handleUncountableEarlyExits(VPlan &Plan, VPBasicBlock *HeaderVPBB, VPBasicBlock *LatchVPBB, VPBasicBlock *MiddleVPBB, UncountableExitStyle Style)
Update Plan to account for uncountable early exits by introducing appropriate branching logic in the ...
static LLVM_ABI_FOR_TEST bool tryToConvertVPInstructionsToVPRecipes(VPlan &Plan, const TargetLibraryInfo &TLI)
Replaces the VPInstructions in Plan with corresponding widen recipes.
static void materializeBroadcasts(VPlan &Plan)
Add explicit broadcasts for live-ins and VPValues defined in Plan's entry block if they are used as v...
static void materializePacksAndUnpacks(VPlan &Plan)
Add explicit Build[Struct]Vector recipes to Pack multiple scalar values into vectors and Unpack recip...
static bool simplifyKnownEVL(VPlan &Plan, ElementCount VF, PredicatedScalarEvolution &PSE)
Try to simplify VPInstruction::ExplicitVectorLength recipes when the AVL is known to be <= VF,...
static void removeBranchOnConst(VPlan &Plan, bool OnlyLatches=false)
Remove BranchOnCond recipes with true or false conditions together with removing dead edges to their ...
static void materializeFactors(VPlan &Plan, VPBasicBlock *VectorPH, ElementCount VF)
Materialize UF, VF and VFxUF to be computed explicitly using VPInstructions.
static void materializeBackedgeTakenCount(VPlan &Plan, VPBasicBlock *VectorPH)
Materialize the backedge-taken count to be computed explicitly using VPInstructions.
static void hoistInvariantLoads(VPlan &Plan)
Hoist single-scalar loads with invariant addresses out of the vector loop to the preheader,...
static void addActiveLaneMask(VPlan &Plan, bool UseActiveLaneMaskForControlFlow)
Replace (ICMP_ULE, wide canonical IV, backedge-taken-count) checks with an (active-lane-mask recipe,...
static void dropPoisonGeneratingRecipes(VPlan &Plan, const std::function< bool(BasicBlock *)> &BlockNeedsPredication)
Drop poison flags from recipes that may generate a poison value that is used after vectorization,...
static void createAndOptimizeReplicateRegions(VPlan &Plan)
Wrap predicated VPReplicateRecipes with a mask operand in an if-then region block and remove the mask...
static void convertToVariableLengthStep(VPlan &Plan)
Transform loops with variable-length stepping after region dissolution.
static void createInterleaveGroups(VPlan &Plan, const SmallPtrSetImpl< const InterleaveGroup< Instruction > * > &InterleaveGroups, VPRecipeBuilder &RecipeBuilder, const bool &ScalarEpilogueAllowed)
static void addBranchWeightToMiddleTerminator(VPlan &Plan, ElementCount VF, std::optional< unsigned > VScaleForTuning)
Add branch weight metadata, if the Plan's middle block is terminated by a BranchOnCond recipe.
static std::unique_ptr< VPlan > narrowInterleaveGroups(VPlan &Plan, const TargetTransformInfo &TTI)
Try to find a single VF among Plan's VFs for which all interleave groups (with known minimum VF eleme...
static DenseMap< const SCEV *, Value * > expandSCEVs(VPlan &Plan, ScalarEvolution &SE)
Expand VPExpandSCEVRecipes in Plan's entry block.
static void convertToConcreteRecipes(VPlan &Plan)
Lower abstract recipes to concrete ones, that can be codegen'd.
static void expandBranchOnTwoConds(VPlan &Plan)
Expand BranchOnTwoConds instructions into explicit CFG with BranchOnCond instructions.
static void hoistPredicatedLoads(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L)
Hoist predicated loads from the same address to the loop entry block, if they are guaranteed to execu...
static bool mergeBlocksIntoPredecessors(VPlan &Plan)
Remove redundant VPBasicBlocks by merging them into their single predecessor if the latter has a sing...
static void optimizeFindIVReductions(VPlan &Plan, PredicatedScalarEvolution &PSE, Loop &L)
Optimize FindLast reductions selecting IVs (or expressions of IVs) by converting them to FindIV reduc...
static void convertToAbstractRecipes(VPlan &Plan, VPCostContext &Ctx, VFRange &Range)
This function converts initial recipes to the abstract recipes and clamps Range based on cost model f...
static void materializeConstantVectorTripCount(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
static void addExitUsersForFirstOrderRecurrences(VPlan &Plan, VFRange &Range)
Handle users in the exit block for first order reductions in the original exit block.
static void addExplicitVectorLength(VPlan &Plan, const std::optional< unsigned > &MaxEVLSafeElements)
Add a VPCurrentIterationPHIRecipe and related recipes to Plan and replaces all uses except the canoni...
static void optimizeEVLMasks(VPlan &Plan)
Optimize recipes which use an EVL-based header mask to VP intrinsics, for example:
static void replaceSymbolicStrides(VPlan &Plan, PredicatedScalarEvolution &PSE, const DenseMap< Value *, const SCEV * > &StridesMap)
Replace symbolic strides from StridesMap in Plan with constants when possible.
static void removeDeadRecipes(VPlan &Plan)
Remove dead recipes from Plan.
static void simplifyRecipes(VPlan &Plan)
Perform instcombine-like simplifications on recipes in Plan.
static void sinkPredicatedStores(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L)
Sink predicated stores to the same address with complementary predicates (P and NOT P) to an uncondit...
static void clearReductionWrapFlags(VPlan &Plan)
Clear NSW/NUW flags from reduction instructions if necessary.
static void optimizeInductionLiveOutUsers(VPlan &Plan, PredicatedScalarEvolution &PSE, bool FoldTail)
If there's a single exit block, optimize its phi recipes that use exiting IV values by feeding them p...
static void createPartialReductions(VPlan &Plan, VPCostContext &CostCtx, VFRange &Range)
Detect and create partial reduction recipes for scaled reductions in Plan.
static void cse(VPlan &Plan)
Perform common-subexpression-elimination on Plan.
static void materializeVectorTripCount(VPlan &Plan, VPBasicBlock *VectorPHVPBB, bool TailByMasking, bool RequiresScalarEpilogue, VPValue *Step)
Materialize vector trip count computations to a set of VPInstructions.
static LLVM_ABI_FOR_TEST void optimize(VPlan &Plan)
Apply VPlan-to-VPlan optimizations to Plan, including induction recipe optimizations,...
static void dissolveLoopRegions(VPlan &Plan)
Replace loop regions with explicit CFG.
static void truncateToMinimalBitwidths(VPlan &Plan, const MapVector< Instruction *, uint64_t > &MinBWs)
Insert truncates and extends for any truncated recipe.
static bool adjustFixedOrderRecurrences(VPlan &Plan, VPBuilder &Builder)
Try to have all users of fixed-order recurrences appear after the recipe defining their previous valu...
static void optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Optimize Plan based on BestVF and BestUF.
static void convertEVLExitCond(VPlan &Plan)
Replaces the exit condition from (branch-on-cond eq CanonicalIVInc, VectorTripCount) to (branch-on-co...