LLVM 23.0.0git
VPlanTransforms.cpp
Go to the documentation of this file.
1//===-- VPlanTransforms.cpp - Utility VPlan to VPlan transforms -----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file implements a set of utility VPlan to VPlan transformations.
11///
12//===----------------------------------------------------------------------===//
13
14#include "VPlanTransforms.h"
15#include "VPRecipeBuilder.h"
16#include "VPlan.h"
17#include "VPlanAnalysis.h"
18#include "VPlanCFG.h"
19#include "VPlanDominatorTree.h"
20#include "VPlanHelpers.h"
21#include "VPlanPatternMatch.h"
22#include "VPlanUtils.h"
23#include "VPlanVerifier.h"
24#include "llvm/ADT/APInt.h"
26#include "llvm/ADT/STLExtras.h"
28#include "llvm/ADT/SetVector.h"
30#include "llvm/ADT/TypeSwitch.h"
38#include "llvm/IR/Intrinsics.h"
39#include "llvm/IR/MDBuilder.h"
40#include "llvm/IR/Metadata.h"
45
46using namespace llvm;
47using namespace VPlanPatternMatch;
48using namespace SCEVPatternMatch;
49
51 VPlan &Plan, const TargetLibraryInfo &TLI) {
52
54 Plan.getVectorLoopRegion());
56 // Skip blocks outside region
57 if (!VPBB->getParent())
58 break;
59 VPRecipeBase *Term = VPBB->getTerminator();
60 auto EndIter = Term ? Term->getIterator() : VPBB->end();
61 // Introduce each ingredient into VPlan.
62 for (VPRecipeBase &Ingredient :
63 make_early_inc_range(make_range(VPBB->begin(), EndIter))) {
64
65 VPValue *VPV = Ingredient.getVPSingleValue();
66 if (!VPV->getUnderlyingValue())
67 continue;
68
70
71 VPRecipeBase *NewRecipe = nullptr;
72 if (auto *PhiR = dyn_cast<VPPhi>(&Ingredient)) {
73 auto *Phi = cast<PHINode>(PhiR->getUnderlyingValue());
74 NewRecipe = new VPWidenPHIRecipe(Phi, nullptr, PhiR->getDebugLoc());
75 for (VPValue *Op : PhiR->operands())
76 NewRecipe->addOperand(Op);
77 } else if (auto *VPI = dyn_cast<VPInstruction>(&Ingredient)) {
78 assert(!isa<PHINode>(Inst) && "phis should be handled above");
79 // Create VPWidenMemoryRecipe for loads and stores.
80 if (LoadInst *Load = dyn_cast<LoadInst>(Inst)) {
81 NewRecipe = new VPWidenLoadRecipe(
82 *Load, Ingredient.getOperand(0), nullptr /*Mask*/,
83 false /*Consecutive*/, *VPI, Ingredient.getDebugLoc());
84 } else if (StoreInst *Store = dyn_cast<StoreInst>(Inst)) {
85 NewRecipe = new VPWidenStoreRecipe(
86 *Store, Ingredient.getOperand(1), Ingredient.getOperand(0),
87 nullptr /*Mask*/, false /*Consecutive*/, *VPI,
88 Ingredient.getDebugLoc());
90 NewRecipe = new VPWidenGEPRecipe(GEP, Ingredient.operands(), *VPI,
91 Ingredient.getDebugLoc());
92 } else if (CallInst *CI = dyn_cast<CallInst>(Inst)) {
93 Intrinsic::ID VectorID = getVectorIntrinsicIDForCall(CI, &TLI);
94 if (VectorID == Intrinsic::not_intrinsic)
95 return false;
96
97 // The noalias.scope.decl intrinsic declares a noalias scope that
98 // is valid for a single iteration. Emitting it as a single-scalar
99 // replicate would incorrectly extend the scope across multiple
100 // original iterations packed into one vector iteration.
101 // FIXME: If we want to vectorize this loop, then we have to drop
102 // all the associated !alias.scope and !noalias.
103 if (VectorID == Intrinsic::experimental_noalias_scope_decl)
104 return false;
105
106 // These intrinsics are recognized by getVectorIntrinsicIDForCall
107 // but are not widenable. Emit them as replicate instead of widening.
108 if (VectorID == Intrinsic::assume ||
109 VectorID == Intrinsic::lifetime_end ||
110 VectorID == Intrinsic::lifetime_start ||
111 VectorID == Intrinsic::sideeffect ||
112 VectorID == Intrinsic::pseudoprobe) {
113 // If the operand of llvm.assume holds before vectorization, it will
114 // also hold per lane.
115 // llvm.pseudoprobe requires to be duplicated per lane for accurate
116 // sample count.
117 const bool IsSingleScalar = VectorID != Intrinsic::assume &&
118 VectorID != Intrinsic::pseudoprobe;
119 NewRecipe = new VPReplicateRecipe(CI, Ingredient.operands(),
120 /*IsSingleScalar=*/IsSingleScalar,
121 /*Mask=*/nullptr, *VPI, *VPI,
122 Ingredient.getDebugLoc());
123 } else {
124 NewRecipe = new VPWidenIntrinsicRecipe(
125 *CI, VectorID, drop_end(Ingredient.operands()), CI->getType(),
126 VPIRFlags(*CI), *VPI, CI->getDebugLoc());
127 }
128 } else if (auto *CI = dyn_cast<CastInst>(Inst)) {
129 NewRecipe = new VPWidenCastRecipe(
130 CI->getOpcode(), Ingredient.getOperand(0), CI->getType(), CI,
131 VPIRFlags(*CI), VPIRMetadata(*CI));
132 } else {
133 NewRecipe = new VPWidenRecipe(*Inst, Ingredient.operands(), *VPI,
134 *VPI, Ingredient.getDebugLoc());
135 }
136 } else {
138 "inductions must be created earlier");
139 continue;
140 }
141
142 NewRecipe->insertBefore(&Ingredient);
143 if (NewRecipe->getNumDefinedValues() == 1)
144 VPV->replaceAllUsesWith(NewRecipe->getVPSingleValue());
145 else
146 assert(NewRecipe->getNumDefinedValues() == 0 &&
147 "Only recpies with zero or one defined values expected");
148 Ingredient.eraseFromParent();
149 }
150 }
151 return true;
152}
153
154/// Helper for extra no-alias checks via known-safe recipe and SCEV.
156 const SmallPtrSetImpl<VPRecipeBase *> &ExcludeRecipes;
157 VPReplicateRecipe &GroupLeader;
159 const Loop &L;
160 VPTypeAnalysis &TypeInfo;
161
162 // Return true if \p A and \p B are known to not alias for all VFs in the
163 // plan, checked via the distance between the accesses
164 bool isNoAliasViaDistance(VPReplicateRecipe *A, VPReplicateRecipe *B) const {
165 if (A->getOpcode() != Instruction::Store ||
166 B->getOpcode() != Instruction::Store)
167 return false;
168
169 VPValue *AddrA = A->getOperand(1);
170 const SCEV *SCEVA = vputils::getSCEVExprForVPValue(AddrA, PSE, &L);
171 VPValue *AddrB = B->getOperand(1);
172 const SCEV *SCEVB = vputils::getSCEVExprForVPValue(AddrB, PSE, &L);
174 return false;
175
176 const APInt *Distance;
177 ScalarEvolution &SE = *PSE.getSE();
178 if (!match(SE.getMinusSCEV(SCEVA, SCEVB), m_scev_APInt(Distance)))
179 return false;
180
181 const DataLayout &DL = SE.getDataLayout();
182 Type *TyA = TypeInfo.inferScalarType(A->getOperand(0));
183 uint64_t SizeA = DL.getTypeStoreSize(TyA);
184 Type *TyB = TypeInfo.inferScalarType(B->getOperand(0));
185 uint64_t SizeB = DL.getTypeStoreSize(TyB);
186
187 // Use the maximum store size to ensure no overlap from either direction.
188 // Currently only handles fixed sizes, as it is only used for
189 // replicating VPReplicateRecipes.
190 uint64_t MaxStoreSize = std::max(SizeA, SizeB);
191
192 auto VFs = B->getParent()->getPlan()->vectorFactors();
194 if (MaxVF.isScalable())
195 return false;
196 return Distance->abs().uge(
197 MaxVF.multiplyCoefficientBy(MaxStoreSize).getFixedValue());
198 }
199
200public:
203 const Loop &L, VPTypeAnalysis &TypeInfo)
204 : ExcludeRecipes(ExcludeRecipes), GroupLeader(GroupLeader), PSE(PSE),
205 L(L), TypeInfo(TypeInfo) {}
206
207 /// Return true if \p R should be skipped during alias checking, either
208 /// because it's in the exclude set or because no-alias can be proven via
209 /// SCEV.
210 bool shouldSkip(VPRecipeBase &R) const {
211 auto *Store = dyn_cast<VPReplicateRecipe>(&R);
212 return ExcludeRecipes.contains(&R) ||
213 (Store && isNoAliasViaDistance(Store, &GroupLeader));
214 }
215};
216
217/// Check if a memory operation doesn't alias with memory operations using
218/// scoped noalias metadata, in blocks in the single-successor chain between \p
219/// FirstBB and \p LastBB. If \p SinkInfo is std::nullopt, only recipes that may
220/// write to memory are checked (for load hoisting). Otherwise recipes that both
221/// read and write memory are checked, and SCEV is used to prove no-alias
222/// between the group leader and other replicate recipes (for store sinking).
223static bool
225 VPBasicBlock *FirstBB, VPBasicBlock *LastBB,
226 std::optional<SinkStoreInfo> SinkInfo = {}) {
227 bool CheckReads = SinkInfo.has_value();
228 if (!MemLoc.AATags.Scope)
229 return false;
230
231 for (VPBasicBlock *VPBB :
233 for (VPRecipeBase &R : *VPBB) {
234 if (SinkInfo && SinkInfo->shouldSkip(R))
235 continue;
236
237 // Skip recipes that don't need checking.
238 if (!R.mayWriteToMemory() && !(CheckReads && R.mayReadFromMemory()))
239 continue;
240
242 if (!Loc)
243 // Conservatively assume aliasing for memory operations without
244 // location.
245 return false;
246
248 return false;
249 }
250 }
251 return true;
252}
253
254/// Collect either replicated Loads or Stores grouped by their address SCEV, in
255/// a deep-traversal of the vector loop region in \p Plan.
256template <unsigned Opcode>
259 VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L,
260 function_ref<bool(VPReplicateRecipe *)> FilterFn) {
261 static_assert(Opcode == Instruction::Load || Opcode == Instruction::Store,
262 "Only Load and Store opcodes supported");
263 constexpr bool IsLoad = (Opcode == Instruction::Load);
265 RecipesByAddress;
268 for (VPRecipeBase &R : *VPBB) {
269 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
270 if (!RepR || RepR->getOpcode() != Opcode || !FilterFn(RepR))
271 continue;
272
273 // For loads, operand 0 is address; for stores, operand 1 is address.
274 VPValue *Addr = RepR->getOperand(IsLoad ? 0 : 1);
275 const SCEV *AddrSCEV = vputils::getSCEVExprForVPValue(Addr, PSE, L);
276 if (!isa<SCEVCouldNotCompute>(AddrSCEV))
277 RecipesByAddress[AddrSCEV].push_back(RepR);
278 }
279 }
280 auto Groups = to_vector(RecipesByAddress.values());
281 VPDominatorTree VPDT(Plan);
282 for (auto &Group : Groups) {
283 // Sort mem ops by dominance order, with earliest (most dominating) first.
285 return VPDT.properlyDominates(A, B);
286 });
287 }
288 return Groups;
289}
290
291/// Return true if we do not know how to (mechanically) hoist or sink \p R out
292/// of a loop region.
294 // Assumes don't alias anything or throw; as long as they're guaranteed to
295 // execute, they're safe to hoist.
297 return false;
298
299 // TODO: Relax checks in the future, e.g. we could also hoist reads, if their
300 // memory location is not modified in the vector loop.
301 if (R.mayHaveSideEffects() || R.mayReadFromMemory() || R.isPhi())
302 return true;
303
304 // Allocas cannot be hoisted.
305 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
306 return RepR && RepR->getOpcode() == Instruction::Alloca;
307}
308
309static bool sinkScalarOperands(VPlan &Plan) {
310 auto Iter = vp_depth_first_deep(Plan.getEntry());
311 bool ScalarVFOnly = Plan.hasScalarVFOnly();
312 bool Changed = false;
313
315 auto InsertIfValidSinkCandidate = [ScalarVFOnly, &WorkList](
316 VPBasicBlock *SinkTo, VPValue *Op) {
317 auto *Candidate =
318 dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe());
319 if (!Candidate)
320 return;
321
322 // We only know how to sink VPReplicateRecipes and VPScalarIVStepsRecipes
323 // for now.
325 return;
326
327 if (Candidate->getParent() == SinkTo || cannotHoistOrSinkRecipe(*Candidate))
328 return;
329
330 if (auto *RepR = dyn_cast<VPReplicateRecipe>(Candidate))
331 if (!ScalarVFOnly && RepR->isSingleScalar())
332 return;
333
334 WorkList.insert({SinkTo, Candidate});
335 };
336
337 // First, collect the operands of all recipes in replicate blocks as seeds for
338 // sinking.
340 VPBasicBlock *EntryVPBB = VPR->getEntryBasicBlock();
341 if (!VPR->isReplicator() || EntryVPBB->getSuccessors().size() != 2)
342 continue;
343 VPBasicBlock *VPBB = cast<VPBasicBlock>(EntryVPBB->getSuccessors().front());
344 if (VPBB->getSingleSuccessor() != VPR->getExitingBasicBlock())
345 continue;
346 for (auto &Recipe : *VPBB)
347 for (VPValue *Op : Recipe.operands())
348 InsertIfValidSinkCandidate(VPBB, Op);
349 }
350
351 // Try to sink each replicate or scalar IV steps recipe in the worklist.
352 for (unsigned I = 0; I != WorkList.size(); ++I) {
353 VPBasicBlock *SinkTo;
354 VPSingleDefRecipe *SinkCandidate;
355 std::tie(SinkTo, SinkCandidate) = WorkList[I];
356
357 // All recipe users of SinkCandidate must be in the same block SinkTo or all
358 // users outside of SinkTo must only use the first lane of SinkCandidate. In
359 // the latter case, we need to duplicate SinkCandidate.
360 auto UsersOutsideSinkTo =
361 make_filter_range(SinkCandidate->users(), [SinkTo](VPUser *U) {
362 return cast<VPRecipeBase>(U)->getParent() != SinkTo;
363 });
364 if (any_of(UsersOutsideSinkTo, [SinkCandidate](VPUser *U) {
365 return !U->usesFirstLaneOnly(SinkCandidate);
366 }))
367 continue;
368 bool NeedsDuplicating = !UsersOutsideSinkTo.empty();
369
370 if (NeedsDuplicating) {
371 if (ScalarVFOnly)
372 continue;
373 VPSingleDefRecipe *Clone;
374 if (auto *SinkCandidateRepR =
375 dyn_cast<VPReplicateRecipe>(SinkCandidate)) {
376 // TODO: Handle converting to uniform recipes as separate transform,
377 // then cloning should be sufficient here.
378 Instruction *I = SinkCandidate->getUnderlyingInstr();
379 Clone = new VPReplicateRecipe(I, SinkCandidate->operands(), true,
380 nullptr /*Mask*/, *SinkCandidateRepR,
381 *SinkCandidateRepR);
382 // TODO: add ".cloned" suffix to name of Clone's VPValue.
383 } else {
384 Clone = SinkCandidate->clone();
385 }
386
387 Clone->insertBefore(SinkCandidate);
388 SinkCandidate->replaceUsesWithIf(Clone, [SinkTo](VPUser &U, unsigned) {
389 return cast<VPRecipeBase>(&U)->getParent() != SinkTo;
390 });
391 }
392 SinkCandidate->moveBefore(*SinkTo, SinkTo->getFirstNonPhi());
393 for (VPValue *Op : SinkCandidate->operands())
394 InsertIfValidSinkCandidate(SinkTo, Op);
395 Changed = true;
396 }
397 return Changed;
398}
399
400/// If \p R is a region with a VPBranchOnMaskRecipe in the entry block, return
401/// the mask.
403 auto *EntryBB = dyn_cast<VPBasicBlock>(R->getEntry());
404 if (!EntryBB || EntryBB->size() != 1 ||
405 !isa<VPBranchOnMaskRecipe>(EntryBB->begin()))
406 return nullptr;
407
408 return cast<VPBranchOnMaskRecipe>(&*EntryBB->begin())->getOperand(0);
409}
410
411/// If \p R is a triangle region, return the 'then' block of the triangle.
413 auto *EntryBB = cast<VPBasicBlock>(R->getEntry());
414 if (EntryBB->getNumSuccessors() != 2)
415 return nullptr;
416
417 auto *Succ0 = dyn_cast<VPBasicBlock>(EntryBB->getSuccessors()[0]);
418 auto *Succ1 = dyn_cast<VPBasicBlock>(EntryBB->getSuccessors()[1]);
419 if (!Succ0 || !Succ1)
420 return nullptr;
421
422 if (Succ0->getNumSuccessors() + Succ1->getNumSuccessors() != 1)
423 return nullptr;
424 if (Succ0->getSingleSuccessor() == Succ1)
425 return Succ0;
426 if (Succ1->getSingleSuccessor() == Succ0)
427 return Succ1;
428 return nullptr;
429}
430
431// Merge replicate regions in their successor region, if a replicate region
432// is connected to a successor replicate region with the same predicate by a
433// single, empty VPBasicBlock.
435 SmallPtrSet<VPRegionBlock *, 4> TransformedRegions;
436
437 // Collect replicate regions followed by an empty block, followed by another
438 // replicate region with matching masks to process front. This is to avoid
439 // iterator invalidation issues while merging regions.
442 vp_depth_first_deep(Plan.getEntry()))) {
443 if (!Region1->isReplicator())
444 continue;
445 auto *MiddleBasicBlock =
446 dyn_cast_or_null<VPBasicBlock>(Region1->getSingleSuccessor());
447 if (!MiddleBasicBlock || !MiddleBasicBlock->empty())
448 continue;
449
450 auto *Region2 =
451 dyn_cast_or_null<VPRegionBlock>(MiddleBasicBlock->getSingleSuccessor());
452 if (!Region2 || !Region2->isReplicator())
453 continue;
454
455 VPValue *Mask1 = getPredicatedMask(Region1);
456 VPValue *Mask2 = getPredicatedMask(Region2);
457 if (!Mask1 || Mask1 != Mask2)
458 continue;
459
460 assert(Mask1 && Mask2 && "both region must have conditions");
461 WorkList.push_back(Region1);
462 }
463
464 // Move recipes from Region1 to its successor region, if both are triangles.
465 for (VPRegionBlock *Region1 : WorkList) {
466 if (TransformedRegions.contains(Region1))
467 continue;
468 auto *MiddleBasicBlock = cast<VPBasicBlock>(Region1->getSingleSuccessor());
469 auto *Region2 = cast<VPRegionBlock>(MiddleBasicBlock->getSingleSuccessor());
470
471 VPBasicBlock *Then1 = getPredicatedThenBlock(Region1);
472 VPBasicBlock *Then2 = getPredicatedThenBlock(Region2);
473 if (!Then1 || !Then2)
474 continue;
475
476 // Note: No fusion-preventing memory dependencies are expected in either
477 // region. Such dependencies should be rejected during earlier dependence
478 // checks, which guarantee accesses can be re-ordered for vectorization.
479 //
480 // Move recipes to the successor region.
481 for (VPRecipeBase &ToMove : make_early_inc_range(reverse(*Then1)))
482 ToMove.moveBefore(*Then2, Then2->getFirstNonPhi());
483
484 auto *Merge1 = cast<VPBasicBlock>(Then1->getSingleSuccessor());
485 auto *Merge2 = cast<VPBasicBlock>(Then2->getSingleSuccessor());
486
487 // Move VPPredInstPHIRecipes from the merge block to the successor region's
488 // merge block. Update all users inside the successor region to use the
489 // original values.
490 for (VPRecipeBase &Phi1ToMove : make_early_inc_range(reverse(*Merge1))) {
491 VPValue *PredInst1 =
492 cast<VPPredInstPHIRecipe>(&Phi1ToMove)->getOperand(0);
493 VPValue *Phi1ToMoveV = Phi1ToMove.getVPSingleValue();
494 Phi1ToMoveV->replaceUsesWithIf(PredInst1, [Then2](VPUser &U, unsigned) {
495 return cast<VPRecipeBase>(&U)->getParent() == Then2;
496 });
497
498 // Remove phi recipes that are unused after merging the regions.
499 if (Phi1ToMove.getVPSingleValue()->getNumUsers() == 0) {
500 Phi1ToMove.eraseFromParent();
501 continue;
502 }
503 Phi1ToMove.moveBefore(*Merge2, Merge2->begin());
504 }
505
506 // Remove the dead recipes in Region1's entry block.
507 for (VPRecipeBase &R :
508 make_early_inc_range(reverse(*Region1->getEntryBasicBlock())))
509 R.eraseFromParent();
510
511 // Finally, remove the first region.
512 for (VPBlockBase *Pred : make_early_inc_range(Region1->getPredecessors())) {
513 VPBlockUtils::disconnectBlocks(Pred, Region1);
514 VPBlockUtils::connectBlocks(Pred, MiddleBasicBlock);
515 }
516 VPBlockUtils::disconnectBlocks(Region1, MiddleBasicBlock);
517 TransformedRegions.insert(Region1);
518 }
519
520 return !TransformedRegions.empty();
521}
522
524 VPlan &Plan) {
525 Instruction *Instr = PredRecipe->getUnderlyingInstr();
526 // Build the triangular if-then region.
527 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
528 assert(Instr->getParent() && "Predicated instruction not in any basic block");
529 auto *BlockInMask = PredRecipe->getMask();
530 auto *MaskDef = BlockInMask->getDefiningRecipe();
531 auto *BOMRecipe = new VPBranchOnMaskRecipe(
532 BlockInMask, MaskDef ? MaskDef->getDebugLoc() : DebugLoc::getUnknown());
533 auto *Entry =
534 Plan.createVPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
535
536 // Replace predicated replicate recipe with a replicate recipe without a
537 // mask but in the replicate region.
538 auto *RecipeWithoutMask = new VPReplicateRecipe(
539 PredRecipe->getUnderlyingInstr(), drop_end(PredRecipe->operands()),
540 PredRecipe->isSingleScalar(), nullptr /*Mask*/, *PredRecipe, *PredRecipe,
541 PredRecipe->getDebugLoc());
542 auto *Pred =
543 Plan.createVPBasicBlock(Twine(RegionName) + ".if", RecipeWithoutMask);
544
545 VPPredInstPHIRecipe *PHIRecipe = nullptr;
546 if (PredRecipe->getNumUsers() != 0) {
547 PHIRecipe = new VPPredInstPHIRecipe(RecipeWithoutMask,
548 RecipeWithoutMask->getDebugLoc());
549 PredRecipe->replaceAllUsesWith(PHIRecipe);
550 PHIRecipe->setOperand(0, RecipeWithoutMask);
551 }
552 PredRecipe->eraseFromParent();
553 auto *Exiting =
554 Plan.createVPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
556 Plan.createReplicateRegion(Entry, Exiting, RegionName);
557
558 // Note: first set Entry as region entry and then connect successors starting
559 // from it in order, to propagate the "parent" of each VPBasicBlock.
560 VPBlockUtils::insertTwoBlocksAfter(Pred, Exiting, Entry);
561 VPBlockUtils::connectBlocks(Pred, Exiting);
562
563 return Region;
564}
565
566static void addReplicateRegions(VPlan &Plan) {
569 vp_depth_first_deep(Plan.getEntry()))) {
570 for (VPRecipeBase &R : *VPBB)
571 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
572 if (RepR->isPredicated())
573 WorkList.push_back(RepR);
574 }
575 }
576
577 unsigned BBNum = 0;
578 for (VPReplicateRecipe *RepR : WorkList) {
579 VPBasicBlock *CurrentBlock = RepR->getParent();
580 VPBasicBlock *SplitBlock = CurrentBlock->splitAt(RepR->getIterator());
581
582 BasicBlock *OrigBB = RepR->getUnderlyingInstr()->getParent();
583 SplitBlock->setName(
584 OrigBB->hasName() ? OrigBB->getName() + "." + Twine(BBNum++) : "");
585 // Record predicated instructions for above packing optimizations.
587 Region->setParent(CurrentBlock->getParent());
589
590 VPRegionBlock *ParentRegion = Region->getParent();
591 if (ParentRegion && ParentRegion->getExiting() == CurrentBlock)
592 ParentRegion->setExiting(SplitBlock);
593 }
594}
595
599 vp_depth_first_deep(Plan.getEntry()))) {
600 // Don't fold the blocks in the skeleton of the Plan into their single
601 // predecessors for now.
602 // TODO: Remove restriction once more of the skeleton is modeled in VPlan.
603 if (!VPBB->getParent())
604 continue;
605 auto *PredVPBB =
606 dyn_cast_or_null<VPBasicBlock>(VPBB->getSinglePredecessor());
607 if (!PredVPBB || PredVPBB->getNumSuccessors() != 1 ||
608 isa<VPIRBasicBlock>(PredVPBB))
609 continue;
610 WorkList.push_back(VPBB);
611 }
612
613 for (VPBasicBlock *VPBB : WorkList) {
614 VPBasicBlock *PredVPBB = cast<VPBasicBlock>(VPBB->getSinglePredecessor());
615 for (VPRecipeBase &R : make_early_inc_range(*VPBB))
616 R.moveBefore(*PredVPBB, PredVPBB->end());
617 VPBlockUtils::disconnectBlocks(PredVPBB, VPBB);
618 auto *ParentRegion = VPBB->getParent();
619 if (ParentRegion && ParentRegion->getExiting() == VPBB)
620 ParentRegion->setExiting(PredVPBB);
621 VPBlockUtils::transferSuccessors(VPBB, PredVPBB);
622 // VPBB is now dead and will be cleaned up when the plan gets destroyed.
623 }
624 return !WorkList.empty();
625}
626
628 // Convert masked VPReplicateRecipes to if-then region blocks.
630
631 bool ShouldSimplify = true;
632 while (ShouldSimplify) {
633 ShouldSimplify = sinkScalarOperands(Plan);
634 ShouldSimplify |= mergeReplicateRegionsIntoSuccessors(Plan);
635 ShouldSimplify |= mergeBlocksIntoPredecessors(Plan);
636 }
637}
638
639/// Remove redundant casts of inductions.
640///
641/// Such redundant casts are casts of induction variables that can be ignored,
642/// because we already proved that the casted phi is equal to the uncasted phi
643/// in the vectorized loop. There is no need to vectorize the cast - the same
644/// value can be used for both the phi and casts in the vector loop.
646 for (auto &Phi : Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
648 if (!IV || IV->getTruncInst())
649 continue;
650
651 // A sequence of IR Casts has potentially been recorded for IV, which
652 // *must be bypassed* when the IV is vectorized, because the vectorized IV
653 // will produce the desired casted value. This sequence forms a def-use
654 // chain and is provided in reverse order, ending with the cast that uses
655 // the IV phi. Search for the recipe of the last cast in the chain and
656 // replace it with the original IV. Note that only the final cast is
657 // expected to have users outside the cast-chain and the dead casts left
658 // over will be cleaned up later.
659 ArrayRef<Instruction *> Casts = IV->getInductionDescriptor().getCastInsts();
660 VPValue *FindMyCast = IV;
661 for (Instruction *IRCast : reverse(Casts)) {
662 VPSingleDefRecipe *FoundUserCast = nullptr;
663 for (auto *U : FindMyCast->users()) {
664 auto *UserCast = dyn_cast<VPSingleDefRecipe>(U);
665 if (UserCast && UserCast->getUnderlyingValue() == IRCast) {
666 FoundUserCast = UserCast;
667 break;
668 }
669 }
670 FindMyCast = FoundUserCast;
671 }
672 FindMyCast->replaceAllUsesWith(IV);
673 }
674}
675
676/// Try to replace VPWidenCanonicalIVRecipes with a widened canonical IV
677/// recipe, if it exists.
679 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
680 VPCanonicalIVPHIRecipe *CanonicalIV = LoopRegion->getCanonicalIV();
681 VPWidenCanonicalIVRecipe *WidenNewIV = nullptr;
682 for (VPUser *U : CanonicalIV->users()) {
684 if (WidenNewIV)
685 break;
686 }
687
688 if (!WidenNewIV)
689 return;
690
691 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
692 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
693 auto *WidenOriginalIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
694
695 if (!WidenOriginalIV || !WidenOriginalIV->isCanonical())
696 continue;
697
698 // Replace WidenNewIV with WidenOriginalIV if WidenOriginalIV provides
699 // everything WidenNewIV's users need. That is, WidenOriginalIV will
700 // generate a vector phi or all users of WidenNewIV demand the first lane
701 // only.
702 if (Plan.hasScalarVFOnly() ||
703 !vputils::onlyScalarValuesUsed(WidenOriginalIV) ||
704 vputils::onlyFirstLaneUsed(WidenNewIV)) {
705 // We are replacing a wide canonical iv with a suitable wide induction.
706 // This is used to compute header mask, hence all lanes will be used and
707 // we need to drop wrap flags only applying to lanes guranteed to execute
708 // in the original scalar loop.
709 WidenOriginalIV->dropPoisonGeneratingFlags();
710 WidenNewIV->replaceAllUsesWith(WidenOriginalIV);
711 WidenNewIV->eraseFromParent();
712 return;
713 }
714 }
715}
716
717/// Returns true if \p R is dead and can be removed.
718static bool isDeadRecipe(VPRecipeBase &R) {
719 // Do remove conditional assume instructions as their conditions may be
720 // flattened.
721 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
722 bool IsConditionalAssume = RepR && RepR->isPredicated() &&
724 if (IsConditionalAssume)
725 return true;
726
727 if (R.mayHaveSideEffects())
728 return false;
729
730 // Recipe is dead if no user keeps the recipe alive.
731 return all_of(R.definedValues(),
732 [](VPValue *V) { return V->getNumUsers() == 0; });
733}
734
737 Plan.getEntry());
739 // The recipes in the block are processed in reverse order, to catch chains
740 // of dead recipes.
741 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
742 if (isDeadRecipe(R)) {
743 R.eraseFromParent();
744 continue;
745 }
746
747 // Check if R is a dead VPPhi <-> update cycle and remove it.
748 VPValue *Start, *Incoming;
749 if (!match(&R, m_VPPhi(m_VPValue(Start), m_VPValue(Incoming))))
750 continue;
751 auto *PhiR = cast<VPPhi>(&R);
752 VPUser *PhiUser = PhiR->getSingleUser();
753 if (!PhiUser)
754 continue;
755 if (PhiUser != Incoming->getDefiningRecipe() ||
756 Incoming->getNumUsers() != 1)
757 continue;
758 PhiR->replaceAllUsesWith(Start);
759 PhiR->eraseFromParent();
760 Incoming->getDefiningRecipe()->eraseFromParent();
761 }
762 }
763}
764
767 Instruction::BinaryOps InductionOpcode,
768 FPMathOperator *FPBinOp, Instruction *TruncI,
769 VPIRValue *StartV, VPValue *Step, DebugLoc DL,
770 VPBuilder &Builder) {
771 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
772 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
773 VPCanonicalIVPHIRecipe *CanonicalIV = LoopRegion->getCanonicalIV();
774 VPSingleDefRecipe *BaseIV = Builder.createDerivedIV(
775 Kind, FPBinOp, StartV, CanonicalIV, Step, "offset.idx");
776
777 // Truncate base induction if needed.
778 VPTypeAnalysis TypeInfo(Plan);
779 Type *ResultTy = TypeInfo.inferScalarType(BaseIV);
780 if (TruncI) {
781 Type *TruncTy = TruncI->getType();
782 assert(ResultTy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits() &&
783 "Not truncating.");
784 assert(ResultTy->isIntegerTy() && "Truncation requires an integer type");
785 BaseIV = Builder.createScalarCast(Instruction::Trunc, BaseIV, TruncTy, DL);
786 ResultTy = TruncTy;
787 }
788
789 // Truncate step if needed.
790 Type *StepTy = TypeInfo.inferScalarType(Step);
791 if (ResultTy != StepTy) {
792 assert(StepTy->getScalarSizeInBits() > ResultTy->getScalarSizeInBits() &&
793 "Not truncating.");
794 assert(StepTy->isIntegerTy() && "Truncation requires an integer type");
795 auto *VecPreheader =
797 VPBuilder::InsertPointGuard Guard(Builder);
798 Builder.setInsertPoint(VecPreheader);
799 Step = Builder.createScalarCast(Instruction::Trunc, Step, ResultTy, DL);
800 }
801 return Builder.createScalarIVSteps(InductionOpcode, FPBinOp, BaseIV, Step,
802 &Plan.getVF(), DL);
803}
804
807 for (unsigned I = 0; I != Users.size(); ++I) {
809 if (isa<VPHeaderPHIRecipe>(Cur))
810 continue;
811 for (VPValue *V : Cur->definedValues())
812 Users.insert_range(V->users());
813 }
814 return Users.takeVector();
815}
816
817/// Scalarize a VPWidenPointerInductionRecipe by replacing it with a PtrAdd
818/// (IndStart, ScalarIVSteps (0, Step)). This is used when the recipe only
819/// generates scalar values.
820static VPValue *
822 VPlan &Plan, VPBuilder &Builder) {
824 VPIRValue *StartV = Plan.getZero(ID.getStep()->getType());
825 VPValue *StepV = PtrIV->getOperand(1);
827 Plan, InductionDescriptor::IK_IntInduction, Instruction::Add, nullptr,
828 nullptr, StartV, StepV, PtrIV->getDebugLoc(), Builder);
829
830 return Builder.createPtrAdd(PtrIV->getStartValue(), Steps,
831 PtrIV->getDebugLoc(), "next.gep");
832}
833
834/// Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd
835/// (IndStart, ScalarIVSteps (0, Step)) if only its scalar values are used, as
836/// VPWidenPointerInductionRecipe will generate vectors only. If some users
837/// require vectors while other require scalars, the scalar uses need to extract
838/// the scalars from the generated vectors (Note that this is different to how
839/// int/fp inductions are handled). Legalize extract-from-ends using uniform
840/// VPReplicateRecipe of wide inductions to use regular VPReplicateRecipe, so
841/// the correct end value is available. Also optimize
842/// VPWidenIntOrFpInductionRecipe, if any of its users needs scalar values, by
843/// providing them scalar steps built on the canonical scalar IV and update the
844/// original IV's users. This is an optional optimization to reduce the needs of
845/// vector extracts.
848 bool HasOnlyVectorVFs = !Plan.hasScalarVFOnly();
849 VPBuilder Builder(HeaderVPBB, HeaderVPBB->getFirstNonPhi());
850 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
851 auto *PhiR = dyn_cast<VPWidenInductionRecipe>(&Phi);
852 if (!PhiR)
853 continue;
854
855 // Try to narrow wide and replicating recipes to uniform recipes, based on
856 // VPlan analysis.
857 // TODO: Apply to all recipes in the future, to replace legacy uniformity
858 // analysis.
859 auto Users = collectUsersRecursively(PhiR);
860 for (VPUser *U : reverse(Users)) {
861 auto *Def = dyn_cast<VPRecipeWithIRFlags>(U);
862 auto *RepR = dyn_cast<VPReplicateRecipe>(U);
863 // Skip recipes that shouldn't be narrowed.
864 if (!Def || !isa<VPReplicateRecipe, VPWidenRecipe>(Def) ||
865 Def->getNumUsers() == 0 || !Def->getUnderlyingValue() ||
866 (RepR && (RepR->isSingleScalar() || RepR->isPredicated())))
867 continue;
868
869 // Skip recipes that may have other lanes than their first used.
871 continue;
872
873 auto *Clone = new VPReplicateRecipe(Def->getUnderlyingInstr(),
874 Def->operands(), /*IsUniform*/ true,
875 /*Mask*/ nullptr, /*Flags*/ *Def);
876 Clone->insertAfter(Def);
877 Def->replaceAllUsesWith(Clone);
878 }
879
880 // Replace wide pointer inductions which have only their scalars used by
881 // PtrAdd(IndStart, ScalarIVSteps (0, Step)).
882 if (auto *PtrIV = dyn_cast<VPWidenPointerInductionRecipe>(&Phi)) {
883 if (!Plan.hasScalarVFOnly() &&
884 !PtrIV->onlyScalarsGenerated(Plan.hasScalableVF()))
885 continue;
886
887 VPValue *PtrAdd = scalarizeVPWidenPointerInduction(PtrIV, Plan, Builder);
888 PtrIV->replaceAllUsesWith(PtrAdd);
889 continue;
890 }
891
892 // Replace widened induction with scalar steps for users that only use
893 // scalars.
894 auto *WideIV = cast<VPWidenIntOrFpInductionRecipe>(&Phi);
895 if (HasOnlyVectorVFs && none_of(WideIV->users(), [WideIV](VPUser *U) {
896 return U->usesScalars(WideIV);
897 }))
898 continue;
899
900 const InductionDescriptor &ID = WideIV->getInductionDescriptor();
902 Plan, ID.getKind(), ID.getInductionOpcode(),
903 dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
904 WideIV->getTruncInst(), WideIV->getStartValue(), WideIV->getStepValue(),
905 WideIV->getDebugLoc(), Builder);
906
907 // Update scalar users of IV to use Step instead.
908 if (!HasOnlyVectorVFs) {
909 assert(!Plan.hasScalableVF() &&
910 "plans containing a scalar VF cannot also include scalable VFs");
911 WideIV->replaceAllUsesWith(Steps);
912 } else {
913 bool HasScalableVF = Plan.hasScalableVF();
914 WideIV->replaceUsesWithIf(Steps,
915 [WideIV, HasScalableVF](VPUser &U, unsigned) {
916 if (HasScalableVF)
917 return U.usesFirstLaneOnly(WideIV);
918 return U.usesScalars(WideIV);
919 });
920 }
921 }
922}
923
924/// Check if \p VPV is an untruncated wide induction, either before or after the
925/// increment. If so return the header IV (before the increment), otherwise
926/// return null.
929 auto *WideIV = dyn_cast<VPWidenInductionRecipe>(VPV);
930 if (WideIV) {
931 // VPV itself is a wide induction, separately compute the end value for exit
932 // users if it is not a truncated IV.
933 auto *IntOrFpIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
934 return (IntOrFpIV && IntOrFpIV->getTruncInst()) ? nullptr : WideIV;
935 }
936
937 // Check if VPV is an optimizable induction increment.
938 VPRecipeBase *Def = VPV->getDefiningRecipe();
939 if (!Def || Def->getNumOperands() != 2)
940 return nullptr;
941 WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(0));
942 if (!WideIV)
943 WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(1));
944 if (!WideIV)
945 return nullptr;
946
947 auto IsWideIVInc = [&]() {
948 auto &ID = WideIV->getInductionDescriptor();
949
950 // Check if VPV increments the induction by the induction step.
951 VPValue *IVStep = WideIV->getStepValue();
952 switch (ID.getInductionOpcode()) {
953 case Instruction::Add:
954 return match(VPV, m_c_Add(m_Specific(WideIV), m_Specific(IVStep)));
955 case Instruction::FAdd:
956 return match(VPV, m_c_FAdd(m_Specific(WideIV), m_Specific(IVStep)));
957 case Instruction::FSub:
958 return match(VPV, m_Binary<Instruction::FSub>(m_Specific(WideIV),
959 m_Specific(IVStep)));
960 case Instruction::Sub: {
961 // IVStep will be the negated step of the subtraction. Check if Step == -1
962 // * IVStep.
963 VPValue *Step;
964 if (!match(VPV, m_Sub(m_VPValue(), m_VPValue(Step))))
965 return false;
966 const SCEV *IVStepSCEV = vputils::getSCEVExprForVPValue(IVStep, PSE);
967 const SCEV *StepSCEV = vputils::getSCEVExprForVPValue(Step, PSE);
968 ScalarEvolution &SE = *PSE.getSE();
969 return !isa<SCEVCouldNotCompute>(IVStepSCEV) &&
970 !isa<SCEVCouldNotCompute>(StepSCEV) &&
971 IVStepSCEV == SE.getNegativeSCEV(StepSCEV);
972 }
973 default:
974 return ID.getKind() == InductionDescriptor::IK_PtrInduction &&
975 match(VPV, m_GetElementPtr(m_Specific(WideIV),
976 m_Specific(WideIV->getStepValue())));
977 }
978 llvm_unreachable("should have been covered by switch above");
979 };
980 return IsWideIVInc() ? WideIV : nullptr;
981}
982
983/// Attempts to optimize the induction variable exit values for users in the
984/// early exit block.
986 VPTypeAnalysis &TypeInfo,
987 VPBlockBase *PredVPBB,
988 VPValue *Op,
990 VPValue *Incoming, *Mask;
993 return nullptr;
994
995 auto *WideIV = getOptimizableIVOf(Incoming, PSE);
996 if (!WideIV)
997 return nullptr;
998
999 auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
1000 if (WideIntOrFp && WideIntOrFp->getTruncInst())
1001 return nullptr;
1002
1003 // Calculate the final index.
1004 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
1005 auto *CanonicalIV = LoopRegion->getCanonicalIV();
1006 Type *CanonicalIVType = LoopRegion->getCanonicalIVType();
1007 VPBuilder B(cast<VPBasicBlock>(PredVPBB));
1008
1009 DebugLoc DL = cast<VPInstruction>(Op)->getDebugLoc();
1010 VPValue *FirstActiveLane =
1011 B.createNaryOp(VPInstruction::FirstActiveLane, Mask, DL);
1012 Type *FirstActiveLaneType = TypeInfo.inferScalarType(FirstActiveLane);
1013 FirstActiveLane = B.createScalarZExtOrTrunc(FirstActiveLane, CanonicalIVType,
1014 FirstActiveLaneType, DL);
1015 VPValue *EndValue = B.createAdd(CanonicalIV, FirstActiveLane, DL);
1016
1017 // `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
1018 // changed it means the exit is using the incremented value, so we need to
1019 // add the step.
1020 if (Incoming != WideIV) {
1021 VPValue *One = Plan.getConstantInt(CanonicalIVType, 1);
1022 EndValue = B.createAdd(EndValue, One, DL);
1023 }
1024
1025 if (!WideIntOrFp || !WideIntOrFp->isCanonical()) {
1026 const InductionDescriptor &ID = WideIV->getInductionDescriptor();
1027 VPIRValue *Start = WideIV->getStartValue();
1028 VPValue *Step = WideIV->getStepValue();
1029 EndValue = B.createDerivedIV(
1030 ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
1031 Start, EndValue, Step);
1032 }
1033
1034 return EndValue;
1035}
1036
1037/// Compute the end value for \p WideIV, unless it is truncated. Creates a
1038/// VPDerivedIVRecipe for non-canonical inductions.
1040 VPBuilder &VectorPHBuilder,
1041 VPTypeAnalysis &TypeInfo,
1042 VPValue *VectorTC) {
1043 auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
1044 // Truncated wide inductions resume from the last lane of their vector value
1045 // in the last vector iteration which is handled elsewhere.
1046 if (WideIntOrFp && WideIntOrFp->getTruncInst())
1047 return nullptr;
1048
1049 VPIRValue *Start = WideIV->getStartValue();
1050 VPValue *Step = WideIV->getStepValue();
1052 VPValue *EndValue = VectorTC;
1053 if (!WideIntOrFp || !WideIntOrFp->isCanonical()) {
1054 EndValue = VectorPHBuilder.createDerivedIV(
1055 ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
1056 Start, VectorTC, Step);
1057 }
1058
1059 // EndValue is derived from the vector trip count (which has the same type as
1060 // the widest induction) and thus may be wider than the induction here.
1061 Type *ScalarTypeOfWideIV = TypeInfo.inferScalarType(WideIV);
1062 if (ScalarTypeOfWideIV != TypeInfo.inferScalarType(EndValue)) {
1063 EndValue = VectorPHBuilder.createScalarCast(Instruction::Trunc, EndValue,
1064 ScalarTypeOfWideIV,
1065 WideIV->getDebugLoc());
1066 }
1067
1068 return EndValue;
1069}
1070
1071/// Attempts to optimize the induction variable exit values for users in the
1072/// exit block coming from the latch in the original scalar loop.
1074 VPlan &Plan, VPTypeAnalysis &TypeInfo, VPBlockBase *PredVPBB, VPValue *Op,
1077 VPWidenInductionRecipe *WideIV = nullptr;
1079 WideIV = getOptimizableIVOf(Incoming, PSE);
1080
1081 if (!WideIV)
1082 return nullptr;
1083
1084 VPValue *EndValue = EndValues.lookup(WideIV);
1085 assert(EndValue && "Must have computed the end value up front");
1086
1087 // `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
1088 // changed it means the exit is using the incremented value, so we don't
1089 // need to subtract the step.
1090 if (Incoming != WideIV)
1091 return EndValue;
1092
1093 // Otherwise, subtract the step from the EndValue.
1094 VPBuilder B(cast<VPBasicBlock>(PredVPBB)->getTerminator());
1095 VPValue *Step = WideIV->getStepValue();
1096 Type *ScalarTy = TypeInfo.inferScalarType(WideIV);
1097 if (ScalarTy->isIntegerTy())
1098 return B.createSub(EndValue, Step, DebugLoc::getUnknown(), "ind.escape");
1099 if (ScalarTy->isPointerTy()) {
1100 Type *StepTy = TypeInfo.inferScalarType(Step);
1101 auto *Zero = Plan.getZero(StepTy);
1102 return B.createPtrAdd(EndValue, B.createSub(Zero, Step),
1103 DebugLoc::getUnknown(), "ind.escape");
1104 }
1105 if (ScalarTy->isFloatingPointTy()) {
1106 const auto &ID = WideIV->getInductionDescriptor();
1107 return B.createNaryOp(
1108 ID.getInductionBinOp()->getOpcode() == Instruction::FAdd
1109 ? Instruction::FSub
1110 : Instruction::FAdd,
1111 {EndValue, Step}, {ID.getInductionBinOp()->getFastMathFlags()});
1112 }
1113 llvm_unreachable("all possible induction types must be handled");
1114 return nullptr;
1115}
1116
1118 VPlan &Plan, PredicatedScalarEvolution &PSE, bool FoldTail) {
1119 // Compute end values for all inductions.
1120 VPTypeAnalysis TypeInfo(Plan);
1121 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
1122 auto *VectorPH = cast<VPBasicBlock>(VectorRegion->getSinglePredecessor());
1123 VPBuilder VectorPHBuilder(VectorPH, VectorPH->begin());
1125 VPValue *ResumeTC =
1126 FoldTail ? Plan.getTripCount() : &Plan.getVectorTripCount();
1127 for (auto &Phi : VectorRegion->getEntryBasicBlock()->phis()) {
1128 auto *WideIV = dyn_cast<VPWidenInductionRecipe>(&Phi);
1129 if (!WideIV)
1130 continue;
1132 WideIV, VectorPHBuilder, TypeInfo, ResumeTC))
1133 EndValues[WideIV] = EndValue;
1134 }
1135
1136 VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
1137 for (VPRecipeBase &R : make_early_inc_range(*MiddleVPBB)) {
1138 VPValue *Op;
1139 if (!match(&R, m_ExitingIVValue(m_VPValue(Op))))
1140 continue;
1141 auto *WideIV = cast<VPWidenInductionRecipe>(Op);
1142 if (VPValue *EndValue = EndValues.lookup(WideIV)) {
1143 R.getVPSingleValue()->replaceAllUsesWith(EndValue);
1144 R.eraseFromParent();
1145 }
1146 }
1147
1148 // Then, optimize exit block users.
1149 for (VPIRBasicBlock *ExitVPBB : Plan.getExitBlocks()) {
1150 for (VPRecipeBase &R : ExitVPBB->phis()) {
1151 auto *ExitIRI = cast<VPIRPhi>(&R);
1152
1153 for (auto [Idx, PredVPBB] : enumerate(ExitVPBB->getPredecessors())) {
1154 VPValue *Escape = nullptr;
1155 if (PredVPBB == MiddleVPBB)
1156 Escape = optimizeLatchExitInductionUser(Plan, TypeInfo, PredVPBB,
1157 ExitIRI->getOperand(Idx),
1158 EndValues, PSE);
1159 else
1161 Plan, TypeInfo, PredVPBB, ExitIRI->getOperand(Idx), PSE);
1162 if (Escape)
1163 ExitIRI->setOperand(Idx, Escape);
1164 }
1165 }
1166 }
1167}
1168
1169/// Remove redundant EpxandSCEVRecipes in \p Plan's entry block by replacing
1170/// them with already existing recipes expanding the same SCEV expression.
1173
1174 for (VPRecipeBase &R :
1176 auto *ExpR = dyn_cast<VPExpandSCEVRecipe>(&R);
1177 if (!ExpR)
1178 continue;
1179
1180 const auto &[V, Inserted] = SCEV2VPV.try_emplace(ExpR->getSCEV(), ExpR);
1181 if (Inserted)
1182 continue;
1183 ExpR->replaceAllUsesWith(V->second);
1184 ExpR->eraseFromParent();
1185 }
1186}
1187
1189 SmallVector<VPValue *> WorkList;
1191 WorkList.push_back(V);
1192
1193 while (!WorkList.empty()) {
1194 VPValue *Cur = WorkList.pop_back_val();
1195 if (!Seen.insert(Cur).second)
1196 continue;
1197 VPRecipeBase *R = Cur->getDefiningRecipe();
1198 if (!R)
1199 continue;
1200 if (!isDeadRecipe(*R))
1201 continue;
1202 append_range(WorkList, R->operands());
1203 R->eraseFromParent();
1204 }
1205}
1206
1207/// Get any instruction opcode or intrinsic ID data embedded in recipe \p R.
1208/// Returns an optional pair, where the first element indicates whether it is
1209/// an intrinsic ID.
1210static std::optional<std::pair<bool, unsigned>>
1212 return TypeSwitch<const VPSingleDefRecipe *,
1213 std::optional<std::pair<bool, unsigned>>>(R)
1216 [](auto *I) { return std::make_pair(false, I->getOpcode()); })
1217 .Case([](const VPWidenIntrinsicRecipe *I) {
1218 return std::make_pair(true, I->getVectorIntrinsicID());
1219 })
1220 .Case<VPVectorPointerRecipe, VPPredInstPHIRecipe>([](auto *I) {
1221 // For recipes that do not directly map to LLVM IR instructions,
1222 // assign opcodes after the last VPInstruction opcode (which is also
1223 // after the last IR Instruction opcode), based on the VPRecipeID.
1224 return std::make_pair(false,
1225 VPInstruction::OpsEnd + 1 + I->getVPRecipeID());
1226 })
1227 .Default([](auto *) { return std::nullopt; });
1228}
1229
1230/// Try to fold \p R using InstSimplifyFolder. Will succeed and return a
1231/// non-nullptr VPValue for a handled opcode or intrinsic ID if corresponding \p
1232/// Operands are foldable live-ins.
1234 ArrayRef<VPValue *> Operands,
1235 const DataLayout &DL,
1236 VPTypeAnalysis &TypeInfo) {
1237 auto OpcodeOrIID = getOpcodeOrIntrinsicID(&R);
1238 if (!OpcodeOrIID)
1239 return nullptr;
1240
1242 for (VPValue *Op : Operands) {
1243 if (!match(Op, m_LiveIn()))
1244 return nullptr;
1245 Value *V = Op->getUnderlyingValue();
1246 if (!V)
1247 return nullptr;
1248 Ops.push_back(V);
1249 }
1250
1251 auto FoldToIRValue = [&]() -> Value * {
1252 InstSimplifyFolder Folder(DL);
1253 if (OpcodeOrIID->first) {
1254 if (R.getNumOperands() != 2)
1255 return nullptr;
1256 unsigned ID = OpcodeOrIID->second;
1257 return Folder.FoldBinaryIntrinsic(ID, Ops[0], Ops[1],
1258 TypeInfo.inferScalarType(&R));
1259 }
1260 unsigned Opcode = OpcodeOrIID->second;
1261 if (Instruction::isBinaryOp(Opcode))
1262 return Folder.FoldBinOp(static_cast<Instruction::BinaryOps>(Opcode),
1263 Ops[0], Ops[1]);
1264 if (Instruction::isCast(Opcode))
1265 return Folder.FoldCast(static_cast<Instruction::CastOps>(Opcode), Ops[0],
1266 TypeInfo.inferScalarType(R.getVPSingleValue()));
1267 switch (Opcode) {
1269 return Folder.FoldSelect(Ops[0], Ops[1],
1271 case VPInstruction::Not:
1272 return Folder.FoldBinOp(Instruction::BinaryOps::Xor, Ops[0],
1274 case Instruction::Select:
1275 return Folder.FoldSelect(Ops[0], Ops[1], Ops[2]);
1276 case Instruction::ICmp:
1277 case Instruction::FCmp:
1278 return Folder.FoldCmp(cast<VPRecipeWithIRFlags>(R).getPredicate(), Ops[0],
1279 Ops[1]);
1280 case Instruction::GetElementPtr: {
1281 auto &RFlags = cast<VPRecipeWithIRFlags>(R);
1282 auto *GEP = cast<GetElementPtrInst>(RFlags.getUnderlyingInstr());
1283 return Folder.FoldGEP(GEP->getSourceElementType(), Ops[0],
1284 drop_begin(Ops), RFlags.getGEPNoWrapFlags());
1285 }
1288 return Folder.FoldGEP(IntegerType::getInt8Ty(TypeInfo.getContext()),
1289 Ops[0], Ops[1],
1290 cast<VPRecipeWithIRFlags>(R).getGEPNoWrapFlags());
1291 // An extract of a live-in is an extract of a broadcast, so return the
1292 // broadcasted element.
1293 case Instruction::ExtractElement:
1294 assert(!Ops[0]->getType()->isVectorTy() && "Live-ins should be scalar");
1295 return Ops[0];
1296 }
1297 return nullptr;
1298 };
1299
1300 if (Value *V = FoldToIRValue())
1301 return R.getParent()->getPlan()->getOrAddLiveIn(V);
1302 return nullptr;
1303}
1304
1305/// Try to simplify VPSingleDefRecipe \p Def.
1307 VPlan *Plan = Def->getParent()->getPlan();
1308
1309 // Simplification of live-in IR values for SingleDef recipes using
1310 // InstSimplifyFolder.
1311 const DataLayout &DL = Plan->getDataLayout();
1312 if (VPValue *V = tryToFoldLiveIns(*Def, Def->operands(), DL, TypeInfo))
1313 return Def->replaceAllUsesWith(V);
1314
1315 // Fold PredPHI LiveIn -> LiveIn.
1316 if (auto *PredPHI = dyn_cast<VPPredInstPHIRecipe>(Def)) {
1317 VPValue *Op = PredPHI->getOperand(0);
1318 if (isa<VPIRValue>(Op))
1319 PredPHI->replaceAllUsesWith(Op);
1320 }
1321
1322 VPBuilder Builder(Def);
1323
1324 // Avoid replacing VPInstructions with underlying values with new
1325 // VPInstructions, as we would fail to create widen/replicate recpes from the
1326 // new VPInstructions without an underlying value, and miss out on some
1327 // transformations that only apply to widened/replicated recipes later, by
1328 // doing so.
1329 // TODO: We should also not replace non-VPInstructions like VPWidenRecipe with
1330 // VPInstructions without underlying values, as those will get skipped during
1331 // cost computation.
1332 bool CanCreateNewRecipe =
1333 !isa<VPInstruction>(Def) || !Def->getUnderlyingValue();
1334
1335 VPValue *A;
1336 if (match(Def, m_Trunc(m_ZExtOrSExt(m_VPValue(A))))) {
1337 Type *TruncTy = TypeInfo.inferScalarType(Def);
1338 Type *ATy = TypeInfo.inferScalarType(A);
1339 if (TruncTy == ATy) {
1340 Def->replaceAllUsesWith(A);
1341 } else {
1342 // Don't replace a non-widened cast recipe with a widened cast.
1343 if (!isa<VPWidenCastRecipe>(Def))
1344 return;
1345 if (ATy->getScalarSizeInBits() < TruncTy->getScalarSizeInBits()) {
1346
1347 unsigned ExtOpcode = match(Def->getOperand(0), m_SExt(m_VPValue()))
1348 ? Instruction::SExt
1349 : Instruction::ZExt;
1350 auto *Ext = Builder.createWidenCast(Instruction::CastOps(ExtOpcode), A,
1351 TruncTy);
1352 if (auto *UnderlyingExt = Def->getOperand(0)->getUnderlyingValue()) {
1353 // UnderlyingExt has distinct return type, used to retain legacy cost.
1354 Ext->setUnderlyingValue(UnderlyingExt);
1355 }
1356 Def->replaceAllUsesWith(Ext);
1357 } else if (ATy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits()) {
1358 auto *Trunc = Builder.createWidenCast(Instruction::Trunc, A, TruncTy);
1359 Def->replaceAllUsesWith(Trunc);
1360 }
1361 }
1362#ifndef NDEBUG
1363 // Verify that the cached type info is for both A and its users is still
1364 // accurate by comparing it to freshly computed types.
1365 VPTypeAnalysis TypeInfo2(*Plan);
1366 assert(TypeInfo.inferScalarType(A) == TypeInfo2.inferScalarType(A));
1367 for (VPUser *U : A->users()) {
1368 auto *R = cast<VPRecipeBase>(U);
1369 for (VPValue *VPV : R->definedValues())
1370 assert(TypeInfo.inferScalarType(VPV) == TypeInfo2.inferScalarType(VPV));
1371 }
1372#endif
1373 }
1374
1375 // Simplify (X && Y) | (X && !Y) -> X.
1376 // TODO: Split up into simpler, modular combines: (X && Y) | (X && Z) into X
1377 // && (Y | Z) and (X | !X) into true. This requires queuing newly created
1378 // recipes to be visited during simplification.
1379 VPValue *X, *Y, *Z;
1380 if (match(Def,
1383 Def->replaceAllUsesWith(X);
1384 Def->eraseFromParent();
1385 return;
1386 }
1387
1388 // x | AllOnes -> AllOnes
1389 if (match(Def, m_c_BinaryOr(m_VPValue(X), m_AllOnes())))
1390 return Def->replaceAllUsesWith(
1391 Plan->getAllOnesValue(TypeInfo.inferScalarType(Def)));
1392
1393 // x | 0 -> x
1394 if (match(Def, m_c_BinaryOr(m_VPValue(X), m_ZeroInt())))
1395 return Def->replaceAllUsesWith(X);
1396
1397 // x | !x -> AllOnes
1399 return Def->replaceAllUsesWith(
1400 Plan->getAllOnesValue(TypeInfo.inferScalarType(Def)));
1401
1402 // x & 0 -> 0
1403 if (match(Def, m_c_BinaryAnd(m_VPValue(X), m_ZeroInt())))
1404 return Def->replaceAllUsesWith(
1405 Plan->getZero(TypeInfo.inferScalarType(Def)));
1406
1407 // x & AllOnes -> x
1408 if (match(Def, m_c_BinaryAnd(m_VPValue(X), m_AllOnes())))
1409 return Def->replaceAllUsesWith(X);
1410
1411 // x && false -> false
1412 if (match(Def, m_c_LogicalAnd(m_VPValue(X), m_False())))
1413 return Def->replaceAllUsesWith(Plan->getFalse());
1414
1415 // x && true -> x
1416 if (match(Def, m_c_LogicalAnd(m_VPValue(X), m_True())))
1417 return Def->replaceAllUsesWith(X);
1418
1419 // (x && y) | (x && z) -> x && (y | z)
1420 if (CanCreateNewRecipe &&
1423 // Simplify only if one of the operands has one use to avoid creating an
1424 // extra recipe.
1425 (!Def->getOperand(0)->hasMoreThanOneUniqueUser() ||
1426 !Def->getOperand(1)->hasMoreThanOneUniqueUser()))
1427 return Def->replaceAllUsesWith(
1428 Builder.createLogicalAnd(X, Builder.createOr(Y, Z)));
1429
1430 // x && (x && y) -> x && y
1431 if (match(Def, m_LogicalAnd(m_VPValue(X),
1433 return Def->replaceAllUsesWith(Def->getOperand(1));
1434
1435 // x && (y && x) -> x && y
1436 if (match(Def, m_LogicalAnd(m_VPValue(X),
1438 return Def->replaceAllUsesWith(Builder.createLogicalAnd(X, Y));
1439
1440 // x && !x -> 0
1442 return Def->replaceAllUsesWith(Plan->getFalse());
1443
1444 if (match(Def, m_Select(m_VPValue(), m_VPValue(X), m_Deferred(X))))
1445 return Def->replaceAllUsesWith(X);
1446
1447 // select c, false, true -> not c
1448 VPValue *C;
1449 if (CanCreateNewRecipe &&
1450 match(Def, m_Select(m_VPValue(C), m_False(), m_True())))
1451 return Def->replaceAllUsesWith(Builder.createNot(C));
1452
1453 // select !c, x, y -> select c, y, x
1454 if (match(Def, m_Select(m_Not(m_VPValue(C)), m_VPValue(X), m_VPValue(Y)))) {
1455 Def->setOperand(0, C);
1456 Def->setOperand(1, Y);
1457 Def->setOperand(2, X);
1458 return;
1459 }
1460
1461 if (match(Def, m_c_Add(m_VPValue(A), m_ZeroInt())))
1462 return Def->replaceAllUsesWith(A);
1463
1464 if (match(Def, m_c_Mul(m_VPValue(A), m_One())))
1465 return Def->replaceAllUsesWith(A);
1466
1467 if (match(Def, m_c_Mul(m_VPValue(A), m_ZeroInt())))
1468 return Def->replaceAllUsesWith(
1469 Plan->getZero(TypeInfo.inferScalarType(Def)));
1470
1471 if (CanCreateNewRecipe && match(Def, m_c_Mul(m_VPValue(A), m_AllOnes()))) {
1472 // Preserve nsw from the Mul on the new Sub.
1474 false, cast<VPRecipeWithIRFlags>(Def)->hasNoSignedWrap()};
1475 return Def->replaceAllUsesWith(
1476 Builder.createSub(Plan->getZero(TypeInfo.inferScalarType(A)), A,
1477 Def->getDebugLoc(), "", NW));
1478 }
1479
1480 const APInt *APC;
1481 if (CanCreateNewRecipe && match(Def, m_c_Mul(m_VPValue(A), m_APInt(APC))) &&
1482 APC->isPowerOf2())
1483 return Def->replaceAllUsesWith(Builder.createNaryOp(
1484 Instruction::Shl,
1485 {A, Plan->getConstantInt(APC->getBitWidth(), APC->exactLogBase2())},
1486 *cast<VPRecipeWithIRFlags>(Def), Def->getDebugLoc()));
1487
1488 if (CanCreateNewRecipe && match(Def, m_UDiv(m_VPValue(A), m_APInt(APC))) &&
1489 APC->isPowerOf2())
1490 return Def->replaceAllUsesWith(Builder.createNaryOp(
1491 Instruction::LShr,
1492 {A, Plan->getConstantInt(APC->getBitWidth(), APC->exactLogBase2())},
1493 *cast<VPRecipeWithIRFlags>(Def), Def->getDebugLoc()));
1494
1495 if (match(Def, m_Not(m_VPValue(A)))) {
1496 if (match(A, m_Not(m_VPValue(A))))
1497 return Def->replaceAllUsesWith(A);
1498
1499 // Try to fold Not into compares by adjusting the predicate in-place.
1500 CmpPredicate Pred;
1501 if (match(A, m_Cmp(Pred, m_VPValue(), m_VPValue()))) {
1502 auto *Cmp = cast<VPRecipeWithIRFlags>(A);
1503 if (all_of(Cmp->users(),
1505 m_Not(m_Specific(Cmp)),
1506 m_Select(m_Specific(Cmp), m_VPValue(), m_VPValue()))))) {
1507 Cmp->setPredicate(CmpInst::getInversePredicate(Pred));
1508 for (VPUser *U : to_vector(Cmp->users())) {
1509 auto *R = cast<VPSingleDefRecipe>(U);
1510 if (match(R, m_Select(m_Specific(Cmp), m_VPValue(X), m_VPValue(Y)))) {
1511 // select (cmp pred), x, y -> select (cmp inv_pred), y, x
1512 R->setOperand(1, Y);
1513 R->setOperand(2, X);
1514 } else {
1515 // not (cmp pred) -> cmp inv_pred
1516 assert(match(R, m_Not(m_Specific(Cmp))) && "Unexpected user");
1517 R->replaceAllUsesWith(Cmp);
1518 }
1519 }
1520 // If Cmp doesn't have a debug location, use the one from the negation,
1521 // to preserve the location.
1522 if (!Cmp->getDebugLoc() && Def->getDebugLoc())
1523 Cmp->setDebugLoc(Def->getDebugLoc());
1524 }
1525 }
1526 }
1527
1528 // Fold any-of (fcmp uno %A, %A), (fcmp uno %B, %B), ... ->
1529 // any-of (fcmp uno %A, %B), ...
1530 if (match(Def, m_AnyOf())) {
1532 VPRecipeBase *UnpairedCmp = nullptr;
1533 for (VPValue *Op : Def->operands()) {
1534 VPValue *X;
1535 if (Op->getNumUsers() > 1 ||
1537 m_Deferred(X)))) {
1538 NewOps.push_back(Op);
1539 } else if (!UnpairedCmp) {
1540 UnpairedCmp = Op->getDefiningRecipe();
1541 } else {
1542 NewOps.push_back(Builder.createFCmp(CmpInst::FCMP_UNO,
1543 UnpairedCmp->getOperand(0), X));
1544 UnpairedCmp = nullptr;
1545 }
1546 }
1547
1548 if (UnpairedCmp)
1549 NewOps.push_back(UnpairedCmp->getVPSingleValue());
1550
1551 if (NewOps.size() < Def->getNumOperands()) {
1552 VPValue *NewAnyOf = Builder.createNaryOp(VPInstruction::AnyOf, NewOps);
1553 return Def->replaceAllUsesWith(NewAnyOf);
1554 }
1555 }
1556
1557 // Fold (fcmp uno %X, %X) or (fcmp uno %Y, %Y) -> fcmp uno %X, %Y
1558 // This is useful for fmax/fmin without fast-math flags, where we need to
1559 // check if any operand is NaN.
1560 if (CanCreateNewRecipe &&
1562 m_Deferred(X)),
1564 m_Deferred(Y))))) {
1565 VPValue *NewCmp = Builder.createFCmp(CmpInst::FCMP_UNO, X, Y);
1566 return Def->replaceAllUsesWith(NewCmp);
1567 }
1568
1569 // Remove redundant DerviedIVs, that is 0 + A * 1 -> A and 0 + 0 * x -> 0.
1570 if ((match(Def, m_DerivedIV(m_ZeroInt(), m_VPValue(A), m_One())) ||
1571 match(Def, m_DerivedIV(m_ZeroInt(), m_ZeroInt(), m_VPValue()))) &&
1572 TypeInfo.inferScalarType(Def->getOperand(1)) ==
1573 TypeInfo.inferScalarType(Def))
1574 return Def->replaceAllUsesWith(Def->getOperand(1));
1575
1577 m_One()))) {
1578 Type *WideStepTy = TypeInfo.inferScalarType(Def);
1579 if (TypeInfo.inferScalarType(X) != WideStepTy)
1580 X = Builder.createWidenCast(Instruction::Trunc, X, WideStepTy);
1581 Def->replaceAllUsesWith(X);
1582 return;
1583 }
1584
1585 // For i1 vp.merges produced by AnyOf reductions:
1586 // vp.merge true, (or x, y), x, evl -> vp.merge y, true, x, evl
1588 m_VPValue(X), m_VPValue())) &&
1590 TypeInfo.inferScalarType(Def)->isIntegerTy(1)) {
1591 Def->setOperand(1, Def->getOperand(0));
1592 Def->setOperand(0, Y);
1593 return;
1594 }
1595
1596 // Simplify MaskedCond with no block mask to its single operand.
1598 !cast<VPInstruction>(Def)->isMasked())
1599 return Def->replaceAllUsesWith(Def->getOperand(0));
1600
1601 // Look through ExtractLastLane.
1602 if (match(Def, m_ExtractLastLane(m_VPValue(A)))) {
1603 if (match(A, m_BuildVector())) {
1604 auto *BuildVector = cast<VPInstruction>(A);
1605 Def->replaceAllUsesWith(
1606 BuildVector->getOperand(BuildVector->getNumOperands() - 1));
1607 return;
1608 }
1609 if (Plan->hasScalarVFOnly())
1610 return Def->replaceAllUsesWith(A);
1611 }
1612
1613 // Look through ExtractPenultimateElement (BuildVector ....).
1615 auto *BuildVector = cast<VPInstruction>(Def->getOperand(0));
1616 Def->replaceAllUsesWith(
1617 BuildVector->getOperand(BuildVector->getNumOperands() - 2));
1618 return;
1619 }
1620
1621 uint64_t Idx;
1623 auto *BuildVector = cast<VPInstruction>(Def->getOperand(0));
1624 Def->replaceAllUsesWith(BuildVector->getOperand(Idx));
1625 return;
1626 }
1627
1628 if (match(Def, m_BuildVector()) && all_equal(Def->operands())) {
1629 Def->replaceAllUsesWith(
1630 Builder.createNaryOp(VPInstruction::Broadcast, Def->getOperand(0)));
1631 return;
1632 }
1633
1634 // Look through broadcast of single-scalar when used as select conditions; in
1635 // that case the scalar condition can be used directly.
1636 if (match(Def,
1639 "broadcast operand must be single-scalar");
1640 Def->setOperand(0, C);
1641 return;
1642 }
1643
1645 if (Def->getNumOperands() == 1) {
1646 Def->replaceAllUsesWith(Def->getOperand(0));
1647 return;
1648 }
1649 if (auto *Phi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(Def)) {
1650 if (all_equal(Phi->incoming_values()))
1651 Phi->replaceAllUsesWith(Phi->getOperand(0));
1652 }
1653 return;
1654 }
1655
1656 VPIRValue *IRV;
1657 if (Def->getNumOperands() == 1 &&
1659 return Def->replaceAllUsesWith(IRV);
1660
1661 // Some simplifications can only be applied after unrolling. Perform them
1662 // below.
1663 if (!Plan->isUnrolled())
1664 return;
1665
1666 // After unrolling, extract-lane may be used to extract values from multiple
1667 // scalar sources. Only simplify when extracting from a single scalar source.
1668 VPValue *LaneToExtract;
1669 if (match(Def, m_ExtractLane(m_VPValue(LaneToExtract), m_VPValue(A)))) {
1670 // Simplify extract-lane(%lane_num, %scalar_val) -> %scalar_val.
1672 return Def->replaceAllUsesWith(A);
1673
1674 // Simplify extract-lane with single source to extract-element.
1675 Def->replaceAllUsesWith(Builder.createNaryOp(
1676 Instruction::ExtractElement, {A, LaneToExtract}, Def->getDebugLoc()));
1677 return;
1678 }
1679
1680 // Look for cycles where Def is of the form:
1681 // X = phi(0, IVInc) ; used only by IVInc, or by IVInc and Inc = X + Y
1682 // IVInc = X + Step ; used by X and Def
1683 // Def = IVInc + Y
1684 // Fold the increment Y into the phi's start value, replace Def with IVInc,
1685 // and if Inc exists, replace it with X.
1686 if (match(Def, m_Add(m_Add(m_VPValue(X), m_VPValue()), m_VPValue(Y))) &&
1687 isa<VPIRValue>(Y) &&
1688 match(X, m_VPPhi(m_ZeroInt(), m_Specific(Def->getOperand(0))))) {
1689 auto *Phi = cast<VPPhi>(X);
1690 auto *IVInc = Def->getOperand(0);
1691 if (IVInc->getNumUsers() == 2) {
1692 // If Phi has a second user (besides IVInc's defining recipe), it must
1693 // be Inc = Phi + Y for the fold to apply.
1696 if (Phi->getNumUsers() == 1 || (Phi->getNumUsers() == 2 && Inc)) {
1697 Def->replaceAllUsesWith(IVInc);
1698 if (Inc)
1699 Inc->replaceAllUsesWith(Phi);
1700 Phi->setOperand(0, Y);
1701 return;
1702 }
1703 }
1704 }
1705
1706 // Simplify unrolled VectorPointer without offset, or with zero offset, to
1707 // just the pointer operand.
1708 if (auto *VPR = dyn_cast<VPVectorPointerRecipe>(Def))
1709 if (!VPR->getOffset() || match(VPR->getOffset(), m_ZeroInt()))
1710 return VPR->replaceAllUsesWith(VPR->getOperand(0));
1711
1712 // VPScalarIVSteps after unrolling can be replaced by their start value, if
1713 // the start index is zero and only the first lane 0 is demanded.
1714 if (auto *Steps = dyn_cast<VPScalarIVStepsRecipe>(Def)) {
1715 if (!Steps->getStartIndex() && vputils::onlyFirstLaneUsed(Steps)) {
1716 Steps->replaceAllUsesWith(Steps->getOperand(0));
1717 return;
1718 }
1719 }
1720 // Simplify redundant ReductionStartVector recipes after unrolling.
1721 VPValue *StartV;
1723 m_VPValue(StartV), m_VPValue(), m_VPValue()))) {
1724 Def->replaceUsesWithIf(StartV, [](const VPUser &U, unsigned Idx) {
1725 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&U);
1726 return PhiR && PhiR->isInLoop();
1727 });
1728 return;
1729 }
1730
1732 Def->replaceAllUsesWith(A);
1733 return;
1734 }
1735
1736 if (match(Def, m_ExtractLastLane(m_VPValue(A))) &&
1739 cast<VPReplicateRecipe>(A)->isSingleScalar())) &&
1740 all_of(A->users(),
1741 [Def, A](VPUser *U) { return U->usesScalars(A) || Def == U; })) {
1742 return Def->replaceAllUsesWith(A);
1743 }
1744
1745 if (Plan->getConcreteUF() == 1 && match(Def, m_ExtractLastPart(m_VPValue(A))))
1746 return Def->replaceAllUsesWith(A);
1747}
1748
1751 Plan.getEntry());
1752 VPTypeAnalysis TypeInfo(Plan);
1754 for (VPRecipeBase &R : make_early_inc_range(*VPBB))
1755 if (auto *Def = dyn_cast<VPSingleDefRecipe>(&R))
1756 simplifyRecipe(Def, TypeInfo);
1757 }
1758}
1759
1760/// Reassociate (headermask && x) && y -> headermask && (x && y) to allow the
1761/// header mask to be simplified further when tail folding, e.g. in
1762/// optimizeEVLMasks.
1763static void reassociateHeaderMask(VPlan &Plan) {
1764 VPValue *HeaderMask = vputils::findHeaderMask(Plan);
1765 if (!HeaderMask)
1766 return;
1767
1768 SmallVector<VPUser *> Worklist;
1769 for (VPUser *U : HeaderMask->users())
1770 if (match(U, m_LogicalAnd(m_Specific(HeaderMask), m_VPValue())))
1772
1773 while (!Worklist.empty()) {
1774 auto *R = dyn_cast<VPSingleDefRecipe>(Worklist.pop_back_val());
1775 VPValue *X, *Y;
1776 if (!R || !match(R, m_LogicalAnd(
1777 m_LogicalAnd(m_Specific(HeaderMask), m_VPValue(X)),
1778 m_VPValue(Y))))
1779 continue;
1780 append_range(Worklist, R->users());
1781 VPBuilder Builder(R);
1782 R->replaceAllUsesWith(
1783 Builder.createLogicalAnd(HeaderMask, Builder.createLogicalAnd(X, Y)));
1784 }
1785}
1786
1788 if (Plan.hasScalarVFOnly())
1789 return;
1790
1791 // Try to narrow wide and replicating recipes to single scalar recipes,
1792 // based on VPlan analysis. Only process blocks in the loop region for now,
1793 // without traversing into nested regions, as recipes in replicate regions
1794 // cannot be converted yet.
1797 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
1799 VPWidenStoreRecipe>(&R))
1800 continue;
1801 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
1802 if (RepR && (RepR->isSingleScalar() || RepR->isPredicated()))
1803 continue;
1804
1805 // Convert an unmasked scatter with an uniform address into
1806 // extract-last-lane + scalar store.
1807 // TODO: Add a profitability check comparing the cost of a scatter vs.
1808 // extract + scalar store.
1809 auto *WidenStoreR = dyn_cast<VPWidenStoreRecipe>(&R);
1810 if (WidenStoreR && vputils::isSingleScalar(WidenStoreR->getAddr()) &&
1811 !WidenStoreR->isConsecutive()) {
1812 VPValue *Mask = WidenStoreR->getMask();
1813
1814 // Only convert the scatter to a scalar store if it is unmasked.
1815 // TODO: Support converting scatter masked by the header mask to scalar
1816 // store.
1817 if (Mask)
1818 continue;
1819
1821 {WidenStoreR->getOperand(1)});
1822 Extract->insertBefore(WidenStoreR);
1823
1824 // TODO: Sink the scalar store recipe to middle block if possible.
1825 auto *ScalarStore = new VPReplicateRecipe(
1826 &WidenStoreR->getIngredient(), {Extract, WidenStoreR->getAddr()},
1827 true /*IsSingleScalar*/, nullptr /*Mask*/, {},
1828 *WidenStoreR /*Metadata*/);
1829 ScalarStore->insertBefore(WidenStoreR);
1830 WidenStoreR->eraseFromParent();
1831 continue;
1832 }
1833
1834 auto *RepOrWidenR = dyn_cast<VPRecipeWithIRFlags>(&R);
1835 if (RepR && isa<StoreInst>(RepR->getUnderlyingInstr()) &&
1836 vputils::isSingleScalar(RepR->getOperand(1))) {
1837 auto *Clone = new VPReplicateRecipe(
1838 RepOrWidenR->getUnderlyingInstr(), RepOrWidenR->operands(),
1839 true /*IsSingleScalar*/, nullptr /*Mask*/, *RepR /*Flags*/,
1840 *RepR /*Metadata*/, RepR->getDebugLoc());
1841 Clone->insertBefore(RepOrWidenR);
1842 VPBuilder Builder(Clone);
1843 VPValue *ExtractOp = Clone->getOperand(0);
1844 if (vputils::isUniformAcrossVFsAndUFs(RepR->getOperand(1)))
1845 ExtractOp =
1846 Builder.createNaryOp(VPInstruction::ExtractLastPart, ExtractOp);
1847 ExtractOp =
1848 Builder.createNaryOp(VPInstruction::ExtractLastLane, ExtractOp);
1849 Clone->setOperand(0, ExtractOp);
1850 RepR->eraseFromParent();
1851 continue;
1852 }
1853
1854 // Skip recipes that aren't single scalars.
1855 if (!RepOrWidenR || !vputils::isSingleScalar(RepOrWidenR))
1856 continue;
1857
1858 // Predicate to check if a user of Op introduces extra broadcasts.
1859 auto IntroducesBCastOf = [](const VPValue *Op) {
1860 return [Op](const VPUser *U) {
1861 if (auto *VPI = dyn_cast<VPInstruction>(U)) {
1865 VPI->getOpcode()))
1866 return false;
1867 }
1868 return !U->usesScalars(Op);
1869 };
1870 };
1871
1872 if (any_of(RepOrWidenR->users(), IntroducesBCastOf(RepOrWidenR)) &&
1873 none_of(RepOrWidenR->operands(), [&](VPValue *Op) {
1874 if (any_of(
1875 make_filter_range(Op->users(), not_equal_to(RepOrWidenR)),
1876 IntroducesBCastOf(Op)))
1877 return false;
1878 // Non-constant live-ins require broadcasts, while constants do not
1879 // need explicit broadcasts.
1880 auto *IRV = dyn_cast<VPIRValue>(Op);
1881 bool LiveInNeedsBroadcast = IRV && !isa<Constant>(IRV->getValue());
1882 auto *OpR = dyn_cast<VPReplicateRecipe>(Op);
1883 return LiveInNeedsBroadcast || (OpR && OpR->isSingleScalar());
1884 }))
1885 continue;
1886
1887 auto *Clone = new VPReplicateRecipe(
1888 RepOrWidenR->getUnderlyingInstr(), RepOrWidenR->operands(),
1889 true /*IsSingleScalar*/, nullptr, *RepOrWidenR);
1890 Clone->insertBefore(RepOrWidenR);
1891 RepOrWidenR->replaceAllUsesWith(Clone);
1892 if (isDeadRecipe(*RepOrWidenR))
1893 RepOrWidenR->eraseFromParent();
1894 }
1895 }
1896}
1897
1898/// Try to see if all of \p Blend's masks share a common value logically and'ed
1899/// and remove it from the masks.
1901 if (Blend->isNormalized())
1902 return;
1903 VPValue *CommonEdgeMask;
1904 if (!match(Blend->getMask(0),
1905 m_LogicalAnd(m_VPValue(CommonEdgeMask), m_VPValue())))
1906 return;
1907 for (unsigned I = 0; I < Blend->getNumIncomingValues(); I++)
1908 if (!match(Blend->getMask(I),
1909 m_LogicalAnd(m_Specific(CommonEdgeMask), m_VPValue())))
1910 return;
1911 for (unsigned I = 0; I < Blend->getNumIncomingValues(); I++)
1912 Blend->setMask(I, Blend->getMask(I)->getDefiningRecipe()->getOperand(1));
1913}
1914
1915/// Normalize and simplify VPBlendRecipes. Should be run after simplifyRecipes
1916/// to make sure the masks are simplified.
1917static void simplifyBlends(VPlan &Plan) {
1920 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
1921 auto *Blend = dyn_cast<VPBlendRecipe>(&R);
1922 if (!Blend)
1923 continue;
1924
1925 removeCommonBlendMask(Blend);
1926
1927 // Try to remove redundant blend recipes.
1928 SmallPtrSet<VPValue *, 4> UniqueValues;
1929 if (Blend->isNormalized() || !match(Blend->getMask(0), m_False()))
1930 UniqueValues.insert(Blend->getIncomingValue(0));
1931 for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
1932 if (!match(Blend->getMask(I), m_False()))
1933 UniqueValues.insert(Blend->getIncomingValue(I));
1934
1935 if (UniqueValues.size() == 1) {
1936 Blend->replaceAllUsesWith(*UniqueValues.begin());
1937 Blend->eraseFromParent();
1938 continue;
1939 }
1940
1941 if (Blend->isNormalized())
1942 continue;
1943
1944 // Normalize the blend so its first incoming value is used as the initial
1945 // value with the others blended into it.
1946
1947 unsigned StartIndex = 0;
1948 for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
1949 // If a value's mask is used only by the blend then is can be deadcoded.
1950 // TODO: Find the most expensive mask that can be deadcoded, or a mask
1951 // that's used by multiple blends where it can be removed from them all.
1952 VPValue *Mask = Blend->getMask(I);
1953 if (Mask->getNumUsers() == 1 && !match(Mask, m_False())) {
1954 StartIndex = I;
1955 break;
1956 }
1957 }
1958
1959 SmallVector<VPValue *, 4> OperandsWithMask;
1960 OperandsWithMask.push_back(Blend->getIncomingValue(StartIndex));
1961
1962 for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
1963 if (I == StartIndex)
1964 continue;
1965 OperandsWithMask.push_back(Blend->getIncomingValue(I));
1966 OperandsWithMask.push_back(Blend->getMask(I));
1967 }
1968
1969 auto *NewBlend =
1970 new VPBlendRecipe(cast_or_null<PHINode>(Blend->getUnderlyingValue()),
1971 OperandsWithMask, *Blend, Blend->getDebugLoc());
1972 NewBlend->insertBefore(&R);
1973
1974 VPValue *DeadMask = Blend->getMask(StartIndex);
1975 Blend->replaceAllUsesWith(NewBlend);
1976 Blend->eraseFromParent();
1978
1979 /// Simplify BLEND %a, %b, Not(%mask) -> BLEND %b, %a, %mask.
1980 VPValue *NewMask;
1981 if (NewBlend->getNumOperands() == 3 &&
1982 match(NewBlend->getMask(1), m_Not(m_VPValue(NewMask)))) {
1983 VPValue *Inc0 = NewBlend->getOperand(0);
1984 VPValue *Inc1 = NewBlend->getOperand(1);
1985 VPValue *OldMask = NewBlend->getOperand(2);
1986 NewBlend->setOperand(0, Inc1);
1987 NewBlend->setOperand(1, Inc0);
1988 NewBlend->setOperand(2, NewMask);
1989 if (OldMask->getNumUsers() == 0)
1990 cast<VPInstruction>(OldMask)->eraseFromParent();
1991 }
1992 }
1993 }
1994}
1995
1996/// Optimize the width of vector induction variables in \p Plan based on a known
1997/// constant Trip Count, \p BestVF and \p BestUF.
1999 ElementCount BestVF,
2000 unsigned BestUF) {
2001 // Only proceed if we have not completely removed the vector region.
2002 if (!Plan.getVectorLoopRegion())
2003 return false;
2004
2005 const APInt *TC;
2006 if (!BestVF.isFixed() || !match(Plan.getTripCount(), m_APInt(TC)))
2007 return false;
2008
2009 // Calculate the minimum power-of-2 bit width that can fit the known TC, VF
2010 // and UF. Returns at least 8.
2011 auto ComputeBitWidth = [](APInt TC, uint64_t Align) {
2012 APInt AlignedTC =
2015 APInt MaxVal = AlignedTC - 1;
2016 return std::max<unsigned>(PowerOf2Ceil(MaxVal.getActiveBits()), 8);
2017 };
2018 unsigned NewBitWidth =
2019 ComputeBitWidth(*TC, BestVF.getKnownMinValue() * BestUF);
2020
2021 LLVMContext &Ctx = Plan.getContext();
2022 auto *NewIVTy = IntegerType::get(Ctx, NewBitWidth);
2023
2024 bool MadeChange = false;
2025
2026 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
2027 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
2028 auto *WideIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
2029
2030 // Currently only handle canonical IVs as it is trivial to replace the start
2031 // and stop values, and we currently only perform the optimization when the
2032 // IV has a single use.
2033 if (!WideIV || !WideIV->isCanonical() ||
2034 WideIV->hasMoreThanOneUniqueUser() ||
2035 NewIVTy == WideIV->getScalarType())
2036 continue;
2037
2038 // Currently only handle cases where the single user is a header-mask
2039 // comparison with the backedge-taken-count.
2040 VPUser *SingleUser = WideIV->getSingleUser();
2041 if (!SingleUser ||
2042 !match(SingleUser,
2043 m_ICmp(m_Specific(WideIV),
2045 continue;
2046
2047 // Update IV operands and comparison bound to use new narrower type.
2048 auto *NewStart = Plan.getZero(NewIVTy);
2049 WideIV->setStartValue(NewStart);
2050 auto *NewStep = Plan.getConstantInt(NewIVTy, 1);
2051 WideIV->setStepValue(NewStep);
2052
2053 auto *NewBTC = new VPWidenCastRecipe(
2054 Instruction::Trunc, Plan.getOrCreateBackedgeTakenCount(), NewIVTy,
2055 nullptr, VPIRFlags::getDefaultFlags(Instruction::Trunc));
2056 Plan.getVectorPreheader()->appendRecipe(NewBTC);
2057 auto *Cmp = cast<VPInstruction>(WideIV->getSingleUser());
2058 Cmp->setOperand(1, NewBTC);
2059
2060 MadeChange = true;
2061 }
2062
2063 return MadeChange;
2064}
2065
2066/// Return true if \p Cond is known to be true for given \p BestVF and \p
2067/// BestUF.
2069 ElementCount BestVF, unsigned BestUF,
2072 return any_of(Cond->getDefiningRecipe()->operands(), [&Plan, BestVF, BestUF,
2073 &PSE](VPValue *C) {
2074 return isConditionTrueViaVFAndUF(C, Plan, BestVF, BestUF, PSE);
2075 });
2076
2077 auto *CanIV = Plan.getVectorLoopRegion()->getCanonicalIV();
2079 m_Specific(CanIV->getBackedgeValue()),
2080 m_Specific(&Plan.getVectorTripCount()))))
2081 return false;
2082
2083 // The compare checks CanIV + VFxUF == vector trip count. The vector trip
2084 // count is not conveniently available as SCEV so far, so we compare directly
2085 // against the original trip count. This is stricter than necessary, as we
2086 // will only return true if the trip count == vector trip count.
2087 const SCEV *VectorTripCount =
2089 if (isa<SCEVCouldNotCompute>(VectorTripCount))
2090 VectorTripCount = vputils::getSCEVExprForVPValue(Plan.getTripCount(), PSE);
2091 assert(!isa<SCEVCouldNotCompute>(VectorTripCount) &&
2092 "Trip count SCEV must be computable");
2093 ScalarEvolution &SE = *PSE.getSE();
2094 ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF);
2095 const SCEV *C = SE.getElementCount(VectorTripCount->getType(), NumElements);
2096 return SE.isKnownPredicate(CmpInst::ICMP_EQ, VectorTripCount, C);
2097}
2098
2099/// Try to replace multiple active lane masks used for control flow with
2100/// a single, wide active lane mask instruction followed by multiple
2101/// extract subvector intrinsics. This applies to the active lane mask
2102/// instructions both in the loop and in the preheader.
2103/// Incoming values of all ActiveLaneMaskPHIs are updated to use the
2104/// new extracts from the first active lane mask, which has it's last
2105/// operand (multiplier) set to UF.
2107 unsigned UF) {
2108 if (!EnableWideActiveLaneMask || !VF.isVector() || UF == 1)
2109 return false;
2110
2111 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
2112 VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
2113 auto *Term = &ExitingVPBB->back();
2114
2115 using namespace llvm::VPlanPatternMatch;
2117 m_VPValue(), m_VPValue(), m_VPValue())))))
2118 return false;
2119
2120 auto *Header = cast<VPBasicBlock>(VectorRegion->getEntry());
2121 LLVMContext &Ctx = Plan.getContext();
2122
2123 auto ExtractFromALM = [&](VPInstruction *ALM,
2124 SmallVectorImpl<VPValue *> &Extracts) {
2125 DebugLoc DL = ALM->getDebugLoc();
2126 for (unsigned Part = 0; Part < UF; ++Part) {
2128 Ops.append({ALM, Plan.getConstantInt(64, VF.getKnownMinValue() * Part)});
2129 auto *Ext =
2130 new VPWidenIntrinsicRecipe(Intrinsic::vector_extract, Ops,
2131 IntegerType::getInt1Ty(Ctx), {}, {}, DL);
2132 Extracts[Part] = Ext;
2133 Ext->insertAfter(ALM);
2134 }
2135 };
2136
2137 // Create a list of each active lane mask phi, ordered by unroll part.
2139 for (VPRecipeBase &R : Header->phis()) {
2141 if (!Phi)
2142 continue;
2143 VPValue *Index = nullptr;
2144 match(Phi->getBackedgeValue(),
2146 assert(Index && "Expected index from ActiveLaneMask instruction");
2147
2148 uint64_t Part;
2149 if (match(Index,
2151 m_VPValue(), m_Mul(m_VPValue(), m_ConstantInt(Part)))))
2152 Phis[Part] = Phi;
2153 else {
2154 // Anything other than a CanonicalIVIncrementForPart is part 0
2155 assert(!match(
2156 Index,
2158 Phis[0] = Phi;
2159 }
2160 }
2161
2162 assert(all_of(Phis, not_equal_to(nullptr)) &&
2163 "Expected one VPActiveLaneMaskPHIRecipe for each unroll part");
2164
2165 auto *EntryALM = cast<VPInstruction>(Phis[0]->getStartValue());
2166 auto *LoopALM = cast<VPInstruction>(Phis[0]->getBackedgeValue());
2167
2168 assert((EntryALM->getOpcode() == VPInstruction::ActiveLaneMask &&
2169 LoopALM->getOpcode() == VPInstruction::ActiveLaneMask) &&
2170 "Expected incoming values of Phi to be ActiveLaneMasks");
2171
2172 // When using wide lane masks, the return type of the get.active.lane.mask
2173 // intrinsic is VF x UF (last operand).
2174 VPValue *ALMMultiplier = Plan.getConstantInt(64, UF);
2175 EntryALM->setOperand(2, ALMMultiplier);
2176 LoopALM->setOperand(2, ALMMultiplier);
2177
2178 // Create UF x extract vectors and insert into preheader.
2179 SmallVector<VPValue *> EntryExtracts(UF);
2180 ExtractFromALM(EntryALM, EntryExtracts);
2181
2182 // Create UF x extract vectors and insert before the loop compare & branch,
2183 // updating the compare to use the first extract.
2184 SmallVector<VPValue *> LoopExtracts(UF);
2185 ExtractFromALM(LoopALM, LoopExtracts);
2186 VPInstruction *Not = cast<VPInstruction>(Term->getOperand(0));
2187 Not->setOperand(0, LoopExtracts[0]);
2188
2189 // Update the incoming values of active lane mask phis.
2190 for (unsigned Part = 0; Part < UF; ++Part) {
2191 Phis[Part]->setStartValue(EntryExtracts[Part]);
2192 Phis[Part]->setBackedgeValue(LoopExtracts[Part]);
2193 }
2194
2195 return true;
2196}
2197
2198/// Try to simplify the branch condition of \p Plan. This may restrict the
2199/// resulting plan to \p BestVF and \p BestUF.
2201 unsigned BestUF,
2203 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
2204 VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
2205 auto *Term = &ExitingVPBB->back();
2206 VPValue *Cond;
2207 auto m_CanIVInc = m_Add(m_VPValue(), m_Specific(&Plan.getVFxUF()));
2208 // Check if the branch condition compares the canonical IV increment (for main
2209 // loop), or the canonical IV increment plus an offset (for epilog loop).
2210 if (match(Term, m_BranchOnCount(
2211 m_CombineOr(m_CanIVInc, m_c_Add(m_CanIVInc, m_LiveIn())),
2212 m_VPValue())) ||
2214 m_VPValue(), m_VPValue(), m_VPValue()))))) {
2215 // Try to simplify the branch condition if VectorTC <= VF * UF when the
2216 // latch terminator is BranchOnCount or BranchOnCond(Not(ActiveLaneMask)).
2217 const SCEV *VectorTripCount =
2219 if (isa<SCEVCouldNotCompute>(VectorTripCount))
2220 VectorTripCount =
2222 assert(!isa<SCEVCouldNotCompute>(VectorTripCount) &&
2223 "Trip count SCEV must be computable");
2224 ScalarEvolution &SE = *PSE.getSE();
2225 ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF);
2226 const SCEV *C = SE.getElementCount(VectorTripCount->getType(), NumElements);
2227 if (!SE.isKnownPredicate(CmpInst::ICMP_ULE, VectorTripCount, C))
2228 return false;
2229 } else if (match(Term, m_BranchOnCond(m_VPValue(Cond))) ||
2231 // For BranchOnCond, check if we can prove the condition to be true using VF
2232 // and UF.
2233 if (!isConditionTrueViaVFAndUF(Cond, Plan, BestVF, BestUF, PSE))
2234 return false;
2235 } else {
2236 return false;
2237 }
2238
2239 // The vector loop region only executes once. Convert terminator of the
2240 // exiting block to exit in the first iteration.
2241 if (match(Term, m_BranchOnTwoConds())) {
2242 Term->setOperand(1, Plan.getTrue());
2243 return true;
2244 }
2245
2246 auto *BOC = new VPInstruction(VPInstruction::BranchOnCond, Plan.getTrue(), {},
2247 {}, Term->getDebugLoc());
2248 ExitingVPBB->appendRecipe(BOC);
2249 Term->eraseFromParent();
2250
2251 return true;
2252}
2253
2254/// From the definition of llvm.experimental.get.vector.length,
2255/// VPInstruction::ExplicitVectorLength(%AVL) = %AVL when %AVL <= VF.
2259 vp_depth_first_deep(Plan.getEntry()))) {
2260 for (VPRecipeBase &R : *VPBB) {
2261 VPValue *AVL;
2262 if (!match(&R, m_EVL(m_VPValue(AVL))))
2263 continue;
2264
2265 const SCEV *AVLSCEV = vputils::getSCEVExprForVPValue(AVL, PSE);
2266 if (isa<SCEVCouldNotCompute>(AVLSCEV))
2267 continue;
2268 ScalarEvolution &SE = *PSE.getSE();
2269 const SCEV *VFSCEV = SE.getElementCount(AVLSCEV->getType(), VF);
2270 if (!SE.isKnownPredicate(CmpInst::ICMP_ULE, AVLSCEV, VFSCEV))
2271 continue;
2272
2274 AVL, Type::getInt32Ty(Plan.getContext()), AVLSCEV->getType(),
2275 R.getDebugLoc());
2276 if (Trunc != AVL) {
2277 auto *TruncR = cast<VPSingleDefRecipe>(Trunc);
2278 const DataLayout &DL = Plan.getDataLayout();
2279 VPTypeAnalysis TypeInfo(Plan);
2280 if (VPValue *Folded =
2281 tryToFoldLiveIns(*TruncR, TruncR->operands(), DL, TypeInfo))
2282 Trunc = Folded;
2283 }
2284 R.getVPSingleValue()->replaceAllUsesWith(Trunc);
2285 return true;
2286 }
2287 }
2288 return false;
2289}
2290
2292 unsigned BestUF,
2294 assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
2295 assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
2296
2297 bool MadeChange = tryToReplaceALMWithWideALM(Plan, BestVF, BestUF);
2298 MadeChange |= simplifyBranchConditionForVFAndUF(Plan, BestVF, BestUF, PSE);
2299 MadeChange |= optimizeVectorInductionWidthForTCAndVFUF(Plan, BestVF, BestUF);
2300
2301 if (MadeChange) {
2302 Plan.setVF(BestVF);
2303 assert(Plan.getConcreteUF() == BestUF && "BestUF must match the Plan's UF");
2304 }
2305}
2306
2307/// Sink users of \p FOR after the recipe defining the previous value \p
2308/// Previous of the recurrence. \returns true if all users of \p FOR could be
2309/// re-arranged as needed or false if it is not possible.
2310static bool
2312 VPRecipeBase *Previous,
2313 VPDominatorTree &VPDT) {
2314 // If Previous is a live-in (no defining recipe), it naturally dominates all
2315 // recipes in the loop, so no sinking is needed.
2316 if (!Previous)
2317 return true;
2318
2319 // Collect recipes that need sinking.
2322 Seen.insert(Previous);
2323 auto TryToPushSinkCandidate = [&](VPRecipeBase *SinkCandidate) {
2324 // The previous value must not depend on the users of the recurrence phi. In
2325 // that case, FOR is not a fixed order recurrence.
2326 if (SinkCandidate == Previous)
2327 return false;
2328
2329 if (isa<VPHeaderPHIRecipe>(SinkCandidate) ||
2330 !Seen.insert(SinkCandidate).second ||
2331 VPDT.properlyDominates(Previous, SinkCandidate))
2332 return true;
2333
2334 if (cannotHoistOrSinkRecipe(*SinkCandidate))
2335 return false;
2336
2337 WorkList.push_back(SinkCandidate);
2338 return true;
2339 };
2340
2341 // Recursively sink users of FOR after Previous.
2342 WorkList.push_back(FOR);
2343 for (unsigned I = 0; I != WorkList.size(); ++I) {
2344 VPRecipeBase *Current = WorkList[I];
2345 assert(Current->getNumDefinedValues() == 1 &&
2346 "only recipes with a single defined value expected");
2347
2348 for (VPUser *User : Current->getVPSingleValue()->users()) {
2349 if (!TryToPushSinkCandidate(cast<VPRecipeBase>(User)))
2350 return false;
2351 }
2352 }
2353
2354 // Keep recipes to sink ordered by dominance so earlier instructions are
2355 // processed first.
2356 sort(WorkList, [&VPDT](const VPRecipeBase *A, const VPRecipeBase *B) {
2357 return VPDT.properlyDominates(A, B);
2358 });
2359
2360 for (VPRecipeBase *SinkCandidate : WorkList) {
2361 if (SinkCandidate == FOR)
2362 continue;
2363
2364 SinkCandidate->moveAfter(Previous);
2365 Previous = SinkCandidate;
2366 }
2367 return true;
2368}
2369
2370/// Try to hoist \p Previous and its operands before all users of \p FOR.
2372 VPRecipeBase *Previous,
2373 VPDominatorTree &VPDT) {
2374 if (cannotHoistOrSinkRecipe(*Previous))
2375 return false;
2376
2377 // Collect recipes that need hoisting.
2378 SmallVector<VPRecipeBase *> HoistCandidates;
2380 VPRecipeBase *HoistPoint = nullptr;
2381 // Find the closest hoist point by looking at all users of FOR and selecting
2382 // the recipe dominating all other users.
2383 for (VPUser *U : FOR->users()) {
2384 auto *R = cast<VPRecipeBase>(U);
2385 if (!HoistPoint || VPDT.properlyDominates(R, HoistPoint))
2386 HoistPoint = R;
2387 }
2388 assert(all_of(FOR->users(),
2389 [&VPDT, HoistPoint](VPUser *U) {
2390 auto *R = cast<VPRecipeBase>(U);
2391 return HoistPoint == R ||
2392 VPDT.properlyDominates(HoistPoint, R);
2393 }) &&
2394 "HoistPoint must dominate all users of FOR");
2395
2396 auto NeedsHoisting = [HoistPoint, &VPDT,
2397 &Visited](VPValue *HoistCandidateV) -> VPRecipeBase * {
2398 VPRecipeBase *HoistCandidate = HoistCandidateV->getDefiningRecipe();
2399 if (!HoistCandidate)
2400 return nullptr;
2401 VPRegionBlock *EnclosingLoopRegion =
2402 HoistCandidate->getParent()->getEnclosingLoopRegion();
2403 assert((!HoistCandidate->getRegion() ||
2404 HoistCandidate->getRegion() == EnclosingLoopRegion) &&
2405 "CFG in VPlan should still be flat, without replicate regions");
2406 // Hoist candidate was already visited, no need to hoist.
2407 if (!Visited.insert(HoistCandidate).second)
2408 return nullptr;
2409
2410 // Candidate is outside loop region or a header phi, dominates FOR users w/o
2411 // hoisting.
2412 if (!EnclosingLoopRegion || isa<VPHeaderPHIRecipe>(HoistCandidate))
2413 return nullptr;
2414
2415 // If we reached a recipe that dominates HoistPoint, we don't need to
2416 // hoist the recipe.
2417 if (VPDT.properlyDominates(HoistCandidate, HoistPoint))
2418 return nullptr;
2419 return HoistCandidate;
2420 };
2421
2422 if (!NeedsHoisting(Previous->getVPSingleValue()))
2423 return true;
2424
2425 // Recursively try to hoist Previous and its operands before all users of FOR.
2426 HoistCandidates.push_back(Previous);
2427
2428 for (unsigned I = 0; I != HoistCandidates.size(); ++I) {
2429 VPRecipeBase *Current = HoistCandidates[I];
2430 assert(Current->getNumDefinedValues() == 1 &&
2431 "only recipes with a single defined value expected");
2432 if (cannotHoistOrSinkRecipe(*Current))
2433 return false;
2434
2435 for (VPValue *Op : Current->operands()) {
2436 // If we reach FOR, it means the original Previous depends on some other
2437 // recurrence that in turn depends on FOR. If that is the case, we would
2438 // also need to hoist recipes involving the other FOR, which may break
2439 // dependencies.
2440 if (Op == FOR)
2441 return false;
2442
2443 if (auto *R = NeedsHoisting(Op)) {
2444 // Bail out if the recipe defines multiple values.
2445 // TODO: Hoisting such recipes requires additional handling.
2446 if (R->getNumDefinedValues() != 1)
2447 return false;
2448 HoistCandidates.push_back(R);
2449 }
2450 }
2451 }
2452
2453 // Order recipes to hoist by dominance so earlier instructions are processed
2454 // first.
2455 sort(HoistCandidates, [&VPDT](const VPRecipeBase *A, const VPRecipeBase *B) {
2456 return VPDT.properlyDominates(A, B);
2457 });
2458
2459 for (VPRecipeBase *HoistCandidate : HoistCandidates) {
2460 HoistCandidate->moveBefore(*HoistPoint->getParent(),
2461 HoistPoint->getIterator());
2462 }
2463
2464 return true;
2465}
2466
2468 VPBuilder &LoopBuilder) {
2469 VPDominatorTree VPDT(Plan);
2470 VPTypeAnalysis TypeInfo(Plan);
2471
2473 for (VPRecipeBase &R :
2476 RecurrencePhis.push_back(FOR);
2477
2478 for (VPFirstOrderRecurrencePHIRecipe *FOR : RecurrencePhis) {
2480 VPRecipeBase *Previous = FOR->getBackedgeValue()->getDefiningRecipe();
2481 // Fixed-order recurrences do not contain cycles, so this loop is guaranteed
2482 // to terminate.
2483 while (auto *PrevPhi =
2485 assert(PrevPhi->getParent() == FOR->getParent());
2486 assert(SeenPhis.insert(PrevPhi).second);
2487 Previous = PrevPhi->getBackedgeValue()->getDefiningRecipe();
2488 }
2489
2490 if (!sinkRecurrenceUsersAfterPrevious(FOR, Previous, VPDT) &&
2491 !hoistPreviousBeforeFORUsers(FOR, Previous, VPDT))
2492 return false;
2493
2494 // Introduce a recipe to combine the incoming and previous values of a
2495 // fixed-order recurrence.
2496 VPBasicBlock *InsertBlock =
2497 Previous ? Previous->getParent() : FOR->getParent();
2498 if (!Previous || isa<VPHeaderPHIRecipe>(Previous))
2499 LoopBuilder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi());
2500 else
2501 LoopBuilder.setInsertPoint(InsertBlock,
2502 std::next(Previous->getIterator()));
2503
2504 auto *RecurSplice =
2506 {FOR, FOR->getBackedgeValue()});
2507
2508 FOR->replaceAllUsesWith(RecurSplice);
2509 // Set the first operand of RecurSplice to FOR again, after replacing
2510 // all users.
2511 RecurSplice->setOperand(0, FOR);
2512
2513 // Check for users extracting at the penultimate active lane of the FOR.
2514 // If only a single lane is active in the current iteration, we need to
2515 // select the last element from the previous iteration (from the FOR phi
2516 // directly).
2517 for (VPUser *U : RecurSplice->users()) {
2519 m_Specific(RecurSplice))))
2520 continue;
2521
2523 VPValue *LastActiveLane = cast<VPInstruction>(U)->getOperand(0);
2524 Type *Ty = TypeInfo.inferScalarType(LastActiveLane);
2525 VPValue *Zero = Plan.getConstantInt(Ty, 0);
2526 VPValue *One = Plan.getConstantInt(Ty, 1);
2527 VPValue *PenultimateIndex = B.createSub(LastActiveLane, One);
2528 VPValue *PenultimateLastIter =
2529 B.createNaryOp(VPInstruction::ExtractLane,
2530 {PenultimateIndex, FOR->getBackedgeValue()});
2531 VPValue *LastPrevIter =
2532 B.createNaryOp(VPInstruction::ExtractLastLane, FOR);
2533
2534 VPValue *Cmp = B.createICmp(CmpInst::ICMP_EQ, LastActiveLane, Zero);
2535 VPValue *Sel = B.createSelect(Cmp, LastPrevIter, PenultimateLastIter);
2536 cast<VPInstruction>(U)->replaceAllUsesWith(Sel);
2537 }
2538 }
2539 return true;
2540}
2541
2543 for (VPRecipeBase &R :
2545 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
2546 if (!PhiR)
2547 continue;
2548 RecurKind RK = PhiR->getRecurrenceKind();
2549 if (RK != RecurKind::Add && RK != RecurKind::Mul && RK != RecurKind::Sub &&
2551 continue;
2552
2553 for (VPUser *U : collectUsersRecursively(PhiR))
2554 if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(U)) {
2555 RecWithFlags->dropPoisonGeneratingFlags();
2556 }
2557 }
2558}
2559
2560namespace {
2561struct VPCSEDenseMapInfo : public DenseMapInfo<VPSingleDefRecipe *> {
2562 static bool isSentinel(const VPSingleDefRecipe *Def) {
2563 return Def == getEmptyKey() || Def == getTombstoneKey();
2564 }
2565
2566 /// If recipe \p R will lower to a GEP with a non-i8 source element type,
2567 /// return that source element type.
2568 static Type *getGEPSourceElementType(const VPSingleDefRecipe *R) {
2569 // All VPInstructions that lower to GEPs must have the i8 source element
2570 // type (as they are PtrAdds), so we omit it.
2572 .Case([](const VPReplicateRecipe *I) -> Type * {
2573 if (auto *GEP = dyn_cast<GetElementPtrInst>(I->getUnderlyingValue()))
2574 return GEP->getSourceElementType();
2575 return nullptr;
2576 })
2577 .Case<VPVectorPointerRecipe, VPWidenGEPRecipe>(
2578 [](auto *I) { return I->getSourceElementType(); })
2579 .Default([](auto *) { return nullptr; });
2580 }
2581
2582 /// Returns true if recipe \p Def can be safely handed for CSE.
2583 static bool canHandle(const VPSingleDefRecipe *Def) {
2584 // We can extend the list of handled recipes in the future,
2585 // provided we account for the data embedded in them while checking for
2586 // equality or hashing.
2587 auto C = getOpcodeOrIntrinsicID(Def);
2588
2589 // The issue with (Insert|Extract)Value is that the index of the
2590 // insert/extract is not a proper operand in LLVM IR, and hence also not in
2591 // VPlan.
2592 if (!C || (!C->first && (C->second == Instruction::InsertValue ||
2593 C->second == Instruction::ExtractValue)))
2594 return false;
2595
2596 // During CSE, we can only handle recipes that don't read from memory: if
2597 // they read from memory, there could be an intervening write to memory
2598 // before the next instance is CSE'd, leading to an incorrect result.
2599 return !Def->mayReadFromMemory();
2600 }
2601
2602 /// Hash the underlying data of \p Def.
2603 static unsigned getHashValue(const VPSingleDefRecipe *Def) {
2604 const VPlan *Plan = Def->getParent()->getPlan();
2605 VPTypeAnalysis TypeInfo(*Plan);
2606 hash_code Result = hash_combine(
2607 Def->getVPRecipeID(), getOpcodeOrIntrinsicID(Def),
2608 getGEPSourceElementType(Def), TypeInfo.inferScalarType(Def),
2610 if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(Def))
2611 if (RFlags->hasPredicate())
2612 return hash_combine(Result, RFlags->getPredicate());
2613 return Result;
2614 }
2615
2616 /// Check equality of underlying data of \p L and \p R.
2617 static bool isEqual(const VPSingleDefRecipe *L, const VPSingleDefRecipe *R) {
2618 if (isSentinel(L) || isSentinel(R))
2619 return L == R;
2620 if (L->getVPRecipeID() != R->getVPRecipeID() ||
2622 getGEPSourceElementType(L) != getGEPSourceElementType(R) ||
2624 !equal(L->operands(), R->operands()))
2625 return false;
2627 "must have valid opcode info for both recipes");
2628 if (auto *LFlags = dyn_cast<VPRecipeWithIRFlags>(L))
2629 if (LFlags->hasPredicate() &&
2630 LFlags->getPredicate() !=
2631 cast<VPRecipeWithIRFlags>(R)->getPredicate())
2632 return false;
2633 // Recipes in replicate regions implicitly depend on predicate. If either
2634 // recipe is in a replicate region, only consider them equal if both have
2635 // the same parent.
2636 const VPRegionBlock *RegionL = L->getRegion();
2637 const VPRegionBlock *RegionR = R->getRegion();
2638 if (((RegionL && RegionL->isReplicator()) ||
2639 (RegionR && RegionR->isReplicator())) &&
2640 L->getParent() != R->getParent())
2641 return false;
2642 const VPlan *Plan = L->getParent()->getPlan();
2643 VPTypeAnalysis TypeInfo(*Plan);
2644 return TypeInfo.inferScalarType(L) == TypeInfo.inferScalarType(R);
2645 }
2646};
2647} // end anonymous namespace
2648
2649/// Perform a common-subexpression-elimination of VPSingleDefRecipes on the \p
2650/// Plan.
2652 VPDominatorTree VPDT(Plan);
2654
2656 Plan.getEntry());
2658 for (VPRecipeBase &R : *VPBB) {
2659 auto *Def = dyn_cast<VPSingleDefRecipe>(&R);
2660 if (!Def || !VPCSEDenseMapInfo::canHandle(Def))
2661 continue;
2662 if (VPSingleDefRecipe *V = CSEMap.lookup(Def)) {
2663 // V must dominate Def for a valid replacement.
2664 if (!VPDT.dominates(V->getParent(), VPBB))
2665 continue;
2666 // Only keep flags present on both V and Def.
2667 if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(V))
2668 RFlags->intersectFlags(*cast<VPRecipeWithIRFlags>(Def));
2669 Def->replaceAllUsesWith(V);
2670 continue;
2671 }
2672 CSEMap[Def] = Def;
2673 }
2674 }
2675}
2676
2677/// Move loop-invariant recipes out of the vector loop region in \p Plan.
2678static void licm(VPlan &Plan) {
2679 VPBasicBlock *Preheader = Plan.getVectorPreheader();
2680
2681 // Hoist any loop invariant recipes from the vector loop region to the
2682 // preheader. Preform a shallow traversal of the vector loop region, to
2683 // exclude recipes in replicate regions. Since the top-level blocks in the
2684 // vector loop region are guaranteed to execute if the vector pre-header is,
2685 // we don't need to check speculation safety.
2686 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2687 assert(Preheader->getSingleSuccessor() == LoopRegion &&
2688 "Expected vector prehader's successor to be the vector loop region");
2690 vp_depth_first_shallow(LoopRegion->getEntry()))) {
2691 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
2693 continue;
2694 if (any_of(R.operands(), [](VPValue *Op) {
2695 return !Op->isDefinedOutsideLoopRegions();
2696 }))
2697 continue;
2698 R.moveBefore(*Preheader, Preheader->end());
2699 }
2700 }
2701
2702#ifndef NDEBUG
2703 VPDominatorTree VPDT(Plan);
2704#endif
2705 // Sink recipes with no users inside the vector loop region if all users are
2706 // in the same exit block of the region.
2707 // TODO: Extend to sink recipes from inner loops.
2709 LoopRegion->getEntry());
2711 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
2713 continue;
2714
2715 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
2716 assert(!RepR->isPredicated() &&
2717 "Expected prior transformation of predicated replicates to "
2718 "replicate regions");
2719 // narrowToSingleScalarRecipes should have already maximally narrowed
2720 // replicates to single-scalar replicates.
2721 // TODO: When unrolling, replicateByVF doesn't handle sunk
2722 // non-single-scalar replicates correctly.
2723 if (!RepR->isSingleScalar())
2724 continue;
2725 }
2726
2727 // TODO: Use R.definedValues() instead of casting to VPSingleDefRecipe to
2728 // support recipes with multiple defined values (e.g., interleaved loads).
2729 auto *Def = cast<VPSingleDefRecipe>(&R);
2730 // Skip recipes without users as we cannot determine a sink block.
2731 // TODO: Clone sinkable recipes without users to all exit blocks to reduce
2732 // their execution frequency.
2733 if (Def->getNumUsers() == 0)
2734 continue;
2735
2736 VPBasicBlock *SinkBB = nullptr;
2737 // Cannot sink the recipe if any user
2738 // * is defined in any loop region, or
2739 // * is a phi, or
2740 // * multiple users in different blocks.
2741 if (any_of(Def->users(), [&SinkBB](VPUser *U) {
2742 auto *UserR = cast<VPRecipeBase>(U);
2743 VPBasicBlock *Parent = UserR->getParent();
2744 // TODO: If the user is a PHI node, we should check the block of
2745 // incoming value. Support PHI node users if needed.
2746 if (UserR->isPhi() || Parent->getEnclosingLoopRegion())
2747 return true;
2748 // TODO: Support sinking when users are in multiple blocks.
2749 if (SinkBB && SinkBB != Parent)
2750 return true;
2751 SinkBB = Parent;
2752 return false;
2753 }))
2754 continue;
2755
2756 // Only sink to dedicated exit blocks of the loop region.
2757 if (SinkBB->getSinglePredecessor() != LoopRegion)
2758 continue;
2759
2760 // TODO: This will need to be a check instead of a assert after
2761 // conditional branches in vectorized loops are supported.
2762 assert(VPDT.properlyDominates(VPBB, SinkBB) &&
2763 "Defining block must dominate sink block");
2764 // TODO: Clone the recipe if users are on multiple exit paths, instead of
2765 // just moving.
2766 Def->moveBefore(*SinkBB, SinkBB->getFirstNonPhi());
2767 }
2768 }
2769}
2770
2772 VPlan &Plan, const MapVector<Instruction *, uint64_t> &MinBWs) {
2773 if (Plan.hasScalarVFOnly())
2774 return;
2775 // Keep track of created truncates, so they can be re-used. Note that we
2776 // cannot use RAUW after creating a new truncate, as this would could make
2777 // other uses have different types for their operands, making them invalidly
2778 // typed.
2780 VPTypeAnalysis TypeInfo(Plan);
2781 VPBasicBlock *PH = Plan.getVectorPreheader();
2784 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
2787 continue;
2788
2789 VPValue *ResultVPV = R.getVPSingleValue();
2790 auto *UI = cast_or_null<Instruction>(ResultVPV->getUnderlyingValue());
2791 unsigned NewResSizeInBits = MinBWs.lookup(UI);
2792 if (!NewResSizeInBits)
2793 continue;
2794
2795 // If the value wasn't vectorized, we must maintain the original scalar
2796 // type. Skip those here, after incrementing NumProcessedRecipes. Also
2797 // skip casts which do not need to be handled explicitly here, as
2798 // redundant casts will be removed during recipe simplification.
2800 continue;
2801
2802 Type *OldResTy = TypeInfo.inferScalarType(ResultVPV);
2803 unsigned OldResSizeInBits = OldResTy->getScalarSizeInBits();
2804 assert(OldResTy->isIntegerTy() && "only integer types supported");
2805 (void)OldResSizeInBits;
2806
2807 auto *NewResTy = IntegerType::get(Plan.getContext(), NewResSizeInBits);
2808
2809 // Any wrapping introduced by shrinking this operation shouldn't be
2810 // considered undefined behavior. So, we can't unconditionally copy
2811 // arithmetic wrapping flags to VPW.
2812 if (auto *VPW = dyn_cast<VPRecipeWithIRFlags>(&R))
2813 VPW->dropPoisonGeneratingFlags();
2814
2815 if (OldResSizeInBits != NewResSizeInBits &&
2816 !match(&R, m_ICmp(m_VPValue(), m_VPValue()))) {
2817 // Extend result to original width.
2818 auto *Ext = new VPWidenCastRecipe(
2819 Instruction::ZExt, ResultVPV, OldResTy, nullptr,
2820 VPIRFlags::getDefaultFlags(Instruction::ZExt));
2821 Ext->insertAfter(&R);
2822 ResultVPV->replaceAllUsesWith(Ext);
2823 Ext->setOperand(0, ResultVPV);
2824 assert(OldResSizeInBits > NewResSizeInBits && "Nothing to shrink?");
2825 } else {
2826 assert(match(&R, m_ICmp(m_VPValue(), m_VPValue())) &&
2827 "Only ICmps should not need extending the result.");
2828 }
2829
2830 assert(!isa<VPWidenStoreRecipe>(&R) && "stores cannot be narrowed");
2832 continue;
2833
2834 // Shrink operands by introducing truncates as needed.
2835 unsigned StartIdx =
2836 match(&R, m_Select(m_VPValue(), m_VPValue(), m_VPValue())) ? 1 : 0;
2837 for (unsigned Idx = StartIdx; Idx != R.getNumOperands(); ++Idx) {
2838 auto *Op = R.getOperand(Idx);
2839 unsigned OpSizeInBits =
2841 if (OpSizeInBits == NewResSizeInBits)
2842 continue;
2843 assert(OpSizeInBits > NewResSizeInBits && "nothing to truncate");
2844 auto [ProcessedIter, IterIsEmpty] = ProcessedTruncs.try_emplace(Op);
2845 if (!IterIsEmpty) {
2846 R.setOperand(Idx, ProcessedIter->second);
2847 continue;
2848 }
2849
2850 VPBuilder Builder;
2851 if (isa<VPIRValue>(Op))
2852 Builder.setInsertPoint(PH);
2853 else
2854 Builder.setInsertPoint(&R);
2855 VPWidenCastRecipe *NewOp =
2856 Builder.createWidenCast(Instruction::Trunc, Op, NewResTy);
2857 ProcessedIter->second = NewOp;
2858 R.setOperand(Idx, NewOp);
2859 }
2860
2861 }
2862 }
2863}
2864
2865void VPlanTransforms::removeBranchOnConst(VPlan &Plan, bool OnlyLatches) {
2866 std::optional<VPDominatorTree> VPDT;
2867 if (OnlyLatches)
2868 VPDT.emplace(Plan);
2869
2870 // Collect all blocks before modifying the CFG so we can identify unreachable
2871 // ones after constant branch removal.
2873
2874 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(AllBlocks)) {
2875 VPValue *Cond;
2876 // Skip blocks that are not terminated by BranchOnCond.
2877 if (VPBB->empty() || !match(&VPBB->back(), m_BranchOnCond(m_VPValue(Cond))))
2878 continue;
2879
2880 if (OnlyLatches && !VPBlockUtils::isLatch(VPBB, *VPDT))
2881 continue;
2882
2883 assert(VPBB->getNumSuccessors() == 2 &&
2884 "Two successors expected for BranchOnCond");
2885 unsigned RemovedIdx;
2886 if (match(Cond, m_True()))
2887 RemovedIdx = 1;
2888 else if (match(Cond, m_False()))
2889 RemovedIdx = 0;
2890 else
2891 continue;
2892
2893 VPBasicBlock *RemovedSucc =
2894 cast<VPBasicBlock>(VPBB->getSuccessors()[RemovedIdx]);
2895 assert(count(RemovedSucc->getPredecessors(), VPBB) == 1 &&
2896 "There must be a single edge between VPBB and its successor");
2897 // Values coming from VPBB into phi recipes of RemovedSucc are removed from
2898 // these recipes.
2899 for (VPRecipeBase &R : RemovedSucc->phis())
2900 cast<VPPhiAccessors>(&R)->removeIncomingValueFor(VPBB);
2901
2902 // Disconnect blocks and remove the terminator.
2903 VPBlockUtils::disconnectBlocks(VPBB, RemovedSucc);
2904 VPBB->back().eraseFromParent();
2905 }
2906
2907 // Compute which blocks are still reachable from the entry after constant
2908 // branch removal.
2911
2912 // Detach all unreachable blocks from their successors, removing their recipes
2913 // and incoming values from phi recipes.
2914 VPSymbolicValue Tmp;
2915 for (VPBlockBase *B : AllBlocks) {
2916 if (Reachable.contains(B))
2917 continue;
2918 for (VPBlockBase *Succ : to_vector(B->successors())) {
2919 if (auto *SuccBB = dyn_cast<VPBasicBlock>(Succ))
2920 for (VPRecipeBase &R : SuccBB->phis())
2921 cast<VPPhiAccessors>(&R)->removeIncomingValueFor(B);
2923 }
2924 for (VPBasicBlock *DeadBB :
2926 for (VPRecipeBase &R : make_early_inc_range(*DeadBB)) {
2927 for (VPValue *Def : R.definedValues())
2928 Def->replaceAllUsesWith(&Tmp);
2929 R.eraseFromParent();
2930 }
2931 }
2932 }
2933}
2934
2956
2957// Add a VPActiveLaneMaskPHIRecipe and related recipes to \p Plan and replace
2958// the loop terminator with a branch-on-cond recipe with the negated
2959// active-lane-mask as operand. Note that this turns the loop into an
2960// uncountable one. Only the existing terminator is replaced, all other existing
2961// recipes/users remain unchanged, except for poison-generating flags being
2962// dropped from the canonical IV increment. Return the created
2963// VPActiveLaneMaskPHIRecipe.
2964//
2965// The function adds the following recipes:
2966//
2967// vector.ph:
2968// %EntryInc = canonical-iv-increment-for-part CanonicalIVStart
2969// %EntryALM = active-lane-mask %EntryInc, TC
2970//
2971// vector.body:
2972// ...
2973// %P = active-lane-mask-phi [ %EntryALM, %vector.ph ], [ %ALM, %vector.body ]
2974// ...
2975// %InLoopInc = canonical-iv-increment-for-part CanonicalIVIncrement
2976// %ALM = active-lane-mask %InLoopInc, TC
2977// %Negated = Not %ALM
2978// branch-on-cond %Negated
2979//
2982 VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
2983 VPBasicBlock *EB = TopRegion->getExitingBasicBlock();
2984 auto *CanonicalIVPHI = TopRegion->getCanonicalIV();
2985 VPValue *StartV = CanonicalIVPHI->getStartValue();
2986
2987 auto *CanonicalIVIncrement =
2988 cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue());
2989 // TODO: Check if dropping the flags is needed.
2990 CanonicalIVIncrement->dropPoisonGeneratingFlags();
2991 DebugLoc DL = CanonicalIVIncrement->getDebugLoc();
2992 // We can't use StartV directly in the ActiveLaneMask VPInstruction, since
2993 // we have to take unrolling into account. Each part needs to start at
2994 // Part * VF
2995 auto *VecPreheader = Plan.getVectorPreheader();
2996 VPBuilder Builder(VecPreheader);
2997
2998 // Create the ActiveLaneMask instruction using the correct start values.
2999 VPValue *TC = Plan.getTripCount();
3000 VPValue *VF = &Plan.getVF();
3001
3002 auto *EntryIncrement = Builder.createOverflowingOp(
3003 VPInstruction::CanonicalIVIncrementForPart, {StartV, VF}, {false, false},
3004 DL, "index.part.next");
3005
3006 // Create the active lane mask instruction in the VPlan preheader.
3007 VPValue *ALMMultiplier =
3008 Plan.getConstantInt(TopRegion->getCanonicalIVType(), 1);
3009 auto *EntryALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
3010 {EntryIncrement, TC, ALMMultiplier}, DL,
3011 "active.lane.mask.entry");
3012
3013 // Now create the ActiveLaneMaskPhi recipe in the main loop using the
3014 // preheader ActiveLaneMask instruction.
3015 auto *LaneMaskPhi =
3017 LaneMaskPhi->insertAfter(CanonicalIVPHI);
3018
3019 // Create the active lane mask for the next iteration of the loop before the
3020 // original terminator.
3021 VPRecipeBase *OriginalTerminator = EB->getTerminator();
3022 Builder.setInsertPoint(OriginalTerminator);
3023 auto *InLoopIncrement = Builder.createOverflowingOp(
3025 {CanonicalIVIncrement, &Plan.getVF()}, {false, false}, DL);
3026 auto *ALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
3027 {InLoopIncrement, TC, ALMMultiplier}, DL,
3028 "active.lane.mask.next");
3029 LaneMaskPhi->addOperand(ALM);
3030
3031 // Replace the original terminator with BranchOnCond. We have to invert the
3032 // mask here because a true condition means jumping to the exit block.
3033 auto *NotMask = Builder.createNot(ALM, DL);
3034 Builder.createNaryOp(VPInstruction::BranchOnCond, {NotMask}, DL);
3035 OriginalTerminator->eraseFromParent();
3036 return LaneMaskPhi;
3037}
3038
3040 bool UseActiveLaneMaskForControlFlow) {
3041 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3042 auto *FoundWidenCanonicalIVUser = find_if(
3044 assert(FoundWidenCanonicalIVUser &&
3045 "Must have widened canonical IV when tail folding!");
3046 VPSingleDefRecipe *HeaderMask = vputils::findHeaderMask(Plan);
3047 auto *WideCanonicalIV =
3048 cast<VPWidenCanonicalIVRecipe>(*FoundWidenCanonicalIVUser);
3049 VPSingleDefRecipe *LaneMask;
3050 if (UseActiveLaneMaskForControlFlow) {
3051 LaneMask = addVPLaneMaskPhiAndUpdateExitBranch(Plan);
3052 } else {
3053 VPBuilder B = VPBuilder::getToInsertAfter(WideCanonicalIV);
3054 VPValue *ALMMultiplier =
3055 Plan.getConstantInt(LoopRegion->getCanonicalIVType(), 1);
3056 LaneMask =
3057 B.createNaryOp(VPInstruction::ActiveLaneMask,
3058 {WideCanonicalIV, Plan.getTripCount(), ALMMultiplier},
3059 nullptr, "active.lane.mask");
3060 }
3061
3062 // Walk users of WideCanonicalIV and replace the header mask of the form
3063 // (ICMP_ULE, WideCanonicalIV, backedge-taken-count) with an active-lane-mask,
3064 // removing the old one to ensure there is always only a single header mask.
3065 HeaderMask->replaceAllUsesWith(LaneMask);
3066 HeaderMask->eraseFromParent();
3067}
3068
3069template <typename Op0_t, typename Op1_t> struct RemoveMask_match {
3070 Op0_t In;
3072
3073 RemoveMask_match(const Op0_t &In, Op1_t &Out) : In(In), Out(Out) {}
3074
3075 template <typename OpTy> bool match(OpTy *V) const {
3076 if (m_Specific(In).match(V)) {
3077 Out = nullptr;
3078 return true;
3079 }
3080 return m_LogicalAnd(m_Specific(In), m_VPValue(Out)).match(V);
3081 }
3082};
3083
3084/// Match a specific mask \p In, or a combination of it (logical-and In, Out).
3085/// Returns the remaining part \p Out if so, or nullptr otherwise.
3086template <typename Op0_t, typename Op1_t>
3087static inline RemoveMask_match<Op0_t, Op1_t> m_RemoveMask(const Op0_t &In,
3088 Op1_t &Out) {
3089 return RemoveMask_match<Op0_t, Op1_t>(In, Out);
3090}
3091
3092/// Try to optimize a \p CurRecipe masked by \p HeaderMask to a corresponding
3093/// EVL-based recipe without the header mask. Returns nullptr if no EVL-based
3094/// recipe could be created.
3095/// \p HeaderMask Header Mask.
3096/// \p CurRecipe Recipe to be transform.
3097/// \p TypeInfo VPlan-based type analysis.
3098/// \p EVL The explicit vector length parameter of vector-predication
3099/// intrinsics.
3101 VPRecipeBase &CurRecipe,
3102 VPTypeAnalysis &TypeInfo, VPValue &EVL) {
3103 VPlan *Plan = CurRecipe.getParent()->getPlan();
3104 DebugLoc DL = CurRecipe.getDebugLoc();
3105 VPValue *Addr, *Mask, *EndPtr;
3106
3107 /// Adjust any end pointers so that they point to the end of EVL lanes not VF.
3108 auto AdjustEndPtr = [&CurRecipe, &EVL](VPValue *EndPtr) {
3109 auto *EVLEndPtr = cast<VPVectorEndPointerRecipe>(EndPtr)->clone();
3110 EVLEndPtr->insertBefore(&CurRecipe);
3111 EVLEndPtr->setOperand(1, &EVL);
3112 return EVLEndPtr;
3113 };
3114
3115 auto GetVPReverse = [&CurRecipe, &EVL, &TypeInfo, Plan,
3117 if (!V)
3118 return nullptr;
3119 auto *Reverse = new VPWidenIntrinsicRecipe(
3120 Intrinsic::experimental_vp_reverse, {V, Plan->getTrue(), &EVL},
3121 TypeInfo.inferScalarType(V), {}, {}, DL);
3122 Reverse->insertBefore(&CurRecipe);
3123 return Reverse;
3124 };
3125
3126 if (match(&CurRecipe,
3127 m_MaskedLoad(m_VPValue(Addr), m_RemoveMask(HeaderMask, Mask))))
3128 return new VPWidenLoadEVLRecipe(cast<VPWidenLoadRecipe>(CurRecipe), Addr,
3129 EVL, Mask);
3130
3131 VPValue *ReversedVal;
3132 if (match(&CurRecipe, m_Reverse(m_VPValue(ReversedVal))) &&
3133 match(ReversedVal,
3134 m_MaskedLoad(m_VPValue(EndPtr),
3135 m_Reverse(m_RemoveMask(HeaderMask, Mask)))) &&
3136 match(EndPtr, m_VecEndPtr(m_VPValue(), m_Specific(&Plan->getVF())))) {
3137 Mask = GetVPReverse(Mask);
3138 Addr = AdjustEndPtr(EndPtr);
3139 auto *LoadR = new VPWidenLoadEVLRecipe(
3140 *cast<VPWidenLoadRecipe>(ReversedVal), Addr, EVL, Mask);
3141 LoadR->insertBefore(&CurRecipe);
3142 return new VPWidenIntrinsicRecipe(
3143 Intrinsic::experimental_vp_reverse, {LoadR, Plan->getTrue(), &EVL},
3144 TypeInfo.inferScalarType(LoadR), {}, {}, DL);
3145 }
3146
3147 VPValue *StoredVal;
3148 if (match(&CurRecipe, m_MaskedStore(m_VPValue(Addr), m_VPValue(StoredVal),
3149 m_RemoveMask(HeaderMask, Mask))))
3150 return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(CurRecipe), Addr,
3151 StoredVal, EVL, Mask);
3152
3153 if (match(&CurRecipe,
3154 m_MaskedStore(m_VPValue(EndPtr), m_Reverse(m_VPValue(ReversedVal)),
3155 m_Reverse(m_RemoveMask(HeaderMask, Mask)))) &&
3156 match(EndPtr, m_VecEndPtr(m_VPValue(), m_Specific(&Plan->getVF())))) {
3157 Mask = GetVPReverse(Mask);
3158 Addr = AdjustEndPtr(EndPtr);
3159 StoredVal = GetVPReverse(ReversedVal);
3160 return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(CurRecipe), Addr,
3161 StoredVal, EVL, Mask);
3162 }
3163
3164 if (auto *Rdx = dyn_cast<VPReductionRecipe>(&CurRecipe))
3165 if (Rdx->isConditional() &&
3166 match(Rdx->getCondOp(), m_RemoveMask(HeaderMask, Mask)))
3167 return new VPReductionEVLRecipe(*Rdx, EVL, Mask);
3168
3169 if (auto *Interleave = dyn_cast<VPInterleaveRecipe>(&CurRecipe))
3170 if (Interleave->getMask() &&
3171 match(Interleave->getMask(), m_RemoveMask(HeaderMask, Mask)))
3172 return new VPInterleaveEVLRecipe(*Interleave, EVL, Mask);
3173
3174 VPValue *LHS, *RHS;
3175 if (match(&CurRecipe,
3176 m_Select(m_Specific(HeaderMask), m_VPValue(LHS), m_VPValue(RHS))))
3177 return new VPWidenIntrinsicRecipe(
3178 Intrinsic::vp_merge, {Plan->getTrue(), LHS, RHS, &EVL},
3179 TypeInfo.inferScalarType(LHS), {}, {}, DL);
3180
3181 if (match(&CurRecipe, m_Select(m_RemoveMask(HeaderMask, Mask), m_VPValue(LHS),
3182 m_VPValue(RHS))))
3183 return new VPWidenIntrinsicRecipe(
3184 Intrinsic::vp_merge, {Mask, LHS, RHS, &EVL},
3185 TypeInfo.inferScalarType(LHS), {}, {}, DL);
3186
3187 if (match(&CurRecipe, m_LastActiveLane(m_Specific(HeaderMask)))) {
3188 Type *Ty = TypeInfo.inferScalarType(CurRecipe.getVPSingleValue());
3189 VPValue *ZExt = VPBuilder(&CurRecipe)
3191 &EVL, Ty, TypeInfo.inferScalarType(&EVL), DL);
3192 return new VPInstruction(
3193 Instruction::Sub, {ZExt, Plan->getConstantInt(Ty, 1)},
3194 VPIRFlags::getDefaultFlags(Instruction::Sub), {}, DL);
3195 }
3196
3197 return nullptr;
3198}
3199
3200/// Optimize away any EVL-based header masks to VP intrinsic based recipes.
3201/// The transforms here need to preserve the original semantics.
3203 // Find the EVL-based header mask if it exists: icmp ult step-vector, EVL
3204 VPValue *HeaderMask = nullptr, *EVL = nullptr;
3207 m_VPValue(EVL))) &&
3208 match(EVL, m_EVL(m_VPValue()))) {
3209 HeaderMask = R.getVPSingleValue();
3210 break;
3211 }
3212 }
3213 if (!HeaderMask)
3214 return;
3215
3216 VPTypeAnalysis TypeInfo(Plan);
3217 SmallVector<VPRecipeBase *> OldRecipes;
3218 for (VPUser *U : collectUsersRecursively(HeaderMask)) {
3220 if (auto *NewR = optimizeMaskToEVL(HeaderMask, *R, TypeInfo, *EVL)) {
3221 NewR->insertBefore(R);
3222 for (auto [Old, New] :
3223 zip_equal(R->definedValues(), NewR->definedValues()))
3224 Old->replaceAllUsesWith(New);
3225 OldRecipes.push_back(R);
3226 }
3227 }
3228
3229 // Replace remaining (HeaderMask && Mask) with vp.merge (True, Mask,
3230 // False, EVL)
3231 for (VPUser *U : collectUsersRecursively(HeaderMask)) {
3232 VPValue *Mask;
3233 if (match(U, m_LogicalAnd(m_Specific(HeaderMask), m_VPValue(Mask)))) {
3234 auto *LogicalAnd = cast<VPInstruction>(U);
3235 auto *Merge = new VPWidenIntrinsicRecipe(
3236 Intrinsic::vp_merge, {Plan.getTrue(), Mask, Plan.getFalse(), EVL},
3237 TypeInfo.inferScalarType(Mask), {}, {}, LogicalAnd->getDebugLoc());
3238 Merge->insertBefore(LogicalAnd);
3239 LogicalAnd->replaceAllUsesWith(Merge);
3240 OldRecipes.push_back(LogicalAnd);
3241 }
3242 }
3243
3244 // Erase old recipes at the end so we don't invalidate TypeInfo.
3245 for (VPRecipeBase *R : reverse(OldRecipes)) {
3246 SmallVector<VPValue *> PossiblyDead(R->operands());
3247 R->eraseFromParent();
3248 for (VPValue *Op : PossiblyDead)
3250 }
3251}
3252
3253/// After replacing the canonical IV with a EVL-based IV, fixup recipes that use
3254/// VF to use the EVL instead to avoid incorrect updates on the penultimate
3255/// iteration.
3256static void fixupVFUsersForEVL(VPlan &Plan, VPValue &EVL) {
3257 VPTypeAnalysis TypeInfo(Plan);
3258 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3259 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
3260
3261 assert(all_of(Plan.getVF().users(),
3264 "User of VF that we can't transform to EVL.");
3265 Plan.getVF().replaceUsesWithIf(&EVL, [](VPUser &U, unsigned Idx) {
3267 });
3268
3269 assert(all_of(Plan.getVFxUF().users(),
3271 m_c_Add(m_Specific(LoopRegion->getCanonicalIV()),
3272 m_Specific(&Plan.getVFxUF())),
3274 "Only users of VFxUF should be VPWidenPointerInductionRecipe and the "
3275 "increment of the canonical induction.");
3276 Plan.getVFxUF().replaceUsesWithIf(&EVL, [](VPUser &U, unsigned Idx) {
3277 // Only replace uses in VPWidenPointerInductionRecipe; The increment of the
3278 // canonical induction must not be updated.
3280 });
3281
3282 // Create a scalar phi to track the previous EVL if fixed-order recurrence is
3283 // contained.
3284 bool ContainsFORs =
3286 if (ContainsFORs) {
3287 // TODO: Use VPInstruction::ExplicitVectorLength to get maximum EVL.
3288 VPValue *MaxEVL = &Plan.getVF();
3289 // Emit VPScalarCastRecipe in preheader if VF is not a 32 bits integer.
3290 VPBuilder Builder(LoopRegion->getPreheaderVPBB());
3291 MaxEVL = Builder.createScalarZExtOrTrunc(
3292 MaxEVL, Type::getInt32Ty(Plan.getContext()),
3293 TypeInfo.inferScalarType(MaxEVL), DebugLoc::getUnknown());
3294
3295 Builder.setInsertPoint(Header, Header->getFirstNonPhi());
3296 VPValue *PrevEVL = Builder.createScalarPhi(
3297 {MaxEVL, &EVL}, DebugLoc::getUnknown(), "prev.evl");
3298
3301 for (VPRecipeBase &R : *VPBB) {
3302 VPValue *V1, *V2;
3303 if (!match(&R,
3305 m_VPValue(V1), m_VPValue(V2))))
3306 continue;
3307 VPValue *Imm = Plan.getOrAddLiveIn(
3310 Intrinsic::experimental_vp_splice,
3311 {V1, V2, Imm, Plan.getTrue(), PrevEVL, &EVL},
3312 TypeInfo.inferScalarType(R.getVPSingleValue()), {}, {},
3313 R.getDebugLoc());
3314 VPSplice->insertBefore(&R);
3315 R.getVPSingleValue()->replaceAllUsesWith(VPSplice);
3316 }
3317 }
3318 }
3319
3320 VPValue *HeaderMask = vputils::findHeaderMask(Plan);
3321 if (!HeaderMask)
3322 return;
3323
3324 // Ensure that any reduction that uses a select to mask off tail lanes does so
3325 // in the vector loop, not the middle block, since EVL tail folding can have
3326 // tail elements in the penultimate iteration.
3327 assert(all_of(*Plan.getMiddleBlock(), [&Plan, HeaderMask](VPRecipeBase &R) {
3328 if (match(&R, m_ComputeReductionResult(m_Select(m_Specific(HeaderMask),
3329 m_VPValue(), m_VPValue()))))
3330 return R.getOperand(0)->getDefiningRecipe()->getRegion() ==
3331 Plan.getVectorLoopRegion();
3332 return true;
3333 }));
3334
3335 // Replace header masks with a mask equivalent to predicating by EVL:
3336 //
3337 // icmp ule widen-canonical-iv backedge-taken-count
3338 // ->
3339 // icmp ult step-vector, EVL
3340 VPRecipeBase *EVLR = EVL.getDefiningRecipe();
3341 VPBuilder Builder(EVLR->getParent(), std::next(EVLR->getIterator()));
3342 Type *EVLType = TypeInfo.inferScalarType(&EVL);
3343 VPValue *EVLMask = Builder.createICmp(
3345 Builder.createNaryOp(VPInstruction::StepVector, {}, EVLType), &EVL);
3346 HeaderMask->replaceAllUsesWith(EVLMask);
3347}
3348
3349/// Converts a tail folded vector loop region to step by
3350/// VPInstruction::ExplicitVectorLength elements instead of VF elements each
3351/// iteration.
3352///
3353/// - Add a VPCurrentIterationPHIRecipe and related recipes to \p Plan and
3354/// replaces all uses except the canonical IV increment of
3355/// VPCanonicalIVPHIRecipe with a VPCurrentIterationPHIRecipe.
3356/// VPCanonicalIVPHIRecipe is used only for loop iterations counting after
3357/// this transformation.
3358///
3359/// - The header mask is replaced with a header mask based on the EVL.
3360///
3361/// - Plans with FORs have a new phi added to keep track of the EVL of the
3362/// previous iteration, and VPFirstOrderRecurrencePHIRecipes are replaced with
3363/// @llvm.vp.splice.
3364///
3365/// The function uses the following definitions:
3366/// %StartV is the canonical induction start value.
3367///
3368/// The function adds the following recipes:
3369///
3370/// vector.ph:
3371/// ...
3372///
3373/// vector.body:
3374/// ...
3375/// %CurrentIter = CURRENT-ITERATION-PHI [ %StartV, %vector.ph ],
3376/// [ %NextIter, %vector.body ]
3377/// %AVL = phi [ trip-count, %vector.ph ], [ %NextAVL, %vector.body ]
3378/// %VPEVL = EXPLICIT-VECTOR-LENGTH %AVL
3379/// ...
3380/// %OpEVL = cast i32 %VPEVL to IVSize
3381/// %NextIter = add IVSize %OpEVL, %CurrentIter
3382/// %NextAVL = sub IVSize nuw %AVL, %OpEVL
3383/// ...
3384///
3385/// If MaxSafeElements is provided, the function adds the following recipes:
3386/// vector.ph:
3387/// ...
3388///
3389/// vector.body:
3390/// ...
3391/// %CurrentIter = CURRENT-ITERATION-PHI [ %StartV, %vector.ph ],
3392/// [ %NextIter, %vector.body ]
3393/// %AVL = phi [ trip-count, %vector.ph ], [ %NextAVL, %vector.body ]
3394/// %cmp = cmp ult %AVL, MaxSafeElements
3395/// %SAFE_AVL = select %cmp, %AVL, MaxSafeElements
3396/// %VPEVL = EXPLICIT-VECTOR-LENGTH %SAFE_AVL
3397/// ...
3398/// %OpEVL = cast i32 %VPEVL to IVSize
3399/// %NextIter = add IVSize %OpEVL, %CurrentIter
3400/// %NextAVL = sub IVSize nuw %AVL, %OpEVL
3401/// ...
3402///
3404 VPlan &Plan, const std::optional<unsigned> &MaxSafeElements) {
3405 if (Plan.hasScalarVFOnly())
3406 return;
3407 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3408 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
3409
3410 auto *CanonicalIVPHI = LoopRegion->getCanonicalIV();
3411 auto *CanIVTy = LoopRegion->getCanonicalIVType();
3412 VPValue *StartV = CanonicalIVPHI->getStartValue();
3413
3414 // Create the CurrentIteration recipe in the vector loop.
3415 auto *CurrentIteration =
3417 CurrentIteration->insertAfter(CanonicalIVPHI);
3418 VPBuilder Builder(Header, Header->getFirstNonPhi());
3419 // Create the AVL (application vector length), starting from TC -> 0 in steps
3420 // of EVL.
3421 VPPhi *AVLPhi = Builder.createScalarPhi(
3422 {Plan.getTripCount()}, DebugLoc::getCompilerGenerated(), "avl");
3423 VPValue *AVL = AVLPhi;
3424
3425 if (MaxSafeElements) {
3426 // Support for MaxSafeDist for correct loop emission.
3427 VPValue *AVLSafe = Plan.getConstantInt(CanIVTy, *MaxSafeElements);
3428 VPValue *Cmp = Builder.createICmp(ICmpInst::ICMP_ULT, AVL, AVLSafe);
3429 AVL = Builder.createSelect(Cmp, AVL, AVLSafe, DebugLoc::getUnknown(),
3430 "safe_avl");
3431 }
3432 auto *VPEVL = Builder.createNaryOp(VPInstruction::ExplicitVectorLength, AVL,
3433 DebugLoc::getUnknown(), "evl");
3434
3435 auto *CanonicalIVIncrement =
3436 cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue());
3437 Builder.setInsertPoint(CanonicalIVIncrement);
3438 VPValue *OpVPEVL = VPEVL;
3439
3440 auto *I32Ty = Type::getInt32Ty(Plan.getContext());
3441 OpVPEVL = Builder.createScalarZExtOrTrunc(
3442 OpVPEVL, CanIVTy, I32Ty, CanonicalIVIncrement->getDebugLoc());
3443
3444 auto *NextIter = Builder.createAdd(
3445 OpVPEVL, CurrentIteration, CanonicalIVIncrement->getDebugLoc(),
3446 "current.iteration.next", CanonicalIVIncrement->getNoWrapFlags());
3447 CurrentIteration->addOperand(NextIter);
3448
3449 VPValue *NextAVL =
3450 Builder.createSub(AVLPhi, OpVPEVL, DebugLoc::getCompilerGenerated(),
3451 "avl.next", {/*NUW=*/true, /*NSW=*/false});
3452 AVLPhi->addOperand(NextAVL);
3453
3454 fixupVFUsersForEVL(Plan, *VPEVL);
3455 removeDeadRecipes(Plan);
3456
3457 // Replace all uses of VPCanonicalIVPHIRecipe by
3458 // VPCurrentIterationPHIRecipe except for the canonical IV increment.
3459 CanonicalIVPHI->replaceAllUsesWith(CurrentIteration);
3460 CanonicalIVIncrement->setOperand(0, CanonicalIVPHI);
3461 // TODO: support unroll factor > 1.
3462 Plan.setUF(1);
3463}
3464
3466 // Find the vector loop entry by locating VPCurrentIterationPHIRecipe.
3467 // There should be only one VPCurrentIteration in the entire plan.
3468 VPCurrentIterationPHIRecipe *CurrentIteration = nullptr;
3469
3472 for (VPRecipeBase &R : VPBB->phis())
3473 if (auto *PhiR = dyn_cast<VPCurrentIterationPHIRecipe>(&R)) {
3474 assert(!CurrentIteration &&
3475 "Found multiple CurrentIteration. Only one expected");
3476 CurrentIteration = PhiR;
3477 }
3478
3479 // Early return if it is not variable-length stepping.
3480 if (!CurrentIteration)
3481 return;
3482
3483 VPBasicBlock *HeaderVPBB = CurrentIteration->getParent();
3484 VPValue *CurrentIterationIncr = CurrentIteration->getBackedgeValue();
3485
3486 // Convert CurrentIteration to concrete recipe.
3487 auto *ScalarR =
3488 VPBuilder(CurrentIteration)
3490 {CurrentIteration->getStartValue(), CurrentIterationIncr},
3491 CurrentIteration->getDebugLoc(), "current.iteration.iv");
3492 CurrentIteration->replaceAllUsesWith(ScalarR);
3493 CurrentIteration->eraseFromParent();
3494
3495 // Replace CanonicalIVInc with CurrentIteration increment.
3496 auto *CanonicalIV = cast<VPPhi>(&*HeaderVPBB->begin());
3497 VPValue *Backedge = CanonicalIV->getIncomingValue(1);
3498 assert(match(Backedge, m_c_Add(m_Specific(CanonicalIV),
3499 m_Specific(&Plan.getVFxUF()))) &&
3500 "Unexpected canonical iv");
3501 Backedge->replaceAllUsesWith(CurrentIterationIncr);
3502
3503 // Remove unused phi and increment.
3504 VPRecipeBase *CanonicalIVIncrement = Backedge->getDefiningRecipe();
3505 CanonicalIVIncrement->eraseFromParent();
3506 CanonicalIV->eraseFromParent();
3507}
3508
3510 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3511 // The canonical IV may not exist at this stage.
3512 if (!LoopRegion ||
3514 return;
3515 VPCanonicalIVPHIRecipe *CanIV = LoopRegion->getCanonicalIV();
3516 if (std::next(CanIV->getIterator()) == CanIV->getParent()->end())
3517 return;
3518 // The EVL IV is always immediately after the canonical IV.
3520 std::next(CanIV->getIterator()));
3521 if (!EVLPhi)
3522 return;
3523
3524 // Bail if not an EVL tail folded loop.
3525 VPValue *AVL;
3526 if (!match(EVLPhi->getBackedgeValue(),
3527 m_c_Add(m_ZExtOrSelf(m_EVL(m_VPValue(AVL))), m_Specific(EVLPhi))))
3528 return;
3529
3530 // The AVL may be capped to a safe distance.
3531 VPValue *SafeAVL, *UnsafeAVL;
3532 if (match(AVL,
3534 m_VPValue(SafeAVL)),
3535 m_Deferred(UnsafeAVL), m_Deferred(SafeAVL))))
3536 AVL = UnsafeAVL;
3537
3538 VPValue *AVLNext;
3539 [[maybe_unused]] bool FoundAVLNext =
3541 m_Specific(Plan.getTripCount()), m_VPValue(AVLNext)));
3542 assert(FoundAVLNext && "Didn't find AVL backedge?");
3543
3544 VPBasicBlock *Latch = LoopRegion->getExitingBasicBlock();
3545 auto *LatchBr = cast<VPInstruction>(Latch->getTerminator());
3546 if (match(LatchBr, m_BranchOnCond(m_True())))
3547 return;
3548
3549 assert(
3550 match(LatchBr,
3553 m_Specific(&Plan.getVectorTripCount())))) &&
3554 "Expected BranchOnCond with ICmp comparing CanIV increment with vector "
3555 "trip count");
3556
3557 Type *AVLTy = VPTypeAnalysis(Plan).inferScalarType(AVLNext);
3558 VPBuilder Builder(LatchBr);
3559 LatchBr->setOperand(
3560 0, Builder.createICmp(CmpInst::ICMP_EQ, AVLNext, Plan.getZero(AVLTy)));
3561}
3562
3564 VPlan &Plan, PredicatedScalarEvolution &PSE,
3565 const DenseMap<Value *, const SCEV *> &StridesMap) {
3566 // Replace VPValues for known constant strides guaranteed by predicate scalar
3567 // evolution.
3568 auto CanUseVersionedStride = [&Plan](VPUser &U, unsigned) {
3569 auto *R = cast<VPRecipeBase>(&U);
3570 return R->getRegion() ||
3571 R->getParent() == Plan.getVectorLoopRegion()->getSinglePredecessor();
3572 };
3573 ValueToSCEVMapTy RewriteMap;
3574 for (const SCEV *Stride : StridesMap.values()) {
3575 using namespace SCEVPatternMatch;
3576 auto *StrideV = cast<SCEVUnknown>(Stride)->getValue();
3577 const APInt *StrideConst;
3578 if (!match(PSE.getSCEV(StrideV), m_scev_APInt(StrideConst)))
3579 // Only handle constant strides for now.
3580 continue;
3581
3582 auto *CI = Plan.getConstantInt(*StrideConst);
3583 if (VPValue *StrideVPV = Plan.getLiveIn(StrideV))
3584 StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
3585
3586 // The versioned value may not be used in the loop directly but through a
3587 // sext/zext. Add new live-ins in those cases.
3588 for (Value *U : StrideV->users()) {
3590 continue;
3591 VPValue *StrideVPV = Plan.getLiveIn(U);
3592 if (!StrideVPV)
3593 continue;
3594 unsigned BW = U->getType()->getScalarSizeInBits();
3595 APInt C =
3596 isa<SExtInst>(U) ? StrideConst->sext(BW) : StrideConst->zext(BW);
3597 VPValue *CI = Plan.getConstantInt(C);
3598 StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
3599 }
3600 RewriteMap[StrideV] = PSE.getSCEV(StrideV);
3601 }
3602
3603 for (VPRecipeBase &R : *Plan.getEntry()) {
3604 auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(&R);
3605 if (!ExpSCEV)
3606 continue;
3607 const SCEV *ScevExpr = ExpSCEV->getSCEV();
3608 auto *NewSCEV =
3609 SCEVParameterRewriter::rewrite(ScevExpr, *PSE.getSE(), RewriteMap);
3610 if (NewSCEV != ScevExpr) {
3611 VPValue *NewExp = vputils::getOrCreateVPValueForSCEVExpr(Plan, NewSCEV);
3612 ExpSCEV->replaceAllUsesWith(NewExp);
3613 if (Plan.getTripCount() == ExpSCEV)
3614 Plan.resetTripCount(NewExp);
3615 }
3616 }
3617}
3618
3620 VPlan &Plan,
3621 const std::function<bool(BasicBlock *)> &BlockNeedsPredication) {
3622 // Collect recipes in the backward slice of `Root` that may generate a poison
3623 // value that is used after vectorization.
3625 auto CollectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) {
3627 Worklist.push_back(Root);
3628
3629 // Traverse the backward slice of Root through its use-def chain.
3630 while (!Worklist.empty()) {
3631 VPRecipeBase *CurRec = Worklist.pop_back_val();
3632
3633 if (!Visited.insert(CurRec).second)
3634 continue;
3635
3636 // Prune search if we find another recipe generating a widen memory
3637 // instruction. Widen memory instructions involved in address computation
3638 // will lead to gather/scatter instructions, which don't need to be
3639 // handled.
3641 VPHeaderPHIRecipe>(CurRec))
3642 continue;
3643
3644 // This recipe contributes to the address computation of a widen
3645 // load/store. If the underlying instruction has poison-generating flags,
3646 // drop them directly.
3647 if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(CurRec)) {
3648 VPValue *A, *B;
3649 // Dropping disjoint from an OR may yield incorrect results, as some
3650 // analysis may have converted it to an Add implicitly (e.g. SCEV used
3651 // for dependence analysis). Instead, replace it with an equivalent Add.
3652 // This is possible as all users of the disjoint OR only access lanes
3653 // where the operands are disjoint or poison otherwise.
3654 if (match(RecWithFlags, m_BinaryOr(m_VPValue(A), m_VPValue(B))) &&
3655 RecWithFlags->isDisjoint()) {
3656 VPBuilder Builder(RecWithFlags);
3657 VPInstruction *New =
3658 Builder.createAdd(A, B, RecWithFlags->getDebugLoc());
3659 New->setUnderlyingValue(RecWithFlags->getUnderlyingValue());
3660 RecWithFlags->replaceAllUsesWith(New);
3661 RecWithFlags->eraseFromParent();
3662 CurRec = New;
3663 } else
3664 RecWithFlags->dropPoisonGeneratingFlags();
3665 } else {
3668 (void)Instr;
3669 assert((!Instr || !Instr->hasPoisonGeneratingFlags()) &&
3670 "found instruction with poison generating flags not covered by "
3671 "VPRecipeWithIRFlags");
3672 }
3673
3674 // Add new definitions to the worklist.
3675 for (VPValue *Operand : CurRec->operands())
3676 if (VPRecipeBase *OpDef = Operand->getDefiningRecipe())
3677 Worklist.push_back(OpDef);
3678 }
3679 });
3680
3681 // Traverse all the recipes in the VPlan and collect the poison-generating
3682 // recipes in the backward slice starting at the address of a VPWidenRecipe or
3683 // VPInterleaveRecipe.
3684 auto Iter = vp_depth_first_deep(Plan.getEntry());
3686 for (VPRecipeBase &Recipe : *VPBB) {
3687 if (auto *WidenRec = dyn_cast<VPWidenMemoryRecipe>(&Recipe)) {
3688 Instruction &UnderlyingInstr = WidenRec->getIngredient();
3689 VPRecipeBase *AddrDef = WidenRec->getAddr()->getDefiningRecipe();
3690 if (AddrDef && WidenRec->isConsecutive() &&
3691 BlockNeedsPredication(UnderlyingInstr.getParent()))
3692 CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
3693 } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) {
3694 VPRecipeBase *AddrDef = InterleaveRec->getAddr()->getDefiningRecipe();
3695 if (AddrDef) {
3696 // Check if any member of the interleave group needs predication.
3697 const InterleaveGroup<Instruction> *InterGroup =
3698 InterleaveRec->getInterleaveGroup();
3699 bool NeedPredication = false;
3700 for (int I = 0, NumMembers = InterGroup->getNumMembers();
3701 I < NumMembers; ++I) {
3702 Instruction *Member = InterGroup->getMember(I);
3703 if (Member)
3704 NeedPredication |= BlockNeedsPredication(Member->getParent());
3705 }
3706
3707 if (NeedPredication)
3708 CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
3709 }
3710 }
3711 }
3712 }
3713}
3714
3716 VPlan &Plan,
3718 &InterleaveGroups,
3719 VPRecipeBuilder &RecipeBuilder, const bool &EpilogueAllowed) {
3720 if (InterleaveGroups.empty())
3721 return;
3722
3723 // Interleave memory: for each Interleave Group we marked earlier as relevant
3724 // for this VPlan, replace the Recipes widening its memory instructions with a
3725 // single VPInterleaveRecipe at its insertion point.
3726 VPDominatorTree VPDT(Plan);
3727 for (const auto *IG : InterleaveGroups) {
3728 auto *Start =
3729 cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IG->getMember(0)));
3730 VPIRMetadata InterleaveMD(*Start);
3731 SmallVector<VPValue *, 4> StoredValues;
3732 if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(Start))
3733 StoredValues.push_back(StoreR->getStoredValue());
3734 for (unsigned I = 1; I < IG->getFactor(); ++I) {
3735 Instruction *MemberI = IG->getMember(I);
3736 if (!MemberI)
3737 continue;
3738 VPWidenMemoryRecipe *MemoryR =
3739 cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(MemberI));
3740 if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(MemoryR))
3741 StoredValues.push_back(StoreR->getStoredValue());
3742 InterleaveMD.intersect(*MemoryR);
3743 }
3744
3745 bool NeedsMaskForGaps =
3746 (IG->requiresScalarEpilogue() && !EpilogueAllowed) ||
3747 (!StoredValues.empty() && !IG->isFull());
3748
3749 Instruction *IRInsertPos = IG->getInsertPos();
3750 auto *InsertPos =
3751 cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IRInsertPos));
3752
3754 if (auto *Gep = dyn_cast<GetElementPtrInst>(
3755 getLoadStorePointerOperand(IRInsertPos)->stripPointerCasts()))
3756 NW = Gep->getNoWrapFlags().withoutNoUnsignedWrap();
3757
3758 // Get or create the start address for the interleave group.
3759 VPValue *Addr = Start->getAddr();
3760 VPRecipeBase *AddrDef = Addr->getDefiningRecipe();
3761 if (AddrDef && !VPDT.properlyDominates(AddrDef, InsertPos)) {
3762 // We cannot re-use the address of member zero because it does not
3763 // dominate the insert position. Instead, use the address of the insert
3764 // position and create a PtrAdd adjusting it to the address of member
3765 // zero.
3766 // TODO: Hoist Addr's defining recipe (and any operands as needed) to
3767 // InsertPos or sink loads above zero members to join it.
3768 assert(IG->getIndex(IRInsertPos) != 0 &&
3769 "index of insert position shouldn't be zero");
3770 auto &DL = IRInsertPos->getDataLayout();
3771 APInt Offset(32,
3772 DL.getTypeAllocSize(getLoadStoreType(IRInsertPos)) *
3773 IG->getIndex(IRInsertPos),
3774 /*IsSigned=*/true);
3775 VPValue *OffsetVPV = Plan.getConstantInt(-Offset);
3776 VPBuilder B(InsertPos);
3777 Addr = B.createNoWrapPtrAdd(InsertPos->getAddr(), OffsetVPV, NW);
3778 }
3779 // If the group is reverse, adjust the index to refer to the last vector
3780 // lane instead of the first. We adjust the index from the first vector
3781 // lane, rather than directly getting the pointer for lane VF - 1, because
3782 // the pointer operand of the interleaved access is supposed to be uniform.
3783 if (IG->isReverse()) {
3784 auto *ReversePtr = new VPVectorEndPointerRecipe(
3785 Addr, &Plan.getVF(), getLoadStoreType(IRInsertPos),
3786 -(int64_t)IG->getFactor(), NW, InsertPos->getDebugLoc());
3787 ReversePtr->insertBefore(InsertPos);
3788 Addr = ReversePtr;
3789 }
3790 auto *VPIG = new VPInterleaveRecipe(IG, Addr, StoredValues,
3791 InsertPos->getMask(), NeedsMaskForGaps,
3792 InterleaveMD, InsertPos->getDebugLoc());
3793 VPIG->insertBefore(InsertPos);
3794
3795 unsigned J = 0;
3796 for (unsigned i = 0; i < IG->getFactor(); ++i)
3797 if (Instruction *Member = IG->getMember(i)) {
3798 VPRecipeBase *MemberR = RecipeBuilder.getRecipe(Member);
3799 if (!Member->getType()->isVoidTy()) {
3800 VPValue *OriginalV = MemberR->getVPSingleValue();
3801 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));
3802 J++;
3803 }
3804 MemberR->eraseFromParent();
3805 }
3806 }
3807}
3808
3809/// Expand a VPWidenIntOrFpInduction into executable recipes, for the initial
3810/// value, phi and backedge value. In the following example:
3811///
3812/// vector.ph:
3813/// Successor(s): vector loop
3814///
3815/// <x1> vector loop: {
3816/// vector.body:
3817/// WIDEN-INDUCTION %i = phi %start, %step, %vf
3818/// ...
3819/// EMIT branch-on-count ...
3820/// No successors
3821/// }
3822///
3823/// WIDEN-INDUCTION will get expanded to:
3824///
3825/// vector.ph:
3826/// ...
3827/// vp<%induction.start> = ...
3828/// vp<%induction.increment> = ...
3829///
3830/// Successor(s): vector loop
3831///
3832/// <x1> vector loop: {
3833/// vector.body:
3834/// ir<%i> = WIDEN-PHI vp<%induction.start>, vp<%vec.ind.next>
3835/// ...
3836/// vp<%vec.ind.next> = add ir<%i>, vp<%induction.increment>
3837/// EMIT branch-on-count ...
3838/// No successors
3839/// }
3840static void
3842 VPTypeAnalysis &TypeInfo) {
3843 VPlan *Plan = WidenIVR->getParent()->getPlan();
3844 VPValue *Start = WidenIVR->getStartValue();
3845 VPValue *Step = WidenIVR->getStepValue();
3846 VPValue *VF = WidenIVR->getVFValue();
3847 DebugLoc DL = WidenIVR->getDebugLoc();
3848
3849 // The value from the original loop to which we are mapping the new induction
3850 // variable.
3851 Type *Ty = TypeInfo.inferScalarType(WidenIVR);
3852
3853 const InductionDescriptor &ID = WidenIVR->getInductionDescriptor();
3856 VPIRFlags Flags = *WidenIVR;
3857 if (ID.getKind() == InductionDescriptor::IK_IntInduction) {
3858 AddOp = Instruction::Add;
3859 MulOp = Instruction::Mul;
3860 } else {
3861 AddOp = ID.getInductionOpcode();
3862 MulOp = Instruction::FMul;
3863 }
3864
3865 // If the phi is truncated, truncate the start and step values.
3866 VPBuilder Builder(Plan->getVectorPreheader());
3867 Type *StepTy = TypeInfo.inferScalarType(Step);
3868 if (Ty->getScalarSizeInBits() < StepTy->getScalarSizeInBits()) {
3869 assert(StepTy->isIntegerTy() && "Truncation requires an integer type");
3870 Step = Builder.createScalarCast(Instruction::Trunc, Step, Ty, DL);
3871 Start = Builder.createScalarCast(Instruction::Trunc, Start, Ty, DL);
3872 StepTy = Ty;
3873 }
3874
3875 // Construct the initial value of the vector IV in the vector loop preheader.
3876 Type *IVIntTy =
3878 VPValue *Init = Builder.createNaryOp(VPInstruction::StepVector, {}, IVIntTy);
3879 if (StepTy->isFloatingPointTy())
3880 Init = Builder.createWidenCast(Instruction::UIToFP, Init, StepTy);
3881
3882 VPValue *SplatStart = Builder.createNaryOp(VPInstruction::Broadcast, Start);
3883 VPValue *SplatStep = Builder.createNaryOp(VPInstruction::Broadcast, Step);
3884
3885 Init = Builder.createNaryOp(MulOp, {Init, SplatStep}, Flags);
3886 Init = Builder.createNaryOp(AddOp, {SplatStart, Init}, Flags,
3887 DebugLoc::getUnknown(), "induction");
3888
3889 // Create the widened phi of the vector IV.
3890 auto *WidePHI = new VPWidenPHIRecipe(WidenIVR->getPHINode(), Init,
3891 WidenIVR->getDebugLoc(), "vec.ind");
3892 WidePHI->insertBefore(WidenIVR);
3893
3894 // Create the backedge value for the vector IV.
3895 VPValue *Inc;
3896 VPValue *Prev;
3897 // If unrolled, use the increment and prev value from the operands.
3898 if (auto *SplatVF = WidenIVR->getSplatVFValue()) {
3899 Inc = SplatVF;
3900 Prev = WidenIVR->getLastUnrolledPartOperand();
3901 } else {
3902 if (VPRecipeBase *R = VF->getDefiningRecipe())
3903 Builder.setInsertPoint(R->getParent(), std::next(R->getIterator()));
3904 // Multiply the vectorization factor by the step using integer or
3905 // floating-point arithmetic as appropriate.
3906 if (StepTy->isFloatingPointTy())
3907 VF = Builder.createScalarCast(Instruction::CastOps::UIToFP, VF, StepTy,
3908 DL);
3909 else
3910 VF = Builder.createScalarZExtOrTrunc(VF, StepTy,
3911 TypeInfo.inferScalarType(VF), DL);
3912
3913 Inc = Builder.createNaryOp(MulOp, {Step, VF}, Flags);
3914 Inc = Builder.createNaryOp(VPInstruction::Broadcast, Inc);
3915 Prev = WidePHI;
3916 }
3917
3919 Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator());
3920 auto *Next = Builder.createNaryOp(AddOp, {Prev, Inc}, Flags,
3921 WidenIVR->getDebugLoc(), "vec.ind.next");
3922
3923 WidePHI->addOperand(Next);
3924
3925 WidenIVR->replaceAllUsesWith(WidePHI);
3926}
3927
3928/// Expand a VPWidenPointerInductionRecipe into executable recipes, for the
3929/// initial value, phi and backedge value. In the following example:
3930///
3931/// <x1> vector loop: {
3932/// vector.body:
3933/// EMIT ir<%ptr.iv> = WIDEN-POINTER-INDUCTION %start, %step, %vf
3934/// ...
3935/// EMIT branch-on-count ...
3936/// }
3937///
3938/// WIDEN-POINTER-INDUCTION will get expanded to:
3939///
3940/// <x1> vector loop: {
3941/// vector.body:
3942/// EMIT-SCALAR %pointer.phi = phi %start, %ptr.ind
3943/// EMIT %mul = mul %stepvector, %step
3944/// EMIT %vector.gep = wide-ptradd %pointer.phi, %mul
3945/// ...
3946/// EMIT %ptr.ind = ptradd %pointer.phi, %vf
3947/// EMIT branch-on-count ...
3948/// }
3950 VPTypeAnalysis &TypeInfo) {
3951 VPlan *Plan = R->getParent()->getPlan();
3952 VPValue *Start = R->getStartValue();
3953 VPValue *Step = R->getStepValue();
3954 VPValue *VF = R->getVFValue();
3955
3956 assert(R->getInductionDescriptor().getKind() ==
3958 "Not a pointer induction according to InductionDescriptor!");
3959 assert(TypeInfo.inferScalarType(R)->isPointerTy() && "Unexpected type.");
3960 assert(!R->onlyScalarsGenerated(Plan->hasScalableVF()) &&
3961 "Recipe should have been replaced");
3962
3963 VPBuilder Builder(R);
3964 DebugLoc DL = R->getDebugLoc();
3965
3966 // Build a scalar pointer phi.
3967 VPPhi *ScalarPtrPhi = Builder.createScalarPhi(Start, DL, "pointer.phi");
3968
3969 // Create actual address geps that use the pointer phi as base and a
3970 // vectorized version of the step value (<step*0, ..., step*N>) as offset.
3971 Builder.setInsertPoint(R->getParent(), R->getParent()->getFirstNonPhi());
3972 Type *StepTy = TypeInfo.inferScalarType(Step);
3973 VPValue *Offset = Builder.createNaryOp(VPInstruction::StepVector, {}, StepTy);
3974 Offset = Builder.createOverflowingOp(Instruction::Mul, {Offset, Step});
3975 VPValue *PtrAdd =
3976 Builder.createWidePtrAdd(ScalarPtrPhi, Offset, DL, "vector.gep");
3977 R->replaceAllUsesWith(PtrAdd);
3978
3979 // Create the backedge value for the scalar pointer phi.
3981 Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator());
3982 VF = Builder.createScalarZExtOrTrunc(VF, StepTy, TypeInfo.inferScalarType(VF),
3983 DL);
3984 VPValue *Inc = Builder.createOverflowingOp(Instruction::Mul, {Step, VF});
3985
3986 VPValue *InductionGEP =
3987 Builder.createPtrAdd(ScalarPtrPhi, Inc, DL, "ptr.ind");
3988 ScalarPtrPhi->addOperand(InductionGEP);
3989}
3990
3992 // Replace loop regions with explicity CFG.
3993 SmallVector<VPRegionBlock *> LoopRegions;
3995 vp_depth_first_deep(Plan.getEntry()))) {
3996 if (!R->isReplicator())
3997 LoopRegions.push_back(R);
3998 }
3999 for (VPRegionBlock *R : LoopRegions)
4000 R->dissolveToCFGLoop();
4001}
4002
4005 // The transform runs after dissolving loop regions, so all VPBasicBlocks
4006 // terminated with BranchOnTwoConds are reached via a shallow traversal.
4009 if (!VPBB->empty() && match(&VPBB->back(), m_BranchOnTwoConds()))
4010 WorkList.push_back(cast<VPInstruction>(&VPBB->back()));
4011 }
4012
4013 // Expand BranchOnTwoConds instructions into explicit CFG with two new
4014 // single-condition branches:
4015 // 1. A branch that replaces BranchOnTwoConds, jumps to the first successor if
4016 // the first condition is true, and otherwise jumps to a new interim block.
4017 // 2. A branch that ends the interim block, jumps to the second successor if
4018 // the second condition is true, and otherwise jumps to the third
4019 // successor.
4020 for (VPInstruction *Br : WorkList) {
4021 assert(Br->getNumOperands() == 2 &&
4022 "BranchOnTwoConds must have exactly 2 conditions");
4023 DebugLoc DL = Br->getDebugLoc();
4024 VPBasicBlock *BrOnTwoCondsBB = Br->getParent();
4025 const auto Successors = to_vector(BrOnTwoCondsBB->getSuccessors());
4026 assert(Successors.size() == 3 &&
4027 "BranchOnTwoConds must have exactly 3 successors");
4028
4029 for (VPBlockBase *Succ : Successors)
4030 VPBlockUtils::disconnectBlocks(BrOnTwoCondsBB, Succ);
4031
4032 VPValue *Cond0 = Br->getOperand(0);
4033 VPValue *Cond1 = Br->getOperand(1);
4034 VPBlockBase *Succ0 = Successors[0];
4035 VPBlockBase *Succ1 = Successors[1];
4036 VPBlockBase *Succ2 = Successors[2];
4037 assert(!Succ0->getParent() && !Succ1->getParent() && !Succ2->getParent() &&
4038 !BrOnTwoCondsBB->getParent() && "regions must already be dissolved");
4039
4040 VPBasicBlock *InterimBB =
4041 Plan.createVPBasicBlock(BrOnTwoCondsBB->getName() + ".interim");
4042
4043 VPBuilder(BrOnTwoCondsBB)
4045 VPBlockUtils::connectBlocks(BrOnTwoCondsBB, Succ0);
4046 VPBlockUtils::connectBlocks(BrOnTwoCondsBB, InterimBB);
4047
4049 VPBlockUtils::connectBlocks(InterimBB, Succ1);
4050 VPBlockUtils::connectBlocks(InterimBB, Succ2);
4051 Br->eraseFromParent();
4052 }
4053}
4054
4056 VPTypeAnalysis TypeInfo(Plan);
4059 vp_depth_first_deep(Plan.getEntry()))) {
4060 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
4061 if (auto *WidenIVR = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R)) {
4062 expandVPWidenIntOrFpInduction(WidenIVR, TypeInfo);
4063 ToRemove.push_back(WidenIVR);
4064 continue;
4065 }
4066
4067 if (auto *WidenIVR = dyn_cast<VPWidenPointerInductionRecipe>(&R)) {
4068 // If the recipe only generates scalars, scalarize it instead of
4069 // expanding it.
4070 if (WidenIVR->onlyScalarsGenerated(Plan.hasScalableVF())) {
4071 VPBuilder Builder(WidenIVR);
4072 VPValue *PtrAdd =
4073 scalarizeVPWidenPointerInduction(WidenIVR, Plan, Builder);
4074 WidenIVR->replaceAllUsesWith(PtrAdd);
4075 ToRemove.push_back(WidenIVR);
4076 continue;
4077 }
4078 expandVPWidenPointerInduction(WidenIVR, TypeInfo);
4079 ToRemove.push_back(WidenIVR);
4080 continue;
4081 }
4082
4083 // Expand VPBlendRecipe into VPInstruction::Select.
4084 VPBuilder Builder(&R);
4085 if (auto *Blend = dyn_cast<VPBlendRecipe>(&R)) {
4086 VPValue *Select = Blend->getIncomingValue(0);
4087 for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
4088 Select = Builder.createSelect(Blend->getMask(I),
4089 Blend->getIncomingValue(I), Select,
4090 R.getDebugLoc(), "predphi", *Blend);
4091 Blend->replaceAllUsesWith(Select);
4092 ToRemove.push_back(Blend);
4093 }
4094
4095 if (auto *VEPR = dyn_cast<VPVectorEndPointerRecipe>(&R)) {
4096 if (!VEPR->getOffset()) {
4097 assert(Plan.getConcreteUF() == 1 &&
4098 "Expected unroller to have materialized offset for UF != 1");
4099 VEPR->materializeOffset();
4100 }
4101 }
4102
4103 if (auto *Expr = dyn_cast<VPExpressionRecipe>(&R)) {
4104 Expr->decompose();
4105 ToRemove.push_back(Expr);
4106 }
4107
4108 // Expand LastActiveLane into Not + FirstActiveLane + Sub.
4109 auto *LastActiveL = dyn_cast<VPInstruction>(&R);
4110 if (LastActiveL &&
4111 LastActiveL->getOpcode() == VPInstruction::LastActiveLane) {
4112 // Create Not(Mask) for all operands.
4114 for (VPValue *Op : LastActiveL->operands()) {
4115 VPValue *NotMask = Builder.createNot(Op, LastActiveL->getDebugLoc());
4116 NotMasks.push_back(NotMask);
4117 }
4118
4119 // Create FirstActiveLane on the inverted masks.
4120 VPValue *FirstInactiveLane = Builder.createNaryOp(
4122 LastActiveL->getDebugLoc(), "first.inactive.lane");
4123
4124 // Subtract 1 to get the last active lane.
4125 VPValue *One =
4126 Plan.getConstantInt(TypeInfo.inferScalarType(FirstInactiveLane), 1);
4127 VPValue *LastLane =
4128 Builder.createSub(FirstInactiveLane, One,
4129 LastActiveL->getDebugLoc(), "last.active.lane");
4130
4131 LastActiveL->replaceAllUsesWith(LastLane);
4132 ToRemove.push_back(LastActiveL);
4133 continue;
4134 }
4135
4136 // Lower MaskedCond with block mask to LogicalAnd.
4138 auto *VPI = cast<VPInstruction>(&R);
4139 assert(VPI->isMasked() &&
4140 "Unmasked MaskedCond should be simplified earlier");
4141 VPI->replaceAllUsesWith(Builder.createNaryOp(
4142 VPInstruction::LogicalAnd, {VPI->getMask(), VPI->getOperand(0)}));
4143 ToRemove.push_back(VPI);
4144 continue;
4145 }
4146
4147 // Lower CanonicalIVIncrementForPart to plain Add.
4148 if (match(
4149 &R,
4151 auto *VPI = cast<VPInstruction>(&R);
4152 VPValue *Add = Builder.createOverflowingOp(
4153 Instruction::Add, VPI->operands(), VPI->getNoWrapFlags(),
4154 VPI->getDebugLoc());
4155 VPI->replaceAllUsesWith(Add);
4156 ToRemove.push_back(VPI);
4157 continue;
4158 }
4159
4160 // Lower BranchOnCount to ICmp + BranchOnCond.
4161 VPValue *IV, *TC;
4162 if (match(&R, m_BranchOnCount(m_VPValue(IV), m_VPValue(TC)))) {
4163 auto *BranchOnCountInst = cast<VPInstruction>(&R);
4164 DebugLoc DL = BranchOnCountInst->getDebugLoc();
4165 VPValue *Cond = Builder.createICmp(CmpInst::ICMP_EQ, IV, TC, DL);
4166 Builder.createNaryOp(VPInstruction::BranchOnCond, Cond, DL);
4167 ToRemove.push_back(BranchOnCountInst);
4168 continue;
4169 }
4170
4171 VPValue *VectorStep;
4172 VPValue *ScalarStep;
4174 m_VPValue(VectorStep), m_VPValue(ScalarStep))))
4175 continue;
4176
4177 // Expand WideIVStep.
4178 auto *VPI = cast<VPInstruction>(&R);
4179 Type *IVTy = TypeInfo.inferScalarType(VPI);
4180 if (TypeInfo.inferScalarType(VectorStep) != IVTy) {
4182 ? Instruction::UIToFP
4183 : Instruction::Trunc;
4184 VectorStep = Builder.createWidenCast(CastOp, VectorStep, IVTy);
4185 }
4186
4187 assert(!match(ScalarStep, m_One()) && "Expected non-unit scalar-step");
4188 if (TypeInfo.inferScalarType(ScalarStep) != IVTy) {
4189 ScalarStep =
4190 Builder.createWidenCast(Instruction::Trunc, ScalarStep, IVTy);
4191 }
4192
4193 VPIRFlags Flags;
4194 unsigned MulOpc;
4195 if (IVTy->isFloatingPointTy()) {
4196 MulOpc = Instruction::FMul;
4197 Flags = VPI->getFastMathFlags();
4198 } else {
4199 MulOpc = Instruction::Mul;
4200 Flags = VPIRFlags::getDefaultFlags(MulOpc);
4201 }
4202
4203 VPInstruction *Mul = Builder.createNaryOp(
4204 MulOpc, {VectorStep, ScalarStep}, Flags, R.getDebugLoc());
4205 VectorStep = Mul;
4206 VPI->replaceAllUsesWith(VectorStep);
4207 ToRemove.push_back(VPI);
4208 }
4209 }
4210
4211 for (VPRecipeBase *R : ToRemove)
4212 R->eraseFromParent();
4213}
4214
4216 VPBasicBlock *HeaderVPBB,
4217 VPBasicBlock *LatchVPBB,
4218 VPBasicBlock *MiddleVPBB,
4219 UncountableExitStyle Style) {
4220 struct EarlyExitInfo {
4221 VPBasicBlock *EarlyExitingVPBB;
4222 VPIRBasicBlock *EarlyExitVPBB;
4223 VPValue *CondToExit;
4224 };
4225
4226 VPDominatorTree VPDT(Plan);
4227 VPBuilder Builder(LatchVPBB->getTerminator());
4229 for (VPIRBasicBlock *ExitBlock : Plan.getExitBlocks()) {
4230 for (VPBlockBase *Pred : to_vector(ExitBlock->getPredecessors())) {
4231 if (Pred == MiddleVPBB)
4232 continue;
4233 // Collect condition for this early exit.
4234 auto *EarlyExitingVPBB = cast<VPBasicBlock>(Pred);
4235 VPBlockBase *TrueSucc = EarlyExitingVPBB->getSuccessors()[0];
4236 VPValue *CondOfEarlyExitingVPBB;
4237 [[maybe_unused]] bool Matched =
4238 match(EarlyExitingVPBB->getTerminator(),
4239 m_BranchOnCond(m_VPValue(CondOfEarlyExitingVPBB)));
4240 assert(Matched && "Terminator must be BranchOnCond");
4241
4242 // Insert the MaskedCond in the EarlyExitingVPBB so the predicator adds
4243 // the correct block mask.
4244 VPBuilder EarlyExitingBuilder(EarlyExitingVPBB->getTerminator());
4245 auto *CondToEarlyExit = EarlyExitingBuilder.createNaryOp(
4247 TrueSucc == ExitBlock
4248 ? CondOfEarlyExitingVPBB
4249 : EarlyExitingBuilder.createNot(CondOfEarlyExitingVPBB));
4250 assert((isa<VPIRValue>(CondOfEarlyExitingVPBB) ||
4251 !VPDT.properlyDominates(EarlyExitingVPBB, LatchVPBB) ||
4252 VPDT.properlyDominates(
4253 CondOfEarlyExitingVPBB->getDefiningRecipe()->getParent(),
4254 LatchVPBB)) &&
4255 "exit condition must dominate the latch");
4256 Exits.push_back({
4257 EarlyExitingVPBB,
4258 ExitBlock,
4259 CondToEarlyExit,
4260 });
4261 }
4262 }
4263
4264 assert(!Exits.empty() && "must have at least one early exit");
4265 // Sort exits by RPO order to get correct program order. RPO gives a
4266 // topological ordering of the CFG, ensuring upstream exits are checked
4267 // before downstream exits in the dispatch chain.
4269 HeaderVPBB);
4271 for (const auto &[Num, VPB] : enumerate(RPOT))
4272 RPOIdx[VPB] = Num;
4273 llvm::sort(Exits, [&RPOIdx](const EarlyExitInfo &A, const EarlyExitInfo &B) {
4274 return RPOIdx[A.EarlyExitingVPBB] < RPOIdx[B.EarlyExitingVPBB];
4275 });
4276#ifndef NDEBUG
4277 // After RPO sorting, verify that for any pair where one exit dominates
4278 // another, the dominating exit comes first. This is guaranteed by RPO
4279 // (topological order) and is required for the dispatch chain correctness.
4280 for (unsigned I = 0; I + 1 < Exits.size(); ++I)
4281 for (unsigned J = I + 1; J < Exits.size(); ++J)
4282 assert(!VPDT.properlyDominates(Exits[J].EarlyExitingVPBB,
4283 Exits[I].EarlyExitingVPBB) &&
4284 "RPO sort must place dominating exits before dominated ones");
4285#endif
4286
4287 // Build the AnyOf condition for the latch terminator using logical OR
4288 // to avoid poison propagation from later exit conditions when an earlier
4289 // exit is taken.
4290 VPValue *Combined = Exits[0].CondToExit;
4291 for (const EarlyExitInfo &Info : drop_begin(Exits))
4292 Combined = Builder.createLogicalOr(Combined, Info.CondToExit);
4293
4294 VPValue *IsAnyExitTaken =
4295 Builder.createNaryOp(VPInstruction::AnyOf, {Combined});
4296
4298 "Early exit store masking not implemented");
4299
4300 // Create the vector.early.exit blocks.
4301 SmallVector<VPBasicBlock *> VectorEarlyExitVPBBs(Exits.size());
4302 for (unsigned Idx = 0; Idx != Exits.size(); ++Idx) {
4303 Twine BlockSuffix = Exits.size() == 1 ? "" : Twine(".") + Twine(Idx);
4304 VPBasicBlock *VectorEarlyExitVPBB =
4305 Plan.createVPBasicBlock("vector.early.exit" + BlockSuffix);
4306 VectorEarlyExitVPBBs[Idx] = VectorEarlyExitVPBB;
4307 }
4308
4309 // Create the dispatch block (or reuse the single exit block if only one
4310 // exit). The dispatch block computes the first active lane of the combined
4311 // condition and, for multiple exits, chains through conditions to determine
4312 // which exit to take.
4313 VPBasicBlock *DispatchVPBB =
4314 Exits.size() == 1 ? VectorEarlyExitVPBBs[0]
4315 : Plan.createVPBasicBlock("vector.early.exit.check");
4316 VPBuilder DispatchBuilder(DispatchVPBB, DispatchVPBB->begin());
4317 VPValue *FirstActiveLane =
4318 DispatchBuilder.createNaryOp(VPInstruction::FirstActiveLane, {Combined},
4319 DebugLoc::getUnknown(), "first.active.lane");
4320
4321 // For each early exit, disconnect the original exiting block
4322 // (early.exiting.I) from the exit block (ir-bb<exit.I>) and route through a
4323 // new vector.early.exit block. Update ir-bb<exit.I>'s phis to extract their
4324 // values at the first active lane:
4325 //
4326 // Input:
4327 // early.exiting.I:
4328 // ...
4329 // EMIT branch-on-cond vp<%cond.I>
4330 // Successor(s): in.loop.succ, ir-bb<exit.I>
4331 //
4332 // ir-bb<exit.I>:
4333 // IR %phi = phi [ vp<%incoming.I>, early.exiting.I ], ...
4334 //
4335 // Output:
4336 // early.exiting.I:
4337 // ...
4338 // Successor(s): in.loop.succ
4339 //
4340 // vector.early.exit.I:
4341 // EMIT vp<%exit.val> = extract-lane vp<%first.lane>, vp<%incoming.I>
4342 // Successor(s): ir-bb<exit.I>
4343 //
4344 // ir-bb<exit.I>:
4345 // IR %phi = phi ... (extra operand: vp<%exit.val> from
4346 // vector.early.exit.I)
4347 //
4348 for (auto [Exit, VectorEarlyExitVPBB] :
4349 zip_equal(Exits, VectorEarlyExitVPBBs)) {
4350 auto &[EarlyExitingVPBB, EarlyExitVPBB, _] = Exit;
4351 // Adjust the phi nodes in EarlyExitVPBB.
4352 // 1. remove incoming values from EarlyExitingVPBB,
4353 // 2. extract the incoming value at FirstActiveLane
4354 // 3. add back the extracts as last operands for the phis
4355 // Then adjust the CFG, removing the edge between EarlyExitingVPBB and
4356 // EarlyExitVPBB and adding a new edge between VectorEarlyExitVPBB and
4357 // EarlyExitVPBB. The extracts at FirstActiveLane are now the incoming
4358 // values from VectorEarlyExitVPBB.
4359 for (VPRecipeBase &R : EarlyExitVPBB->phis()) {
4360 auto *ExitIRI = cast<VPIRPhi>(&R);
4361 VPValue *IncomingVal =
4362 ExitIRI->getIncomingValueForBlock(EarlyExitingVPBB);
4363 VPValue *NewIncoming = IncomingVal;
4364 if (!isa<VPIRValue>(IncomingVal)) {
4365 VPBuilder EarlyExitBuilder(VectorEarlyExitVPBB);
4366 NewIncoming = EarlyExitBuilder.createNaryOp(
4367 VPInstruction::ExtractLane, {FirstActiveLane, IncomingVal},
4368 DebugLoc::getUnknown(), "early.exit.value");
4369 }
4370 ExitIRI->removeIncomingValueFor(EarlyExitingVPBB);
4371 ExitIRI->addOperand(NewIncoming);
4372 }
4373
4374 EarlyExitingVPBB->getTerminator()->eraseFromParent();
4375 VPBlockUtils::disconnectBlocks(EarlyExitingVPBB, EarlyExitVPBB);
4376 VPBlockUtils::connectBlocks(VectorEarlyExitVPBB, EarlyExitVPBB);
4377 }
4378
4379 // Chain through exits: for each exit, check if its condition is true at
4380 // the first active lane. If so, take that exit; otherwise, try the next.
4381 // The last exit needs no check since it must be taken if all others fail.
4382 //
4383 // For 3 exits (cond.0, cond.1, cond.2), this creates:
4384 //
4385 // latch:
4386 // ...
4387 // EMIT vp<%combined> = logical-or vp<%cond.0>, vp<%cond.1>, vp<%cond.2>
4388 // ...
4389 //
4390 // vector.early.exit.check:
4391 // EMIT vp<%first.lane> = first-active-lane vp<%combined>
4392 // EMIT vp<%at.cond.0> = extract-lane vp<%first.lane>, vp<%cond.0>
4393 // EMIT branch-on-cond vp<%at.cond.0>
4394 // Successor(s): vector.early.exit.0, vector.early.exit.check.0
4395 //
4396 // vector.early.exit.check.0:
4397 // EMIT vp<%at.cond.1> = extract-lane vp<%first.lane>, vp<%cond.1>
4398 // EMIT branch-on-cond vp<%at.cond.1>
4399 // Successor(s): vector.early.exit.1, vector.early.exit.2
4400 VPBasicBlock *CurrentBB = DispatchVPBB;
4401 for (auto [I, Exit] : enumerate(ArrayRef(Exits).drop_back())) {
4402 VPValue *LaneVal = DispatchBuilder.createNaryOp(
4403 VPInstruction::ExtractLane, {FirstActiveLane, Exit.CondToExit},
4404 DebugLoc::getUnknown(), "exit.cond.at.lane");
4405
4406 // For the last dispatch, branch directly to the last exit on false;
4407 // otherwise, create a new check block.
4408 bool IsLastDispatch = (I + 2 == Exits.size());
4409 VPBasicBlock *FalseBB =
4410 IsLastDispatch ? VectorEarlyExitVPBBs.back()
4411 : Plan.createVPBasicBlock(
4412 Twine("vector.early.exit.check.") + Twine(I));
4413
4414 DispatchBuilder.createNaryOp(VPInstruction::BranchOnCond, {LaneVal});
4415 CurrentBB->setSuccessors({VectorEarlyExitVPBBs[I], FalseBB});
4416 VectorEarlyExitVPBBs[I]->setPredecessors({CurrentBB});
4417 FalseBB->setPredecessors({CurrentBB});
4418
4419 CurrentBB = FalseBB;
4420 DispatchBuilder.setInsertPoint(CurrentBB);
4421 }
4422
4423 // Replace the latch terminator with the new branching logic.
4424 auto *LatchExitingBranch = cast<VPInstruction>(LatchVPBB->getTerminator());
4425 assert(LatchExitingBranch->getOpcode() == VPInstruction::BranchOnCount &&
4426 "Unexpected terminator");
4427 auto *IsLatchExitTaken =
4428 Builder.createICmp(CmpInst::ICMP_EQ, LatchExitingBranch->getOperand(0),
4429 LatchExitingBranch->getOperand(1));
4430
4431 DebugLoc LatchDL = LatchExitingBranch->getDebugLoc();
4432 LatchExitingBranch->eraseFromParent();
4433 Builder.setInsertPoint(LatchVPBB);
4434 Builder.createNaryOp(VPInstruction::BranchOnTwoConds,
4435 {IsAnyExitTaken, IsLatchExitTaken}, LatchDL);
4436 LatchVPBB->clearSuccessors();
4437 LatchVPBB->setSuccessors({DispatchVPBB, MiddleVPBB, HeaderVPBB});
4438 DispatchVPBB->setPredecessors({LatchVPBB});
4439}
4440
4441/// This function tries convert extended in-loop reductions to
4442/// VPExpressionRecipe and clamp the \p Range if it is beneficial and
4443/// valid. The created recipe must be decomposed to its constituent
4444/// recipes before execution.
4445static VPExpressionRecipe *
4447 VFRange &Range) {
4448 Type *RedTy = Ctx.Types.inferScalarType(Red);
4449 VPValue *VecOp = Red->getVecOp();
4450
4451 // For partial reductions, the decision has already been made at the point of
4452 // transforming reductions -> partial reductions for a given plan, based on
4453 // the cost-model.
4454 if (Red->isPartialReduction())
4455 return new VPExpressionRecipe(cast<VPWidenCastRecipe>(VecOp), Red);
4456
4457 // Clamp the range if using extended-reduction is profitable.
4458 auto IsExtendedRedValidAndClampRange =
4459 [&](unsigned Opcode, Instruction::CastOps ExtOpc, Type *SrcTy) -> bool {
4461 [&](ElementCount VF) {
4462 auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF));
4464
4466 InstructionCost ExtCost =
4467 cast<VPWidenCastRecipe>(VecOp)->computeCost(VF, Ctx);
4468 InstructionCost RedCost = Red->computeCost(VF, Ctx);
4469
4470 // TTI::getExtendedReductionCost for in-loop reductions
4471 // only supports integer types.
4472 if (RedTy->isFloatingPointTy())
4473 return false;
4474 ExtRedCost = Ctx.TTI.getExtendedReductionCost(
4475 Opcode, ExtOpc == Instruction::CastOps::ZExt, RedTy, SrcVecTy,
4476 Red->getFastMathFlags(), CostKind);
4477 return ExtRedCost.isValid() && ExtRedCost < ExtCost + RedCost;
4478 },
4479 Range);
4480 };
4481
4482 VPValue *A;
4483 // Match reduce(ext)).
4486 IsExtendedRedValidAndClampRange(
4487 RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()),
4488 cast<VPWidenCastRecipe>(VecOp)->getOpcode(),
4489 Ctx.Types.inferScalarType(A)))
4490 return new VPExpressionRecipe(cast<VPWidenCastRecipe>(VecOp), Red);
4491
4492 return nullptr;
4493}
4494
4495/// This function tries convert extended in-loop reductions to
4496/// VPExpressionRecipe and clamp the \p Range if it is beneficial
4497/// and valid. The created VPExpressionRecipe must be decomposed to its
4498/// constituent recipes before execution. Patterns of the
4499/// VPExpressionRecipe:
4500/// reduce.add(mul(...)),
4501/// reduce.add(mul(ext(A), ext(B))),
4502/// reduce.add(ext(mul(ext(A), ext(B)))).
4503/// reduce.fadd(fmul(ext(A), ext(B)))
4504static VPExpressionRecipe *
4506 VPCostContext &Ctx, VFRange &Range) {
4507 unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind());
4508 if (Opcode != Instruction::Add && Opcode != Instruction::Sub &&
4509 Opcode != Instruction::FAdd)
4510 return nullptr;
4511
4512 Type *RedTy = Ctx.Types.inferScalarType(Red);
4513
4514 // Clamp the range if using multiply-accumulate-reduction is profitable.
4515 auto IsMulAccValidAndClampRange =
4517 VPWidenCastRecipe *OuterExt) -> bool {
4519 [&](ElementCount VF) {
4520 // For partial reductions, the decision has already been made at the
4521 // point of transforming reductions -> partial reductions for a given
4522 // plan, based on the cost-model.
4523 if (Red->isPartialReduction())
4524 return true;
4525
4527 Type *SrcTy =
4528 Ext0 ? Ctx.Types.inferScalarType(Ext0->getOperand(0)) : RedTy;
4529 InstructionCost MulAccCost;
4530
4531 // Only partial reductions support mixed or floating-point extends at
4532 // the moment.
4533 if (Ext0 && Ext1 &&
4534 (Ext0->getOpcode() != Ext1->getOpcode() ||
4535 Ext0->getOpcode() == Instruction::CastOps::FPExt))
4536 return false;
4537
4538 bool IsZExt =
4539 !Ext0 || Ext0->getOpcode() == Instruction::CastOps::ZExt;
4540 auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF));
4541 MulAccCost = Ctx.TTI.getMulAccReductionCost(IsZExt, Opcode, RedTy,
4542 SrcVecTy, CostKind);
4543
4544 InstructionCost MulCost = Mul->computeCost(VF, Ctx);
4545 InstructionCost RedCost = Red->computeCost(VF, Ctx);
4546 InstructionCost ExtCost = 0;
4547 if (Ext0)
4548 ExtCost += Ext0->computeCost(VF, Ctx);
4549 if (Ext1)
4550 ExtCost += Ext1->computeCost(VF, Ctx);
4551 if (OuterExt)
4552 ExtCost += OuterExt->computeCost(VF, Ctx);
4553
4554 return MulAccCost.isValid() &&
4555 MulAccCost < ExtCost + MulCost + RedCost;
4556 },
4557 Range);
4558 };
4559
4560 VPValue *VecOp = Red->getVecOp();
4561 VPRecipeBase *Sub = nullptr;
4562 VPValue *A, *B;
4563 VPValue *Tmp = nullptr;
4564
4565 // Try to match reduce.fadd(fmul(fpext(...), fpext(...))).
4566 if (match(VecOp, m_FMul(m_FPExt(m_VPValue()), m_FPExt(m_VPValue())))) {
4567 assert(Opcode == Instruction::FAdd &&
4568 "MulAccumulateReduction from an FMul must accumulate into an FAdd "
4569 "instruction");
4570 auto *FMul = dyn_cast<VPWidenRecipe>(VecOp);
4571 if (!FMul)
4572 return nullptr;
4573
4574 auto *RecipeA = dyn_cast<VPWidenCastRecipe>(FMul->getOperand(0));
4575 auto *RecipeB = dyn_cast<VPWidenCastRecipe>(FMul->getOperand(1));
4576
4577 if (RecipeA && RecipeB &&
4578 IsMulAccValidAndClampRange(FMul, RecipeA, RecipeB, nullptr)) {
4579 return new VPExpressionRecipe(RecipeA, RecipeB, FMul, Red);
4580 }
4581 }
4582 if (RedTy->isFloatingPointTy())
4583 return nullptr;
4584
4585 // Sub reductions could have a sub between the add reduction and vec op.
4586 if (match(VecOp, m_Sub(m_ZeroInt(), m_VPValue(Tmp)))) {
4587 Sub = VecOp->getDefiningRecipe();
4588 VecOp = Tmp;
4589 }
4590
4591 // If ValB is a constant and can be safely extended, truncate it to the same
4592 // type as ExtA's operand, then extend it to the same type as ExtA. This
4593 // creates two uniform extends that can more easily be matched by the rest of
4594 // the bundling code. The ExtB reference, ValB and operand 1 of Mul are all
4595 // replaced with the new extend of the constant.
4596 auto ExtendAndReplaceConstantOp = [&Ctx, &Red](VPWidenCastRecipe *ExtA,
4597 VPWidenCastRecipe *&ExtB,
4598 VPValue *&ValB,
4599 VPWidenRecipe *Mul) {
4600 if (!ExtA || ExtB || !isa<VPIRValue>(ValB) || Red->isPartialReduction())
4601 return;
4602 Type *NarrowTy = Ctx.Types.inferScalarType(ExtA->getOperand(0));
4603 Instruction::CastOps ExtOpc = ExtA->getOpcode();
4604 const APInt *Const;
4605 if (!match(ValB, m_APInt(Const)) ||
4607 Const, NarrowTy, TTI::getPartialReductionExtendKind(ExtOpc)))
4608 return;
4609 // The truncate ensures that the type of each extended operand is the
4610 // same, and it's been proven that the constant can be extended from
4611 // NarrowTy safely. Necessary since ExtA's extended operand would be
4612 // e.g. an i8, while the const will likely be an i32. This will be
4613 // elided by later optimisations.
4614 VPBuilder Builder(Mul);
4615 auto *Trunc =
4616 Builder.createWidenCast(Instruction::CastOps::Trunc, ValB, NarrowTy);
4617 Type *WideTy = Ctx.Types.inferScalarType(ExtA);
4618 ValB = ExtB = Builder.createWidenCast(ExtOpc, Trunc, WideTy);
4619 Mul->setOperand(1, ExtB);
4620 };
4621
4622 // Try to match reduce.add(mul(...)).
4623 if (match(VecOp, m_Mul(m_VPValue(A), m_VPValue(B)))) {
4626 auto *Mul = cast<VPWidenRecipe>(VecOp);
4627
4628 // Convert reduce.add(mul(ext, const)) to reduce.add(mul(ext, ext(const)))
4629 ExtendAndReplaceConstantOp(RecipeA, RecipeB, B, Mul);
4630
4631 // Match reduce.add/sub(mul(ext, ext)).
4632 if (RecipeA && RecipeB && match(RecipeA, m_ZExtOrSExt(m_VPValue())) &&
4633 match(RecipeB, m_ZExtOrSExt(m_VPValue())) &&
4634 IsMulAccValidAndClampRange(Mul, RecipeA, RecipeB, nullptr)) {
4635 if (Sub)
4636 return new VPExpressionRecipe(RecipeA, RecipeB, Mul,
4637 cast<VPWidenRecipe>(Sub), Red);
4638 return new VPExpressionRecipe(RecipeA, RecipeB, Mul, Red);
4639 }
4640 // TODO: Add an expression type for this variant with a negated mul
4641 if (!Sub && IsMulAccValidAndClampRange(Mul, nullptr, nullptr, nullptr))
4642 return new VPExpressionRecipe(Mul, Red);
4643 }
4644 // TODO: Add an expression type for negated versions of other expression
4645 // variants.
4646 if (Sub)
4647 return nullptr;
4648
4649 // Match reduce.add(ext(mul(A, B))).
4650 if (!Red->isPartialReduction() &&
4651 match(VecOp, m_ZExtOrSExt(m_Mul(m_VPValue(A), m_VPValue(B))))) {
4652 auto *Ext = cast<VPWidenCastRecipe>(VecOp);
4653 auto *Mul = cast<VPWidenRecipe>(Ext->getOperand(0));
4656
4657 // reduce.add(ext(mul(ext, const)))
4658 // -> reduce.add(ext(mul(ext, ext(const))))
4659 ExtendAndReplaceConstantOp(Ext0, Ext1, B, Mul);
4660
4661 // reduce.add(ext(mul(ext(A), ext(B))))
4662 // -> reduce.add(mul(wider_ext(A), wider_ext(B)))
4663 // The inner extends must either have the same opcode as the outer extend or
4664 // be the same, in which case the multiply can never result in a negative
4665 // value and the outer extend can be folded away by doing wider
4666 // extends for the operands of the mul.
4667 if (Ext0 && Ext1 &&
4668 (Ext->getOpcode() == Ext0->getOpcode() || Ext0 == Ext1) &&
4669 Ext0->getOpcode() == Ext1->getOpcode() &&
4670 IsMulAccValidAndClampRange(Mul, Ext0, Ext1, Ext) && Mul->hasOneUse()) {
4671 auto *NewExt0 = new VPWidenCastRecipe(
4672 Ext0->getOpcode(), Ext0->getOperand(0), Ext->getResultType(), nullptr,
4673 *Ext0, *Ext0, Ext0->getDebugLoc());
4674 NewExt0->insertBefore(Ext0);
4675
4676 VPWidenCastRecipe *NewExt1 = NewExt0;
4677 if (Ext0 != Ext1) {
4678 NewExt1 = new VPWidenCastRecipe(Ext1->getOpcode(), Ext1->getOperand(0),
4679 Ext->getResultType(), nullptr, *Ext1,
4680 *Ext1, Ext1->getDebugLoc());
4681 NewExt1->insertBefore(Ext1);
4682 }
4683 Mul->setOperand(0, NewExt0);
4684 Mul->setOperand(1, NewExt1);
4685 Red->setOperand(1, Mul);
4686 return new VPExpressionRecipe(NewExt0, NewExt1, Mul, Red);
4687 }
4688 }
4689 return nullptr;
4690}
4691
4692/// This function tries to create abstract recipes from the reduction recipe for
4693/// following optimizations and cost estimation.
4695 VPCostContext &Ctx,
4696 VFRange &Range) {
4697 VPExpressionRecipe *AbstractR = nullptr;
4698 auto IP = std::next(Red->getIterator());
4699 auto *VPBB = Red->getParent();
4700 if (auto *MulAcc = tryToMatchAndCreateMulAccumulateReduction(Red, Ctx, Range))
4701 AbstractR = MulAcc;
4702 else if (auto *ExtRed = tryToMatchAndCreateExtendedReduction(Red, Ctx, Range))
4703 AbstractR = ExtRed;
4704 // Cannot create abstract inloop reduction recipes.
4705 if (!AbstractR)
4706 return;
4707
4708 AbstractR->insertBefore(*VPBB, IP);
4709 Red->replaceAllUsesWith(AbstractR);
4710}
4711
4722
4724 if (Plan.hasScalarVFOnly())
4725 return;
4726
4727#ifndef NDEBUG
4728 VPDominatorTree VPDT(Plan);
4729#endif
4730
4731 SmallVector<VPValue *> VPValues;
4732 if (VPValue *BTC = Plan.getBackedgeTakenCount())
4733 VPValues.push_back(BTC);
4734 append_range(VPValues, Plan.getLiveIns());
4735 for (VPRecipeBase &R : *Plan.getEntry())
4736 append_range(VPValues, R.definedValues());
4737
4738 auto *VectorPreheader = Plan.getVectorPreheader();
4739 for (VPValue *VPV : VPValues) {
4741 (isa<VPIRValue>(VPV) && isa<Constant>(VPV->getLiveInIRValue())))
4742 continue;
4743
4744 // Add explicit broadcast at the insert point that dominates all users.
4745 VPBasicBlock *HoistBlock = VectorPreheader;
4746 VPBasicBlock::iterator HoistPoint = VectorPreheader->end();
4747 for (VPUser *User : VPV->users()) {
4748 if (User->usesScalars(VPV))
4749 continue;
4750 if (cast<VPRecipeBase>(User)->getParent() == VectorPreheader)
4751 HoistPoint = HoistBlock->begin();
4752 else
4753 assert(VPDT.dominates(VectorPreheader,
4754 cast<VPRecipeBase>(User)->getParent()) &&
4755 "All users must be in the vector preheader or dominated by it");
4756 }
4757
4758 VPBuilder Builder(cast<VPBasicBlock>(HoistBlock), HoistPoint);
4759 auto *Broadcast = Builder.createNaryOp(VPInstruction::Broadcast, {VPV});
4760 VPV->replaceUsesWithIf(Broadcast,
4761 [VPV, Broadcast](VPUser &U, unsigned Idx) {
4762 return Broadcast != &U && !U.usesScalars(VPV);
4763 });
4764 }
4765}
4766
4768 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
4769
4770 // Collect candidate loads with invariant addresses and noalias scopes
4771 // metadata and memory-writing recipes with noalias metadata.
4775 vp_depth_first_shallow(LoopRegion->getEntry()))) {
4776 for (VPRecipeBase &R : *VPBB) {
4777 // Only handle single-scalar replicated loads with invariant addresses.
4778 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
4779 if (RepR->isPredicated() || !RepR->isSingleScalar() ||
4780 RepR->getOpcode() != Instruction::Load)
4781 continue;
4782
4783 VPValue *Addr = RepR->getOperand(0);
4784 if (Addr->isDefinedOutsideLoopRegions()) {
4786 if (!Loc.AATags.Scope)
4787 continue;
4788 CandidateLoads.push_back({RepR, Loc});
4789 }
4790 }
4791 if (R.mayWriteToMemory()) {
4793 if (!Loc || !Loc->AATags.Scope || !Loc->AATags.NoAlias)
4794 return;
4795 Stores.push_back(*Loc);
4796 }
4797 }
4798 }
4799
4800 VPBasicBlock *Preheader = Plan.getVectorPreheader();
4801 for (auto &[LoadRecipe, LoadLoc] : CandidateLoads) {
4802 // Hoist the load to the preheader if it doesn't alias with any stores
4803 // according to the noalias metadata. Other loads should have been hoisted
4804 // by other passes
4805 const AAMDNodes &LoadAA = LoadLoc.AATags;
4806 if (all_of(Stores, [&](const MemoryLocation &StoreLoc) {
4808 LoadAA.Scope, StoreLoc.AATags.NoAlias);
4809 })) {
4810 LoadRecipe->moveBefore(*Preheader, Preheader->getFirstNonPhi());
4811 }
4812 }
4813}
4814
4815// Collect common metadata from a group of replicate recipes by intersecting
4816// metadata from all recipes in the group.
4818 VPIRMetadata CommonMetadata = *Recipes.front();
4819 for (VPReplicateRecipe *Recipe : drop_begin(Recipes))
4820 CommonMetadata.intersect(*Recipe);
4821 return CommonMetadata;
4822}
4823
4824template <unsigned Opcode>
4828 const Loop *L) {
4829 static_assert(Opcode == Instruction::Load || Opcode == Instruction::Store,
4830 "Only Load and Store opcodes supported");
4831 constexpr bool IsLoad = (Opcode == Instruction::Load);
4832 VPTypeAnalysis TypeInfo(Plan);
4833
4834 // For each address, collect operations with the same or complementary masks.
4836 auto GetLoadStoreValueType = [&](VPReplicateRecipe *Recipe) {
4837 return TypeInfo.inferScalarType(IsLoad ? Recipe : Recipe->getOperand(0));
4838 };
4840 Plan, PSE, L,
4841 [](VPReplicateRecipe *RepR) { return RepR->isPredicated(); });
4842 for (auto Recipes : Groups) {
4843 if (Recipes.size() < 2)
4844 continue;
4845
4846 // Collect groups with the same or complementary masks.
4847 for (VPReplicateRecipe *&RecipeI : Recipes) {
4848 if (!RecipeI)
4849 continue;
4850
4851 VPValue *MaskI = RecipeI->getMask();
4852 Type *TypeI = GetLoadStoreValueType(RecipeI);
4854 Group.push_back(RecipeI);
4855 RecipeI = nullptr;
4856
4857 // Find all operations with the same or complementary masks.
4858 bool HasComplementaryMask = false;
4859 for (VPReplicateRecipe *&RecipeJ : Recipes) {
4860 if (!RecipeJ)
4861 continue;
4862
4863 VPValue *MaskJ = RecipeJ->getMask();
4864 Type *TypeJ = GetLoadStoreValueType(RecipeJ);
4865 if (TypeI == TypeJ) {
4866 // Check if any operation in the group has a complementary mask with
4867 // another, that is M1 == NOT(M2) or M2 == NOT(M1).
4868 HasComplementaryMask |= match(MaskI, m_Not(m_Specific(MaskJ))) ||
4869 match(MaskJ, m_Not(m_Specific(MaskI)));
4870 Group.push_back(RecipeJ);
4871 RecipeJ = nullptr;
4872 }
4873 }
4874
4875 if (HasComplementaryMask) {
4876 assert(Group.size() >= 2 && "must have at least 2 entries");
4877 AllGroups.push_back(std::move(Group));
4878 }
4879 }
4880 }
4881
4882 return AllGroups;
4883}
4884
4885// Find the recipe with minimum alignment in the group.
4886template <typename InstType>
4887static VPReplicateRecipe *
4889 return *min_element(Group, [](VPReplicateRecipe *A, VPReplicateRecipe *B) {
4890 return cast<InstType>(A->getUnderlyingInstr())->getAlign() <
4891 cast<InstType>(B->getUnderlyingInstr())->getAlign();
4892 });
4893}
4894
4897 const Loop *L) {
4898 auto Groups =
4900 if (Groups.empty())
4901 return;
4902
4903 // Process each group of loads.
4904 for (auto &Group : Groups) {
4905 // Try to use the earliest (most dominating) load to replace all others.
4906 VPReplicateRecipe *EarliestLoad = Group[0];
4907 VPBasicBlock *FirstBB = EarliestLoad->getParent();
4908 VPBasicBlock *LastBB = Group.back()->getParent();
4909
4910 // Check that the load doesn't alias with stores between first and last.
4911 auto LoadLoc = vputils::getMemoryLocation(*EarliestLoad);
4912 if (!LoadLoc || !canHoistOrSinkWithNoAliasCheck(*LoadLoc, FirstBB, LastBB))
4913 continue;
4914
4915 // Collect common metadata from all loads in the group.
4916 VPIRMetadata CommonMetadata = getCommonMetadata(Group);
4917
4918 // Find the load with minimum alignment to use.
4919 auto *LoadWithMinAlign = findRecipeWithMinAlign<LoadInst>(Group);
4920
4921 bool IsSingleScalar = EarliestLoad->isSingleScalar();
4922 assert(all_of(Group,
4923 [IsSingleScalar](VPReplicateRecipe *R) {
4924 return R->isSingleScalar() == IsSingleScalar;
4925 }) &&
4926 "all members in group must agree on IsSingleScalar");
4927
4928 // Create an unpredicated version of the earliest load with common
4929 // metadata.
4930 auto *UnpredicatedLoad = new VPReplicateRecipe(
4931 LoadWithMinAlign->getUnderlyingInstr(), {EarliestLoad->getOperand(0)},
4932 IsSingleScalar, /*Mask=*/nullptr, *EarliestLoad, CommonMetadata);
4933
4934 UnpredicatedLoad->insertBefore(EarliestLoad);
4935
4936 // Replace all loads in the group with the unpredicated load.
4937 for (VPReplicateRecipe *Load : Group) {
4938 Load->replaceAllUsesWith(UnpredicatedLoad);
4939 Load->eraseFromParent();
4940 }
4941 }
4942}
4943
4944static bool
4946 PredicatedScalarEvolution &PSE, const Loop &L,
4947 VPTypeAnalysis &TypeInfo) {
4948 auto StoreLoc = vputils::getMemoryLocation(*StoresToSink.front());
4949 if (!StoreLoc || !StoreLoc->AATags.Scope)
4950 return false;
4951
4952 // When sinking a group of stores, all members of the group alias each other.
4953 // Skip them during the alias checks.
4954 SmallPtrSet<VPRecipeBase *, 4> StoresToSinkSet(StoresToSink.begin(),
4955 StoresToSink.end());
4956
4957 VPBasicBlock *FirstBB = StoresToSink.front()->getParent();
4958 VPBasicBlock *LastBB = StoresToSink.back()->getParent();
4959 SinkStoreInfo SinkInfo(StoresToSinkSet, *StoresToSink[0], PSE, L, TypeInfo);
4960 return canHoistOrSinkWithNoAliasCheck(*StoreLoc, FirstBB, LastBB, SinkInfo);
4961}
4962
4965 const Loop *L) {
4966 auto Groups =
4968 if (Groups.empty())
4969 return;
4970
4971 VPTypeAnalysis TypeInfo(Plan);
4972
4973 for (auto &Group : Groups) {
4974 if (!canSinkStoreWithNoAliasCheck(Group, PSE, *L, TypeInfo))
4975 continue;
4976
4977 // Use the last (most dominated) store's location for the unconditional
4978 // store.
4979 VPReplicateRecipe *LastStore = Group.back();
4980 VPBasicBlock *InsertBB = LastStore->getParent();
4981
4982 // Collect common alias metadata from all stores in the group.
4983 VPIRMetadata CommonMetadata = getCommonMetadata(Group);
4984
4985 // Build select chain for stored values.
4986 VPValue *SelectedValue = Group[0]->getOperand(0);
4987 VPBuilder Builder(InsertBB, LastStore->getIterator());
4988
4989 bool IsSingleScalar = Group[0]->isSingleScalar();
4990 for (unsigned I = 1; I < Group.size(); ++I) {
4991 assert(IsSingleScalar == Group[I]->isSingleScalar() &&
4992 "all members in group must agree on IsSingleScalar");
4993 VPValue *Mask = Group[I]->getMask();
4994 VPValue *Value = Group[I]->getOperand(0);
4995 SelectedValue = Builder.createSelect(Mask, Value, SelectedValue,
4996 Group[I]->getDebugLoc());
4997 }
4998
4999 // Find the store with minimum alignment to use.
5000 auto *StoreWithMinAlign = findRecipeWithMinAlign<StoreInst>(Group);
5001
5002 // Create unconditional store with selected value and common metadata.
5003 auto *UnpredicatedStore = new VPReplicateRecipe(
5004 StoreWithMinAlign->getUnderlyingInstr(),
5005 {SelectedValue, LastStore->getOperand(1)}, IsSingleScalar,
5006 /*Mask=*/nullptr, *LastStore, CommonMetadata);
5007 UnpredicatedStore->insertBefore(*InsertBB, LastStore->getIterator());
5008
5009 // Remove all predicated stores from the group.
5010 for (VPReplicateRecipe *Store : Group)
5011 Store->eraseFromParent();
5012 }
5013}
5014
5016 VPlan &Plan, ElementCount BestVF, unsigned BestUF,
5018 assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
5019 assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
5020
5021 VPValue *TC = Plan.getTripCount();
5022 if (TC->getNumUsers() == 0)
5023 return;
5024
5025 // Skip cases for which the trip count may be non-trivial to materialize.
5026 // I.e., when a scalar tail is absent - due to tail folding, or when a scalar
5027 // tail is required.
5028 if (!Plan.hasScalarTail() ||
5030 Plan.getScalarPreheader() ||
5031 !isa<VPIRValue>(TC))
5032 return;
5033
5034 // Materialize vector trip counts for constants early if it can simply
5035 // be computed as (Original TC / VF * UF) * VF * UF.
5036 // TODO: Compute vector trip counts for loops requiring a scalar epilogue and
5037 // tail-folded loops.
5038 ScalarEvolution &SE = *PSE.getSE();
5039 auto *TCScev = SE.getSCEV(TC->getLiveInIRValue());
5040 if (!isa<SCEVConstant>(TCScev))
5041 return;
5042 const SCEV *VFxUF = SE.getElementCount(TCScev->getType(), BestVF * BestUF);
5043 auto VecTCScev = SE.getMulExpr(SE.getUDivExpr(TCScev, VFxUF), VFxUF);
5044 if (auto *ConstVecTC = dyn_cast<SCEVConstant>(VecTCScev))
5045 Plan.getVectorTripCount().setUnderlyingValue(ConstVecTC->getValue());
5046}
5047
5049 VPBasicBlock *VectorPH) {
5051 if (BTC->getNumUsers() == 0)
5052 return;
5053
5054 VPBuilder Builder(VectorPH, VectorPH->begin());
5055 auto *TCTy = VPTypeAnalysis(Plan).inferScalarType(Plan.getTripCount());
5056 auto *TCMO =
5057 Builder.createSub(Plan.getTripCount(), Plan.getConstantInt(TCTy, 1),
5058 DebugLoc::getCompilerGenerated(), "trip.count.minus.1");
5059 BTC->replaceAllUsesWith(TCMO);
5060}
5061
5063 if (Plan.hasScalarVFOnly())
5064 return;
5065
5066 VPTypeAnalysis TypeInfo(Plan);
5067 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
5068 auto VPBBsOutsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
5070 auto VPBBsInsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
5071 vp_depth_first_shallow(LoopRegion->getEntry()));
5072 // Materialize Build(Struct)Vector for all replicating VPReplicateRecipes,
5073 // VPScalarIVStepsRecipe and VPInstructions, excluding ones in replicate
5074 // regions. Those are not materialized explicitly yet. Those vector users are
5075 // still handled in VPReplicateRegion::execute(), via shouldPack().
5076 // TODO: materialize build vectors for replicating recipes in replicating
5077 // regions.
5078 for (VPBasicBlock *VPBB :
5079 concat<VPBasicBlock *>(VPBBsOutsideLoopRegion, VPBBsInsideLoopRegion)) {
5080 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
5082 continue;
5083 auto *DefR = cast<VPSingleDefRecipe>(&R);
5084 auto UsesVectorOrInsideReplicateRegion = [DefR, LoopRegion](VPUser *U) {
5085 VPRegionBlock *ParentRegion = cast<VPRecipeBase>(U)->getRegion();
5086 return !U->usesScalars(DefR) || ParentRegion != LoopRegion;
5087 };
5088 if ((isa<VPReplicateRecipe>(DefR) &&
5089 cast<VPReplicateRecipe>(DefR)->isSingleScalar()) ||
5090 (isa<VPInstruction>(DefR) &&
5092 !cast<VPInstruction>(DefR)->doesGeneratePerAllLanes())) ||
5093 none_of(DefR->users(), UsesVectorOrInsideReplicateRegion))
5094 continue;
5095
5096 Type *ScalarTy = TypeInfo.inferScalarType(DefR);
5097 unsigned Opcode = ScalarTy->isStructTy()
5100 auto *BuildVector = new VPInstruction(Opcode, {DefR});
5101 BuildVector->insertAfter(DefR);
5102
5103 DefR->replaceUsesWithIf(
5104 BuildVector, [BuildVector, &UsesVectorOrInsideReplicateRegion](
5105 VPUser &U, unsigned) {
5106 return &U != BuildVector && UsesVectorOrInsideReplicateRegion(&U);
5107 });
5108 }
5109 }
5110
5111 // Create explicit VPInstructions to convert vectors to scalars. The current
5112 // implementation is conservative - it may miss some cases that may or may not
5113 // be vector values. TODO: introduce Unpacks speculatively - remove them later
5114 // if they are known to operate on scalar values.
5115 for (VPBasicBlock *VPBB : VPBBsInsideLoopRegion) {
5116 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
5119 continue;
5120 for (VPValue *Def : R.definedValues()) {
5121 // Skip recipes that are single-scalar or only have their first lane
5122 // used.
5123 // TODO: The Defs skipped here may or may not be vector values.
5124 // Introduce Unpacks, and remove them later, if they are guaranteed to
5125 // produce scalar values.
5127 continue;
5128
5129 // At the moment, we create unpacks only for scalar users outside
5130 // replicate regions. Recipes inside replicate regions still extract the
5131 // required lanes implicitly.
5132 // TODO: Remove once replicate regions are unrolled completely.
5133 auto IsCandidateUnpackUser = [Def](VPUser *U) {
5134 VPRegionBlock *ParentRegion = cast<VPRecipeBase>(U)->getRegion();
5135 return U->usesScalars(Def) &&
5136 (!ParentRegion || !ParentRegion->isReplicator());
5137 };
5138 if (none_of(Def->users(), IsCandidateUnpackUser))
5139 continue;
5140
5141 auto *Unpack = new VPInstruction(VPInstruction::Unpack, {Def});
5142 if (R.isPhi())
5143 Unpack->insertBefore(*VPBB, VPBB->getFirstNonPhi());
5144 else
5145 Unpack->insertAfter(&R);
5146 Def->replaceUsesWithIf(Unpack,
5147 [&IsCandidateUnpackUser](VPUser &U, unsigned) {
5148 return IsCandidateUnpackUser(&U);
5149 });
5150 }
5151 }
5152 }
5153}
5154
5156 VPBasicBlock *VectorPHVPBB,
5157 bool TailByMasking,
5158 bool RequiresScalarEpilogue,
5159 VPValue *Step) {
5160 VPSymbolicValue &VectorTC = Plan.getVectorTripCount();
5161 // There's nothing to do if there are no users of the vector trip count or its
5162 // IR value has already been set.
5163 if (VectorTC.getNumUsers() == 0 || VectorTC.getUnderlyingValue())
5164 return;
5165
5166 VPValue *TC = Plan.getTripCount();
5167 Type *TCTy = VPTypeAnalysis(Plan).inferScalarType(TC);
5168 VPBasicBlock::iterator InsertPt = VectorPHVPBB->begin();
5169 if (auto *StepR = Step->getDefiningRecipe()) {
5170 assert(StepR->getParent() == VectorPHVPBB &&
5171 "Step must be defined in VectorPHVPBB");
5172 // Insert after Step's definition to maintain valid def-use ordering.
5173 InsertPt = std::next(StepR->getIterator());
5174 }
5175 VPBuilder Builder(VectorPHVPBB, InsertPt);
5176
5177 // If the tail is to be folded by masking, round the number of iterations N
5178 // up to a multiple of Step instead of rounding down. This is done by first
5179 // adding Step-1 and then rounding down. Note that it's ok if this addition
5180 // overflows: the vector induction variable will eventually wrap to zero given
5181 // that it starts at zero and its Step is a power of two; the loop will then
5182 // exit, with the last early-exit vector comparison also producing all-true.
5183 if (TailByMasking) {
5184 TC = Builder.createAdd(
5185 TC, Builder.createSub(Step, Plan.getConstantInt(TCTy, 1)),
5186 DebugLoc::getCompilerGenerated(), "n.rnd.up");
5187 }
5188
5189 // Now we need to generate the expression for the part of the loop that the
5190 // vectorized body will execute. This is equal to N - (N % Step) if scalar
5191 // iterations are not required for correctness, or N - Step, otherwise. Step
5192 // is equal to the vectorization factor (number of SIMD elements) times the
5193 // unroll factor (number of SIMD instructions).
5194 VPValue *R =
5195 Builder.createNaryOp(Instruction::URem, {TC, Step},
5196 DebugLoc::getCompilerGenerated(), "n.mod.vf");
5197
5198 // There are cases where we *must* run at least one iteration in the remainder
5199 // loop. See the cost model for when this can happen. If the step evenly
5200 // divides the trip count, we set the remainder to be equal to the step. If
5201 // the step does not evenly divide the trip count, no adjustment is necessary
5202 // since there will already be scalar iterations. Note that the minimum
5203 // iterations check ensures that N >= Step.
5204 if (RequiresScalarEpilogue) {
5205 assert(!TailByMasking &&
5206 "requiring scalar epilogue is not supported with fail folding");
5207 VPValue *IsZero =
5208 Builder.createICmp(CmpInst::ICMP_EQ, R, Plan.getZero(TCTy));
5209 R = Builder.createSelect(IsZero, Step, R);
5210 }
5211
5212 VPValue *Res =
5213 Builder.createSub(TC, R, DebugLoc::getCompilerGenerated(), "n.vec");
5214 VectorTC.replaceAllUsesWith(Res);
5215}
5216
5218 ElementCount VFEC) {
5219 // If VF and VFxUF have already been materialized (no remaining users),
5220 // there's nothing more to do.
5221 if (Plan.getVF().isMaterialized()) {
5222 assert(Plan.getVFxUF().isMaterialized() &&
5223 "VF and VFxUF must be materialized together");
5224 return;
5225 }
5226
5227 VPBuilder Builder(VectorPH, VectorPH->begin());
5228 Type *TCTy = VPTypeAnalysis(Plan).inferScalarType(Plan.getTripCount());
5229 VPValue &VF = Plan.getVF();
5230 VPValue &VFxUF = Plan.getVFxUF();
5231 // If there are no users of the runtime VF, compute VFxUF by constant folding
5232 // the multiplication of VF and UF.
5233 if (VF.getNumUsers() == 0) {
5234 VPValue *RuntimeVFxUF =
5235 Builder.createElementCount(TCTy, VFEC * Plan.getConcreteUF());
5236 VFxUF.replaceAllUsesWith(RuntimeVFxUF);
5237 return;
5238 }
5239
5240 // For users of the runtime VF, compute it as VF * vscale, and VFxUF as (VF *
5241 // vscale) * UF.
5242 VPValue *RuntimeVF = Builder.createElementCount(TCTy, VFEC);
5244 VPValue *BC = Builder.createNaryOp(VPInstruction::Broadcast, RuntimeVF);
5246 BC, [&VF](VPUser &U, unsigned) { return !U.usesScalars(&VF); });
5247 }
5248 VF.replaceAllUsesWith(RuntimeVF);
5249
5250 VPValue *MulByUF = Builder.createOverflowingOp(
5251 Instruction::Mul,
5252 {RuntimeVF, Plan.getConstantInt(TCTy, Plan.getConcreteUF())},
5253 {true, false});
5254 VFxUF.replaceAllUsesWith(MulByUF);
5255}
5256
5259 SCEVExpander Expander(SE, "induction", /*PreserveLCSSA=*/false);
5260
5261 auto *Entry = cast<VPIRBasicBlock>(Plan.getEntry());
5262 BasicBlock *EntryBB = Entry->getIRBasicBlock();
5263 DenseMap<const SCEV *, Value *> ExpandedSCEVs;
5264 for (VPRecipeBase &R : make_early_inc_range(*Entry)) {
5266 continue;
5267 auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(&R);
5268 if (!ExpSCEV)
5269 break;
5270 const SCEV *Expr = ExpSCEV->getSCEV();
5271 Value *Res =
5272 Expander.expandCodeFor(Expr, Expr->getType(), EntryBB->getTerminator());
5273 ExpandedSCEVs[ExpSCEV->getSCEV()] = Res;
5274 VPValue *Exp = Plan.getOrAddLiveIn(Res);
5275 ExpSCEV->replaceAllUsesWith(Exp);
5276 if (Plan.getTripCount() == ExpSCEV)
5277 Plan.resetTripCount(Exp);
5278 ExpSCEV->eraseFromParent();
5279 }
5281 "VPExpandSCEVRecipes must be at the beginning of the entry block, "
5282 "before any VPIRInstructions");
5283 // Add IR instructions in the entry basic block but not in the VPIRBasicBlock
5284 // to the VPIRBasicBlock.
5285 auto EI = Entry->begin();
5286 for (Instruction &I : drop_end(*EntryBB)) {
5287 if (EI != Entry->end() && isa<VPIRInstruction>(*EI) &&
5288 &cast<VPIRInstruction>(&*EI)->getInstruction() == &I) {
5289 EI++;
5290 continue;
5291 }
5293 }
5294
5295 return ExpandedSCEVs;
5296}
5297
5298/// Returns true if \p V is VPWidenLoadRecipe or VPInterleaveRecipe that can be
5299/// converted to a narrower recipe. \p V is used by a wide recipe that feeds a
5300/// store interleave group at index \p Idx, \p WideMember0 is the recipe feeding
5301/// the same interleave group at index 0. A VPWidenLoadRecipe can be narrowed to
5302/// an index-independent load if it feeds all wide ops at all indices (\p OpV
5303/// must be the operand at index \p OpIdx for both the recipe at lane 0, \p
5304/// WideMember0). A VPInterleaveRecipe can be narrowed to a wide load, if \p V
5305/// is defined at \p Idx of a load interleave group.
5306static bool canNarrowLoad(VPSingleDefRecipe *WideMember0, unsigned OpIdx,
5307 VPValue *OpV, unsigned Idx, bool IsScalable) {
5308 VPValue *Member0Op = WideMember0->getOperand(OpIdx);
5309 VPRecipeBase *Member0OpR = Member0Op->getDefiningRecipe();
5310 if (!Member0OpR)
5311 return Member0Op == OpV;
5312 if (auto *W = dyn_cast<VPWidenLoadRecipe>(Member0OpR))
5313 // For scalable VFs, the narrowed plan processes vscale iterations at once,
5314 // so a shared wide load cannot be narrowed to a uniform scalar; bail out.
5315 return !IsScalable && !W->getMask() && W->isConsecutive() &&
5316 Member0Op == OpV;
5317 if (auto *IR = dyn_cast<VPInterleaveRecipe>(Member0OpR))
5318 return IR->getInterleaveGroup()->isFull() && IR->getVPValue(Idx) == OpV;
5319 return false;
5320}
5321
5322static bool canNarrowOps(ArrayRef<VPValue *> Ops, bool IsScalable) {
5324 auto *WideMember0 = dyn_cast<VPSingleDefRecipe>(Ops[0]);
5325 if (!WideMember0)
5326 return false;
5327 for (VPValue *V : Ops) {
5329 return false;
5330 auto *R = cast<VPSingleDefRecipe>(V);
5331 if (getOpcodeOrIntrinsicID(R) != getOpcodeOrIntrinsicID(WideMember0))
5332 return false;
5333 }
5334
5335 for (unsigned Idx = 0; Idx != WideMember0->getNumOperands(); ++Idx) {
5337 for (VPValue *Op : Ops)
5338 OpsI.push_back(Op->getDefiningRecipe()->getOperand(Idx));
5339
5340 if (canNarrowOps(OpsI, IsScalable))
5341 continue;
5342
5343 if (any_of(enumerate(OpsI), [WideMember0, Idx, IsScalable](const auto &P) {
5344 const auto &[OpIdx, OpV] = P;
5345 return !canNarrowLoad(WideMember0, Idx, OpV, OpIdx, IsScalable);
5346 }))
5347 return false;
5348 }
5349
5350 return true;
5351}
5352
5353/// Returns VF from \p VFs if \p IR is a full interleave group with factor and
5354/// number of members both equal to VF. The interleave group must also access
5355/// the full vector width.
5356static std::optional<ElementCount> isConsecutiveInterleaveGroup(
5358 VPTypeAnalysis &TypeInfo, const TargetTransformInfo &TTI) {
5359 if (!InterleaveR || InterleaveR->getMask())
5360 return std::nullopt;
5361
5362 Type *GroupElementTy = nullptr;
5363 if (InterleaveR->getStoredValues().empty()) {
5364 GroupElementTy = TypeInfo.inferScalarType(InterleaveR->getVPValue(0));
5365 if (!all_of(InterleaveR->definedValues(),
5366 [&TypeInfo, GroupElementTy](VPValue *Op) {
5367 return TypeInfo.inferScalarType(Op) == GroupElementTy;
5368 }))
5369 return std::nullopt;
5370 } else {
5371 GroupElementTy =
5372 TypeInfo.inferScalarType(InterleaveR->getStoredValues()[0]);
5373 if (!all_of(InterleaveR->getStoredValues(),
5374 [&TypeInfo, GroupElementTy](VPValue *Op) {
5375 return TypeInfo.inferScalarType(Op) == GroupElementTy;
5376 }))
5377 return std::nullopt;
5378 }
5379
5380 auto IG = InterleaveR->getInterleaveGroup();
5381 if (IG->getFactor() != IG->getNumMembers())
5382 return std::nullopt;
5383
5384 auto GetVectorBitWidthForVF = [&TTI](ElementCount VF) {
5385 TypeSize Size = TTI.getRegisterBitWidth(
5388 assert(Size.isScalable() == VF.isScalable() &&
5389 "if Size is scalable, VF must be scalable and vice versa");
5390 return Size.getKnownMinValue();
5391 };
5392
5393 for (ElementCount VF : VFs) {
5394 unsigned MinVal = VF.getKnownMinValue();
5395 unsigned GroupSize = GroupElementTy->getScalarSizeInBits() * MinVal;
5396 if (IG->getFactor() == MinVal && GroupSize == GetVectorBitWidthForVF(VF))
5397 return {VF};
5398 }
5399 return std::nullopt;
5400}
5401
5402/// Returns true if \p VPValue is a narrow VPValue.
5403static bool isAlreadyNarrow(VPValue *VPV) {
5404 if (isa<VPIRValue>(VPV))
5405 return true;
5406 auto *RepR = dyn_cast<VPReplicateRecipe>(VPV);
5407 return RepR && RepR->isSingleScalar();
5408}
5409
5410// Convert a wide recipe defining a VPValue \p V feeding an interleave group to
5411// a narrow variant.
5412static VPValue *
5414 auto *R = V->getDefiningRecipe();
5415 if (!R || NarrowedOps.contains(V))
5416 return V;
5417
5418 if (isAlreadyNarrow(V))
5419 return V;
5420
5422 auto *WideMember0 = cast<VPSingleDefRecipe>(R);
5423 for (unsigned Idx = 0, E = WideMember0->getNumOperands(); Idx != E; ++Idx)
5424 WideMember0->setOperand(
5425 Idx,
5426 narrowInterleaveGroupOp(WideMember0->getOperand(Idx), NarrowedOps));
5427 return V;
5428 }
5429
5430 if (auto *LoadGroup = dyn_cast<VPInterleaveRecipe>(R)) {
5431 // Narrow interleave group to wide load, as transformed VPlan will only
5432 // process one original iteration.
5433 auto *LI = cast<LoadInst>(LoadGroup->getInterleaveGroup()->getInsertPos());
5434 auto *L = new VPWidenLoadRecipe(*LI, LoadGroup->getAddr(),
5435 LoadGroup->getMask(), /*Consecutive=*/true,
5436 {}, LoadGroup->getDebugLoc());
5437 L->insertBefore(LoadGroup);
5438 NarrowedOps.insert(L);
5439 return L;
5440 }
5441
5442 if (auto *RepR = dyn_cast<VPReplicateRecipe>(R)) {
5443 assert(RepR->isSingleScalar() &&
5444 isa<LoadInst>(RepR->getUnderlyingInstr()) &&
5445 "must be a single scalar load");
5446 NarrowedOps.insert(RepR);
5447 return RepR;
5448 }
5449
5450 auto *WideLoad = cast<VPWidenLoadRecipe>(R);
5451 VPValue *PtrOp = WideLoad->getAddr();
5452 if (auto *VecPtr = dyn_cast<VPVectorPointerRecipe>(PtrOp))
5453 PtrOp = VecPtr->getOperand(0);
5454 // Narrow wide load to uniform scalar load, as transformed VPlan will only
5455 // process one original iteration.
5456 auto *N = new VPReplicateRecipe(&WideLoad->getIngredient(), {PtrOp},
5457 /*IsUniform*/ true,
5458 /*Mask*/ nullptr, {}, *WideLoad);
5459 N->insertBefore(WideLoad);
5460 NarrowedOps.insert(N);
5461 return N;
5462}
5463
5464std::unique_ptr<VPlan>
5466 const TargetTransformInfo &TTI) {
5467 VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
5468
5469 if (!VectorLoop)
5470 return nullptr;
5471
5472 // Only handle single-block loops for now.
5473 if (VectorLoop->getEntryBasicBlock() != VectorLoop->getExitingBasicBlock())
5474 return nullptr;
5475
5476 // Skip plans when we may not be able to properly narrow.
5477 VPBasicBlock *Exiting = VectorLoop->getExitingBasicBlock();
5478 if (!match(&Exiting->back(), m_BranchOnCount()))
5479 return nullptr;
5480
5481 assert(match(&Exiting->back(),
5483 m_Specific(&Plan.getVectorTripCount()))) &&
5484 "unexpected branch-on-count");
5485
5486 VPTypeAnalysis TypeInfo(Plan);
5488 std::optional<ElementCount> VFToOptimize;
5489 for (auto &R : *VectorLoop->getEntryBasicBlock()) {
5491 continue;
5492
5495 continue;
5496
5497 // Bail out on recipes not supported at the moment:
5498 // * phi recipes other than the canonical induction
5499 // * recipes writing to memory except interleave groups
5500 // Only support plans with a canonical induction phi.
5501 if (R.isPhi())
5502 return nullptr;
5503
5504 auto *InterleaveR = dyn_cast<VPInterleaveRecipe>(&R);
5505 if (R.mayWriteToMemory() && !InterleaveR)
5506 return nullptr;
5507
5508 // All other ops are allowed, but we reject uses that cannot be converted
5509 // when checking all allowed consumers (store interleave groups) below.
5510 if (!InterleaveR)
5511 continue;
5512
5513 // Try to find a single VF, where all interleave groups are consecutive and
5514 // saturate the full vector width. If we already have a candidate VF, check
5515 // if it is applicable for the current InterleaveR, otherwise look for a
5516 // suitable VF across the Plan's VFs.
5518 VFToOptimize ? SmallVector<ElementCount>({*VFToOptimize})
5519 : to_vector(Plan.vectorFactors());
5520 std::optional<ElementCount> NarrowedVF =
5521 isConsecutiveInterleaveGroup(InterleaveR, VFs, TypeInfo, TTI);
5522 if (!NarrowedVF || (VFToOptimize && NarrowedVF != VFToOptimize))
5523 return nullptr;
5524 VFToOptimize = NarrowedVF;
5525
5526 // Skip read interleave groups.
5527 if (InterleaveR->getStoredValues().empty())
5528 continue;
5529
5530 // Narrow interleave groups, if all operands are already matching narrow
5531 // ops.
5532 auto *Member0 = InterleaveR->getStoredValues()[0];
5533 if (isAlreadyNarrow(Member0) &&
5534 all_of(InterleaveR->getStoredValues(), equal_to(Member0))) {
5535 StoreGroups.push_back(InterleaveR);
5536 continue;
5537 }
5538
5539 // For now, we only support full interleave groups storing load interleave
5540 // groups.
5541 if (all_of(enumerate(InterleaveR->getStoredValues()), [](auto Op) {
5542 VPRecipeBase *DefR = Op.value()->getDefiningRecipe();
5543 if (!DefR)
5544 return false;
5545 auto *IR = dyn_cast<VPInterleaveRecipe>(DefR);
5546 return IR && IR->getInterleaveGroup()->isFull() &&
5547 IR->getVPValue(Op.index()) == Op.value();
5548 })) {
5549 StoreGroups.push_back(InterleaveR);
5550 continue;
5551 }
5552
5553 // Check if all values feeding InterleaveR are matching wide recipes, which
5554 // operands that can be narrowed.
5555 if (!canNarrowOps(InterleaveR->getStoredValues(),
5556 VFToOptimize->isScalable()))
5557 return nullptr;
5558 StoreGroups.push_back(InterleaveR);
5559 }
5560
5561 if (StoreGroups.empty())
5562 return nullptr;
5563
5564 VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
5565 bool RequiresScalarEpilogue =
5566 MiddleVPBB->getNumSuccessors() == 1 &&
5567 MiddleVPBB->getSingleSuccessor() == Plan.getScalarPreheader();
5568 // Bail out for tail-folding (middle block with a single successor to exit).
5569 if (MiddleVPBB->getNumSuccessors() != 2 && !RequiresScalarEpilogue)
5570 return nullptr;
5571
5572 // All interleave groups in Plan can be narrowed for VFToOptimize. Split the
5573 // original Plan into 2: a) a new clone which contains all VFs of Plan, except
5574 // VFToOptimize, and b) the original Plan with VFToOptimize as single VF.
5575 // TODO: Handle cases where only some interleave groups can be narrowed.
5576 std::unique_ptr<VPlan> NewPlan;
5577 if (size(Plan.vectorFactors()) != 1) {
5578 NewPlan = std::unique_ptr<VPlan>(Plan.duplicate());
5579 Plan.setVF(*VFToOptimize);
5580 NewPlan->removeVF(*VFToOptimize);
5581 }
5582
5583 // Convert InterleaveGroup \p R to a single VPWidenLoadRecipe.
5584 SmallPtrSet<VPValue *, 4> NarrowedOps;
5585 // Narrow operation tree rooted at store groups.
5586 for (auto *StoreGroup : StoreGroups) {
5587 VPValue *Res =
5588 narrowInterleaveGroupOp(StoreGroup->getStoredValues()[0], NarrowedOps);
5589 auto *SI =
5590 cast<StoreInst>(StoreGroup->getInterleaveGroup()->getInsertPos());
5591 auto *S = new VPWidenStoreRecipe(*SI, StoreGroup->getAddr(), Res, nullptr,
5592 /*Consecutive=*/true, {},
5593 StoreGroup->getDebugLoc());
5594 S->insertBefore(StoreGroup);
5595 StoreGroup->eraseFromParent();
5596 }
5597
5598 // Adjust induction to reflect that the transformed plan only processes one
5599 // original iteration.
5600 auto *CanIV = VectorLoop->getCanonicalIV();
5601 auto *Inc = cast<VPInstruction>(CanIV->getBackedgeValue());
5602 VPBasicBlock *VectorPH = Plan.getVectorPreheader();
5603 VPBuilder PHBuilder(VectorPH, VectorPH->begin());
5604
5605 VPValue *UF = &Plan.getUF();
5606 VPValue *Step;
5607 if (VFToOptimize->isScalable()) {
5608 VPValue *VScale = PHBuilder.createElementCount(
5610 Step = PHBuilder.createOverflowingOp(Instruction::Mul, {VScale, UF},
5611 {true, false});
5612 Plan.getVF().replaceAllUsesWith(VScale);
5613 } else {
5614 Step = UF;
5616 Plan.getConstantInt(CanIV->getScalarType(), 1));
5617 }
5618 // Materialize vector trip count with the narrowed step.
5619 materializeVectorTripCount(Plan, VectorPH, /*TailByMasking=*/false,
5620 RequiresScalarEpilogue, Step);
5621
5622 Inc->setOperand(1, Step);
5623 Plan.getVFxUF().replaceAllUsesWith(Step);
5624
5625 removeDeadRecipes(Plan);
5626 assert(none_of(*VectorLoop->getEntryBasicBlock(),
5628 "All VPVectorPointerRecipes should have been removed");
5629 return NewPlan;
5630}
5631
5632/// Add branch weight metadata, if the \p Plan's middle block is terminated by a
5633/// BranchOnCond recipe.
5635 VPlan &Plan, ElementCount VF, std::optional<unsigned> VScaleForTuning) {
5636 VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
5637 auto *MiddleTerm =
5639 // Only add branch metadata if there is a (conditional) terminator.
5640 if (!MiddleTerm)
5641 return;
5642
5643 assert(MiddleTerm->getOpcode() == VPInstruction::BranchOnCond &&
5644 "must have a BranchOnCond");
5645 // Assume that `TripCount % VectorStep ` is equally distributed.
5646 unsigned VectorStep = Plan.getConcreteUF() * VF.getKnownMinValue();
5647 if (VF.isScalable() && VScaleForTuning.has_value())
5648 VectorStep *= *VScaleForTuning;
5649 assert(VectorStep > 0 && "trip count should not be zero");
5650 MDBuilder MDB(Plan.getContext());
5651 MDNode *BranchWeights =
5652 MDB.createBranchWeights({1, VectorStep - 1}, /*IsExpected=*/false);
5653 MiddleTerm->setMetadata(LLVMContext::MD_prof, BranchWeights);
5654}
5655
5657 VFRange &Range) {
5658 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
5659 auto *MiddleVPBB = Plan.getMiddleBlock();
5660 VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
5661
5662 auto IsScalableOne = [](ElementCount VF) -> bool {
5663 return VF == ElementCount::getScalable(1);
5664 };
5665
5666 for (auto &HeaderPhi : VectorRegion->getEntryBasicBlock()->phis()) {
5667 auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&HeaderPhi);
5668 if (!FOR)
5669 continue;
5670
5671 assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
5672 "Cannot handle loops with uncountable early exits");
5673
5674 // This is the second phase of vectorizing first-order recurrences, creating
5675 // extract for users outside the loop. An overview of the transformation is
5676 // described below. Suppose we have the following loop with some use after
5677 // the loop of the last a[i-1],
5678 //
5679 // for (int i = 0; i < n; ++i) {
5680 // t = a[i - 1];
5681 // b[i] = a[i] - t;
5682 // }
5683 // use t;
5684 //
5685 // There is a first-order recurrence on "a". For this loop, the shorthand
5686 // scalar IR looks like:
5687 //
5688 // scalar.ph:
5689 // s.init = a[-1]
5690 // br scalar.body
5691 //
5692 // scalar.body:
5693 // i = phi [0, scalar.ph], [i+1, scalar.body]
5694 // s1 = phi [s.init, scalar.ph], [s2, scalar.body]
5695 // s2 = a[i]
5696 // b[i] = s2 - s1
5697 // br cond, scalar.body, exit.block
5698 //
5699 // exit.block:
5700 // use = lcssa.phi [s1, scalar.body]
5701 //
5702 // In this example, s1 is a recurrence because it's value depends on the
5703 // previous iteration. In the first phase of vectorization, we created a
5704 // VPFirstOrderRecurrencePHIRecipe v1 for s1. Now we create the extracts
5705 // for users in the scalar preheader and exit block.
5706 //
5707 // vector.ph:
5708 // v_init = vector(..., ..., ..., a[-1])
5709 // br vector.body
5710 //
5711 // vector.body
5712 // i = phi [0, vector.ph], [i+4, vector.body]
5713 // v1 = phi [v_init, vector.ph], [v2, vector.body]
5714 // v2 = a[i, i+1, i+2, i+3]
5715 // b[i] = v2 - v1
5716 // // Next, third phase will introduce v1' = splice(v1(3), v2(0, 1, 2))
5717 // b[i, i+1, i+2, i+3] = v2 - v1
5718 // br cond, vector.body, middle.block
5719 //
5720 // middle.block:
5721 // vector.recur.extract.for.phi = v2(2)
5722 // vector.recur.extract = v2(3)
5723 // br cond, scalar.ph, exit.block
5724 //
5725 // scalar.ph:
5726 // scalar.recur.init = phi [vector.recur.extract, middle.block],
5727 // [s.init, otherwise]
5728 // br scalar.body
5729 //
5730 // scalar.body:
5731 // i = phi [0, scalar.ph], [i+1, scalar.body]
5732 // s1 = phi [scalar.recur.init, scalar.ph], [s2, scalar.body]
5733 // s2 = a[i]
5734 // b[i] = s2 - s1
5735 // br cond, scalar.body, exit.block
5736 //
5737 // exit.block:
5738 // lo = lcssa.phi [s1, scalar.body],
5739 // [vector.recur.extract.for.phi, middle.block]
5740 //
5741 // Now update VPIRInstructions modeling LCSSA phis in the exit block.
5742 // Extract the penultimate value of the recurrence and use it as operand for
5743 // the VPIRInstruction modeling the phi.
5745 make_range(MiddleVPBB->getFirstNonPhi(), MiddleVPBB->end()))) {
5747 continue;
5748
5749 // For VF vscale x 1, if vscale = 1, we are unable to extract the
5750 // penultimate value of the recurrence. Instead we rely on the existing
5751 // extract of the last element from the result of
5752 // VPInstruction::FirstOrderRecurrenceSplice.
5753 // TODO: Consider vscale_range info and UF.
5755 Range))
5756 return;
5757 VPValue *PenultimateElement = MiddleBuilder.createNaryOp(
5758 VPInstruction::ExtractPenultimateElement, FOR->getBackedgeValue(), {},
5759 "vector.recur.extract.for.phi");
5760 for (VPUser *U : to_vector(cast<VPInstruction>(&R)->users())) {
5761 auto *ExitPhi = dyn_cast<VPIRPhi>(U);
5762 if (!ExitPhi)
5763 continue;
5764 ExitPhi->replaceUsesOfWith(cast<VPInstruction>(&R), PenultimateElement);
5765 }
5766 }
5767 }
5768}
5769
5770/// Check if \p V is a binary expression of a widened IV and a loop-invariant
5771/// value. Returns the widened IV if found, nullptr otherwise.
5773 auto *BinOp = dyn_cast<VPWidenRecipe>(V);
5774 if (!BinOp || !Instruction::isBinaryOp(BinOp->getOpcode()) ||
5775 Instruction::isIntDivRem(BinOp->getOpcode()))
5776 return nullptr;
5777
5778 VPValue *WidenIVCandidate = BinOp->getOperand(0);
5779 VPValue *InvariantCandidate = BinOp->getOperand(1);
5780 if (!isa<VPWidenIntOrFpInductionRecipe>(WidenIVCandidate))
5781 std::swap(WidenIVCandidate, InvariantCandidate);
5782
5783 if (!InvariantCandidate->isDefinedOutsideLoopRegions())
5784 return nullptr;
5785
5786 return dyn_cast<VPWidenIntOrFpInductionRecipe>(WidenIVCandidate);
5787}
5788
5789/// Create a scalar version of \p BinOp, with its \p WidenIV operand replaced
5790/// by \p ScalarIV, and place it after \p ScalarIV's defining recipe.
5794 BinOp->getNumOperands() == 2 && "BinOp must have 2 operands");
5795 auto *ClonedOp = BinOp->clone();
5796 if (ClonedOp->getOperand(0) == WidenIV) {
5797 ClonedOp->setOperand(0, ScalarIV);
5798 } else {
5799 assert(ClonedOp->getOperand(1) == WidenIV && "one operand must be WideIV");
5800 ClonedOp->setOperand(1, ScalarIV);
5801 }
5802 ClonedOp->insertAfter(ScalarIV->getDefiningRecipe());
5803 return ClonedOp;
5804}
5805
5808 Loop &L) {
5809 ScalarEvolution &SE = *PSE.getSE();
5810 VPRegionBlock *VectorLoopRegion = Plan.getVectorLoopRegion();
5811
5812 // Helper lambda to check if the IV range excludes the sentinel value. Try
5813 // signed first, then unsigned. Return an excluded sentinel if found,
5814 // otherwise return std::nullopt.
5815 auto CheckSentinel = [&SE](const SCEV *IVSCEV,
5816 bool UseMax) -> std::optional<APSInt> {
5817 unsigned BW = IVSCEV->getType()->getScalarSizeInBits();
5818 for (bool Signed : {true, false}) {
5819 APSInt Sentinel = UseMax ? APSInt::getMinValue(BW, /*Unsigned=*/!Signed)
5820 : APSInt::getMaxValue(BW, /*Unsigned=*/!Signed);
5821
5822 ConstantRange IVRange =
5823 Signed ? SE.getSignedRange(IVSCEV) : SE.getUnsignedRange(IVSCEV);
5824 if (!IVRange.contains(Sentinel))
5825 return Sentinel;
5826 }
5827 return std::nullopt;
5828 };
5829
5830 VPValue *HeaderMask = vputils::findHeaderMask(Plan);
5831 for (VPRecipeBase &Phi :
5832 make_early_inc_range(VectorLoopRegion->getEntryBasicBlock()->phis())) {
5833 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&Phi);
5835 PhiR->getRecurrenceKind()))
5836 continue;
5837
5838 Type *PhiTy = VPTypeAnalysis(Plan).inferScalarType(PhiR);
5839 if (PhiTy->isPointerTy() || PhiTy->isFloatingPointTy())
5840 continue;
5841
5842 // If there's a header mask, the backedge select will not be the find-last
5843 // select.
5844 VPValue *BackedgeVal = PhiR->getBackedgeValue();
5845 auto *FindLastSelect = cast<VPSingleDefRecipe>(BackedgeVal);
5846 if (HeaderMask &&
5847 !match(BackedgeVal,
5848 m_Select(m_Specific(HeaderMask),
5849 m_VPSingleDefRecipe(FindLastSelect), m_Specific(PhiR))))
5850 llvm_unreachable("expected header mask select");
5851
5852 // Get the find-last expression from the find-last select of the reduction
5853 // phi. The find-last select should be a select between the phi and the
5854 // find-last expression.
5855 VPValue *Cond, *FindLastExpression;
5856 if (!match(FindLastSelect, m_Select(m_VPValue(Cond), m_Specific(PhiR),
5857 m_VPValue(FindLastExpression))) &&
5858 !match(FindLastSelect,
5859 m_Select(m_VPValue(Cond), m_VPValue(FindLastExpression),
5860 m_Specific(PhiR))))
5861 continue;
5862
5863 // Check if FindLastExpression is a simple expression of a widened IV. If
5864 // so, we can track the underlying IV instead and sink the expression.
5865 auto *IVOfExpressionToSink = getExpressionIV(FindLastExpression);
5866 const SCEV *IVSCEV = vputils::getSCEVExprForVPValue(
5867 IVOfExpressionToSink ? IVOfExpressionToSink : FindLastExpression, PSE,
5868 &L);
5869 const SCEV *Step;
5870 if (!match(IVSCEV, m_scev_AffineAddRec(m_SCEV(), m_SCEV(Step)))) {
5871 assert(!match(vputils::getSCEVExprForVPValue(FindLastExpression, PSE, &L),
5873 "IVOfExpressionToSink not being an AddRec must imply "
5874 "FindLastExpression not being an AddRec.");
5875 continue;
5876 }
5877
5878 // Determine direction from SCEV step.
5879 if (!SE.isKnownNonZero(Step))
5880 continue;
5881
5882 // Positive step means we need UMax/SMax to find the last IV value, and
5883 // UMin/SMin otherwise.
5884 bool UseMax = SE.isKnownPositive(Step);
5885 std::optional<APSInt> SentinelVal = CheckSentinel(IVSCEV, UseMax);
5886 bool UseSigned = SentinelVal && SentinelVal->isSigned();
5887
5888 // Sinking an expression will disable epilogue vectorization. Only use it,
5889 // if FindLastExpression cannot be vectorized via a sentinel. Sinking may
5890 // also prevent vectorizing using a sentinel (e.g., if the expression is a
5891 // multiply or divide by large constant, respectively), which also makes
5892 // sinking undesirable.
5893 if (IVOfExpressionToSink) {
5894 const SCEV *FindLastExpressionSCEV =
5895 vputils::getSCEVExprForVPValue(FindLastExpression, PSE, &L);
5896 if (match(FindLastExpressionSCEV,
5897 m_scev_AffineAddRec(m_SCEV(), m_SCEV(Step)))) {
5898 bool NewUseMax = SE.isKnownPositive(Step);
5899 if (auto NewSentinel =
5900 CheckSentinel(FindLastExpressionSCEV, NewUseMax)) {
5901 // The original expression already has a sentinel, so prefer not
5902 // sinking to keep epilogue vectorization possible.
5903 SentinelVal = *NewSentinel;
5904 UseSigned = NewSentinel->isSigned();
5905 UseMax = NewUseMax;
5906 IVSCEV = FindLastExpressionSCEV;
5907 IVOfExpressionToSink = nullptr;
5908 }
5909 }
5910 }
5911
5912 // If no sentinel was found, fall back to a boolean AnyOf reduction to track
5913 // if the condition was ever true. Requires the IV to not wrap, otherwise we
5914 // cannot use min/max.
5915 if (!SentinelVal) {
5916 auto *AR = cast<SCEVAddRecExpr>(IVSCEV);
5917 if (AR->hasNoSignedWrap())
5918 UseSigned = true;
5919 else if (AR->hasNoUnsignedWrap())
5920 UseSigned = false;
5921 else
5922 continue;
5923 }
5924
5926 BackedgeVal,
5928
5929 VPValue *NewFindLastSelect = BackedgeVal;
5930 VPValue *SelectCond = Cond;
5931 if (!SentinelVal || IVOfExpressionToSink) {
5932 // When we need to create a new select, normalize the condition so that
5933 // PhiR is the last operand and include the header mask if needed.
5934 DebugLoc DL = FindLastSelect->getDefiningRecipe()->getDebugLoc();
5935 VPBuilder LoopBuilder(FindLastSelect->getDefiningRecipe());
5936 if (FindLastSelect->getDefiningRecipe()->getOperand(1) == PhiR)
5937 SelectCond = LoopBuilder.createNot(SelectCond);
5938
5939 // When tail folding, mask the condition with the header mask to prevent
5940 // propagating poison from inactive lanes in the last vector iteration.
5941 if (HeaderMask)
5942 SelectCond = LoopBuilder.createLogicalAnd(HeaderMask, SelectCond);
5943
5944 if (SelectCond != Cond || IVOfExpressionToSink) {
5945 NewFindLastSelect = LoopBuilder.createSelect(
5946 SelectCond,
5947 IVOfExpressionToSink ? IVOfExpressionToSink : FindLastExpression,
5948 PhiR, DL);
5949 }
5950 }
5951
5952 // Create the reduction result in the middle block using sentinel directly.
5953 RecurKind MinMaxKind =
5954 UseMax ? (UseSigned ? RecurKind::SMax : RecurKind::UMax)
5955 : (UseSigned ? RecurKind::SMin : RecurKind::UMin);
5956 VPIRFlags Flags(MinMaxKind, /*IsOrdered=*/false, /*IsInLoop=*/false,
5957 FastMathFlags());
5958 DebugLoc ExitDL = RdxResult->getDebugLoc();
5959 VPBuilder MiddleBuilder(RdxResult);
5960 VPValue *ReducedIV =
5962 NewFindLastSelect, Flags, ExitDL);
5963
5964 // If IVOfExpressionToSink is an expression to sink, sink it now.
5965 VPValue *VectorRegionExitingVal = ReducedIV;
5966 if (IVOfExpressionToSink)
5967 VectorRegionExitingVal =
5968 cloneBinOpForScalarIV(cast<VPWidenRecipe>(FindLastExpression),
5969 ReducedIV, IVOfExpressionToSink);
5970
5971 VPValue *NewRdxResult;
5972 VPValue *StartVPV = PhiR->getStartValue();
5973 if (SentinelVal) {
5974 // Sentinel-based approach: reduce IVs with min/max, compare against
5975 // sentinel to detect if condition was ever true, select accordingly.
5976 VPValue *Sentinel = Plan.getConstantInt(*SentinelVal);
5977 auto *Cmp = MiddleBuilder.createICmp(CmpInst::ICMP_NE, ReducedIV,
5978 Sentinel, ExitDL);
5979 NewRdxResult = MiddleBuilder.createSelect(Cmp, VectorRegionExitingVal,
5980 StartVPV, ExitDL);
5981 StartVPV = Sentinel;
5982 } else {
5983 // Introduce a boolean AnyOf reduction to track if the condition was ever
5984 // true in the loop. Use it to select the initial start value, if it was
5985 // never true.
5986 auto *AnyOfPhi = new VPReductionPHIRecipe(
5987 /*Phi=*/nullptr, RecurKind::Or, *Plan.getFalse(), *Plan.getFalse(),
5988 RdxUnordered{1}, {}, /*HasUsesOutsideReductionChain=*/false);
5989 AnyOfPhi->insertAfter(PhiR);
5990
5991 VPBuilder LoopBuilder(BackedgeVal->getDefiningRecipe());
5992 VPValue *OrVal = LoopBuilder.createOr(AnyOfPhi, SelectCond);
5993 AnyOfPhi->setOperand(1, OrVal);
5994
5995 NewRdxResult = MiddleBuilder.createAnyOfReduction(
5996 OrVal, VectorRegionExitingVal, StartVPV, ExitDL);
5997
5998 // Initialize the IV reduction phi with the neutral element, not the
5999 // original start value, to ensure correct min/max reduction results.
6000 StartVPV = Plan.getOrAddLiveIn(
6001 getRecurrenceIdentity(MinMaxKind, IVSCEV->getType(), {}));
6002 }
6003 RdxResult->replaceAllUsesWith(NewRdxResult);
6004 RdxResult->eraseFromParent();
6005
6006 auto *NewPhiR = new VPReductionPHIRecipe(
6007 cast<PHINode>(PhiR->getUnderlyingInstr()), RecurKind::FindIV, *StartVPV,
6008 *NewFindLastSelect, RdxUnordered{1}, {},
6009 PhiR->hasUsesOutsideReductionChain());
6010 NewPhiR->insertBefore(PhiR);
6011 PhiR->replaceAllUsesWith(NewPhiR);
6012 PhiR->eraseFromParent();
6013 }
6014}
6015
6016namespace {
6017
6018using ExtendKind = TTI::PartialReductionExtendKind;
6019struct ReductionExtend {
6020 Type *SrcType = nullptr;
6021 ExtendKind Kind = ExtendKind::PR_None;
6022};
6023
6024/// Describes the extends used to compute the extended reduction operand.
6025/// ExtendB is optional. If ExtendB is present, ExtendsUser is a binary
6026/// operation.
6027struct ExtendedReductionOperand {
6028 /// The recipe that consumes the extends.
6029 VPWidenRecipe *ExtendsUser = nullptr;
6030 /// Extend descriptions (inputs to getPartialReductionCost).
6031 ReductionExtend ExtendA, ExtendB;
6032};
6033
6034/// A chain of recipes that form a partial reduction. Matches either
6035/// reduction_bin_op (extended op, accumulator), or
6036/// reduction_bin_op (accumulator, extended op).
6037/// The possible forms of the "extended op" are listed in
6038/// matchExtendedReductionOperand.
6039struct VPPartialReductionChain {
6040 /// The top-level binary operation that forms the reduction to a scalar
6041 /// after the loop body.
6042 VPWidenRecipe *ReductionBinOp = nullptr;
6043 /// The user of the extends that is then reduced.
6044 ExtendedReductionOperand ExtendedOp;
6045 /// The recurrence kind for the entire partial reduction chain.
6046 /// This allows distinguishing between Sub and AddWithSub recurrences,
6047 /// when the ReductionBinOp is a Instruction::Sub.
6048 RecurKind RK;
6049 /// The index of the accumulator operand of ReductionBinOp. The extended op
6050 /// is `1 - AccumulatorOpIdx`.
6051 unsigned AccumulatorOpIdx;
6052 unsigned ScaleFactor;
6053};
6054
6055static VPSingleDefRecipe *
6056optimizeExtendsForPartialReduction(VPSingleDefRecipe *Op,
6057 VPTypeAnalysis &TypeInfo) {
6058 // reduce.add(mul(ext(A), C))
6059 // -> reduce.add(mul(ext(A), ext(trunc(C))))
6060 const APInt *Const;
6061 if (match(Op, m_Mul(m_ZExtOrSExt(m_VPValue()), m_APInt(Const)))) {
6062 auto *ExtA = cast<VPWidenCastRecipe>(Op->getOperand(0));
6063 Instruction::CastOps ExtOpc = ExtA->getOpcode();
6064 Type *NarrowTy = TypeInfo.inferScalarType(ExtA->getOperand(0));
6065 if (!Op->hasOneUse() ||
6067 Const, NarrowTy, TTI::getPartialReductionExtendKind(ExtOpc)))
6068 return Op;
6069
6070 VPBuilder Builder(Op);
6071 auto *Trunc = Builder.createWidenCast(Instruction::CastOps::Trunc,
6072 Op->getOperand(1), NarrowTy);
6073 Type *WideTy = TypeInfo.inferScalarType(ExtA);
6074 Op->setOperand(1, Builder.createWidenCast(ExtOpc, Trunc, WideTy));
6075 return Op;
6076 }
6077
6078 // reduce.add(abs(sub(ext(A), ext(B))))
6079 // -> reduce.add(ext(absolute-difference(A, B)))
6080 VPValue *X, *Y;
6083 auto *Sub = Op->getOperand(0)->getDefiningRecipe();
6084 auto *Ext = cast<VPWidenCastRecipe>(Sub->getOperand(0));
6085 assert(Ext->getOpcode() ==
6086 cast<VPWidenCastRecipe>(Sub->getOperand(1))->getOpcode() &&
6087 "Expected both the LHS and RHS extends to be the same");
6088 bool IsSigned = Ext->getOpcode() == Instruction::SExt;
6089 VPBuilder Builder(Op);
6090 Type *SrcTy = TypeInfo.inferScalarType(X);
6091 auto *FreezeX = Builder.insert(new VPWidenRecipe(Instruction::Freeze, {X}));
6092 auto *FreezeY = Builder.insert(new VPWidenRecipe(Instruction::Freeze, {Y}));
6093 auto *Max = Builder.insert(
6094 new VPWidenIntrinsicRecipe(IsSigned ? Intrinsic::smax : Intrinsic::umax,
6095 {FreezeX, FreezeY}, SrcTy));
6096 auto *Min = Builder.insert(
6097 new VPWidenIntrinsicRecipe(IsSigned ? Intrinsic::smin : Intrinsic::umin,
6098 {FreezeX, FreezeY}, SrcTy));
6099 auto *AbsDiff =
6100 Builder.insert(new VPWidenRecipe(Instruction::Sub, {Max, Min}));
6101 return Builder.createWidenCast(Instruction::CastOps::ZExt, AbsDiff,
6102 TypeInfo.inferScalarType(Op));
6103 }
6104
6105 // reduce.add(ext(mul(ext(A), ext(B))))
6106 // -> reduce.add(mul(wider_ext(A), wider_ext(B)))
6107 // TODO: Support this optimization for float types.
6109 m_ZExtOrSExt(m_VPValue()))))) {
6110 auto *Ext = cast<VPWidenCastRecipe>(Op);
6111 auto *Mul = cast<VPWidenRecipe>(Ext->getOperand(0));
6112 auto *MulLHS = cast<VPWidenCastRecipe>(Mul->getOperand(0));
6113 auto *MulRHS = cast<VPWidenCastRecipe>(Mul->getOperand(1));
6114 if (!Mul->hasOneUse() ||
6115 (Ext->getOpcode() != MulLHS->getOpcode() && MulLHS != MulRHS) ||
6116 MulLHS->getOpcode() != MulRHS->getOpcode())
6117 return Op;
6118 VPBuilder Builder(Mul);
6119 Mul->setOperand(0, Builder.createWidenCast(MulLHS->getOpcode(),
6120 MulLHS->getOperand(0),
6121 Ext->getResultType()));
6122 Mul->setOperand(1, MulLHS == MulRHS
6123 ? Mul->getOperand(0)
6124 : Builder.createWidenCast(MulRHS->getOpcode(),
6125 MulRHS->getOperand(0),
6126 Ext->getResultType()));
6127 return Mul;
6128 }
6129
6130 return Op;
6131}
6132
6133// Helper to transform a partial reduction chain into a partial reduction
6134// recipe. Assumes profitability has been checked.
6135static void transformToPartialReduction(const VPPartialReductionChain &Chain,
6136 VPTypeAnalysis &TypeInfo, VPlan &Plan,
6137 VPReductionPHIRecipe *RdxPhi) {
6138 VPWidenRecipe *WidenRecipe = Chain.ReductionBinOp;
6139 assert(WidenRecipe->getNumOperands() == 2 && "Expected binary operation");
6140
6141 VPValue *Accumulator = WidenRecipe->getOperand(Chain.AccumulatorOpIdx);
6142 auto *ExtendedOp = cast<VPSingleDefRecipe>(
6143 WidenRecipe->getOperand(1 - Chain.AccumulatorOpIdx));
6144
6145 // Sub-reductions can be implemented in two ways:
6146 // (1) negate the operand in the vector loop (the default way).
6147 // (2) subtract the reduced value from the init value in the middle block.
6148 // Both ways keep the reduction itself as an 'add' reduction.
6149 //
6150 // The ISD nodes for partial reductions don't support folding the
6151 // sub/negation into its operands because the following is not a valid
6152 // transformation:
6153 // sub(0, mul(ext(a), ext(b)))
6154 // -> mul(ext(a), ext(sub(0, b)))
6155 //
6156 // It's therefore better to choose option (2) such that the partial
6157 // reduction is always positive (starting at '0') and to do a final
6158 // subtract in the middle block.
6159 if (WidenRecipe->getOpcode() == Instruction::Sub &&
6160 Chain.RK != RecurKind::Sub) {
6161 VPBuilder Builder(WidenRecipe);
6162 Type *ElemTy = TypeInfo.inferScalarType(ExtendedOp);
6163 auto *Zero = Plan.getZero(ElemTy);
6164 auto *NegRecipe =
6165 new VPWidenRecipe(Instruction::Sub, {Zero, ExtendedOp}, VPIRFlags(),
6167 Builder.insert(NegRecipe);
6168 ExtendedOp = NegRecipe;
6169 }
6170
6171 // FIXME: Do these transforms before invoking the cost-model.
6172 ExtendedOp = optimizeExtendsForPartialReduction(ExtendedOp, TypeInfo);
6173
6174 // Check if WidenRecipe is the final result of the reduction. If so look
6175 // through selects for predicated reductions.
6176 VPValue *Cond = nullptr;
6178 WidenRecipe,
6179 m_Select(m_VPValue(Cond), m_Specific(WidenRecipe), m_Specific(RdxPhi))));
6180 bool IsLastInChain = RdxPhi->getBackedgeValue() == WidenRecipe ||
6181 RdxPhi->getBackedgeValue() == ExitValue;
6182 assert((!ExitValue || IsLastInChain) &&
6183 "if we found ExitValue, it must match RdxPhi's backedge value");
6184
6185 Type *PhiType = TypeInfo.inferScalarType(RdxPhi);
6186 RecurKind RdxKind =
6188 auto *PartialRed = new VPReductionRecipe(
6189 RdxKind,
6190 RdxKind == RecurKind::FAdd ? WidenRecipe->getFastMathFlags()
6191 : FastMathFlags(),
6192 WidenRecipe->getUnderlyingInstr(), Accumulator, ExtendedOp, Cond,
6193 RdxUnordered{/*VFScaleFactor=*/Chain.ScaleFactor});
6194 PartialRed->insertBefore(WidenRecipe);
6195
6196 if (Cond)
6197 ExitValue->replaceAllUsesWith(PartialRed);
6198 WidenRecipe->replaceAllUsesWith(PartialRed);
6199
6200 // We only need to update the PHI node once, which is when we find the
6201 // last reduction in the chain.
6202 if (!IsLastInChain)
6203 return;
6204
6205 // Scale the PHI and ReductionStartVector by the VFScaleFactor
6206 assert(RdxPhi->getVFScaleFactor() == 1 && "scale factor must not be set");
6207 RdxPhi->setVFScaleFactor(Chain.ScaleFactor);
6208
6209 auto *StartInst = cast<VPInstruction>(RdxPhi->getStartValue());
6210 assert(StartInst->getOpcode() == VPInstruction::ReductionStartVector);
6211 auto *NewScaleFactor = Plan.getConstantInt(32, Chain.ScaleFactor);
6212 StartInst->setOperand(2, NewScaleFactor);
6213
6214 // If this is the last value in a sub-reduction chain, then update the PHI
6215 // node to start at `0` and update the reduction-result to subtract from
6216 // the PHI's start value.
6217 if (Chain.RK != RecurKind::Sub)
6218 return;
6219
6220 VPValue *OldStartValue = StartInst->getOperand(0);
6221 StartInst->setOperand(0, StartInst->getOperand(1));
6222
6223 // Replace reduction_result by 'sub (startval, reductionresult)'.
6225 assert(RdxResult && "Could not find reduction result");
6226
6227 VPBuilder Builder = VPBuilder::getToInsertAfter(RdxResult);
6228 constexpr unsigned SubOpc = Instruction::BinaryOps::Sub;
6229 VPInstruction *NewResult = Builder.createNaryOp(
6230 SubOpc, {OldStartValue, RdxResult}, VPIRFlags::getDefaultFlags(SubOpc),
6231 RdxPhi->getDebugLoc());
6232 RdxResult->replaceUsesWithIf(
6233 NewResult,
6234 [&NewResult](VPUser &U, unsigned Idx) { return &U != NewResult; });
6235}
6236
6237/// Returns the cost of a link in a partial-reduction chain for a given VF.
6238static InstructionCost
6239getPartialReductionLinkCost(VPCostContext &CostCtx,
6240 const VPPartialReductionChain &Link,
6241 ElementCount VF) {
6242 Type *RdxType = CostCtx.Types.inferScalarType(Link.ReductionBinOp);
6243 const ExtendedReductionOperand &ExtendedOp = Link.ExtendedOp;
6244 std::optional<unsigned> BinOpc = std::nullopt;
6245 // If ExtendB is not none, then the "ExtendsUser" is the binary operation.
6246 if (ExtendedOp.ExtendB.Kind != ExtendKind::PR_None)
6247 BinOpc = ExtendedOp.ExtendsUser->getOpcode();
6248
6249 std::optional<llvm::FastMathFlags> Flags;
6250 if (RdxType->isFloatingPointTy())
6251 Flags = Link.ReductionBinOp->getFastMathFlags();
6252
6253 unsigned Opcode = Link.RK == RecurKind::Sub
6254 ? (unsigned)Instruction::Add
6255 : Link.ReductionBinOp->getOpcode();
6256 return CostCtx.TTI.getPartialReductionCost(
6257 Opcode, ExtendedOp.ExtendA.SrcType, ExtendedOp.ExtendB.SrcType, RdxType,
6258 VF, ExtendedOp.ExtendA.Kind, ExtendedOp.ExtendB.Kind, BinOpc,
6259 CostCtx.CostKind, Flags);
6260}
6261
6262static ExtendKind getPartialReductionExtendKind(VPWidenCastRecipe *Cast) {
6264}
6265
6266/// Checks if \p Op (which is an operand of \p UpdateR) is an extended reduction
6267/// operand. This is an operand where the source of the value (e.g. a load) has
6268/// been extended (sext, zext, or fpext) before it is used in the reduction.
6269///
6270/// Possible forms matched by this function:
6271/// - UpdateR(PrevValue, ext(...))
6272/// - UpdateR(PrevValue, BinOp(ext(...), ext(...)))
6273/// - UpdateR(PrevValue, BinOp(ext(...), Constant))
6274/// - UpdateR(PrevValue, neg(BinOp(ext(...), ext(...))))
6275/// - UpdateR(PrevValue, neg(BinOp(ext(...), Constant)))
6276/// - UpdateR(PrevValue, ext(mul(ext(...), ext(...))))
6277/// - UpdateR(PrevValue, ext(mul(ext(...), Constant)))
6278/// - UpdateR(PrevValue, abs(sub(ext(...), ext(...)))
6279///
6280/// Note: The second operand of UpdateR corresponds to \p Op in the examples.
6281static std::optional<ExtendedReductionOperand>
6282matchExtendedReductionOperand(VPWidenRecipe *UpdateR, VPValue *Op,
6283 VPTypeAnalysis &TypeInfo) {
6284 assert(is_contained(UpdateR->operands(), Op) &&
6285 "Op should be operand of UpdateR");
6286
6287 // Try matching an absolute difference operand of the form
6288 // `abs(sub(ext(A), ext(B)))`. This will be later transformed into
6289 // `ext(absolute-difference(A, B))`. This allows us to perform the absolute
6290 // difference on a wider type and get the extend for "free" from the partial
6291 // reduction.
6292 VPValue *X, *Y;
6293 if (Op->hasOneUse() &&
6297 auto *Abs = cast<VPWidenIntrinsicRecipe>(Op);
6298 auto *Sub = cast<VPWidenRecipe>(Abs->getOperand(0));
6299 auto *LHSExt = cast<VPWidenCastRecipe>(Sub->getOperand(0));
6300 auto *RHSExt = cast<VPWidenCastRecipe>(Sub->getOperand(1));
6301 Type *LHSInputType = TypeInfo.inferScalarType(X);
6302 Type *RHSInputType = TypeInfo.inferScalarType(Y);
6303 if (LHSInputType != RHSInputType ||
6304 LHSExt->getOpcode() != RHSExt->getOpcode())
6305 return std::nullopt;
6306 // Note: This is essentially the same as matching ext(...) as we will
6307 // rewrite this operand to ext(absolute-difference(A, B)).
6308 return ExtendedReductionOperand{
6309 Sub,
6310 /*ExtendA=*/{LHSInputType, getPartialReductionExtendKind(LHSExt)},
6311 /*ExtendB=*/{}};
6312 }
6313
6314 std::optional<TTI::PartialReductionExtendKind> OuterExtKind;
6316 auto *CastRecipe = cast<VPWidenCastRecipe>(Op);
6317 VPValue *CastSource = CastRecipe->getOperand(0);
6318 OuterExtKind = getPartialReductionExtendKind(CastRecipe);
6319 if (match(CastSource, m_Mul(m_VPValue(), m_VPValue())) ||
6320 match(CastSource, m_FMul(m_VPValue(), m_VPValue()))) {
6321 // Match: ext(mul(...))
6322 // Record the outer extend kind and set `Op` to the mul. We can then match
6323 // this as a binary operation. Note: We can optimize out the outer extend
6324 // by widening the inner extends to match it. See
6325 // optimizeExtendsForPartialReduction.
6326 Op = CastSource;
6327 } else if (UpdateR->getOpcode() == Instruction::Add ||
6328 UpdateR->getOpcode() == Instruction::FAdd) {
6329 // Match: UpdateR(PrevValue, ext(...))
6330 // TODO: Remove the add/fadd restriction (we should be able to handle this
6331 // case for sub reductions too).
6332 return ExtendedReductionOperand{
6333 UpdateR,
6334 /*ExtendA=*/{TypeInfo.inferScalarType(CastSource), *OuterExtKind},
6335 /*ExtendB=*/{}};
6336 }
6337 }
6338
6339 if (!Op->hasOneUse())
6340 return std::nullopt;
6341
6342 // Handle neg(...) pattern (aka sub(0, ...)).
6343 VPValue *NegatedOp = nullptr;
6344 if (match(Op, m_Sub(m_ZeroInt(), m_VPValue(NegatedOp))))
6345 Op = NegatedOp;
6346
6348 if (!BinOp || !Instruction::isBinaryOp(BinOp->getOpcode()))
6349 return std::nullopt;
6350
6351 // The rest of the matching assumes `Op` is a (possibly extended/negated)
6352 // binary operation.
6353
6354 VPValue *LHS = BinOp->getOperand(0);
6355 VPValue *RHS = BinOp->getOperand(1);
6356
6357 // The LHS of the operation must always be an extend.
6359 return std::nullopt;
6360
6361 auto *LHSCast = cast<VPWidenCastRecipe>(LHS);
6362 Type *LHSInputType = TypeInfo.inferScalarType(LHSCast->getOperand(0));
6363 ExtendKind LHSExtendKind = getPartialReductionExtendKind(LHSCast);
6364
6365 // The RHS of the operation can be an extend or a constant integer.
6366 const APInt *RHSConst = nullptr;
6367 VPWidenCastRecipe *RHSCast = nullptr;
6369 RHSCast = cast<VPWidenCastRecipe>(RHS);
6370 else if (!match(RHS, m_APInt(RHSConst)) ||
6371 !canConstantBeExtended(RHSConst, LHSInputType, LHSExtendKind))
6372 return std::nullopt;
6373
6374 // The outer extend kind must match the inner extends for folding.
6375 for (VPWidenCastRecipe *Cast : {LHSCast, RHSCast})
6376 if (Cast && OuterExtKind &&
6377 getPartialReductionExtendKind(Cast) != OuterExtKind)
6378 return std::nullopt;
6379
6380 Type *RHSInputType = LHSInputType;
6381 ExtendKind RHSExtendKind = LHSExtendKind;
6382 if (RHSCast) {
6383 RHSInputType = TypeInfo.inferScalarType(RHSCast->getOperand(0));
6384 RHSExtendKind = getPartialReductionExtendKind(RHSCast);
6385 }
6386
6387 return ExtendedReductionOperand{
6388 BinOp, {LHSInputType, LHSExtendKind}, {RHSInputType, RHSExtendKind}};
6389}
6390
6391/// Examines each operation in the reduction chain corresponding to \p RedPhiR,
6392/// and determines if the target can use a cheaper operation with a wider
6393/// per-iteration input VF and narrower PHI VF. If successful, returns the chain
6394/// of operations in the reduction.
6395static std::optional<SmallVector<VPPartialReductionChain>>
6396getScaledReductions(VPReductionPHIRecipe *RedPhiR, VPCostContext &CostCtx,
6397 VFRange &Range) {
6398 // Get the backedge value from the reduction PHI and find the
6399 // ComputeReductionResult that uses it (directly or through a select for
6400 // predicated reductions).
6401 auto *RdxResult = vputils::findComputeReductionResult(RedPhiR);
6402 if (!RdxResult)
6403 return std::nullopt;
6404 VPValue *ExitValue = RdxResult->getOperand(0);
6405 match(ExitValue, m_Select(m_VPValue(), m_VPValue(ExitValue), m_VPValue()));
6406
6407 VPTypeAnalysis &TypeInfo = CostCtx.Types;
6409 RecurKind RK = RedPhiR->getRecurrenceKind();
6410 Type *PhiType = TypeInfo.inferScalarType(RedPhiR);
6411 TypeSize PHISize = PhiType->getPrimitiveSizeInBits();
6412
6413 // Work backwards from the ExitValue examining each reduction operation.
6414 VPValue *CurrentValue = ExitValue;
6415 while (CurrentValue != RedPhiR) {
6416 auto *UpdateR = dyn_cast<VPWidenRecipe>(CurrentValue);
6417 if (!UpdateR || !Instruction::isBinaryOp(UpdateR->getOpcode()))
6418 return std::nullopt;
6419
6420 VPValue *Op = UpdateR->getOperand(1);
6421 VPValue *PrevValue = UpdateR->getOperand(0);
6422
6423 // Find the extended operand. The other operand (PrevValue) is the next link
6424 // in the reduction chain.
6425 std::optional<ExtendedReductionOperand> ExtendedOp =
6426 matchExtendedReductionOperand(UpdateR, Op, TypeInfo);
6427 if (!ExtendedOp) {
6428 ExtendedOp = matchExtendedReductionOperand(UpdateR, PrevValue, TypeInfo);
6429 if (!ExtendedOp)
6430 return std::nullopt;
6431 std::swap(Op, PrevValue);
6432 }
6433
6434 Type *ExtSrcType = ExtendedOp->ExtendA.SrcType;
6435 TypeSize ExtSrcSize = ExtSrcType->getPrimitiveSizeInBits();
6436 if (!PHISize.hasKnownScalarFactor(ExtSrcSize))
6437 return std::nullopt;
6438
6439 // Check if a partial reduction chain is supported by the target (i.e. does
6440 // not have an invalid cost) for the given VF range. Clamps the range and
6441 // returns true if feasible for any VF.
6442 VPPartialReductionChain Link(
6443 {UpdateR, *ExtendedOp, RK,
6444 PrevValue == UpdateR->getOperand(0) ? 0U : 1U,
6445 static_cast<unsigned>(PHISize.getKnownScalarFactor(ExtSrcSize))});
6446 Chain.push_back(Link);
6447 CurrentValue = PrevValue;
6448 }
6449
6450 // The chain links were collected by traversing backwards from the exit value.
6451 // Reverse the chains so they are in program order.
6452 std::reverse(Chain.begin(), Chain.end());
6453 return Chain;
6454}
6455} // namespace
6456
6458 VPCostContext &CostCtx,
6459 VFRange &Range) {
6460 // Find all possible valid partial reductions, grouping chains by their PHI.
6461 // This grouping allows invalidating the whole chain, if any link is not a
6462 // valid partial reduction.
6464 ChainsByPhi;
6465 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
6466 for (VPRecipeBase &R : HeaderVPBB->phis()) {
6467 auto *RedPhiR = dyn_cast<VPReductionPHIRecipe>(&R);
6468 if (!RedPhiR)
6469 continue;
6470
6471 if (auto Chains = getScaledReductions(RedPhiR, CostCtx, Range))
6472 ChainsByPhi.try_emplace(RedPhiR, std::move(*Chains));
6473 }
6474
6475 if (ChainsByPhi.empty())
6476 return;
6477
6478 // Build set of partial reduction operations for extend user validation and
6479 // a map of reduction bin ops to their scale factors for scale validation.
6480 SmallPtrSet<VPRecipeBase *, 4> PartialReductionOps;
6481 DenseMap<VPSingleDefRecipe *, unsigned> ScaledReductionMap;
6482 for (const auto &[_, Chains] : ChainsByPhi)
6483 for (const VPPartialReductionChain &Chain : Chains) {
6484 PartialReductionOps.insert(Chain.ExtendedOp.ExtendsUser);
6485 ScaledReductionMap[Chain.ReductionBinOp] = Chain.ScaleFactor;
6486 }
6487
6488 // A partial reduction is invalid if any of its extends are used by
6489 // something that isn't another partial reduction. This is because the
6490 // extends are intended to be lowered along with the reduction itself.
6491 auto ExtendUsersValid = [&](VPValue *Ext) {
6492 return !isa<VPWidenCastRecipe>(Ext) || all_of(Ext->users(), [&](VPUser *U) {
6493 return PartialReductionOps.contains(cast<VPRecipeBase>(U));
6494 });
6495 };
6496
6497 auto IsProfitablePartialReductionChainForVF =
6498 [&](ArrayRef<VPPartialReductionChain> Chain, ElementCount VF) -> bool {
6499 InstructionCost PartialCost = 0, RegularCost = 0;
6500
6501 // The chain is a profitable partial reduction chain if the cost of handling
6502 // the entire chain is cheaper when using partial reductions than when
6503 // handling the entire chain using regular reductions.
6504 for (const VPPartialReductionChain &Link : Chain) {
6505 const ExtendedReductionOperand &ExtendedOp = Link.ExtendedOp;
6506 InstructionCost LinkCost = getPartialReductionLinkCost(CostCtx, Link, VF);
6507 if (!LinkCost.isValid())
6508 return false;
6509
6510 PartialCost += LinkCost;
6511 RegularCost += Link.ReductionBinOp->computeCost(VF, CostCtx);
6512 // If ExtendB is not none, then the "ExtendsUser" is the binary operation.
6513 if (ExtendedOp.ExtendB.Kind != ExtendKind::PR_None)
6514 RegularCost += ExtendedOp.ExtendsUser->computeCost(VF, CostCtx);
6515 for (VPValue *Op : ExtendedOp.ExtendsUser->operands())
6516 if (auto *Extend = dyn_cast<VPWidenCastRecipe>(Op))
6517 RegularCost += Extend->computeCost(VF, CostCtx);
6518 }
6519 return PartialCost.isValid() && PartialCost <= RegularCost;
6520 };
6521
6522 // Validate chains: check that extends are only used by partial reductions,
6523 // and that reduction bin ops are only used by other partial reductions with
6524 // matching scale factors, are outside the loop region or the select
6525 // introduced by tail-folding. Otherwise we would create users of scaled
6526 // reductions where the types of the other operands don't match.
6527 for (auto &[RedPhiR, Chains] : ChainsByPhi) {
6528 for (const VPPartialReductionChain &Chain : Chains) {
6529 if (!all_of(Chain.ExtendedOp.ExtendsUser->operands(), ExtendUsersValid)) {
6530 Chains.clear();
6531 break;
6532 }
6533 auto UseIsValid = [&, RedPhiR = RedPhiR](VPUser *U) {
6534 if (auto *PhiR = dyn_cast<VPReductionPHIRecipe>(U))
6535 return PhiR == RedPhiR;
6536 auto *R = cast<VPSingleDefRecipe>(U);
6537 return Chain.ScaleFactor == ScaledReductionMap.lookup_or(R, 0) ||
6539 m_Specific(Chain.ReductionBinOp))) ||
6540 match(R, m_Select(m_VPValue(), m_Specific(Chain.ReductionBinOp),
6541 m_Specific(RedPhiR)));
6542 };
6543 if (!all_of(Chain.ReductionBinOp->users(), UseIsValid)) {
6544 Chains.clear();
6545 break;
6546 }
6547
6548 // Check if the compute-reduction-result is used by a sunk store.
6549 // TODO: Also form partial reductions in those cases.
6550 if (auto *RdxResult = vputils::findComputeReductionResult(RedPhiR)) {
6551 if (any_of(RdxResult->users(), [](VPUser *U) {
6552 auto *RepR = dyn_cast<VPReplicateRecipe>(U);
6553 return RepR && isa<StoreInst>(RepR->getUnderlyingInstr());
6554 })) {
6555 Chains.clear();
6556 break;
6557 }
6558 }
6559 }
6560
6561 // Clear the chain if it is not profitable.
6563 [&, &Chains = Chains](ElementCount VF) {
6564 return IsProfitablePartialReductionChainForVF(Chains, VF);
6565 },
6566 Range))
6567 Chains.clear();
6568 }
6569
6570 for (auto &[Phi, Chains] : ChainsByPhi)
6571 for (const VPPartialReductionChain &Chain : Chains)
6572 transformToPartialReduction(Chain, CostCtx.Types, Plan, Phi);
6573}
6574
6576 VPlan &Plan, VFRange &Range, VPRecipeBuilder &RecipeBuilder) {
6577 // Collect all loads/stores first. We will start with ones having simpler
6578 // decisions followed by more complex ones that are potentially
6579 // guided/dependent on the simpler ones.
6581 for (VPBasicBlock *VPBB :
6584 for (VPRecipeBase &R : *VPBB) {
6585 auto *VPI = dyn_cast<VPInstruction>(&R);
6586 if (VPI && VPI->getUnderlyingValue() &&
6587 is_contained({Instruction::Load, Instruction::Store},
6588 VPI->getOpcode()))
6589 MemOps.push_back(VPI);
6590 }
6591 }
6592
6593 VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
6594 VPBuilder FinalRedStoresBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
6595
6596 for (VPInstruction *VPI : MemOps) {
6597 auto ReplaceWith = [&](VPRecipeBase *New) {
6598 RecipeBuilder.setRecipe(cast<Instruction>(VPI->getUnderlyingValue()),
6599 New);
6600 New->insertBefore(VPI);
6601 if (VPI->getOpcode() == Instruction::Load)
6602 VPI->replaceAllUsesWith(New->getVPSingleValue());
6603 VPI->eraseFromParent();
6604 };
6605
6606 // Note: we must do that for scalar VPlan as well.
6607 if (RecipeBuilder.replaceWithFinalIfReductionStore(VPI,
6608 FinalRedStoresBuilder))
6609 continue;
6610
6611 // Filter out scalar VPlan for the remaining memory operations.
6613 [](ElementCount VF) { return VF.isScalar(); }, Range))
6614 continue;
6615
6616 if (VPHistogramRecipe *Histogram = RecipeBuilder.widenIfHistogram(VPI)) {
6617 ReplaceWith(Histogram);
6618 continue;
6619 }
6620
6621 VPRecipeBase *Recipe = RecipeBuilder.tryToWidenMemory(VPI, Range);
6622 if (!Recipe)
6623 Recipe = RecipeBuilder.handleReplication(VPI, Range);
6624
6625 ReplaceWith(Recipe);
6626 }
6627}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
AMDGPU Register Bank Select
This file implements a class to represent arbitrary precision integral constant values and operations...
ReachingDefInfo InstSet & ToRemove
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool isEqual(const Function &Caller, const Function &Callee)
static const Function * getParent(const Value *V)
#define X(NUM, ENUM, NAME)
Definition ELF.h:851
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
static bool isSentinel(const DWARFDebugNames::AttributeEncoding &AE)
@ Default
Hexagon Common GEP
#define _
iv Induction Variable Users
Definition IVUsers.cpp:48
iv users
Definition IVUsers.cpp:48
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
licm
Definition LICM.cpp:383
Legalize the Machine IR a function s Machine IR
Definition Legalizer.cpp:81
#define I(x, y, z)
Definition MD5.cpp:57
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first DebugLoc that has line number information, given a range of instructions.
This file provides utility analysis objects describing memory locations.
This file contains the declarations for metadata subclasses.
MachineInstr unsigned OpIdx
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
#define P(N)
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
R600 Clause Merge
const SmallVectorImpl< MachineOperand > & Cond
This file contains some templates that are useful if you are working with the STL at all.
This is the interface for a metadata-based scoped no-alias analysis.
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file defines the SmallPtrSet class.
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This file implements the TypeSwitch template, which mimics a switch() statement whose cases are type ...
This file implements dominator tree analysis for a single level of a VPlan's H-CFG.
This file contains the declarations of different VPlan-related auxiliary helpers.
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition VPlanSLP.cpp:247
static SmallVector< SmallVector< VPReplicateRecipe *, 4 > > collectComplementaryPredicatedMemOps(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L)
static void removeCommonBlendMask(VPBlendRecipe *Blend)
Try to see if all of Blend's masks share a common value logically and'ed and remove it from the masks...
static void tryToCreateAbstractReductionRecipe(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries to create abstract recipes from the reduction recipe for following optimizations ...
static VPReplicateRecipe * findRecipeWithMinAlign(ArrayRef< VPReplicateRecipe * > Group)
static bool sinkScalarOperands(VPlan &Plan)
static bool cannotHoistOrSinkRecipe(const VPRecipeBase &R)
Return true if we do not know how to (mechanically) hoist or sink R out of a loop region.
static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Try to simplify the branch condition of Plan.
static void simplifyRecipe(VPSingleDefRecipe *Def, VPTypeAnalysis &TypeInfo)
Try to simplify VPSingleDefRecipe Def.
static VPValue * cloneBinOpForScalarIV(VPWidenRecipe *BinOp, VPValue *ScalarIV, VPWidenIntOrFpInductionRecipe *WidenIV)
Create a scalar version of BinOp, with its WidenIV operand replaced by ScalarIV, and place it after S...
static VPWidenIntOrFpInductionRecipe * getExpressionIV(VPValue *V)
Check if V is a binary expression of a widened IV and a loop-invariant value.
static void removeRedundantInductionCasts(VPlan &Plan)
Remove redundant casts of inductions.
static bool isConditionTrueViaVFAndUF(VPValue *Cond, VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Return true if Cond is known to be true for given BestVF and BestUF.
static bool tryToReplaceALMWithWideALM(VPlan &Plan, ElementCount VF, unsigned UF)
Try to replace multiple active lane masks used for control flow with a single, wide active lane mask ...
static std::optional< std::pair< bool, unsigned > > getOpcodeOrIntrinsicID(const VPSingleDefRecipe *R)
Get any instruction opcode or intrinsic ID data embedded in recipe R.
static VPExpressionRecipe * tryToMatchAndCreateExtendedReduction(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries convert extended in-loop reductions to VPExpressionRecipe and clamp the Range if ...
static RemoveMask_match< Op0_t, Op1_t > m_RemoveMask(const Op0_t &In, Op1_t &Out)
Match a specific mask In, or a combination of it (logical-and In, Out).
static VPIRMetadata getCommonMetadata(ArrayRef< VPReplicateRecipe * > Recipes)
static VPValue * getPredicatedMask(VPRegionBlock *R)
If R is a region with a VPBranchOnMaskRecipe in the entry block, return the mask.
static bool sinkRecurrenceUsersAfterPrevious(VPFirstOrderRecurrencePHIRecipe *FOR, VPRecipeBase *Previous, VPDominatorTree &VPDT)
Sink users of FOR after the recipe defining the previous value Previous of the recurrence.
static bool mergeReplicateRegionsIntoSuccessors(VPlan &Plan)
static VPScalarIVStepsRecipe * createScalarIVSteps(VPlan &Plan, InductionDescriptor::InductionKind Kind, Instruction::BinaryOps InductionOpcode, FPMathOperator *FPBinOp, Instruction *TruncI, VPIRValue *StartV, VPValue *Step, DebugLoc DL, VPBuilder &Builder)
static VPWidenInductionRecipe * getOptimizableIVOf(VPValue *VPV, PredicatedScalarEvolution &PSE)
Check if VPV is an untruncated wide induction, either before or after the increment.
static void fixupVFUsersForEVL(VPlan &Plan, VPValue &EVL)
After replacing the canonical IV with a EVL-based IV, fixup recipes that use VF to use the EVL instea...
static bool canNarrowLoad(VPSingleDefRecipe *WideMember0, unsigned OpIdx, VPValue *OpV, unsigned Idx, bool IsScalable)
Returns true if V is VPWidenLoadRecipe or VPInterleaveRecipe that can be converted to a narrower reci...
static void expandVPWidenPointerInduction(VPWidenPointerInductionRecipe *R, VPTypeAnalysis &TypeInfo)
Expand a VPWidenPointerInductionRecipe into executable recipes, for the initial value,...
static std::optional< ElementCount > isConsecutiveInterleaveGroup(VPInterleaveRecipe *InterleaveR, ArrayRef< ElementCount > VFs, VPTypeAnalysis &TypeInfo, const TargetTransformInfo &TTI)
Returns VF from VFs if IR is a full interleave group with factor and number of members both equal to ...
static bool isDeadRecipe(VPRecipeBase &R)
Returns true if R is dead and can be removed.
static void legalizeAndOptimizeInductions(VPlan &Plan)
Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd (IndStart, ScalarIVSteps (0,...
static void addReplicateRegions(VPlan &Plan)
static SmallVector< SmallVector< VPReplicateRecipe *, 4 > > collectGroupedReplicateMemOps(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L, function_ref< bool(VPReplicateRecipe *)> FilterFn)
Collect either replicated Loads or Stores grouped by their address SCEV, in a deep-traversal of the v...
static VPIRValue * tryToFoldLiveIns(VPSingleDefRecipe &R, ArrayRef< VPValue * > Operands, const DataLayout &DL, VPTypeAnalysis &TypeInfo)
Try to fold R using InstSimplifyFolder.
static VPValue * tryToComputeEndValueForInduction(VPWidenInductionRecipe *WideIV, VPBuilder &VectorPHBuilder, VPTypeAnalysis &TypeInfo, VPValue *VectorTC)
Compute the end value for WideIV, unless it is truncated.
static void removeRedundantExpandSCEVRecipes(VPlan &Plan)
Remove redundant EpxandSCEVRecipes in Plan's entry block by replacing them with already existing reci...
static bool hoistPreviousBeforeFORUsers(VPFirstOrderRecurrencePHIRecipe *FOR, VPRecipeBase *Previous, VPDominatorTree &VPDT)
Try to hoist Previous and its operands before all users of FOR.
static VPValue * scalarizeVPWidenPointerInduction(VPWidenPointerInductionRecipe *PtrIV, VPlan &Plan, VPBuilder &Builder)
Scalarize a VPWidenPointerInductionRecipe by replacing it with a PtrAdd (IndStart,...
static SmallVector< VPUser * > collectUsersRecursively(VPValue *V)
static VPValue * optimizeEarlyExitInductionUser(VPlan &Plan, VPTypeAnalysis &TypeInfo, VPBlockBase *PredVPBB, VPValue *Op, PredicatedScalarEvolution &PSE)
Attempts to optimize the induction variable exit values for users in the early exit block.
static void recursivelyDeleteDeadRecipes(VPValue *V)
static void reassociateHeaderMask(VPlan &Plan)
Reassociate (headermask && x) && y -> headermask && (x && y) to allow the header mask to be simplifie...
static bool canSinkStoreWithNoAliasCheck(ArrayRef< VPReplicateRecipe * > StoresToSink, PredicatedScalarEvolution &PSE, const Loop &L, VPTypeAnalysis &TypeInfo)
static VPActiveLaneMaskPHIRecipe * addVPLaneMaskPhiAndUpdateExitBranch(VPlan &Plan)
static VPRegionBlock * createReplicateRegion(VPReplicateRecipe *PredRecipe, VPlan &Plan)
static VPBasicBlock * getPredicatedThenBlock(VPRegionBlock *R)
If R is a triangle region, return the 'then' block of the triangle.
static VPValue * narrowInterleaveGroupOp(VPValue *V, SmallPtrSetImpl< VPValue * > &NarrowedOps)
static bool canHoistOrSinkWithNoAliasCheck(const MemoryLocation &MemLoc, VPBasicBlock *FirstBB, VPBasicBlock *LastBB, std::optional< SinkStoreInfo > SinkInfo={})
Check if a memory operation doesn't alias with memory operations using scoped noalias metadata,...
static void simplifyBlends(VPlan &Plan)
Normalize and simplify VPBlendRecipes.
static VPRecipeBase * optimizeMaskToEVL(VPValue *HeaderMask, VPRecipeBase &CurRecipe, VPTypeAnalysis &TypeInfo, VPValue &EVL)
Try to optimize a CurRecipe masked by HeaderMask to a corresponding EVL-based recipe without the head...
static bool isAlreadyNarrow(VPValue *VPV)
Returns true if VPValue is a narrow VPValue.
static bool canNarrowOps(ArrayRef< VPValue * > Ops, bool IsScalable)
static bool optimizeVectorInductionWidthForTCAndVFUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF)
Optimize the width of vector induction variables in Plan based on a known constant Trip Count,...
static VPExpressionRecipe * tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries convert extended in-loop reductions to VPExpressionRecipe and clamp the Range if ...
static void expandVPWidenIntOrFpInduction(VPWidenIntOrFpInductionRecipe *WidenIVR, VPTypeAnalysis &TypeInfo)
Expand a VPWidenIntOrFpInduction into executable recipes, for the initial value, phi and backedge val...
static void removeRedundantCanonicalIVs(VPlan &Plan)
Try to replace VPWidenCanonicalIVRecipes with a widened canonical IV recipe, if it exists.
static void narrowToSingleScalarRecipes(VPlan &Plan)
static VPValue * optimizeLatchExitInductionUser(VPlan &Plan, VPTypeAnalysis &TypeInfo, VPBlockBase *PredVPBB, VPValue *Op, DenseMap< VPValue *, VPValue * > &EndValues, PredicatedScalarEvolution &PSE)
Attempts to optimize the induction variable exit values for users in the exit block coming from the l...
This file provides utility VPlan to VPlan transformations.
#define RUN_VPLAN_PASS(PASS,...)
This file declares the class VPlanVerifier, which contains utility functions to check the consistency...
This file contains the declarations of the Vectorization Plan base classes:
static const X86InstrFMA3Group Groups[]
Value * RHS
Value * LHS
BinaryOperator * Mul
static const uint32_t IV[8]
Definition blake3_impl.h:83
Helper for extra no-alias checks via known-safe recipe and SCEV.
SinkStoreInfo(const SmallPtrSetImpl< VPRecipeBase * > &ExcludeRecipes, VPReplicateRecipe &GroupLeader, PredicatedScalarEvolution &PSE, const Loop &L, VPTypeAnalysis &TypeInfo)
bool shouldSkip(VPRecipeBase &R) const
Return true if R should be skipped during alias checking, either because it's in the exclude set or b...
Class for arbitrary precision integers.
Definition APInt.h:78
LLVM_ABI APInt zext(unsigned width) const
Zero extend to a new width.
Definition APInt.cpp:1043
unsigned getActiveBits() const
Compute the number of active bits in the value.
Definition APInt.h:1527
APInt abs() const
Get the absolute value.
Definition APInt.h:1810
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1503
LLVM_ABI APInt sext(unsigned width) const
Sign extend to a new width.
Definition APInt.cpp:1016
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition APInt.h:441
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition APInt.h:1228
An arbitrary precision integer that knows its signedness.
Definition APSInt.h:24
static APSInt getMinValue(uint32_t numBits, bool Unsigned)
Return the APSInt representing the minimum integer value with the given bit width and signedness.
Definition APSInt.h:310
static APSInt getMaxValue(uint32_t numBits, bool Unsigned)
Return the APSInt representing the maximum integer value with the given bit width and signedness.
Definition APSInt.h:302
@ NoAlias
The two locations do not alias at all.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
const T & back() const
back - Get the last element.
Definition ArrayRef.h:151
const T & front() const
front - Get the first element.
Definition ArrayRef.h:145
iterator end() const
Definition ArrayRef.h:131
iterator begin() const
Definition ArrayRef.h:130
LLVM Basic Block Representation.
Definition BasicBlock.h:62
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction; assumes that the block is well-formed.
Definition BasicBlock.h:237
This class represents a function call, abstracting a target machine's calling convention.
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:701
@ ICMP_NE
not equal
Definition InstrTypes.h:698
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:702
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition InstrTypes.h:686
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition InstrTypes.h:789
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
static ConstantInt * getSigned(IntegerType *Ty, int64_t V, bool ImplicitTrunc=false)
Return a ConstantInt with the specified value for the specified type.
Definition Constants.h:135
This class represents a range of values.
LLVM_ABI bool contains(const APInt &Val) const
Return true if the specified value is in the set.
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
A debug info location.
Definition DebugLoc.h:123
static DebugLoc getCompilerGenerated()
Definition DebugLoc.h:162
static DebugLoc getUnknown()
Definition DebugLoc.h:161
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition DenseMap.h:205
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:256
ValueT lookup_or(const_arg_type_t< KeyT > Val, U &&Default) const
Definition DenseMap.h:215
bool dominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
dominates - Returns true iff A dominates B.
constexpr bool isVector() const
One or more elements.
Definition TypeSize.h:324
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition TypeSize.h:312
constexpr bool isScalar() const
Exactly one element.
Definition TypeSize.h:320
Utility class for floating point operations which can have information about relaxed accuracy require...
Definition Operator.h:200
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:23
Represents flags for the getelementptr instruction/expression.
GEPNoWrapFlags withoutNoUnsignedWrap() const
static GEPNoWrapFlags none()
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
A struct for saving information about induction variables.
InductionKind
This enum represents the kinds of inductions that we support.
@ IK_PtrInduction
Pointer induction var. Step = C.
@ IK_IntInduction
Integer induction variable. Step = C.
InstSimplifyFolder - Use InstructionSimplify to fold operations to existing values.
static InstructionCost getInvalid(CostType Val=0)
bool isCast() const
bool isBinaryOp() const
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
bool isIntDivRem() const
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:354
The group of interleaved loads/stores sharing the same stride and close to each other.
InstTy * getMember(uint32_t Index) const
Get the member with the given index Index.
uint32_t getNumMembers() const
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
static bool getDecisionAndClampRange(const std::function< bool(ElementCount)> &Predicate, VFRange &Range)
Test a Predicate on a Range of VF's.
Definition VPlan.cpp:1633
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
LLVM_ABI MDNode * createBranchWeights(uint32_t TrueWeight, uint32_t FalseWeight, bool IsExpected=false)
Return metadata containing two branch weights.
Definition MDBuilder.cpp:38
Metadata node.
Definition Metadata.h:1080
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:36
bool empty() const
Definition MapVector.h:77
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
Definition MapVector.h:116
ValueT lookup(const KeyT &Key) const
Definition MapVector.h:108
Representation for a specific memory location.
AAMDNodes AATags
The metadata nodes which describes the aliasing of the location (each member is null if that kind of ...
Post-order traversal of a graph.
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
ScalarEvolution * getSE() const
Returns the ScalarEvolution analysis used.
LLVM_ABI const SCEV * getSCEV(Value *V)
Returns the SCEV expression of V, in the context of the current SCEV predicate.
static LLVM_ABI unsigned getOpcode(RecurKind Kind)
Returns the opcode corresponding to the RecurrenceKind.
static bool isFindLastRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is of the form select(cmp(),x,y) where one of (x,...
RegionT * getParent() const
Get the parent of the Region.
Definition RegionInfo.h:362
This class uses information about analyze scalars to rewrite expressions in canonical form.
LLVM_ABI Value * expandCodeFor(SCEVUse SH, Type *Ty, BasicBlock::iterator I)
Insert code to directly compute the specified SCEV expression into the program.
static const SCEV * rewrite(const SCEV *Scev, ScalarEvolution &SE, ValueToSCEVMapTy &Map)
This class represents an analyzed expression in the program.
LLVM_ABI Type * getType() const
Return the LLVM type of this SCEV expression.
The main scalar evolution driver.
LLVM_ABI const SCEV * getUDivExpr(SCEVUse LHS, SCEVUse RHS)
Get a canonical unsigned division expression, or something simpler if possible.
const DataLayout & getDataLayout() const
Return the DataLayout associated with the module this SCEV instance is operating on.
LLVM_ABI const SCEV * getNegativeSCEV(const SCEV *V, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
Return the SCEV object corresponding to -V.
LLVM_ABI bool isKnownNonZero(const SCEV *S)
Test if the given expression is known to be non-zero.
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI const SCEV * getMinusSCEV(SCEVUse LHS, SCEVUse RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
ConstantRange getSignedRange(const SCEV *S)
Determine the signed range for a particular SCEV.
LLVM_ABI bool isKnownPositive(const SCEV *S)
Test if the given expression is known to be positive.
LLVM_ABI const SCEV * getElementCount(Type *Ty, ElementCount EC, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
ConstantRange getUnsignedRange(const SCEV *S)
Determine the unsigned range for a particular SCEV.
LLVM_ABI const SCEV * getMulExpr(SmallVectorImpl< SCEVUse > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
LLVM_ABI bool isKnownPredicate(CmpPredicate Pred, SCEVUse LHS, SCEVUse RHS)
Test if the given expression is known to satisfy the condition described by Pred, LHS,...
static LLVM_ABI bool mayAliasInScopes(const MDNode *Scopes, const MDNode *NoAlias)
static LLVM_ABI AliasResult alias(const MemoryLocation &LocA, const MemoryLocation &LocB)
A vector that has set insertion semantics.
Definition SetVector.h:57
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:103
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:151
size_type size() const
Definition SmallPtrSet.h:99
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
iterator begin() const
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
Provides information about what library functions are available for the current target.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
static LLVM_ABI PartialReductionExtendKind getPartialReductionExtendKind(Instruction *I)
Get the kind of extension that an instruction represents.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
LLVM_ABI InstructionCost getPartialReductionCost(unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType, ElementCount VF, PartialReductionExtendKind OpAExtend, PartialReductionExtendKind OpBExtend, std::optional< unsigned > BinOp, TTI::TargetCostKind CostKind, std::optional< FastMathFlags > FMF) const
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
This class implements a switch-like dispatch statement for a value of 'T' using dyn_cast functionalit...
Definition TypeSwitch.h:89
TypeSwitch< T, ResultT > & Case(CallableT &&caseFn)
Add a case on the given type.
Definition TypeSwitch.h:98
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:313
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:284
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
Definition Type.cpp:311
bool isStructTy() const
True if this is an instance of StructType.
Definition Type.h:278
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:201
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:236
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:310
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:186
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:257
op_range operands()
Definition User.h:267
A recipe for generating the active lane mask for the vector loop that is used to predicate the vector...
Definition VPlan.h:3866
VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph.
Definition VPlan.h:4230
void appendRecipe(VPRecipeBase *Recipe)
Augment the existing recipes of a VPBasicBlock with an additional Recipe as the last recipe.
Definition VPlan.h:4305
RecipeListTy::iterator iterator
Instruction iterators...
Definition VPlan.h:4257
iterator end()
Definition VPlan.h:4267
iterator begin()
Recipe iterator methods.
Definition VPlan.h:4265
iterator_range< iterator > phis()
Returns an iterator range over the PHI-like recipes in the block.
Definition VPlan.h:4318
iterator getFirstNonPhi()
Return the position of the first non-phi node recipe in the block.
Definition VPlan.cpp:232
VPRegionBlock * getEnclosingLoopRegion()
Definition VPlan.cpp:598
VPBasicBlock * splitAt(iterator SplitAt)
Split current block at SplitAt by inserting a new block between the current block and its successors ...
Definition VPlan.cpp:565
const VPRecipeBase & front() const
Definition VPlan.h:4277
VPRecipeBase * getTerminator()
If the block has multiple successors, return the branch recipe terminating the block.
Definition VPlan.cpp:644
const VPRecipeBase & back() const
Definition VPlan.h:4279
A recipe for vectorizing a phi-node as a sequence of mask-based select instructions.
Definition VPlan.h:2793
VPValue * getMask(unsigned Idx) const
Return mask number Idx.
Definition VPlan.h:2829
unsigned getNumIncomingValues() const
Return the number of incoming values, taking into account when normalized the first incoming value wi...
Definition VPlan.h:2819
void setMask(unsigned Idx, VPValue *V)
Set mask number Idx to V.
Definition VPlan.h:2835
bool isNormalized() const
A normalized blend is one that has an odd number of operands, whereby the first operand does not have...
Definition VPlan.h:2815
VPBlockBase is the building block of the Hierarchical Control-Flow Graph.
Definition VPlan.h:98
void setSuccessors(ArrayRef< VPBlockBase * > NewSuccs)
Set each VPBasicBlock in NewSuccss as successor of this VPBlockBase.
Definition VPlan.h:319
VPRegionBlock * getParent()
Definition VPlan.h:190
const VPBasicBlock * getExitingBasicBlock() const
Definition VPlan.cpp:202
size_t getNumSuccessors() const
Definition VPlan.h:241
void setPredecessors(ArrayRef< VPBlockBase * > NewPreds)
Set each VPBasicBlock in NewPreds as predecessor of this VPBlockBase.
Definition VPlan.h:310
const VPBlocksTy & getPredecessors() const
Definition VPlan.h:226
VPlan * getPlan()
Definition VPlan.cpp:177
const std::string & getName() const
Definition VPlan.h:181
void clearSuccessors()
Remove all the successors of this block.
Definition VPlan.h:329
VPBlockBase * getSinglePredecessor() const
Definition VPlan.h:237
const VPBasicBlock * getEntryBasicBlock() const
Definition VPlan.cpp:182
VPBlockBase * getSingleHierarchicalPredecessor()
Definition VPlan.h:283
VPBlockBase * getSingleSuccessor() const
Definition VPlan.h:231
const VPBlocksTy & getSuccessors() const
Definition VPlan.h:215
static void insertOnEdge(VPBlockBase *From, VPBlockBase *To, VPBlockBase *BlockPtr)
Inserts BlockPtr on the edge between From and To.
Definition VPlanUtils.h:299
static bool isLatch(const VPBlockBase *VPB, const VPDominatorTree &VPDT)
Returns true if VPB is a loop latch, using isHeader().
static void insertTwoBlocksAfter(VPBlockBase *IfTrue, VPBlockBase *IfFalse, VPBlockBase *BlockPtr)
Insert disconnected VPBlockBases IfTrue and IfFalse after BlockPtr.
Definition VPlanUtils.h:200
static void connectBlocks(VPBlockBase *From, VPBlockBase *To, unsigned PredIdx=-1u, unsigned SuccIdx=-1u)
Connect VPBlockBases From and To bi-directionally.
Definition VPlanUtils.h:218
static void disconnectBlocks(VPBlockBase *From, VPBlockBase *To)
Disconnect VPBlockBases From and To bi-directionally.
Definition VPlanUtils.h:236
static auto blocksOnly(T &&Range)
Return an iterator range over Range which only includes BlockTy blocks.
Definition VPlanUtils.h:272
static void transferSuccessors(VPBlockBase *Old, VPBlockBase *New)
Transfer successors from Old to New. New must have no successors.
Definition VPlanUtils.h:256
static SmallVector< VPBasicBlock * > blocksInSingleSuccessorChainBetween(VPBasicBlock *FirstBB, VPBasicBlock *LastBB)
Returns the blocks between FirstBB and LastBB, where FirstBB to LastBB forms a single-sucessor chain.
A recipe for generating conditional branches on the bits of a mask.
Definition VPlan.h:3290
RAII object that stores the current insertion point and restores it when the object is destroyed.
VPlan-based builder utility analogous to IRBuilder.
VPInstruction * createOr(VPValue *LHS, VPValue *RHS, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
VPValue * createScalarZExtOrTrunc(VPValue *Op, Type *ResultTy, Type *SrcTy, DebugLoc DL)
VPValue * createElementCount(Type *Ty, ElementCount EC)
VPInstruction * createNot(VPValue *Operand, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
VPInstruction * createAnyOfReduction(VPValue *ChainOp, VPValue *TrueVal, VPValue *FalseVal, DebugLoc DL=DebugLoc::getUnknown())
Create an AnyOf reduction pattern: or-reduce ChainOp, freeze the result, then select between TrueVal ...
Definition VPlan.cpp:1618
VPInstruction * createLogicalAnd(VPValue *LHS, VPValue *RHS, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
VPInstruction * createScalarCast(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy, DebugLoc DL, const VPIRMetadata &Metadata={})
static VPBuilder getToInsertAfter(VPRecipeBase *R)
Create a VPBuilder to insert after R.
VPInstruction * createOverflowingOp(unsigned Opcode, ArrayRef< VPValue * > Operands, VPRecipeWithIRFlags::WrapFlagsTy WrapFlags={false, false}, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
VPPhi * createScalarPhi(ArrayRef< VPValue * > IncomingValues, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="", const VPIRFlags &Flags={})
VPDerivedIVRecipe * createDerivedIV(InductionDescriptor::InductionKind Kind, FPMathOperator *FPBinOp, VPIRValue *Start, VPValue *Current, VPValue *Step, const Twine &Name="")
Convert the input value Current to the corresponding value of an induction with Start and Step values...
VPInstruction * createICmp(CmpInst::Predicate Pred, VPValue *A, VPValue *B, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
Create a new ICmp VPInstruction with predicate Pred and operands A and B.
VPInstruction * createSelect(VPValue *Cond, VPValue *TrueVal, VPValue *FalseVal, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="", const VPIRFlags &Flags={})
void setInsertPoint(VPBasicBlock *TheBB)
This specifies that created VPInstructions should be appended to the end of the specified block.
VPInstruction * createNaryOp(unsigned Opcode, ArrayRef< VPValue * > Operands, Instruction *Inst=nullptr, const VPIRFlags &Flags={}, const VPIRMetadata &MD={}, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
Create an N-ary operation with Opcode, Operands and set Inst as its underlying Instruction.
Canonical scalar induction phi of the vector loop.
Definition VPlan.h:3808
A recipe for generating the phi node tracking the current scalar iteration index.
Definition VPlan.h:3898
unsigned getNumDefinedValues() const
Returns the number of values defined by the VPDef.
Definition VPlanValue.h:465
VPValue * getVPSingleValue()
Returns the only VPValue defined by the VPDef.
Definition VPlanValue.h:438
VPValue * getVPValue(unsigned I)
Returns the VPValue with index I defined by the VPDef.
Definition VPlanValue.h:450
ArrayRef< VPRecipeValue * > definedValues()
Returns an ArrayRef of the values defined by the VPDef.
Definition VPlanValue.h:460
A recipe for converting the input value IV value to the corresponding value of an IV with different s...
Definition VPlan.h:3978
Template specialization of the standard LLVM dominator tree utility for VPBlockBases.
bool properlyDominates(const VPRecipeBase *A, const VPRecipeBase *B)
A recipe to combine multiple recipes into a single 'expression' recipe, which should be considered a ...
Definition VPlan.h:3335
A pure virtual base class for all recipes modeling header phis, including phis for first order recurr...
Definition VPlan.h:2305
virtual VPValue * getBackedgeValue()
Returns the incoming value from the loop backedge.
Definition VPlan.h:2347
VPValue * getStartValue()
Returns the start value of the phi, if one is set.
Definition VPlan.h:2336
A recipe representing a sequence of load -> update -> store as part of a histogram operation.
Definition VPlan.h:2047
A special type of VPBasicBlock that wraps an existing IR basic block.
Definition VPlan.h:4383
Class to record and manage LLVM IR flags.
Definition VPlan.h:690
static VPIRFlags getDefaultFlags(unsigned Opcode)
Returns default flags for Opcode for opcodes that support it, asserts otherwise.
LLVM_ABI_FOR_TEST FastMathFlags getFastMathFlags() const
static LLVM_ABI_FOR_TEST VPIRInstruction * create(Instruction &I)
Create a new VPIRPhi for \I , if it is a PHINode, otherwise create a VPIRInstruction.
Helper to manage IR metadata for recipes.
Definition VPlan.h:1170
void intersect(const VPIRMetadata &MD)
Intersect this VPIRMetadata object with MD, keeping only metadata nodes that are common to both.
This is a concrete Recipe that models a single VPlan-level instruction.
Definition VPlan.h:1225
@ ExtractLane
Extracts a single lane (first operand) from a set of vector operands.
Definition VPlan.h:1326
@ Unpack
Extracts all lanes from its (non-scalable) vector operand.
Definition VPlan.h:1269
@ ReductionStartVector
Start vector for reductions with 3 operands: the original start value, the identity value for the red...
Definition VPlan.h:1320
@ BuildVector
Creates a fixed-width vector containing all operands.
Definition VPlan.h:1264
@ BuildStructVector
Given operands of (the same) struct type, creates a struct of fixed- width vectors each containing a ...
Definition VPlan.h:1261
@ CanonicalIVIncrementForPart
Definition VPlan.h:1245
@ ComputeReductionResult
Reduce the operands to the final reduction result using the operation specified via the operation's V...
Definition VPlan.h:1272
const InterleaveGroup< Instruction > * getInterleaveGroup() const
Definition VPlan.h:2931
VPValue * getMask() const
Return the mask used by this recipe.
Definition VPlan.h:2923
ArrayRef< VPValue * > getStoredValues() const
Return the VPValues stored by this interleave group.
Definition VPlan.h:2952
A recipe for interleaved memory operations with vector-predication intrinsics.
Definition VPlan.h:3004
VPInterleaveRecipe is a recipe for transforming an interleave group of load or stores into one wide l...
Definition VPlan.h:2962
VPValue * getIncomingValue(unsigned Idx) const
Returns the incoming VPValue with index Idx.
Definition VPlan.h:1592
VPPredInstPHIRecipe is a recipe for generating the phi nodes needed when control converges back from ...
Definition VPlan.h:3477
VPRecipeBase is a base class modeling a sequence of one or more output IR instructions.
Definition VPlan.h:406
VPRegionBlock * getRegion()
Definition VPlan.h:4535
VPBasicBlock * getParent()
Definition VPlan.h:481
DebugLoc getDebugLoc() const
Returns the debug location of the recipe.
Definition VPlan.h:555
void moveBefore(VPBasicBlock &BB, iplist< VPRecipeBase >::iterator I)
Unlink this recipe and insert into BB before I.
void insertBefore(VPRecipeBase *InsertPos)
Insert an unlinked recipe into a basic block immediately before the specified recipe.
void insertAfter(VPRecipeBase *InsertPos)
Insert an unlinked Recipe into a basic block immediately after the specified Recipe.
iplist< VPRecipeBase >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Helper class to create VPRecipies from IR instructions.
VPHistogramRecipe * widenIfHistogram(VPInstruction *VPI)
If VPI represents a histogram operation (as determined by LoopVectorizationLegality) make that safe f...
void setRecipe(Instruction *I, VPRecipeBase *R)
Set the recipe created for given ingredient.
VPRecipeBase * tryToWidenMemory(VPInstruction *VPI, VFRange &Range)
Check if the load or store instruction VPI should widened for Range.Start and potentially masked.
bool replaceWithFinalIfReductionStore(VPInstruction *VPI, VPBuilder &FinalRedStoresBuilder)
If VPI is a store of a reduction into an invariant address, delete it.
VPReplicateRecipe * handleReplication(VPInstruction *VPI, VFRange &Range)
Build a VPReplicationRecipe for VPI.
VPRecipeBase * getRecipe(Instruction *I)
Return the recipe created for given ingredient.
A recipe to represent inloop reduction operations with vector-predication intrinsics,...
Definition VPlan.h:3164
A recipe for handling reduction phis.
Definition VPlan.h:2699
void setVFScaleFactor(unsigned ScaleFactor)
Set the VFScaleFactor for this reduction phi.
Definition VPlan.h:2746
unsigned getVFScaleFactor() const
Get the factor that the VF of this recipe's output should be scaled by, or 1 if it isn't scaled.
Definition VPlan.h:2739
RecurKind getRecurrenceKind() const
Returns the recurrence kind of the reduction.
Definition VPlan.h:2757
A recipe to represent inloop, ordered or partial reduction operations.
Definition VPlan.h:3055
VPRegionBlock represents a collection of VPBasicBlocks and VPRegionBlocks which form a Single-Entry-S...
Definition VPlan.h:4418
const VPBlockBase * getEntry() const
Definition VPlan.h:4454
Type * getCanonicalIVType()
Return the type of the canonical IV for loop regions.
Definition VPlan.h:4529
bool isReplicator() const
An indicator whether this region is to generate multiple replicated instances of output IR correspond...
Definition VPlan.h:4486
void setExiting(VPBlockBase *ExitingBlock)
Set ExitingBlock as the exiting VPBlockBase of this VPRegionBlock.
Definition VPlan.h:4471
VPCanonicalIVPHIRecipe * getCanonicalIV()
Returns the canonical induction recipe of the region.
Definition VPlan.h:4516
const VPBlockBase * getExiting() const
Definition VPlan.h:4466
VPBasicBlock * getPreheaderVPBB()
Returns the pre-header VPBasicBlock of the loop region.
Definition VPlan.h:4479
VPReplicateRecipe replicates a given instruction producing multiple scalar copies of the original sca...
Definition VPlan.h:3209
bool isSingleScalar() const
Definition VPlan.h:3250
bool isPredicated() const
Definition VPlan.h:3252
VPValue * getMask()
Return the mask of a predicated VPReplicateRecipe.
Definition VPlan.h:3274
A recipe for handling phi nodes of integer and floating-point inductions, producing their scalar valu...
Definition VPlan.h:4050
VPSingleDef is a base class for recipes for modeling a sequence of one or more output IR that define ...
Definition VPlan.h:607
Instruction * getUnderlyingInstr()
Returns the underlying instruction.
Definition VPlan.h:675
VPSingleDefRecipe * clone() override=0
Clone the current recipe.
An analysis for type-inference for VPValues.
LLVMContext & getContext()
Return the LLVMContext used by the analysis.
Type * inferScalarType(const VPValue *V)
Infer the type of V. Returns the scalar type of V.
This class augments VPValue with operands which provide the inverse def-use edges from VPValue's user...
Definition VPlanValue.h:296
operand_range operands()
Definition VPlanValue.h:364
void setOperand(unsigned I, VPValue *New)
Definition VPlanValue.h:340
unsigned getNumOperands() const
Definition VPlanValue.h:334
VPValue * getOperand(unsigned N) const
Definition VPlanValue.h:335
void addOperand(VPValue *Operand)
Definition VPlanValue.h:329
This is the base class of the VPlan Def/Use graph, used for modeling the data flow into,...
Definition VPlanValue.h:46
Value * getLiveInIRValue() const
Return the underlying IR value for a VPIRValue.
Definition VPlan.cpp:137
bool isDefinedOutsideLoopRegions() const
Returns true if the VPValue is defined outside any loop.
Definition VPlan.cpp:1446
VPRecipeBase * getDefiningRecipe()
Returns the recipe defining this VPValue or nullptr if it is not defined by a recipe,...
Definition VPlan.cpp:127
Value * getUnderlyingValue() const
Return the underlying Value attached to this VPValue.
Definition VPlanValue.h:70
void setUnderlyingValue(Value *Val)
Definition VPlanValue.h:196
void replaceAllUsesWith(VPValue *New)
Definition VPlan.cpp:1449
unsigned getNumUsers() const
Definition VPlanValue.h:107
void replaceUsesWithIf(VPValue *New, llvm::function_ref< bool(VPUser &U, unsigned Idx)> ShouldReplace)
Go through the uses list for this VPValue and make each use point to New if the callback ShouldReplac...
Definition VPlan.cpp:1455
user_range users()
Definition VPlanValue.h:149
A recipe to compute a pointer to the last element of each part of a widened memory access for widened...
Definition VPlan.h:2153
A Recipe for widening the canonical induction variable of the vector loop.
Definition VPlan.h:3941
VPWidenCastRecipe is a recipe to create vector cast instructions.
Definition VPlan.h:1839
Instruction::CastOps getOpcode() const
Definition VPlan.h:1877
A recipe for handling GEP instructions.
Definition VPlan.h:2089
Base class for widened induction (VPWidenIntOrFpInductionRecipe and VPWidenPointerInductionRecipe),...
Definition VPlan.h:2371
VPIRValue * getStartValue() const
Returns the start value of the induction.
Definition VPlan.h:2399
PHINode * getPHINode() const
Returns the underlying PHINode if one exists, or null otherwise.
Definition VPlan.h:2417
VPValue * getStepValue()
Returns the step value of the induction.
Definition VPlan.h:2402
const InductionDescriptor & getInductionDescriptor() const
Returns the induction descriptor for the recipe.
Definition VPlan.h:2422
A recipe for handling phi nodes of integer and floating-point inductions, producing their vector valu...
Definition VPlan.h:2453
VPIRValue * getStartValue() const
Returns the start value of the induction.
Definition VPlan.h:2500
VPValue * getSplatVFValue() const
If the recipe has been unrolled, return the VPValue for the induction increment, otherwise return nul...
Definition VPlan.h:2504
VPValue * getLastUnrolledPartOperand()
Returns the VPValue representing the value of this induction at the last unrolled part,...
Definition VPlan.h:2531
A recipe for widening vector intrinsics.
Definition VPlan.h:1891
A common base class for widening memory operations.
Definition VPlan.h:3520
A recipe for widened phis.
Definition VPlan.h:2589
VPWidenRecipe is a recipe for producing a widened instruction using the opcode and operands of the re...
Definition VPlan.h:1783
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPWidenRecipe.
VPWidenRecipe * clone() override
Clone the current recipe.
Definition VPlan.h:1803
unsigned getOpcode() const
Definition VPlan.h:1820
VPlan models a candidate for vectorization, encoding various decisions take to produce efficient outp...
Definition VPlan.h:4548
VPIRValue * getLiveIn(Value *V) const
Return the live-in VPIRValue for V, if there is one or nullptr otherwise.
Definition VPlan.h:4864
bool hasVF(ElementCount VF) const
Definition VPlan.h:4762
const DataLayout & getDataLayout() const
Definition VPlan.h:4744
LLVMContext & getContext() const
Definition VPlan.h:4740
VPBasicBlock * getEntry()
Definition VPlan.h:4640
bool hasScalableVF() const
Definition VPlan.h:4763
VPValue * getTripCount() const
The trip count of the original loop.
Definition VPlan.h:4699
VPValue * getOrCreateBackedgeTakenCount()
The backedge taken count of the original loop.
Definition VPlan.h:4720
iterator_range< SmallSetVector< ElementCount, 2 >::iterator > vectorFactors() const
Returns an iterator range over all VFs of the plan.
Definition VPlan.h:4769
VPIRValue * getFalse()
Return a VPIRValue wrapping i1 false.
Definition VPlan.h:4835
VPSymbolicValue & getVFxUF()
Returns VF * UF of the vector loop region.
Definition VPlan.h:4738
VPIRValue * getAllOnesValue(Type *Ty)
Return a VPIRValue wrapping the AllOnes value of type Ty.
Definition VPlan.h:4841
VPRegionBlock * createReplicateRegion(VPBlockBase *Entry, VPBlockBase *Exiting, const std::string &Name="")
Create a new replicate region with Entry, Exiting and Name.
Definition VPlan.h:4911
auto getLiveIns() const
Return the list of live-in VPValues available in the VPlan.
Definition VPlan.h:4867
bool hasUF(unsigned UF) const
Definition VPlan.h:4787
ArrayRef< VPIRBasicBlock * > getExitBlocks() const
Return an ArrayRef containing VPIRBasicBlocks wrapping the exit blocks of the original scalar loop.
Definition VPlan.h:4689
VPSymbolicValue & getVectorTripCount()
The vector trip count.
Definition VPlan.h:4728
VPValue * getBackedgeTakenCount() const
Definition VPlan.h:4725
VPIRValue * getOrAddLiveIn(Value *V)
Gets the live-in VPIRValue for V or adds a new live-in (if none exists yet) for V.
Definition VPlan.h:4812
VPIRValue * getZero(Type *Ty)
Return a VPIRValue wrapping the null value of type Ty.
Definition VPlan.h:4838
void setVF(ElementCount VF)
Definition VPlan.h:4750
bool isUnrolled() const
Returns true if the VPlan already has been unrolled, i.e.
Definition VPlan.h:4803
LLVM_ABI_FOR_TEST VPRegionBlock * getVectorLoopRegion()
Returns the VPRegionBlock of the vector loop.
Definition VPlan.cpp:1067
unsigned getConcreteUF() const
Returns the concrete UF of the plan, after unrolling.
Definition VPlan.h:4790
void resetTripCount(VPValue *NewTripCount)
Resets the trip count for the VPlan.
Definition VPlan.h:4713
VPBasicBlock * getMiddleBlock()
Returns the 'middle' block of the plan, that is the block that selects whether to execute the scalar ...
Definition VPlan.h:4665
VPBasicBlock * createVPBasicBlock(const Twine &Name, VPRecipeBase *Recipe=nullptr)
Create a new VPBasicBlock with Name and containing Recipe if present.
Definition VPlan.h:4890
VPIRValue * getTrue()
Return a VPIRValue wrapping i1 true.
Definition VPlan.h:4832
VPSymbolicValue & getUF()
Returns the UF of the vector loop region.
Definition VPlan.h:4735
bool hasScalarVFOnly() const
Definition VPlan.h:4780
VPBasicBlock * getScalarPreheader() const
Return the VPBasicBlock for the preheader of the scalar loop.
Definition VPlan.h:4679
VPBasicBlock * getVectorPreheader()
Returns the preheader of the vector loop region, if one exists, or null otherwise.
Definition VPlan.h:4645
VPSymbolicValue & getVF()
Returns the VF of the vector loop region.
Definition VPlan.h:4731
void setUF(unsigned UF)
Definition VPlan.h:4795
bool hasScalarTail() const
Returns true if the scalar tail may execute after the vector loop, i.e.
Definition VPlan.h:4943
LLVM_ABI_FOR_TEST VPlan * duplicate()
Clone the current VPlan, update all VPValues of the new VPlan and cloned recipes to refer to the clon...
Definition VPlan.cpp:1215
VPIRValue * getConstantInt(Type *Ty, uint64_t Val, bool IsSigned=false)
Return a VPIRValue wrapping a ConstantInt with the given type and value.
Definition VPlan.h:4846
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255
iterator_range< user_iterator > users()
Definition Value.h:426
bool hasName() const
Definition Value.h:261
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:318
constexpr bool hasKnownScalarFactor(const FixedOrScalableQuantity &RHS) const
Returns true if there exists a value X where RHS.multiplyCoefficientBy(X) will result in a value whos...
Definition TypeSize.h:269
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
constexpr ScalarTy getKnownScalarFactor(const FixedOrScalableQuantity &RHS) const
Returns a value X where RHS.multiplyCoefficientBy(X) will result in a value whose quantity matches ou...
Definition TypeSize.h:277
static constexpr bool isKnownLT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:216
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:168
constexpr LeafTy multiplyCoefficientBy(ScalarTy RHS) const
Definition TypeSize.h:256
constexpr bool isFixed() const
Returns true if the quantity is not scaled by vscale.
Definition TypeSize.h:171
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
LLVM_ABI APInt RoundingUDiv(const APInt &A, const APInt &B, APInt::Rounding RM)
Return A unsign-divided by B, rounded by the given rounding mode.
Definition APInt.cpp:2803
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
match_combine_or< Ty... > m_CombineOr(const Ty &...Ps)
Combine pattern matchers matching any of Ps patterns.
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
auto m_Cmp()
Matches any compare instruction and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
m_Intrinsic_Ty< Opnd0, Opnd1, Opnd2 >::Ty m_MaskedStore(const Opnd0 &Op0, const Opnd1 &Op1, const Opnd2 &Op2)
Matches MaskedStore Intrinsic.
ap_match< APInt > m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
CastInst_match< OpTy, TruncInst > m_Trunc(const OpTy &Op)
Matches Trunc.
LogicalOp_match< LHS, RHS, Instruction::And > m_LogicalAnd(const LHS &L, const RHS &R)
Matches L && R either in the form of L & R or L ?
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, ZExtInst >, OpTy > m_ZExtOrSelf(const OpTy &Op)
bool match(Val *V, const Pattern &P)
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
auto match_fn(const Pattern &P)
A match functor that can be used as a UnaryPredicate in functional algorithms like all_of.
m_Intrinsic_Ty< Opnd0, Opnd1, Opnd2 >::Ty m_MaskedLoad(const Opnd0 &Op0, const Opnd1 &Op1, const Opnd2 &Op2)
Matches MaskedLoad Intrinsic.
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
SpecificCmpClass_match< LHS, RHS, CmpInst > m_SpecificCmp(CmpPredicate MatchPred, const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
deferredval_ty< Value > m_Deferred(Value *const &V)
Like m_Specific(), but works if the specific value to match is determined as part of the same match()...
CastInst_match< OpTy, FPExtInst > m_FPExt(const OpTy &Op)
SpecificCmpClass_match< LHS, RHS, ICmpInst > m_SpecificICmp(CmpPredicate MatchPred, const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::UDiv > m_UDiv(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
CmpClass_match< LHS, RHS, ICmpInst > m_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
BinaryOp_match< LHS, RHS, Instruction::FAdd, true > m_c_FAdd(const LHS &L, const RHS &R)
Matches FAdd with LHS and RHS in either order.
LogicalOp_match< LHS, RHS, Instruction::And, true > m_c_LogicalAnd(const LHS &L, const RHS &R)
Matches L && R with LHS and RHS in either order.
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
BinaryOp_match< LHS, RHS, Instruction::Mul, true > m_c_Mul(const LHS &L, const RHS &R)
Matches a Mul with LHS and RHS in either order.
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
auto m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
bind_cst_ty m_scev_APInt(const APInt *&C)
Match an SCEV constant and bind it to an APInt.
bool match(const SCEV *S, const Pattern &P)
SCEVAffineAddRec_match< Op0_t, Op1_t, match_isa< const Loop > > m_scev_AffineAddRec(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::ExtractLastLane, VPInstruction_match< VPInstruction::ExtractLastPart, Op0_t > > m_ExtractLastLaneOfLastPart(const Op0_t &Op0)
AllRecipe_commutative_match< Instruction::And, Op0_t, Op1_t > m_c_BinaryAnd(const Op0_t &Op0, const Op1_t &Op1)
Match a binary AND operation.
AllRecipe_match< Instruction::Or, Op0_t, Op1_t > m_BinaryOr(const Op0_t &Op0, const Op1_t &Op1)
Match a binary OR operation.
VPInstruction_match< VPInstruction::AnyOf > m_AnyOf()
AllRecipe_commutative_match< Instruction::Or, Op0_t, Op1_t > m_c_BinaryOr(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::ComputeReductionResult, Op0_t > m_ComputeReductionResult(const Op0_t &Op0)
auto m_WidenAnyExtend(const Op0_t &Op0)
VPInstruction_match< VPInstruction::StepVector > m_StepVector()
auto m_VPPhi(const Op0_t &Op0, const Op1_t &Op1)
bind_ty< VPSingleDefRecipe > m_VPSingleDefRecipe(VPSingleDefRecipe *&V)
Match a VPSingleDefRecipe, capturing if we match.
VPInstruction_match< VPInstruction::BranchOnTwoConds > m_BranchOnTwoConds()
AllRecipe_match< Opcode, Op0_t, Op1_t > m_Binary(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::LastActiveLane, Op0_t > m_LastActiveLane(const Op0_t &Op0)
auto m_WidenIntrinsic(const T &...Ops)
VPInstruction_match< VPInstruction::ExitingIVValue, Op0_t > m_ExitingIVValue(const Op0_t &Op0)
VPInstruction_match< Instruction::ExtractElement, Op0_t, Op1_t > m_ExtractElement(const Op0_t &Op0, const Op1_t &Op1)
specific_intval< 1 > m_False()
VPInstruction_match< VPInstruction::ExtractLastLane, Op0_t > m_ExtractLastLane(const Op0_t &Op0)
VPInstruction_match< VPInstruction::ActiveLaneMask, Op0_t, Op1_t, Op2_t > m_ActiveLaneMask(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2)
VPInstruction_match< VPInstruction::BranchOnCount > m_BranchOnCount()
bind_ty< VPIRValue > m_VPIRValue(VPIRValue *&V)
Match a VPIRValue.
auto m_GetElementPtr(const Op0_t &Op0, const Op1_t &Op1)
specific_intval< 1 > m_True()
auto m_VPValue()
Match an arbitrary VPValue and ignore it.
VectorEndPointerRecipe_match< Op0_t, Op1_t > m_VecEndPtr(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::ExtractLastPart, Op0_t > m_ExtractLastPart(const Op0_t &Op0)
VPInstruction_match< VPInstruction::Broadcast, Op0_t > m_Broadcast(const Op0_t &Op0)
VPInstruction_match< VPInstruction::ExplicitVectorLength, Op0_t > m_EVL(const Op0_t &Op0)
VPInstruction_match< VPInstruction::BuildVector > m_BuildVector()
BuildVector is matches only its opcode, w/o matching its operands as the number of operands is not fi...
VPInstruction_match< VPInstruction::ExtractPenultimateElement, Op0_t > m_ExtractPenultimateElement(const Op0_t &Op0)
VPInstruction_match< VPInstruction::FirstActiveLane, Op0_t > m_FirstActiveLane(const Op0_t &Op0)
bind_ty< VPInstruction > m_VPInstruction(VPInstruction *&V)
Match a VPInstruction, capturing if we match.
auto m_DerivedIV(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2)
VPInstruction_match< VPInstruction::BranchOnCond > m_BranchOnCond()
VPInstruction_match< VPInstruction::ExtractLane, Op0_t, Op1_t > m_ExtractLane(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::Reverse, Op0_t > m_Reverse(const Op0_t &Op0)
NodeAddr< DefNode * > Def
Definition RDFGraph.h:384
bool isSingleScalar(const VPValue *VPV)
Returns true if VPV is a single scalar, either because it produces the same value for all lanes or on...
bool isUniformAcrossVFsAndUFs(VPValue *V)
Checks if V is uniform across all VF lanes and UF parts.
VPValue * getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr)
Get or create a VPValue that corresponds to the expansion of Expr.
VPInstruction * findComputeReductionResult(VPReductionPHIRecipe *PhiR)
Find the ComputeReductionResult recipe for PhiR, looking through selects inserted for predicated redu...
std::optional< MemoryLocation > getMemoryLocation(const VPRecipeBase &R)
Return a MemoryLocation for R with noalias metadata populated from R, if the recipe is supported and ...
bool onlyFirstLaneUsed(const VPValue *Def)
Returns true if only the first lane of Def is used.
VPRecipeBase * findRecipe(VPValue *Start, PredT Pred)
Search Start's users for a recipe satisfying Pred, looking through recipes with definitions.
Definition VPlanUtils.h:111
VPSingleDefRecipe * findHeaderMask(VPlan &Plan)
Collect the header mask with the pattern: (ICMP_ULE, WideCanonicalIV, backedge-taken-count) TODO: Int...
bool onlyScalarValuesUsed(const VPValue *Def)
Returns true if only scalar values of Def are used by all users.
static VPRecipeBase * findUserOf(VPValue *V, const MatchT &P)
If V is used by a recipe matching pattern P, return it.
Definition VPlanUtils.h:132
const SCEV * getSCEVExprForVPValue(const VPValue *V, PredicatedScalarEvolution &PSE, const Loop *L=nullptr)
Return the SCEV expression for V.
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
@ Offset
Definition DWP.cpp:532
constexpr auto not_equal_to(T &&Arg)
Functor variant of std::not_equal_to that can be used as a UnaryPredicate in functional algorithms li...
Definition STLExtras.h:2180
void stable_sort(R &&Range)
Definition STLExtras.h:2116
auto min_element(R &&Range)
Provide wrappers to std::min_element which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:2078
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1669
LLVM_ABI Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
Definition STLExtras.h:841
DenseMap< const Value *, const SCEV * > ValueToSCEVMapTy
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2554
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
constexpr from_range_t from_range
auto dyn_cast_if_present(const Y &Val)
dyn_cast_if_present<X> - Functionally identical to dyn_cast, except that a null (or none in the case ...
Definition Casting.h:732
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2208
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:634
auto cast_or_null(const Y &Val)
Definition Casting.h:714
iterator_range< df_iterator< VPBlockShallowTraversalWrapper< VPBlockBase * > > > vp_depth_first_shallow(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order.
Definition VPlanCFG.h:253
iterator_range< df_iterator< VPBlockDeepTraversalWrapper< VPBlockBase * > > > vp_depth_first_deep(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order while traversing t...
Definition VPlanCFG.h:279
constexpr auto equal_to(T &&Arg)
Functor variant of std::equal_to that can be used as a UnaryPredicate in functional algorithms like a...
Definition STLExtras.h:2173
detail::concat_range< ValueT, RangeTs... > concat(RangeTs &&...Ranges)
Returns a concatenated range across two or more ranges.
Definition STLExtras.h:1152
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:385
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1746
auto reverse(ContainerTy &&C)
Definition STLExtras.h:408
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1636
LLVM_ABI_FOR_TEST cl::opt< bool > EnableWideActiveLaneMask
UncountableExitStyle
Different methods of handling early exits.
Definition VPlan.h:83
@ ReadOnly
No side effects to worry about, so we can process any uncountable exits in the loop and branch either...
Definition VPlan.h:88
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1753
SmallVector< ValueTypeFromRangeType< R >, Size > to_vector(R &&Range)
Given a range of type R, iterate the entire range and return a SmallVector with elements of the vecto...
iterator_range< filter_iterator< detail::IterOfRange< RangeT >, PredicateT > > make_filter_range(RangeT &&Range, PredicateT Pred)
Convenience function that takes a range of elements and a predicate, and return a new filter_iterator...
Definition STLExtras.h:552
bool canConstantBeExtended(const APInt *C, Type *NarrowType, TTI::PartialReductionExtendKind ExtKind)
Check if a constant CI can be safely treated as having been extended from a narrower type with the gi...
Definition VPlan.cpp:1827
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
auto drop_end(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the last N elements excluded.
Definition STLExtras.h:323
TargetTransformInfo TTI
RecurKind
These are the kinds of recurrences that we support.
@ UMin
Unsigned integer min implemented in terms of select(cmp()).
@ FindIV
FindIV reduction with select(icmp(),x,y) where one of (x,y) is a loop induction variable (increasing ...
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ FMul
Product of floats.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ SMin
Signed integer min implemented in terms of select(cmp()).
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
@ AddChainWithSubs
A chain of adds and subs.
@ FAdd
Sum of floats.
@ UMax
Unsigned integer max implemented in terms of select(cmp()).
LLVM_ABI Value * getRecurrenceIdentity(RecurKind K, Type *Tp, FastMathFlags FMF)
Given information about an recurrence kind, return the identity for the @llvm.vector....
LLVM_ABI BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="")
Split the specified block at the specified instruction.
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition STLExtras.h:2012
DWARFExpression::Operation Op
auto max_element(R &&Range)
Provide wrappers to std::max_element which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:2088
ArrayRef(const T &OneElt) -> ArrayRef< T >
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1772
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1947
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition STLExtras.h:2166
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition Hashing.h:592
bool equal(L &&LRange, R &&RRange)
Wrapper function around std::equal to detect if pair-wise elements between two ranges are the same.
Definition STLExtras.h:2146
Type * toVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
@ Default
The result value is uniform if and only if all operands are uniform.
Definition Uniformity.h:20
constexpr detail::IsaCheckPredicate< Types... > IsaPred
Function object wrapper for the llvm::isa type check.
Definition Casting.h:866
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition Hashing.h:466
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
#define N
RemoveMask_match(const Op0_t &In, Op1_t &Out)
bool match(OpTy *V) const
A collection of metadata nodes that might be associated with a memory access used by the alias-analys...
Definition Metadata.h:763
MDNode * Scope
The tag for alias scope specification (used with noalias).
Definition Metadata.h:786
MDNode * NoAlias
The tag specifying the noalias scope.
Definition Metadata.h:789
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
An information struct used to provide DenseMap with the various necessary components for a given valu...
Incoming for lane mask phi as machine instruction, incoming register Reg and incoming block Block are...
This reduction is unordered with the partial result scaled down by some factor.
Definition VPlan.h:2681
A range of powers-of-2 vectorization factors with fixed start and adjustable end.
Struct to hold various analysis needed for cost computations.
TargetTransformInfo::TargetCostKind CostKind
VPTypeAnalysis Types
const TargetTransformInfo & TTI
A recipe for handling first-order recurrence phis.
Definition VPlan.h:2637
A VPValue representing a live-in from the input IR or a constant.
Definition VPlanValue.h:207
Type * getType() const
Returns the type of the underlying IR value.
Definition VPlan.cpp:141
A symbolic live-in VPValue, used for values like vector trip count, VF, and VFxUF.
Definition VPlanValue.h:247
bool isMaterialized() const
Returns true if this symbolic value has been materialized.
Definition VPlanValue.h:255
A recipe for widening load operations with vector-predication intrinsics, using the address to load f...
Definition VPlan.h:3639
A recipe for widening load operations, using the address to load from and an optional mask.
Definition VPlan.h:3599
A recipe for widening store operations with vector-predication intrinsics, using the value to store,...
Definition VPlan.h:3723
A recipe for widening store operations, using the stored value, the address to store to and an option...
Definition VPlan.h:3680
static void handleUncountableEarlyExits(VPlan &Plan, VPBasicBlock *HeaderVPBB, VPBasicBlock *LatchVPBB, VPBasicBlock *MiddleVPBB, UncountableExitStyle Style)
Update Plan to account for uncountable early exits by introducing appropriate branching logic in the ...
static LLVM_ABI_FOR_TEST bool tryToConvertVPInstructionsToVPRecipes(VPlan &Plan, const TargetLibraryInfo &TLI)
Replaces the VPInstructions in Plan with corresponding widen recipes.
static void makeMemOpWideningDecisions(VPlan &Plan, VFRange &Range, VPRecipeBuilder &RecipeBuilder)
Convert load/store VPInstructions in Plan into widened or replicate recipes.
static void materializeBroadcasts(VPlan &Plan)
Add explicit broadcasts for live-ins and VPValues defined in Plan's entry block if they are used as v...
static void materializePacksAndUnpacks(VPlan &Plan)
Add explicit Build[Struct]Vector recipes to Pack multiple scalar values into vectors and Unpack recip...
static void createInterleaveGroups(VPlan &Plan, const SmallPtrSetImpl< const InterleaveGroup< Instruction > * > &InterleaveGroups, VPRecipeBuilder &RecipeBuilder, const bool &EpilogueAllowed)
static bool simplifyKnownEVL(VPlan &Plan, ElementCount VF, PredicatedScalarEvolution &PSE)
Try to simplify VPInstruction::ExplicitVectorLength recipes when the AVL is known to be <= VF,...
static void removeBranchOnConst(VPlan &Plan, bool OnlyLatches=false)
Remove BranchOnCond recipes with true or false conditions together with removing dead edges to their ...
static void materializeFactors(VPlan &Plan, VPBasicBlock *VectorPH, ElementCount VF)
Materialize UF, VF and VFxUF to be computed explicitly using VPInstructions.
static void materializeBackedgeTakenCount(VPlan &Plan, VPBasicBlock *VectorPH)
Materialize the backedge-taken count to be computed explicitly using VPInstructions.
static void hoistInvariantLoads(VPlan &Plan)
Hoist single-scalar loads with invariant addresses out of the vector loop to the preheader,...
static void addActiveLaneMask(VPlan &Plan, bool UseActiveLaneMaskForControlFlow)
Replace (ICMP_ULE, wide canonical IV, backedge-taken-count) checks with an (active-lane-mask recipe,...
static void dropPoisonGeneratingRecipes(VPlan &Plan, const std::function< bool(BasicBlock *)> &BlockNeedsPredication)
Drop poison flags from recipes that may generate a poison value that is used after vectorization,...
static void createAndOptimizeReplicateRegions(VPlan &Plan)
Wrap predicated VPReplicateRecipes with a mask operand in an if-then region block and remove the mask...
static void convertToVariableLengthStep(VPlan &Plan)
Transform loops with variable-length stepping after region dissolution.
static void addBranchWeightToMiddleTerminator(VPlan &Plan, ElementCount VF, std::optional< unsigned > VScaleForTuning)
Add branch weight metadata, if the Plan's middle block is terminated by a BranchOnCond recipe.
static std::unique_ptr< VPlan > narrowInterleaveGroups(VPlan &Plan, const TargetTransformInfo &TTI)
Try to find a single VF among Plan's VFs for which all interleave groups (with known minimum VF eleme...
static DenseMap< const SCEV *, Value * > expandSCEVs(VPlan &Plan, ScalarEvolution &SE)
Expand VPExpandSCEVRecipes in Plan's entry block.
static void convertToConcreteRecipes(VPlan &Plan)
Lower abstract recipes to concrete ones, that can be codegen'd.
static void expandBranchOnTwoConds(VPlan &Plan)
Expand BranchOnTwoConds instructions into explicit CFG with BranchOnCond instructions.
static void hoistPredicatedLoads(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L)
Hoist predicated loads from the same address to the loop entry block, if they are guaranteed to execu...
static bool mergeBlocksIntoPredecessors(VPlan &Plan)
Remove redundant VPBasicBlocks by merging them into their single predecessor if the latter has a sing...
static void optimizeFindIVReductions(VPlan &Plan, PredicatedScalarEvolution &PSE, Loop &L)
Optimize FindLast reductions selecting IVs (or expressions of IVs) by converting them to FindIV reduc...
static void convertToAbstractRecipes(VPlan &Plan, VPCostContext &Ctx, VFRange &Range)
This function converts initial recipes to the abstract recipes and clamps Range based on cost model f...
static void materializeConstantVectorTripCount(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
static void addExitUsersForFirstOrderRecurrences(VPlan &Plan, VFRange &Range)
Handle users in the exit block for first order reductions in the original exit block.
static void addExplicitVectorLength(VPlan &Plan, const std::optional< unsigned > &MaxEVLSafeElements)
Add a VPCurrentIterationPHIRecipe and related recipes to Plan and replaces all uses except the canoni...
static void optimizeEVLMasks(VPlan &Plan)
Optimize recipes which use an EVL-based header mask to VP intrinsics, for example:
static void replaceSymbolicStrides(VPlan &Plan, PredicatedScalarEvolution &PSE, const DenseMap< Value *, const SCEV * > &StridesMap)
Replace symbolic strides from StridesMap in Plan with constants when possible.
static void removeDeadRecipes(VPlan &Plan)
Remove dead recipes from Plan.
static void simplifyRecipes(VPlan &Plan)
Perform instcombine-like simplifications on recipes in Plan.
static void sinkPredicatedStores(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L)
Sink predicated stores to the same address with complementary predicates (P and NOT P) to an uncondit...
static void clearReductionWrapFlags(VPlan &Plan)
Clear NSW/NUW flags from reduction instructions if necessary.
static void optimizeInductionLiveOutUsers(VPlan &Plan, PredicatedScalarEvolution &PSE, bool FoldTail)
If there's a single exit block, optimize its phi recipes that use exiting IV values by feeding them p...
static void createPartialReductions(VPlan &Plan, VPCostContext &CostCtx, VFRange &Range)
Detect and create partial reduction recipes for scaled reductions in Plan.
static void cse(VPlan &Plan)
Perform common-subexpression-elimination on Plan.
static void materializeVectorTripCount(VPlan &Plan, VPBasicBlock *VectorPHVPBB, bool TailByMasking, bool RequiresScalarEpilogue, VPValue *Step)
Materialize vector trip count computations to a set of VPInstructions.
static LLVM_ABI_FOR_TEST void optimize(VPlan &Plan)
Apply VPlan-to-VPlan optimizations to Plan, including induction recipe optimizations,...
static void dissolveLoopRegions(VPlan &Plan)
Replace loop regions with explicit CFG.
static void truncateToMinimalBitwidths(VPlan &Plan, const MapVector< Instruction *, uint64_t > &MinBWs)
Insert truncates and extends for any truncated recipe.
static bool adjustFixedOrderRecurrences(VPlan &Plan, VPBuilder &Builder)
Try to have all users of fixed-order recurrences appear after the recipe defining their previous valu...
static void optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Optimize Plan based on BestVF and BestUF.
static void convertEVLExitCond(VPlan &Plan)
Replaces the exit condition from (branch-on-cond eq CanonicalIVInc, VectorTripCount) to (branch-on-co...