LLVM 22.0.0git
VPlanTransforms.cpp
Go to the documentation of this file.
1//===-- VPlanTransforms.cpp - Utility VPlan to VPlan transforms -----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file implements a set of utility VPlan to VPlan transformations.
11///
12//===----------------------------------------------------------------------===//
13
14#include "VPlanTransforms.h"
15#include "VPRecipeBuilder.h"
16#include "VPlan.h"
17#include "VPlanAnalysis.h"
18#include "VPlanCFG.h"
19#include "VPlanDominatorTree.h"
20#include "VPlanHelpers.h"
21#include "VPlanPatternMatch.h"
22#include "VPlanUtils.h"
23#include "VPlanVerifier.h"
24#include "llvm/ADT/APInt.h"
26#include "llvm/ADT/STLExtras.h"
28#include "llvm/ADT/SetVector.h"
30#include "llvm/ADT/TypeSwitch.h"
38#include "llvm/IR/Intrinsics.h"
39#include "llvm/IR/MDBuilder.h"
40#include "llvm/IR/Metadata.h"
44
45using namespace llvm;
46using namespace VPlanPatternMatch;
47using namespace SCEVPatternMatch;
48
50 VPlan &Plan,
52 GetIntOrFpInductionDescriptor,
53 const TargetLibraryInfo &TLI) {
54
56 Plan.getVectorLoopRegion());
58 // Skip blocks outside region
59 if (!VPBB->getParent())
60 break;
61 VPRecipeBase *Term = VPBB->getTerminator();
62 auto EndIter = Term ? Term->getIterator() : VPBB->end();
63 // Introduce each ingredient into VPlan.
64 for (VPRecipeBase &Ingredient :
65 make_early_inc_range(make_range(VPBB->begin(), EndIter))) {
66
67 VPValue *VPV = Ingredient.getVPSingleValue();
68 if (!VPV->getUnderlyingValue())
69 continue;
70
72
73 VPRecipeBase *NewRecipe = nullptr;
74 if (auto *PhiR = dyn_cast<VPPhi>(&Ingredient)) {
75 auto *Phi = cast<PHINode>(PhiR->getUnderlyingValue());
76 const auto *II = GetIntOrFpInductionDescriptor(Phi);
77 if (!II) {
78 NewRecipe = new VPWidenPHIRecipe(Phi, nullptr, PhiR->getDebugLoc());
79 for (VPValue *Op : PhiR->operands())
80 NewRecipe->addOperand(Op);
81 } else {
82 VPValue *Start = Plan.getOrAddLiveIn(II->getStartValue());
83 VPValue *Step =
85 // It is always safe to copy over the NoWrap and FastMath flags. In
86 // particular, when folding tail by masking, the masked-off lanes are
87 // never used, so it is safe.
89 NewRecipe = new VPWidenIntOrFpInductionRecipe(
90 Phi, Start, Step, &Plan.getVF(), *II, Flags,
91 Ingredient.getDebugLoc());
92 }
93 } else {
94 auto *VPI = cast<VPInstruction>(&Ingredient);
95 assert(!isa<PHINode>(Inst) && "phis should be handled above");
96 // Create VPWidenMemoryRecipe for loads and stores.
97 if (LoadInst *Load = dyn_cast<LoadInst>(Inst)) {
98 NewRecipe = new VPWidenLoadRecipe(
99 *Load, Ingredient.getOperand(0), nullptr /*Mask*/,
100 false /*Consecutive*/, false /*Reverse*/, *VPI,
101 Ingredient.getDebugLoc());
102 } else if (StoreInst *Store = dyn_cast<StoreInst>(Inst)) {
103 NewRecipe = new VPWidenStoreRecipe(
104 *Store, Ingredient.getOperand(1), Ingredient.getOperand(0),
105 nullptr /*Mask*/, false /*Consecutive*/, false /*Reverse*/, *VPI,
106 Ingredient.getDebugLoc());
108 NewRecipe = new VPWidenGEPRecipe(GEP, Ingredient.operands(), *VPI,
109 Ingredient.getDebugLoc());
110 } else if (CallInst *CI = dyn_cast<CallInst>(Inst)) {
111 Intrinsic::ID VectorID = getVectorIntrinsicIDForCall(CI, &TLI);
112 if (VectorID == Intrinsic::not_intrinsic)
113 return false;
114 NewRecipe = new VPWidenIntrinsicRecipe(
115 *CI, getVectorIntrinsicIDForCall(CI, &TLI),
116 drop_end(Ingredient.operands()), CI->getType(), VPIRFlags(*CI),
117 *VPI, CI->getDebugLoc());
118 } else if (SelectInst *SI = dyn_cast<SelectInst>(Inst)) {
119 NewRecipe = new VPWidenSelectRecipe(SI, Ingredient.operands(), *VPI,
120 *VPI, Ingredient.getDebugLoc());
121 } else if (auto *CI = dyn_cast<CastInst>(Inst)) {
122 NewRecipe = new VPWidenCastRecipe(
123 CI->getOpcode(), Ingredient.getOperand(0), CI->getType(), CI,
124 VPIRFlags(*CI), VPIRMetadata(*CI));
125 } else {
126 NewRecipe = new VPWidenRecipe(*Inst, Ingredient.operands(), *VPI,
127 *VPI, Ingredient.getDebugLoc());
128 }
129 }
130
131 NewRecipe->insertBefore(&Ingredient);
132 if (NewRecipe->getNumDefinedValues() == 1)
133 VPV->replaceAllUsesWith(NewRecipe->getVPSingleValue());
134 else
135 assert(NewRecipe->getNumDefinedValues() == 0 &&
136 "Only recpies with zero or one defined values expected");
137 Ingredient.eraseFromParent();
138 }
139 }
140 return true;
141}
142
143/// Helper for extra no-alias checks via known-safe recipe and SCEV.
145 const SmallPtrSetImpl<VPRecipeBase *> &ExcludeRecipes;
146 VPReplicateRecipe &GroupLeader;
147 ScalarEvolution &SE;
148 const Loop &L;
149 VPTypeAnalysis &TypeInfo;
150
151 // Return true if \p A and \p B are known to not alias for all VFs in the
152 // plan, checked via the distance between the accesses
153 bool isNoAliasViaDistance(VPReplicateRecipe *A, VPReplicateRecipe *B) const {
154 if (A->getOpcode() != Instruction::Store ||
155 B->getOpcode() != Instruction::Store)
156 return false;
157
158 VPValue *AddrA = A->getOperand(1);
159 const SCEV *SCEVA = vputils::getSCEVExprForVPValue(AddrA, SE, &L);
160 VPValue *AddrB = B->getOperand(1);
161 const SCEV *SCEVB = vputils::getSCEVExprForVPValue(AddrB, SE, &L);
163 return false;
164
165 const APInt *Distance;
166 if (!match(SE.getMinusSCEV(SCEVA, SCEVB), m_scev_APInt(Distance)))
167 return false;
168
169 const DataLayout &DL = SE.getDataLayout();
170 Type *TyA = TypeInfo.inferScalarType(A->getOperand(0));
171 uint64_t SizeA = DL.getTypeStoreSize(TyA);
172 Type *TyB = TypeInfo.inferScalarType(B->getOperand(0));
173 uint64_t SizeB = DL.getTypeStoreSize(TyB);
174
175 // Use the maximum store size to ensure no overlap from either direction.
176 // Currently only handles fixed sizes, as it is only used for
177 // replicating VPReplicateRecipes.
178 uint64_t MaxStoreSize = std::max(SizeA, SizeB);
179
180 auto VFs = B->getParent()->getPlan()->vectorFactors();
182 return Distance->abs().uge(
183 MaxVF.multiplyCoefficientBy(MaxStoreSize).getFixedValue());
184 }
185
186public:
188 VPReplicateRecipe &GroupLeader, ScalarEvolution &SE,
189 const Loop &L, VPTypeAnalysis &TypeInfo)
190 : ExcludeRecipes(ExcludeRecipes), GroupLeader(GroupLeader), SE(SE), L(L),
191 TypeInfo(TypeInfo) {}
192
193 /// Return true if \p R should be skipped during alias checking, either
194 /// because it's in the exclude set or because no-alias can be proven via
195 /// SCEV.
196 bool shouldSkip(VPRecipeBase &R) const {
197 auto *Store = dyn_cast<VPReplicateRecipe>(&R);
198 return ExcludeRecipes.contains(&R) ||
199 (Store && isNoAliasViaDistance(Store, &GroupLeader));
200 }
201};
202
203/// Check if a memory operation doesn't alias with memory operations in blocks
204/// between \p FirstBB and \p LastBB using scoped noalias metadata. If
205/// \p SinkInfo is std::nullopt, only recipes that may write to memory are
206/// checked (for load hoisting). Otherwise recipes that both read and write
207/// memory are checked, and SCEV is used to prove no-alias between the group
208/// leader and other replicate recipes (for store sinking).
209static bool
211 VPBasicBlock *FirstBB, VPBasicBlock *LastBB,
212 std::optional<SinkStoreInfo> SinkInfo = {}) {
213 bool CheckReads = SinkInfo.has_value();
214 if (!MemLoc.AATags.Scope)
215 return false;
216
217 const AAMDNodes &MemAA = MemLoc.AATags;
218
219 for (VPBlockBase *Block = FirstBB; Block;
220 Block = Block->getSingleSuccessor()) {
221 assert(Block->getNumSuccessors() <= 1 &&
222 "Expected at most one successor in block chain");
223 auto *VPBB = cast<VPBasicBlock>(Block);
224 for (VPRecipeBase &R : *VPBB) {
225 if (SinkInfo && SinkInfo->shouldSkip(R))
226 continue;
227
228 // Skip recipes that don't need checking.
229 if (!R.mayWriteToMemory() && !(CheckReads && R.mayReadFromMemory()))
230 continue;
231
233 if (!Loc)
234 // Conservatively assume aliasing for memory operations without
235 // location.
236 return false;
237
238 // For reads, check if they don't alias in the reverse direction and
239 // skip if so.
240 if (CheckReads && R.mayReadFromMemory() &&
242 MemAA.NoAlias))
243 continue;
244
245 // Check if the memory operations may alias in the forward direction.
247 Loc->AATags.NoAlias))
248 return false;
249 }
250
251 if (Block == LastBB)
252 break;
253 }
254 return true;
255}
256
257/// Return true if we do not know how to (mechanically) hoist or sink \p R out
258/// of a loop region.
260 // Assumes don't alias anything or throw; as long as they're guaranteed to
261 // execute, they're safe to hoist.
263 return false;
264
265 // TODO: Relax checks in the future, e.g. we could also hoist reads, if their
266 // memory location is not modified in the vector loop.
267 if (R.mayHaveSideEffects() || R.mayReadFromMemory() || R.isPhi())
268 return true;
269
270 // Allocas cannot be hoisted.
271 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
272 return RepR && RepR->getOpcode() == Instruction::Alloca;
273}
274
275static bool sinkScalarOperands(VPlan &Plan) {
276 auto Iter = vp_depth_first_deep(Plan.getEntry());
277 bool ScalarVFOnly = Plan.hasScalarVFOnly();
278 bool Changed = false;
279
281 auto InsertIfValidSinkCandidate = [ScalarVFOnly, &WorkList](
282 VPBasicBlock *SinkTo, VPValue *Op) {
283 auto *Candidate =
284 dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe());
285 if (!Candidate)
286 return;
287
288 // We only know how to sink VPReplicateRecipes and VPScalarIVStepsRecipes
289 // for now.
291 return;
292
293 if (Candidate->getParent() == SinkTo || cannotHoistOrSinkRecipe(*Candidate))
294 return;
295
296 if (auto *RepR = dyn_cast<VPReplicateRecipe>(Candidate))
297 if (!ScalarVFOnly && RepR->isSingleScalar())
298 return;
299
300 WorkList.insert({SinkTo, Candidate});
301 };
302
303 // First, collect the operands of all recipes in replicate blocks as seeds for
304 // sinking.
306 VPBasicBlock *EntryVPBB = VPR->getEntryBasicBlock();
307 if (!VPR->isReplicator() || EntryVPBB->getSuccessors().size() != 2)
308 continue;
309 VPBasicBlock *VPBB = cast<VPBasicBlock>(EntryVPBB->getSuccessors().front());
310 if (VPBB->getSingleSuccessor() != VPR->getExitingBasicBlock())
311 continue;
312 for (auto &Recipe : *VPBB)
313 for (VPValue *Op : Recipe.operands())
314 InsertIfValidSinkCandidate(VPBB, Op);
315 }
316
317 // Try to sink each replicate or scalar IV steps recipe in the worklist.
318 for (unsigned I = 0; I != WorkList.size(); ++I) {
319 VPBasicBlock *SinkTo;
320 VPSingleDefRecipe *SinkCandidate;
321 std::tie(SinkTo, SinkCandidate) = WorkList[I];
322
323 // All recipe users of SinkCandidate must be in the same block SinkTo or all
324 // users outside of SinkTo must only use the first lane of SinkCandidate. In
325 // the latter case, we need to duplicate SinkCandidate.
326 auto UsersOutsideSinkTo =
327 make_filter_range(SinkCandidate->users(), [SinkTo](VPUser *U) {
328 return cast<VPRecipeBase>(U)->getParent() != SinkTo;
329 });
330 if (any_of(UsersOutsideSinkTo, [SinkCandidate](VPUser *U) {
331 return !U->usesFirstLaneOnly(SinkCandidate);
332 }))
333 continue;
334 bool NeedsDuplicating = !UsersOutsideSinkTo.empty();
335
336 if (NeedsDuplicating) {
337 if (ScalarVFOnly)
338 continue;
339 VPSingleDefRecipe *Clone;
340 if (auto *SinkCandidateRepR =
341 dyn_cast<VPReplicateRecipe>(SinkCandidate)) {
342 // TODO: Handle converting to uniform recipes as separate transform,
343 // then cloning should be sufficient here.
344 Instruction *I = SinkCandidate->getUnderlyingInstr();
345 Clone = new VPReplicateRecipe(I, SinkCandidate->operands(), true,
346 nullptr /*Mask*/, *SinkCandidateRepR,
347 *SinkCandidateRepR);
348 // TODO: add ".cloned" suffix to name of Clone's VPValue.
349 } else {
350 Clone = SinkCandidate->clone();
351 }
352
353 Clone->insertBefore(SinkCandidate);
354 SinkCandidate->replaceUsesWithIf(Clone, [SinkTo](VPUser &U, unsigned) {
355 return cast<VPRecipeBase>(&U)->getParent() != SinkTo;
356 });
357 }
358 SinkCandidate->moveBefore(*SinkTo, SinkTo->getFirstNonPhi());
359 for (VPValue *Op : SinkCandidate->operands())
360 InsertIfValidSinkCandidate(SinkTo, Op);
361 Changed = true;
362 }
363 return Changed;
364}
365
366/// If \p R is a region with a VPBranchOnMaskRecipe in the entry block, return
367/// the mask.
369 auto *EntryBB = dyn_cast<VPBasicBlock>(R->getEntry());
370 if (!EntryBB || EntryBB->size() != 1 ||
371 !isa<VPBranchOnMaskRecipe>(EntryBB->begin()))
372 return nullptr;
373
374 return cast<VPBranchOnMaskRecipe>(&*EntryBB->begin())->getOperand(0);
375}
376
377/// If \p R is a triangle region, return the 'then' block of the triangle.
379 auto *EntryBB = cast<VPBasicBlock>(R->getEntry());
380 if (EntryBB->getNumSuccessors() != 2)
381 return nullptr;
382
383 auto *Succ0 = dyn_cast<VPBasicBlock>(EntryBB->getSuccessors()[0]);
384 auto *Succ1 = dyn_cast<VPBasicBlock>(EntryBB->getSuccessors()[1]);
385 if (!Succ0 || !Succ1)
386 return nullptr;
387
388 if (Succ0->getNumSuccessors() + Succ1->getNumSuccessors() != 1)
389 return nullptr;
390 if (Succ0->getSingleSuccessor() == Succ1)
391 return Succ0;
392 if (Succ1->getSingleSuccessor() == Succ0)
393 return Succ1;
394 return nullptr;
395}
396
397// Merge replicate regions in their successor region, if a replicate region
398// is connected to a successor replicate region with the same predicate by a
399// single, empty VPBasicBlock.
401 SmallPtrSet<VPRegionBlock *, 4> TransformedRegions;
402
403 // Collect replicate regions followed by an empty block, followed by another
404 // replicate region with matching masks to process front. This is to avoid
405 // iterator invalidation issues while merging regions.
408 vp_depth_first_deep(Plan.getEntry()))) {
409 if (!Region1->isReplicator())
410 continue;
411 auto *MiddleBasicBlock =
412 dyn_cast_or_null<VPBasicBlock>(Region1->getSingleSuccessor());
413 if (!MiddleBasicBlock || !MiddleBasicBlock->empty())
414 continue;
415
416 auto *Region2 =
417 dyn_cast_or_null<VPRegionBlock>(MiddleBasicBlock->getSingleSuccessor());
418 if (!Region2 || !Region2->isReplicator())
419 continue;
420
421 VPValue *Mask1 = getPredicatedMask(Region1);
422 VPValue *Mask2 = getPredicatedMask(Region2);
423 if (!Mask1 || Mask1 != Mask2)
424 continue;
425
426 assert(Mask1 && Mask2 && "both region must have conditions");
427 WorkList.push_back(Region1);
428 }
429
430 // Move recipes from Region1 to its successor region, if both are triangles.
431 for (VPRegionBlock *Region1 : WorkList) {
432 if (TransformedRegions.contains(Region1))
433 continue;
434 auto *MiddleBasicBlock = cast<VPBasicBlock>(Region1->getSingleSuccessor());
435 auto *Region2 = cast<VPRegionBlock>(MiddleBasicBlock->getSingleSuccessor());
436
437 VPBasicBlock *Then1 = getPredicatedThenBlock(Region1);
438 VPBasicBlock *Then2 = getPredicatedThenBlock(Region2);
439 if (!Then1 || !Then2)
440 continue;
441
442 // Note: No fusion-preventing memory dependencies are expected in either
443 // region. Such dependencies should be rejected during earlier dependence
444 // checks, which guarantee accesses can be re-ordered for vectorization.
445 //
446 // Move recipes to the successor region.
447 for (VPRecipeBase &ToMove : make_early_inc_range(reverse(*Then1)))
448 ToMove.moveBefore(*Then2, Then2->getFirstNonPhi());
449
450 auto *Merge1 = cast<VPBasicBlock>(Then1->getSingleSuccessor());
451 auto *Merge2 = cast<VPBasicBlock>(Then2->getSingleSuccessor());
452
453 // Move VPPredInstPHIRecipes from the merge block to the successor region's
454 // merge block. Update all users inside the successor region to use the
455 // original values.
456 for (VPRecipeBase &Phi1ToMove : make_early_inc_range(reverse(*Merge1))) {
457 VPValue *PredInst1 =
458 cast<VPPredInstPHIRecipe>(&Phi1ToMove)->getOperand(0);
459 VPValue *Phi1ToMoveV = Phi1ToMove.getVPSingleValue();
460 Phi1ToMoveV->replaceUsesWithIf(PredInst1, [Then2](VPUser &U, unsigned) {
461 return cast<VPRecipeBase>(&U)->getParent() == Then2;
462 });
463
464 // Remove phi recipes that are unused after merging the regions.
465 if (Phi1ToMove.getVPSingleValue()->getNumUsers() == 0) {
466 Phi1ToMove.eraseFromParent();
467 continue;
468 }
469 Phi1ToMove.moveBefore(*Merge2, Merge2->begin());
470 }
471
472 // Remove the dead recipes in Region1's entry block.
473 for (VPRecipeBase &R :
474 make_early_inc_range(reverse(*Region1->getEntryBasicBlock())))
475 R.eraseFromParent();
476
477 // Finally, remove the first region.
478 for (VPBlockBase *Pred : make_early_inc_range(Region1->getPredecessors())) {
479 VPBlockUtils::disconnectBlocks(Pred, Region1);
480 VPBlockUtils::connectBlocks(Pred, MiddleBasicBlock);
481 }
482 VPBlockUtils::disconnectBlocks(Region1, MiddleBasicBlock);
483 TransformedRegions.insert(Region1);
484 }
485
486 return !TransformedRegions.empty();
487}
488
490 VPlan &Plan) {
491 Instruction *Instr = PredRecipe->getUnderlyingInstr();
492 // Build the triangular if-then region.
493 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
494 assert(Instr->getParent() && "Predicated instruction not in any basic block");
495 auto *BlockInMask = PredRecipe->getMask();
496 auto *MaskDef = BlockInMask->getDefiningRecipe();
497 auto *BOMRecipe = new VPBranchOnMaskRecipe(
498 BlockInMask, MaskDef ? MaskDef->getDebugLoc() : DebugLoc::getUnknown());
499 auto *Entry =
500 Plan.createVPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
501
502 // Replace predicated replicate recipe with a replicate recipe without a
503 // mask but in the replicate region.
504 auto *RecipeWithoutMask = new VPReplicateRecipe(
505 PredRecipe->getUnderlyingInstr(), drop_end(PredRecipe->operands()),
506 PredRecipe->isSingleScalar(), nullptr /*Mask*/, *PredRecipe, *PredRecipe,
507 PredRecipe->getDebugLoc());
508 auto *Pred =
509 Plan.createVPBasicBlock(Twine(RegionName) + ".if", RecipeWithoutMask);
510
511 VPPredInstPHIRecipe *PHIRecipe = nullptr;
512 if (PredRecipe->getNumUsers() != 0) {
513 PHIRecipe = new VPPredInstPHIRecipe(RecipeWithoutMask,
514 RecipeWithoutMask->getDebugLoc());
515 PredRecipe->replaceAllUsesWith(PHIRecipe);
516 PHIRecipe->setOperand(0, RecipeWithoutMask);
517 }
518 PredRecipe->eraseFromParent();
519 auto *Exiting =
520 Plan.createVPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
522 Plan.createReplicateRegion(Entry, Exiting, RegionName);
523
524 // Note: first set Entry as region entry and then connect successors starting
525 // from it in order, to propagate the "parent" of each VPBasicBlock.
526 VPBlockUtils::insertTwoBlocksAfter(Pred, Exiting, Entry);
527 VPBlockUtils::connectBlocks(Pred, Exiting);
528
529 return Region;
530}
531
532static void addReplicateRegions(VPlan &Plan) {
535 vp_depth_first_deep(Plan.getEntry()))) {
536 for (VPRecipeBase &R : *VPBB)
537 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
538 if (RepR->isPredicated())
539 WorkList.push_back(RepR);
540 }
541 }
542
543 unsigned BBNum = 0;
544 for (VPReplicateRecipe *RepR : WorkList) {
545 VPBasicBlock *CurrentBlock = RepR->getParent();
546 VPBasicBlock *SplitBlock = CurrentBlock->splitAt(RepR->getIterator());
547
548 BasicBlock *OrigBB = RepR->getUnderlyingInstr()->getParent();
549 SplitBlock->setName(
550 OrigBB->hasName() ? OrigBB->getName() + "." + Twine(BBNum++) : "");
551 // Record predicated instructions for above packing optimizations.
553 Region->setParent(CurrentBlock->getParent());
555
556 VPRegionBlock *ParentRegion = Region->getParent();
557 if (ParentRegion && ParentRegion->getExiting() == CurrentBlock)
558 ParentRegion->setExiting(SplitBlock);
559 }
560}
561
562/// Remove redundant VPBasicBlocks by merging them into their predecessor if
563/// the predecessor has a single successor.
567 vp_depth_first_deep(Plan.getEntry()))) {
568 // Don't fold the blocks in the skeleton of the Plan into their single
569 // predecessors for now.
570 // TODO: Remove restriction once more of the skeleton is modeled in VPlan.
571 if (!VPBB->getParent())
572 continue;
573 auto *PredVPBB =
574 dyn_cast_or_null<VPBasicBlock>(VPBB->getSinglePredecessor());
575 if (!PredVPBB || PredVPBB->getNumSuccessors() != 1 ||
576 isa<VPIRBasicBlock>(PredVPBB))
577 continue;
578 WorkList.push_back(VPBB);
579 }
580
581 for (VPBasicBlock *VPBB : WorkList) {
582 VPBasicBlock *PredVPBB = cast<VPBasicBlock>(VPBB->getSinglePredecessor());
583 for (VPRecipeBase &R : make_early_inc_range(*VPBB))
584 R.moveBefore(*PredVPBB, PredVPBB->end());
585 VPBlockUtils::disconnectBlocks(PredVPBB, VPBB);
586 auto *ParentRegion = VPBB->getParent();
587 if (ParentRegion && ParentRegion->getExiting() == VPBB)
588 ParentRegion->setExiting(PredVPBB);
589 for (auto *Succ : to_vector(VPBB->successors())) {
591 VPBlockUtils::connectBlocks(PredVPBB, Succ);
592 }
593 // VPBB is now dead and will be cleaned up when the plan gets destroyed.
594 }
595 return !WorkList.empty();
596}
597
599 // Convert masked VPReplicateRecipes to if-then region blocks.
601
602 bool ShouldSimplify = true;
603 while (ShouldSimplify) {
604 ShouldSimplify = sinkScalarOperands(Plan);
605 ShouldSimplify |= mergeReplicateRegionsIntoSuccessors(Plan);
606 ShouldSimplify |= mergeBlocksIntoPredecessors(Plan);
607 }
608}
609
610/// Remove redundant casts of inductions.
611///
612/// Such redundant casts are casts of induction variables that can be ignored,
613/// because we already proved that the casted phi is equal to the uncasted phi
614/// in the vectorized loop. There is no need to vectorize the cast - the same
615/// value can be used for both the phi and casts in the vector loop.
617 for (auto &Phi : Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
619 if (!IV || IV->getTruncInst())
620 continue;
621
622 // A sequence of IR Casts has potentially been recorded for IV, which
623 // *must be bypassed* when the IV is vectorized, because the vectorized IV
624 // will produce the desired casted value. This sequence forms a def-use
625 // chain and is provided in reverse order, ending with the cast that uses
626 // the IV phi. Search for the recipe of the last cast in the chain and
627 // replace it with the original IV. Note that only the final cast is
628 // expected to have users outside the cast-chain and the dead casts left
629 // over will be cleaned up later.
630 ArrayRef<Instruction *> Casts = IV->getInductionDescriptor().getCastInsts();
631 VPValue *FindMyCast = IV;
632 for (Instruction *IRCast : reverse(Casts)) {
633 VPSingleDefRecipe *FoundUserCast = nullptr;
634 for (auto *U : FindMyCast->users()) {
635 auto *UserCast = dyn_cast<VPSingleDefRecipe>(U);
636 if (UserCast && UserCast->getUnderlyingValue() == IRCast) {
637 FoundUserCast = UserCast;
638 break;
639 }
640 }
641 FindMyCast = FoundUserCast;
642 }
643 FindMyCast->replaceAllUsesWith(IV);
644 }
645}
646
647/// Try to replace VPWidenCanonicalIVRecipes with a widened canonical IV
648/// recipe, if it exists.
650 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
651 VPCanonicalIVPHIRecipe *CanonicalIV = LoopRegion->getCanonicalIV();
652 VPWidenCanonicalIVRecipe *WidenNewIV = nullptr;
653 for (VPUser *U : CanonicalIV->users()) {
655 if (WidenNewIV)
656 break;
657 }
658
659 if (!WidenNewIV)
660 return;
661
662 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
663 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
664 auto *WidenOriginalIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
665
666 if (!WidenOriginalIV || !WidenOriginalIV->isCanonical())
667 continue;
668
669 // Replace WidenNewIV with WidenOriginalIV if WidenOriginalIV provides
670 // everything WidenNewIV's users need. That is, WidenOriginalIV will
671 // generate a vector phi or all users of WidenNewIV demand the first lane
672 // only.
673 if (!vputils::onlyScalarValuesUsed(WidenOriginalIV) ||
674 vputils::onlyFirstLaneUsed(WidenNewIV)) {
675 // We are replacing a wide canonical iv with a suitable wide induction.
676 // This is used to compute header mask, hence all lanes will be used and
677 // we need to drop wrap flags only applying to lanes guranteed to execute
678 // in the original scalar loop.
679 WidenOriginalIV->dropPoisonGeneratingFlags();
680 WidenNewIV->replaceAllUsesWith(WidenOriginalIV);
681 WidenNewIV->eraseFromParent();
682 return;
683 }
684 }
685}
686
687/// Returns true if \p R is dead and can be removed.
688static bool isDeadRecipe(VPRecipeBase &R) {
689 // Do remove conditional assume instructions as their conditions may be
690 // flattened.
691 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
692 bool IsConditionalAssume = RepR && RepR->isPredicated() &&
694 if (IsConditionalAssume)
695 return true;
696
697 if (R.mayHaveSideEffects())
698 return false;
699
700 // Recipe is dead if no user keeps the recipe alive.
701 return all_of(R.definedValues(),
702 [](VPValue *V) { return V->getNumUsers() == 0; });
703}
704
707 vp_post_order_deep(Plan.getEntry()))) {
708 // The recipes in the block are processed in reverse order, to catch chains
709 // of dead recipes.
710 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
711 if (isDeadRecipe(R)) {
712 R.eraseFromParent();
713 continue;
714 }
715
716 // Check if R is a dead VPPhi <-> update cycle and remove it.
717 auto *PhiR = dyn_cast<VPPhi>(&R);
718 if (!PhiR || PhiR->getNumOperands() != 2)
719 continue;
720 VPUser *PhiUser = PhiR->getSingleUser();
721 if (!PhiUser)
722 continue;
723 VPValue *Incoming = PhiR->getOperand(1);
724 if (PhiUser != Incoming->getDefiningRecipe() ||
725 Incoming->getNumUsers() != 1)
726 continue;
727 PhiR->replaceAllUsesWith(PhiR->getOperand(0));
728 PhiR->eraseFromParent();
729 Incoming->getDefiningRecipe()->eraseFromParent();
730 }
731 }
732}
733
736 Instruction::BinaryOps InductionOpcode,
737 FPMathOperator *FPBinOp, Instruction *TruncI,
738 VPValue *StartV, VPValue *Step, DebugLoc DL,
739 VPBuilder &Builder) {
740 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
741 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
742 VPCanonicalIVPHIRecipe *CanonicalIV = LoopRegion->getCanonicalIV();
743 VPSingleDefRecipe *BaseIV = Builder.createDerivedIV(
744 Kind, FPBinOp, StartV, CanonicalIV, Step, "offset.idx");
745
746 // Truncate base induction if needed.
747 VPTypeAnalysis TypeInfo(Plan);
748 Type *ResultTy = TypeInfo.inferScalarType(BaseIV);
749 if (TruncI) {
750 Type *TruncTy = TruncI->getType();
751 assert(ResultTy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits() &&
752 "Not truncating.");
753 assert(ResultTy->isIntegerTy() && "Truncation requires an integer type");
754 BaseIV = Builder.createScalarCast(Instruction::Trunc, BaseIV, TruncTy, DL);
755 ResultTy = TruncTy;
756 }
757
758 // Truncate step if needed.
759 Type *StepTy = TypeInfo.inferScalarType(Step);
760 if (ResultTy != StepTy) {
761 assert(StepTy->getScalarSizeInBits() > ResultTy->getScalarSizeInBits() &&
762 "Not truncating.");
763 assert(StepTy->isIntegerTy() && "Truncation requires an integer type");
764 auto *VecPreheader =
766 VPBuilder::InsertPointGuard Guard(Builder);
767 Builder.setInsertPoint(VecPreheader);
768 Step = Builder.createScalarCast(Instruction::Trunc, Step, ResultTy, DL);
769 }
770 return Builder.createScalarIVSteps(InductionOpcode, FPBinOp, BaseIV, Step,
771 &Plan.getVF(), DL);
772}
773
776 for (unsigned I = 0; I != Users.size(); ++I) {
778 if (isa<VPHeaderPHIRecipe>(Cur))
779 continue;
780 for (VPValue *V : Cur->definedValues())
781 Users.insert_range(V->users());
782 }
783 return Users.takeVector();
784}
785
786/// Scalarize a VPWidenPointerInductionRecipe by replacing it with a PtrAdd
787/// (IndStart, ScalarIVSteps (0, Step)). This is used when the recipe only
788/// generates scalar values.
789static VPValue *
791 VPlan &Plan, VPBuilder &Builder) {
793 VPValue *StartV = Plan.getConstantInt(ID.getStep()->getType(), 0);
794 VPValue *StepV = PtrIV->getOperand(1);
796 Plan, InductionDescriptor::IK_IntInduction, Instruction::Add, nullptr,
797 nullptr, StartV, StepV, PtrIV->getDebugLoc(), Builder);
798
799 return Builder.createPtrAdd(PtrIV->getStartValue(), Steps,
800 PtrIV->getDebugLoc(), "next.gep");
801}
802
803/// Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd
804/// (IndStart, ScalarIVSteps (0, Step)) if only its scalar values are used, as
805/// VPWidenPointerInductionRecipe will generate vectors only. If some users
806/// require vectors while other require scalars, the scalar uses need to extract
807/// the scalars from the generated vectors (Note that this is different to how
808/// int/fp inductions are handled). Legalize extract-from-ends using uniform
809/// VPReplicateRecipe of wide inductions to use regular VPReplicateRecipe, so
810/// the correct end value is available. Also optimize
811/// VPWidenIntOrFpInductionRecipe, if any of its users needs scalar values, by
812/// providing them scalar steps built on the canonical scalar IV and update the
813/// original IV's users. This is an optional optimization to reduce the needs of
814/// vector extracts.
817 bool HasOnlyVectorVFs = !Plan.hasScalarVFOnly();
818 VPBuilder Builder(HeaderVPBB, HeaderVPBB->getFirstNonPhi());
819 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
820 auto *PhiR = dyn_cast<VPWidenInductionRecipe>(&Phi);
821 if (!PhiR)
822 continue;
823
824 // Try to narrow wide and replicating recipes to uniform recipes, based on
825 // VPlan analysis.
826 // TODO: Apply to all recipes in the future, to replace legacy uniformity
827 // analysis.
828 auto Users = collectUsersRecursively(PhiR);
829 for (VPUser *U : reverse(Users)) {
830 auto *Def = dyn_cast<VPRecipeWithIRFlags>(U);
831 auto *RepR = dyn_cast<VPReplicateRecipe>(U);
832 // Skip recipes that shouldn't be narrowed.
833 if (!Def || !isa<VPReplicateRecipe, VPWidenRecipe>(Def) ||
834 Def->getNumUsers() == 0 || !Def->getUnderlyingValue() ||
835 (RepR && (RepR->isSingleScalar() || RepR->isPredicated())))
836 continue;
837
838 // Skip recipes that may have other lanes than their first used.
840 continue;
841
842 auto *Clone = new VPReplicateRecipe(Def->getUnderlyingInstr(),
843 Def->operands(), /*IsUniform*/ true,
844 /*Mask*/ nullptr, /*Flags*/ *Def);
845 Clone->insertAfter(Def);
846 Def->replaceAllUsesWith(Clone);
847 }
848
849 // Replace wide pointer inductions which have only their scalars used by
850 // PtrAdd(IndStart, ScalarIVSteps (0, Step)).
851 if (auto *PtrIV = dyn_cast<VPWidenPointerInductionRecipe>(&Phi)) {
852 if (!Plan.hasScalarVFOnly() &&
853 !PtrIV->onlyScalarsGenerated(Plan.hasScalableVF()))
854 continue;
855
856 VPValue *PtrAdd = scalarizeVPWidenPointerInduction(PtrIV, Plan, Builder);
857 PtrIV->replaceAllUsesWith(PtrAdd);
858 continue;
859 }
860
861 // Replace widened induction with scalar steps for users that only use
862 // scalars.
863 auto *WideIV = cast<VPWidenIntOrFpInductionRecipe>(&Phi);
864 if (HasOnlyVectorVFs && none_of(WideIV->users(), [WideIV](VPUser *U) {
865 return U->usesScalars(WideIV);
866 }))
867 continue;
868
869 const InductionDescriptor &ID = WideIV->getInductionDescriptor();
871 Plan, ID.getKind(), ID.getInductionOpcode(),
872 dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
873 WideIV->getTruncInst(), WideIV->getStartValue(), WideIV->getStepValue(),
874 WideIV->getDebugLoc(), Builder);
875
876 // Update scalar users of IV to use Step instead.
877 if (!HasOnlyVectorVFs) {
878 assert(!Plan.hasScalableVF() &&
879 "plans containing a scalar VF cannot also include scalable VFs");
880 WideIV->replaceAllUsesWith(Steps);
881 } else {
882 bool HasScalableVF = Plan.hasScalableVF();
883 WideIV->replaceUsesWithIf(Steps,
884 [WideIV, HasScalableVF](VPUser &U, unsigned) {
885 if (HasScalableVF)
886 return U.usesFirstLaneOnly(WideIV);
887 return U.usesScalars(WideIV);
888 });
889 }
890 }
891}
892
893/// Check if \p VPV is an untruncated wide induction, either before or after the
894/// increment. If so return the header IV (before the increment), otherwise
895/// return null.
897 ScalarEvolution &SE) {
898 auto *WideIV = dyn_cast<VPWidenInductionRecipe>(VPV);
899 if (WideIV) {
900 // VPV itself is a wide induction, separately compute the end value for exit
901 // users if it is not a truncated IV.
902 auto *IntOrFpIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
903 return (IntOrFpIV && IntOrFpIV->getTruncInst()) ? nullptr : WideIV;
904 }
905
906 // Check if VPV is an optimizable induction increment.
907 VPRecipeBase *Def = VPV->getDefiningRecipe();
908 if (!Def || Def->getNumOperands() != 2)
909 return nullptr;
910 WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(0));
911 if (!WideIV)
912 WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(1));
913 if (!WideIV)
914 return nullptr;
915
916 auto IsWideIVInc = [&]() {
917 auto &ID = WideIV->getInductionDescriptor();
918
919 // Check if VPV increments the induction by the induction step.
920 VPValue *IVStep = WideIV->getStepValue();
921 switch (ID.getInductionOpcode()) {
922 case Instruction::Add:
923 return match(VPV, m_c_Add(m_Specific(WideIV), m_Specific(IVStep)));
924 case Instruction::FAdd:
926 m_Specific(IVStep)));
927 case Instruction::FSub:
928 return match(VPV, m_Binary<Instruction::FSub>(m_Specific(WideIV),
929 m_Specific(IVStep)));
930 case Instruction::Sub: {
931 // IVStep will be the negated step of the subtraction. Check if Step == -1
932 // * IVStep.
933 VPValue *Step;
934 if (!match(VPV, m_Sub(m_VPValue(), m_VPValue(Step))))
935 return false;
936 const SCEV *IVStepSCEV = vputils::getSCEVExprForVPValue(IVStep, SE);
937 const SCEV *StepSCEV = vputils::getSCEVExprForVPValue(Step, SE);
938 return !isa<SCEVCouldNotCompute>(IVStepSCEV) &&
939 !isa<SCEVCouldNotCompute>(StepSCEV) &&
940 IVStepSCEV == SE.getNegativeSCEV(StepSCEV);
941 }
942 default:
943 return ID.getKind() == InductionDescriptor::IK_PtrInduction &&
944 match(VPV, m_GetElementPtr(m_Specific(WideIV),
945 m_Specific(WideIV->getStepValue())));
946 }
947 llvm_unreachable("should have been covered by switch above");
948 };
949 return IsWideIVInc() ? WideIV : nullptr;
950}
951
952/// Attempts to optimize the induction variable exit values for users in the
953/// early exit block.
955 VPTypeAnalysis &TypeInfo,
956 VPBlockBase *PredVPBB,
957 VPValue *Op,
958 ScalarEvolution &SE) {
959 VPValue *Incoming, *Mask;
962 return nullptr;
963
964 auto *WideIV = getOptimizableIVOf(Incoming, SE);
965 if (!WideIV)
966 return nullptr;
967
968 auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
969 if (WideIntOrFp && WideIntOrFp->getTruncInst())
970 return nullptr;
971
972 // Calculate the final index.
973 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
974 auto *CanonicalIV = LoopRegion->getCanonicalIV();
975 Type *CanonicalIVType = LoopRegion->getCanonicalIVType();
976 VPBuilder B(cast<VPBasicBlock>(PredVPBB));
977
978 DebugLoc DL = cast<VPInstruction>(Op)->getDebugLoc();
979 VPValue *FirstActiveLane =
980 B.createNaryOp(VPInstruction::FirstActiveLane, Mask, DL);
981 Type *FirstActiveLaneType = TypeInfo.inferScalarType(FirstActiveLane);
982 FirstActiveLane = B.createScalarZExtOrTrunc(FirstActiveLane, CanonicalIVType,
983 FirstActiveLaneType, DL);
984 VPValue *EndValue =
985 B.createNaryOp(Instruction::Add, {CanonicalIV, FirstActiveLane}, DL);
986
987 // `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
988 // changed it means the exit is using the incremented value, so we need to
989 // add the step.
990 if (Incoming != WideIV) {
991 VPValue *One = Plan.getConstantInt(CanonicalIVType, 1);
992 EndValue = B.createNaryOp(Instruction::Add, {EndValue, One}, DL);
993 }
994
995 if (!WideIntOrFp || !WideIntOrFp->isCanonical()) {
996 const InductionDescriptor &ID = WideIV->getInductionDescriptor();
997 VPValue *Start = WideIV->getStartValue();
998 VPValue *Step = WideIV->getStepValue();
999 EndValue = B.createDerivedIV(
1000 ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
1001 Start, EndValue, Step);
1002 }
1003
1004 return EndValue;
1005}
1006
1007/// Attempts to optimize the induction variable exit values for users in the
1008/// exit block coming from the latch in the original scalar loop.
1010 VPlan &Plan, VPTypeAnalysis &TypeInfo, VPBlockBase *PredVPBB, VPValue *Op,
1014 return nullptr;
1015
1016 auto *WideIV = getOptimizableIVOf(Incoming, SE);
1017 if (!WideIV)
1018 return nullptr;
1019
1020 VPValue *EndValue = EndValues.lookup(WideIV);
1021 assert(EndValue && "end value must have been pre-computed");
1022
1023 // `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
1024 // changed it means the exit is using the incremented value, so we don't
1025 // need to subtract the step.
1026 if (Incoming != WideIV)
1027 return EndValue;
1028
1029 // Otherwise, subtract the step from the EndValue.
1030 VPBuilder B(cast<VPBasicBlock>(PredVPBB)->getTerminator());
1031 VPValue *Step = WideIV->getStepValue();
1032 Type *ScalarTy = TypeInfo.inferScalarType(WideIV);
1033 if (ScalarTy->isIntegerTy())
1034 return B.createNaryOp(Instruction::Sub, {EndValue, Step},
1035 DebugLoc::getUnknown(), "ind.escape");
1036 if (ScalarTy->isPointerTy()) {
1037 Type *StepTy = TypeInfo.inferScalarType(Step);
1038 auto *Zero = Plan.getConstantInt(StepTy, 0);
1039 return B.createPtrAdd(EndValue,
1040 B.createNaryOp(Instruction::Sub, {Zero, Step}),
1041 DebugLoc::getUnknown(), "ind.escape");
1042 }
1043 if (ScalarTy->isFloatingPointTy()) {
1044 const auto &ID = WideIV->getInductionDescriptor();
1045 return B.createNaryOp(
1046 ID.getInductionBinOp()->getOpcode() == Instruction::FAdd
1047 ? Instruction::FSub
1048 : Instruction::FAdd,
1049 {EndValue, Step}, {ID.getInductionBinOp()->getFastMathFlags()});
1050 }
1051 llvm_unreachable("all possible induction types must be handled");
1052 return nullptr;
1053}
1054
1056 VPlan &Plan, DenseMap<VPValue *, VPValue *> &EndValues,
1057 ScalarEvolution &SE) {
1058 VPBlockBase *MiddleVPBB = Plan.getMiddleBlock();
1059 VPTypeAnalysis TypeInfo(Plan);
1060 for (VPIRBasicBlock *ExitVPBB : Plan.getExitBlocks()) {
1061 for (VPRecipeBase &R : ExitVPBB->phis()) {
1062 auto *ExitIRI = cast<VPIRPhi>(&R);
1063
1064 for (auto [Idx, PredVPBB] : enumerate(ExitVPBB->getPredecessors())) {
1065 VPValue *Escape = nullptr;
1066 if (PredVPBB == MiddleVPBB)
1067 Escape = optimizeLatchExitInductionUser(Plan, TypeInfo, PredVPBB,
1068 ExitIRI->getOperand(Idx),
1069 EndValues, SE);
1070 else
1071 Escape = optimizeEarlyExitInductionUser(Plan, TypeInfo, PredVPBB,
1072 ExitIRI->getOperand(Idx), SE);
1073 if (Escape)
1074 ExitIRI->setOperand(Idx, Escape);
1075 }
1076 }
1077 }
1078}
1079
1080/// Remove redundant EpxandSCEVRecipes in \p Plan's entry block by replacing
1081/// them with already existing recipes expanding the same SCEV expression.
1084
1085 for (VPRecipeBase &R :
1087 auto *ExpR = dyn_cast<VPExpandSCEVRecipe>(&R);
1088 if (!ExpR)
1089 continue;
1090
1091 const auto &[V, Inserted] = SCEV2VPV.try_emplace(ExpR->getSCEV(), ExpR);
1092 if (Inserted)
1093 continue;
1094 ExpR->replaceAllUsesWith(V->second);
1095 ExpR->eraseFromParent();
1096 }
1097}
1098
1100 SmallVector<VPValue *> WorkList;
1102 WorkList.push_back(V);
1103
1104 while (!WorkList.empty()) {
1105 VPValue *Cur = WorkList.pop_back_val();
1106 if (!Seen.insert(Cur).second)
1107 continue;
1108 VPRecipeBase *R = Cur->getDefiningRecipe();
1109 if (!R)
1110 continue;
1111 if (!isDeadRecipe(*R))
1112 continue;
1113 append_range(WorkList, R->operands());
1114 R->eraseFromParent();
1115 }
1116}
1117
1118/// Get any instruction opcode or intrinsic ID data embedded in recipe \p R.
1119/// Returns an optional pair, where the first element indicates whether it is
1120/// an intrinsic ID.
1121static std::optional<std::pair<bool, unsigned>>
1123 return TypeSwitch<const VPSingleDefRecipe *,
1124 std::optional<std::pair<bool, unsigned>>>(R)
1127 [](auto *I) { return std::make_pair(false, I->getOpcode()); })
1128 .Case<VPWidenIntrinsicRecipe>([](auto *I) {
1129 return std::make_pair(true, I->getVectorIntrinsicID());
1130 })
1131 .Case<VPVectorPointerRecipe, VPPredInstPHIRecipe>([](auto *I) {
1132 // For recipes that do not directly map to LLVM IR instructions,
1133 // assign opcodes after the last VPInstruction opcode (which is also
1134 // after the last IR Instruction opcode), based on the VPDefID.
1135 return std::make_pair(false,
1136 VPInstruction::OpsEnd + 1 + I->getVPDefID());
1137 })
1138 .Default([](auto *) { return std::nullopt; });
1139}
1140
1141/// Try to fold \p R using InstSimplifyFolder. Will succeed and return a
1142/// non-nullptr VPValue for a handled opcode or intrinsic ID if corresponding \p
1143/// Operands are foldable live-ins.
1145 ArrayRef<VPValue *> Operands,
1146 const DataLayout &DL,
1147 VPTypeAnalysis &TypeInfo) {
1148 auto OpcodeOrIID = getOpcodeOrIntrinsicID(&R);
1149 if (!OpcodeOrIID)
1150 return nullptr;
1151
1153 for (VPValue *Op : Operands) {
1154 if (!Op->isLiveIn() || !Op->getLiveInIRValue())
1155 return nullptr;
1156 Ops.push_back(Op->getLiveInIRValue());
1157 }
1158
1159 auto FoldToIRValue = [&]() -> Value * {
1160 InstSimplifyFolder Folder(DL);
1161 if (OpcodeOrIID->first) {
1162 if (R.getNumOperands() != 2)
1163 return nullptr;
1164 unsigned ID = OpcodeOrIID->second;
1165 return Folder.FoldBinaryIntrinsic(ID, Ops[0], Ops[1],
1166 TypeInfo.inferScalarType(&R));
1167 }
1168 unsigned Opcode = OpcodeOrIID->second;
1169 if (Instruction::isBinaryOp(Opcode))
1170 return Folder.FoldBinOp(static_cast<Instruction::BinaryOps>(Opcode),
1171 Ops[0], Ops[1]);
1172 if (Instruction::isCast(Opcode))
1173 return Folder.FoldCast(static_cast<Instruction::CastOps>(Opcode), Ops[0],
1174 TypeInfo.inferScalarType(R.getVPSingleValue()));
1175 switch (Opcode) {
1177 return Folder.FoldSelect(Ops[0], Ops[1],
1179 case VPInstruction::Not:
1180 return Folder.FoldBinOp(Instruction::BinaryOps::Xor, Ops[0],
1182 case Instruction::Select:
1183 return Folder.FoldSelect(Ops[0], Ops[1], Ops[2]);
1184 case Instruction::ICmp:
1185 case Instruction::FCmp:
1186 return Folder.FoldCmp(cast<VPRecipeWithIRFlags>(R).getPredicate(), Ops[0],
1187 Ops[1]);
1188 case Instruction::GetElementPtr: {
1189 auto &RFlags = cast<VPRecipeWithIRFlags>(R);
1190 auto *GEP = cast<GetElementPtrInst>(RFlags.getUnderlyingInstr());
1191 return Folder.FoldGEP(GEP->getSourceElementType(), Ops[0],
1192 drop_begin(Ops), RFlags.getGEPNoWrapFlags());
1193 }
1196 return Folder.FoldGEP(IntegerType::getInt8Ty(TypeInfo.getContext()),
1197 Ops[0], Ops[1],
1198 cast<VPRecipeWithIRFlags>(R).getGEPNoWrapFlags());
1199 // An extract of a live-in is an extract of a broadcast, so return the
1200 // broadcasted element.
1201 case Instruction::ExtractElement:
1202 assert(!Ops[0]->getType()->isVectorTy() && "Live-ins should be scalar");
1203 return Ops[0];
1204 }
1205 return nullptr;
1206 };
1207
1208 if (Value *V = FoldToIRValue())
1209 return R.getParent()->getPlan()->getOrAddLiveIn(V);
1210 return nullptr;
1211}
1212
1213/// Try to simplify VPSingleDefRecipe \p Def.
1215 VPlan *Plan = Def->getParent()->getPlan();
1216
1217 // Simplification of live-in IR values for SingleDef recipes using
1218 // InstSimplifyFolder.
1219 const DataLayout &DL =
1221 if (VPValue *V = tryToFoldLiveIns(*Def, Def->operands(), DL, TypeInfo))
1222 return Def->replaceAllUsesWith(V);
1223
1224 // Fold PredPHI LiveIn -> LiveIn.
1225 if (auto *PredPHI = dyn_cast<VPPredInstPHIRecipe>(Def)) {
1226 VPValue *Op = PredPHI->getOperand(0);
1227 if (Op->isLiveIn())
1228 PredPHI->replaceAllUsesWith(Op);
1229 }
1230
1231 VPBuilder Builder(Def);
1232 VPValue *A;
1233 if (match(Def, m_Trunc(m_ZExtOrSExt(m_VPValue(A))))) {
1234 Type *TruncTy = TypeInfo.inferScalarType(Def);
1235 Type *ATy = TypeInfo.inferScalarType(A);
1236 if (TruncTy == ATy) {
1237 Def->replaceAllUsesWith(A);
1238 } else {
1239 // Don't replace a scalarizing recipe with a widened cast.
1240 if (isa<VPReplicateRecipe>(Def))
1241 return;
1242 if (ATy->getScalarSizeInBits() < TruncTy->getScalarSizeInBits()) {
1243
1244 unsigned ExtOpcode = match(Def->getOperand(0), m_SExt(m_VPValue()))
1245 ? Instruction::SExt
1246 : Instruction::ZExt;
1247 auto *Ext = Builder.createWidenCast(Instruction::CastOps(ExtOpcode), A,
1248 TruncTy);
1249 if (auto *UnderlyingExt = Def->getOperand(0)->getUnderlyingValue()) {
1250 // UnderlyingExt has distinct return type, used to retain legacy cost.
1251 Ext->setUnderlyingValue(UnderlyingExt);
1252 }
1253 Def->replaceAllUsesWith(Ext);
1254 } else if (ATy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits()) {
1255 auto *Trunc = Builder.createWidenCast(Instruction::Trunc, A, TruncTy);
1256 Def->replaceAllUsesWith(Trunc);
1257 }
1258 }
1259#ifndef NDEBUG
1260 // Verify that the cached type info is for both A and its users is still
1261 // accurate by comparing it to freshly computed types.
1262 VPTypeAnalysis TypeInfo2(*Plan);
1263 assert(TypeInfo.inferScalarType(A) == TypeInfo2.inferScalarType(A));
1264 for (VPUser *U : A->users()) {
1265 auto *R = cast<VPRecipeBase>(U);
1266 for (VPValue *VPV : R->definedValues())
1267 assert(TypeInfo.inferScalarType(VPV) == TypeInfo2.inferScalarType(VPV));
1268 }
1269#endif
1270 }
1271
1272 // Simplify (X && Y) || (X && !Y) -> X.
1273 // TODO: Split up into simpler, modular combines: (X && Y) || (X && Z) into X
1274 // && (Y || Z) and (X || !X) into true. This requires queuing newly created
1275 // recipes to be visited during simplification.
1276 VPValue *X, *Y, *Z;
1277 if (match(Def,
1280 Def->replaceAllUsesWith(X);
1281 Def->eraseFromParent();
1282 return;
1283 }
1284
1285 // x | 1 -> 1
1286 if (match(Def, m_c_BinaryOr(m_VPValue(X), m_AllOnes())))
1287 return Def->replaceAllUsesWith(Def->getOperand(Def->getOperand(0) == X));
1288
1289 // x | 0 -> x
1290 if (match(Def, m_c_BinaryOr(m_VPValue(X), m_ZeroInt())))
1291 return Def->replaceAllUsesWith(X);
1292
1293 // x & 0 -> 0
1294 if (match(Def, m_c_BinaryAnd(m_VPValue(X), m_ZeroInt())))
1295 return Def->replaceAllUsesWith(Def->getOperand(Def->getOperand(0) == X));
1296
1297 // x && false -> false
1298 if (match(Def, m_LogicalAnd(m_VPValue(X), m_False())))
1299 return Def->replaceAllUsesWith(Def->getOperand(1));
1300
1301 // (x && y) || (x && z) -> x && (y || z)
1304 // Simplify only if one of the operands has one use to avoid creating an
1305 // extra recipe.
1306 (!Def->getOperand(0)->hasMoreThanOneUniqueUser() ||
1307 !Def->getOperand(1)->hasMoreThanOneUniqueUser()))
1308 return Def->replaceAllUsesWith(
1309 Builder.createLogicalAnd(X, Builder.createOr(Y, Z)));
1310
1311 // x && !x -> 0
1313 return Def->replaceAllUsesWith(Plan->getFalse());
1314
1315 if (match(Def, m_Select(m_VPValue(), m_VPValue(X), m_Deferred(X))))
1316 return Def->replaceAllUsesWith(X);
1317
1318 // select !c, x, y -> select c, y, x
1319 VPValue *C;
1320 if (match(Def, m_Select(m_Not(m_VPValue(C)), m_VPValue(X), m_VPValue(Y)))) {
1321 Def->setOperand(0, C);
1322 Def->setOperand(1, Y);
1323 Def->setOperand(2, X);
1324 return;
1325 }
1326
1327 // Reassociate (x && y) && z -> x && (y && z) if x has multiple users. With
1328 // tail folding it is likely that x is a header mask and can be simplified
1329 // further.
1331 m_VPValue(Z))) &&
1332 X->hasMoreThanOneUniqueUser())
1333 return Def->replaceAllUsesWith(
1334 Builder.createLogicalAnd(X, Builder.createLogicalAnd(Y, Z)));
1335
1336 if (match(Def, m_c_Add(m_VPValue(A), m_ZeroInt())))
1337 return Def->replaceAllUsesWith(A);
1338
1339 if (match(Def, m_c_Mul(m_VPValue(A), m_One())))
1340 return Def->replaceAllUsesWith(A);
1341
1342 if (match(Def, m_c_Mul(m_VPValue(A), m_ZeroInt())))
1343 return Def->replaceAllUsesWith(
1344 Def->getOperand(0) == A ? Def->getOperand(1) : Def->getOperand(0));
1345
1346 if (match(Def, m_Not(m_VPValue(A)))) {
1347 if (match(A, m_Not(m_VPValue(A))))
1348 return Def->replaceAllUsesWith(A);
1349
1350 // Try to fold Not into compares by adjusting the predicate in-place.
1351 CmpPredicate Pred;
1352 if (match(A, m_Cmp(Pred, m_VPValue(), m_VPValue()))) {
1353 auto *Cmp = cast<VPRecipeWithIRFlags>(A);
1354 if (all_of(Cmp->users(),
1356 m_Not(m_Specific(Cmp)),
1357 m_Select(m_Specific(Cmp), m_VPValue(), m_VPValue()))))) {
1358 Cmp->setPredicate(CmpInst::getInversePredicate(Pred));
1359 for (VPUser *U : to_vector(Cmp->users())) {
1360 auto *R = cast<VPSingleDefRecipe>(U);
1361 if (match(R, m_Select(m_Specific(Cmp), m_VPValue(X), m_VPValue(Y)))) {
1362 // select (cmp pred), x, y -> select (cmp inv_pred), y, x
1363 R->setOperand(1, Y);
1364 R->setOperand(2, X);
1365 } else {
1366 // not (cmp pred) -> cmp inv_pred
1367 assert(match(R, m_Not(m_Specific(Cmp))) && "Unexpected user");
1368 R->replaceAllUsesWith(Cmp);
1369 }
1370 }
1371 // If Cmp doesn't have a debug location, use the one from the negation,
1372 // to preserve the location.
1373 if (!Cmp->getDebugLoc() && Def->getDebugLoc())
1374 Cmp->setDebugLoc(Def->getDebugLoc());
1375 }
1376 }
1377 }
1378
1379 // Fold any-of (fcmp uno %A, %A), (fcmp uno %B, %B), ... ->
1380 // any-of (fcmp uno %A, %B), ...
1381 if (match(Def, m_AnyOf())) {
1383 VPRecipeBase *UnpairedCmp = nullptr;
1384 for (VPValue *Op : Def->operands()) {
1385 VPValue *X;
1386 if (Op->getNumUsers() > 1 ||
1388 m_Deferred(X)))) {
1389 NewOps.push_back(Op);
1390 } else if (!UnpairedCmp) {
1391 UnpairedCmp = Op->getDefiningRecipe();
1392 } else {
1393 NewOps.push_back(Builder.createFCmp(CmpInst::FCMP_UNO,
1394 UnpairedCmp->getOperand(0), X));
1395 UnpairedCmp = nullptr;
1396 }
1397 }
1398
1399 if (UnpairedCmp)
1400 NewOps.push_back(UnpairedCmp->getVPSingleValue());
1401
1402 if (NewOps.size() < Def->getNumOperands()) {
1403 VPValue *NewAnyOf = Builder.createNaryOp(VPInstruction::AnyOf, NewOps);
1404 return Def->replaceAllUsesWith(NewAnyOf);
1405 }
1406 }
1407
1408 // Fold (fcmp uno %X, %X) or (fcmp uno %Y, %Y) -> fcmp uno %X, %Y
1409 // This is useful for fmax/fmin without fast-math flags, where we need to
1410 // check if any operand is NaN.
1412 m_Deferred(X)),
1414 m_Deferred(Y))))) {
1415 VPValue *NewCmp = Builder.createFCmp(CmpInst::FCMP_UNO, X, Y);
1416 return Def->replaceAllUsesWith(NewCmp);
1417 }
1418
1419 // Remove redundant DerviedIVs, that is 0 + A * 1 -> A and 0 + 0 * x -> 0.
1420 if ((match(Def, m_DerivedIV(m_ZeroInt(), m_VPValue(A), m_One())) ||
1421 match(Def, m_DerivedIV(m_ZeroInt(), m_ZeroInt(), m_VPValue()))) &&
1422 TypeInfo.inferScalarType(Def->getOperand(1)) ==
1423 TypeInfo.inferScalarType(Def))
1424 return Def->replaceAllUsesWith(Def->getOperand(1));
1425
1427 m_One()))) {
1428 Type *WideStepTy = TypeInfo.inferScalarType(Def);
1429 if (TypeInfo.inferScalarType(X) != WideStepTy)
1430 X = Builder.createWidenCast(Instruction::Trunc, X, WideStepTy);
1431 Def->replaceAllUsesWith(X);
1432 return;
1433 }
1434
1435 // For i1 vp.merges produced by AnyOf reductions:
1436 // vp.merge true, (or x, y), x, evl -> vp.merge y, true, x, evl
1438 m_VPValue(X), m_VPValue())) &&
1440 TypeInfo.inferScalarType(Def)->isIntegerTy(1)) {
1441 Def->setOperand(1, Def->getOperand(0));
1442 Def->setOperand(0, Y);
1443 return;
1444 }
1445
1446 if (auto *Phi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(Def)) {
1447 if (Phi->getOperand(0) == Phi->getOperand(1))
1448 Phi->replaceAllUsesWith(Phi->getOperand(0));
1449 return;
1450 }
1451
1452 // Look through ExtractLastLane.
1453 if (match(Def, m_ExtractLastLane(m_VPValue(A)))) {
1454 if (match(A, m_BuildVector())) {
1455 auto *BuildVector = cast<VPInstruction>(A);
1456 Def->replaceAllUsesWith(
1457 BuildVector->getOperand(BuildVector->getNumOperands() - 1));
1458 return;
1459 }
1460 if (Plan->hasScalarVFOnly())
1461 return Def->replaceAllUsesWith(A);
1462 }
1463
1464 // Look through ExtractPenultimateElement (BuildVector ....).
1466 auto *BuildVector = cast<VPInstruction>(Def->getOperand(0));
1467 Def->replaceAllUsesWith(
1468 BuildVector->getOperand(BuildVector->getNumOperands() - 2));
1469 return;
1470 }
1471
1472 uint64_t Idx;
1474 auto *BuildVector = cast<VPInstruction>(Def->getOperand(0));
1475 Def->replaceAllUsesWith(BuildVector->getOperand(Idx));
1476 return;
1477 }
1478
1479 if (match(Def, m_BuildVector()) && all_equal(Def->operands())) {
1480 Def->replaceAllUsesWith(
1481 Builder.createNaryOp(VPInstruction::Broadcast, Def->getOperand(0)));
1482 return;
1483 }
1484
1485 // Look through broadcast of single-scalar when used as select conditions; in
1486 // that case the scalar condition can be used directly.
1487 if (match(Def,
1490 "broadcast operand must be single-scalar");
1491 Def->setOperand(0, C);
1492 return;
1493 }
1494
1495 if (auto *Phi = dyn_cast<VPPhi>(Def)) {
1496 if (Phi->getNumOperands() == 1)
1497 Phi->replaceAllUsesWith(Phi->getOperand(0));
1498 return;
1499 }
1500
1501 // Some simplifications can only be applied after unrolling. Perform them
1502 // below.
1503 if (!Plan->isUnrolled())
1504 return;
1505
1506 // Hoist an invariant increment Y of a phi X, by having X start at Y.
1507 if (match(Def, m_c_Add(m_VPValue(X), m_VPValue(Y))) && Y->isLiveIn() &&
1508 isa<VPPhi>(X)) {
1509 auto *Phi = cast<VPPhi>(X);
1510 if (Phi->getOperand(1) != Def && match(Phi->getOperand(0), m_ZeroInt()) &&
1511 Phi->getSingleUser() == Def) {
1512 Phi->setOperand(0, Y);
1513 Def->replaceAllUsesWith(Phi);
1514 return;
1515 }
1516 }
1517
1518 // Simplify unrolled VectorPointer without offset, or with zero offset, to
1519 // just the pointer operand.
1520 if (auto *VPR = dyn_cast<VPVectorPointerRecipe>(Def))
1521 if (!VPR->getOffset() || match(VPR->getOffset(), m_ZeroInt()))
1522 return VPR->replaceAllUsesWith(VPR->getOperand(0));
1523
1524 // VPScalarIVSteps for part 0 can be replaced by their start value, if only
1525 // the first lane is demanded.
1526 if (auto *Steps = dyn_cast<VPScalarIVStepsRecipe>(Def)) {
1527 if (Steps->isPart0() && vputils::onlyFirstLaneUsed(Steps)) {
1528 Steps->replaceAllUsesWith(Steps->getOperand(0));
1529 return;
1530 }
1531 }
1532 // Simplify redundant ReductionStartVector recipes after unrolling.
1533 VPValue *StartV;
1535 m_VPValue(StartV), m_VPValue(), m_VPValue()))) {
1536 Def->replaceUsesWithIf(StartV, [](const VPUser &U, unsigned Idx) {
1537 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&U);
1538 return PhiR && PhiR->isInLoop();
1539 });
1540 return;
1541 }
1542
1544 Def->replaceAllUsesWith(A);
1545 return;
1546 }
1547
1548 if (match(Def, m_ExtractLastLane(m_VPValue(A))) &&
1551 cast<VPReplicateRecipe>(A)->isSingleScalar())) &&
1552 all_of(A->users(),
1553 [Def, A](VPUser *U) { return U->usesScalars(A) || Def == U; })) {
1554 return Def->replaceAllUsesWith(A);
1555 }
1556
1557 if (Plan->getUF() == 1 && match(Def, m_ExtractLastPart(m_VPValue(A))))
1558 return Def->replaceAllUsesWith(A);
1559}
1560
1563 Plan.getEntry());
1564 VPTypeAnalysis TypeInfo(Plan);
1566 for (VPRecipeBase &R : make_early_inc_range(*VPBB))
1567 if (auto *Def = dyn_cast<VPSingleDefRecipe>(&R))
1568 simplifyRecipe(Def, TypeInfo);
1569 }
1570}
1571
1573 if (Plan.hasScalarVFOnly())
1574 return;
1575
1576 // Try to narrow wide and replicating recipes to single scalar recipes,
1577 // based on VPlan analysis. Only process blocks in the loop region for now,
1578 // without traversing into nested regions, as recipes in replicate regions
1579 // cannot be converted yet.
1582 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
1585 continue;
1586 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
1587 if (RepR && (RepR->isSingleScalar() || RepR->isPredicated()))
1588 continue;
1589
1590 // Convert an unmasked scatter with an uniform address into
1591 // extract-last-lane + scalar store.
1592 // TODO: Add a profitability check comparing the cost of a scatter vs.
1593 // extract + scalar store.
1594 auto *WidenStoreR = dyn_cast<VPWidenStoreRecipe>(&R);
1595 if (WidenStoreR && vputils::isSingleScalar(WidenStoreR->getAddr()) &&
1596 !WidenStoreR->isConsecutive()) {
1597 assert(!WidenStoreR->isReverse() &&
1598 "Not consecutive memory recipes shouldn't be reversed");
1599 VPValue *Mask = WidenStoreR->getMask();
1600
1601 // Only convert the scatter to a scalar store if it is unmasked.
1602 // TODO: Support converting scatter masked by the header mask to scalar
1603 // store.
1604 if (Mask)
1605 continue;
1606
1608 {WidenStoreR->getOperand(1)});
1609 Extract->insertBefore(WidenStoreR);
1610
1611 // TODO: Sink the scalar store recipe to middle block if possible.
1612 auto *ScalarStore = new VPReplicateRecipe(
1613 &WidenStoreR->getIngredient(), {Extract, WidenStoreR->getAddr()},
1614 true /*IsSingleScalar*/, nullptr /*Mask*/, {},
1615 *WidenStoreR /*Metadata*/);
1616 ScalarStore->insertBefore(WidenStoreR);
1617 WidenStoreR->eraseFromParent();
1618 continue;
1619 }
1620
1621 auto *RepOrWidenR = dyn_cast<VPRecipeWithIRFlags>(&R);
1622 if (RepR && isa<StoreInst>(RepR->getUnderlyingInstr()) &&
1623 vputils::isSingleScalar(RepR->getOperand(1))) {
1624 auto *Clone = new VPReplicateRecipe(
1625 RepOrWidenR->getUnderlyingInstr(), RepOrWidenR->operands(),
1626 true /*IsSingleScalar*/, nullptr /*Mask*/, *RepR /*Flags*/,
1627 *RepR /*Metadata*/, RepR->getDebugLoc());
1628 Clone->insertBefore(RepOrWidenR);
1629 VPBuilder Builder(Clone);
1630 VPValue *ExtractOp = Clone->getOperand(0);
1631 if (vputils::isUniformAcrossVFsAndUFs(RepR->getOperand(1)))
1632 ExtractOp =
1633 Builder.createNaryOp(VPInstruction::ExtractLastPart, ExtractOp);
1634 ExtractOp =
1635 Builder.createNaryOp(VPInstruction::ExtractLastLane, ExtractOp);
1636 Clone->setOperand(0, ExtractOp);
1637 RepR->eraseFromParent();
1638 continue;
1639 }
1640
1641 // Skip recipes that aren't single scalars.
1642 if (!RepOrWidenR || !vputils::isSingleScalar(RepOrWidenR))
1643 continue;
1644
1645 // Skip recipes for which conversion to single-scalar does introduce
1646 // additional broadcasts. No extra broadcasts are needed, if either only
1647 // the scalars of the recipe are used, or at least one of the operands
1648 // would require a broadcast. In the latter case, the single-scalar may
1649 // need to be broadcasted, but another broadcast is removed.
1650 if (!all_of(RepOrWidenR->users(),
1651 [RepOrWidenR](const VPUser *U) {
1652 if (auto *VPI = dyn_cast<VPInstruction>(U)) {
1653 unsigned Opcode = VPI->getOpcode();
1654 if (Opcode == VPInstruction::ExtractLastLane ||
1655 Opcode == VPInstruction::ExtractLastPart ||
1656 Opcode == VPInstruction::ExtractPenultimateElement)
1657 return true;
1658 }
1659
1660 return U->usesScalars(RepOrWidenR);
1661 }) &&
1662 none_of(RepOrWidenR->operands(), [RepOrWidenR](VPValue *Op) {
1663 if (Op->getSingleUser() != RepOrWidenR)
1664 return false;
1665 // Non-constant live-ins require broadcasts, while constants do not
1666 // need explicit broadcasts.
1667 bool LiveInNeedsBroadcast =
1668 Op->isLiveIn() && !isa<Constant>(Op->getLiveInIRValue());
1669 auto *OpR = dyn_cast<VPReplicateRecipe>(Op);
1670 return LiveInNeedsBroadcast || (OpR && OpR->isSingleScalar());
1671 }))
1672 continue;
1673
1674 auto *Clone = new VPReplicateRecipe(
1675 RepOrWidenR->getUnderlyingInstr(), RepOrWidenR->operands(),
1676 true /*IsSingleScalar*/, nullptr, *RepOrWidenR);
1677 Clone->insertBefore(RepOrWidenR);
1678 RepOrWidenR->replaceAllUsesWith(Clone);
1679 if (isDeadRecipe(*RepOrWidenR))
1680 RepOrWidenR->eraseFromParent();
1681 }
1682 }
1683}
1684
1685/// Try to see if all of \p Blend's masks share a common value logically and'ed
1686/// and remove it from the masks.
1688 if (Blend->isNormalized())
1689 return;
1690 VPValue *CommonEdgeMask;
1691 if (!match(Blend->getMask(0),
1692 m_LogicalAnd(m_VPValue(CommonEdgeMask), m_VPValue())))
1693 return;
1694 for (unsigned I = 0; I < Blend->getNumIncomingValues(); I++)
1695 if (!match(Blend->getMask(I),
1696 m_LogicalAnd(m_Specific(CommonEdgeMask), m_VPValue())))
1697 return;
1698 for (unsigned I = 0; I < Blend->getNumIncomingValues(); I++)
1699 Blend->setMask(I, Blend->getMask(I)->getDefiningRecipe()->getOperand(1));
1700}
1701
1702/// Normalize and simplify VPBlendRecipes. Should be run after simplifyRecipes
1703/// to make sure the masks are simplified.
1704static void simplifyBlends(VPlan &Plan) {
1707 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
1708 auto *Blend = dyn_cast<VPBlendRecipe>(&R);
1709 if (!Blend)
1710 continue;
1711
1712 removeCommonBlendMask(Blend);
1713
1714 // Try to remove redundant blend recipes.
1715 SmallPtrSet<VPValue *, 4> UniqueValues;
1716 if (Blend->isNormalized() || !match(Blend->getMask(0), m_False()))
1717 UniqueValues.insert(Blend->getIncomingValue(0));
1718 for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
1719 if (!match(Blend->getMask(I), m_False()))
1720 UniqueValues.insert(Blend->getIncomingValue(I));
1721
1722 if (UniqueValues.size() == 1) {
1723 Blend->replaceAllUsesWith(*UniqueValues.begin());
1724 Blend->eraseFromParent();
1725 continue;
1726 }
1727
1728 if (Blend->isNormalized())
1729 continue;
1730
1731 // Normalize the blend so its first incoming value is used as the initial
1732 // value with the others blended into it.
1733
1734 unsigned StartIndex = 0;
1735 for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
1736 // If a value's mask is used only by the blend then is can be deadcoded.
1737 // TODO: Find the most expensive mask that can be deadcoded, or a mask
1738 // that's used by multiple blends where it can be removed from them all.
1739 VPValue *Mask = Blend->getMask(I);
1740 if (Mask->getNumUsers() == 1 && !match(Mask, m_False())) {
1741 StartIndex = I;
1742 break;
1743 }
1744 }
1745
1746 SmallVector<VPValue *, 4> OperandsWithMask;
1747 OperandsWithMask.push_back(Blend->getIncomingValue(StartIndex));
1748
1749 for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
1750 if (I == StartIndex)
1751 continue;
1752 OperandsWithMask.push_back(Blend->getIncomingValue(I));
1753 OperandsWithMask.push_back(Blend->getMask(I));
1754 }
1755
1756 auto *NewBlend =
1757 new VPBlendRecipe(cast_or_null<PHINode>(Blend->getUnderlyingValue()),
1758 OperandsWithMask, Blend->getDebugLoc());
1759 NewBlend->insertBefore(&R);
1760
1761 VPValue *DeadMask = Blend->getMask(StartIndex);
1762 Blend->replaceAllUsesWith(NewBlend);
1763 Blend->eraseFromParent();
1765
1766 /// Simplify BLEND %a, %b, Not(%mask) -> BLEND %b, %a, %mask.
1767 VPValue *NewMask;
1768 if (NewBlend->getNumOperands() == 3 &&
1769 match(NewBlend->getMask(1), m_Not(m_VPValue(NewMask)))) {
1770 VPValue *Inc0 = NewBlend->getOperand(0);
1771 VPValue *Inc1 = NewBlend->getOperand(1);
1772 VPValue *OldMask = NewBlend->getOperand(2);
1773 NewBlend->setOperand(0, Inc1);
1774 NewBlend->setOperand(1, Inc0);
1775 NewBlend->setOperand(2, NewMask);
1776 if (OldMask->getNumUsers() == 0)
1777 cast<VPInstruction>(OldMask)->eraseFromParent();
1778 }
1779 }
1780 }
1781}
1782
1783/// Optimize the width of vector induction variables in \p Plan based on a known
1784/// constant Trip Count, \p BestVF and \p BestUF.
1786 ElementCount BestVF,
1787 unsigned BestUF) {
1788 // Only proceed if we have not completely removed the vector region.
1789 if (!Plan.getVectorLoopRegion())
1790 return false;
1791
1792 const APInt *TC;
1793 if (!BestVF.isFixed() || !match(Plan.getTripCount(), m_APInt(TC)))
1794 return false;
1795
1796 // Calculate the minimum power-of-2 bit width that can fit the known TC, VF
1797 // and UF. Returns at least 8.
1798 auto ComputeBitWidth = [](APInt TC, uint64_t Align) {
1799 APInt AlignedTC =
1802 APInt MaxVal = AlignedTC - 1;
1803 return std::max<unsigned>(PowerOf2Ceil(MaxVal.getActiveBits()), 8);
1804 };
1805 unsigned NewBitWidth =
1806 ComputeBitWidth(*TC, BestVF.getKnownMinValue() * BestUF);
1807
1808 LLVMContext &Ctx = Plan.getContext();
1809 auto *NewIVTy = IntegerType::get(Ctx, NewBitWidth);
1810
1811 bool MadeChange = false;
1812
1813 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
1814 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
1815 auto *WideIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
1816
1817 // Currently only handle canonical IVs as it is trivial to replace the start
1818 // and stop values, and we currently only perform the optimization when the
1819 // IV has a single use.
1820 if (!WideIV || !WideIV->isCanonical() ||
1821 WideIV->hasMoreThanOneUniqueUser() ||
1822 NewIVTy == WideIV->getScalarType())
1823 continue;
1824
1825 // Currently only handle cases where the single user is a header-mask
1826 // comparison with the backedge-taken-count.
1827 VPUser *SingleUser = WideIV->getSingleUser();
1828 if (!SingleUser ||
1829 !match(SingleUser, m_ICmp(m_Specific(WideIV),
1832 continue;
1833
1834 // Update IV operands and comparison bound to use new narrower type.
1835 auto *NewStart = Plan.getConstantInt(NewIVTy, 0);
1836 WideIV->setStartValue(NewStart);
1837 auto *NewStep = Plan.getConstantInt(NewIVTy, 1);
1838 WideIV->setStepValue(NewStep);
1839
1840 auto *NewBTC = new VPWidenCastRecipe(
1841 Instruction::Trunc, Plan.getOrCreateBackedgeTakenCount(), NewIVTy);
1842 Plan.getVectorPreheader()->appendRecipe(NewBTC);
1843 auto *Cmp = cast<VPInstruction>(WideIV->getSingleUser());
1844 Cmp->setOperand(1, NewBTC);
1845
1846 MadeChange = true;
1847 }
1848
1849 return MadeChange;
1850}
1851
1852/// Return true if \p Cond is known to be true for given \p BestVF and \p
1853/// BestUF.
1855 ElementCount BestVF, unsigned BestUF,
1856 ScalarEvolution &SE) {
1858 return any_of(Cond->getDefiningRecipe()->operands(), [&Plan, BestVF, BestUF,
1859 &SE](VPValue *C) {
1860 return isConditionTrueViaVFAndUF(C, Plan, BestVF, BestUF, SE);
1861 });
1862
1863 auto *CanIV = Plan.getVectorLoopRegion()->getCanonicalIV();
1865 m_Specific(CanIV->getBackedgeValue()),
1866 m_Specific(&Plan.getVectorTripCount()))))
1867 return false;
1868
1869 // The compare checks CanIV + VFxUF == vector trip count. The vector trip
1870 // count is not conveniently available as SCEV so far, so we compare directly
1871 // against the original trip count. This is stricter than necessary, as we
1872 // will only return true if the trip count == vector trip count.
1873 const SCEV *VectorTripCount =
1875 if (isa<SCEVCouldNotCompute>(VectorTripCount))
1876 VectorTripCount = vputils::getSCEVExprForVPValue(Plan.getTripCount(), SE);
1877 assert(!isa<SCEVCouldNotCompute>(VectorTripCount) &&
1878 "Trip count SCEV must be computable");
1879 ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF);
1880 const SCEV *C = SE.getElementCount(VectorTripCount->getType(), NumElements);
1881 return SE.isKnownPredicate(CmpInst::ICMP_EQ, VectorTripCount, C);
1882}
1883
1884/// Try to replace multiple active lane masks used for control flow with
1885/// a single, wide active lane mask instruction followed by multiple
1886/// extract subvector intrinsics. This applies to the active lane mask
1887/// instructions both in the loop and in the preheader.
1888/// Incoming values of all ActiveLaneMaskPHIs are updated to use the
1889/// new extracts from the first active lane mask, which has it's last
1890/// operand (multiplier) set to UF.
1892 unsigned UF) {
1893 if (!EnableWideActiveLaneMask || !VF.isVector() || UF == 1)
1894 return false;
1895
1896 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
1897 VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
1898 auto *Term = &ExitingVPBB->back();
1899
1900 using namespace llvm::VPlanPatternMatch;
1902 m_VPValue(), m_VPValue(), m_VPValue())))))
1903 return false;
1904
1905 auto *Header = cast<VPBasicBlock>(VectorRegion->getEntry());
1906 LLVMContext &Ctx = Plan.getContext();
1907
1908 auto ExtractFromALM = [&](VPInstruction *ALM,
1909 SmallVectorImpl<VPValue *> &Extracts) {
1910 DebugLoc DL = ALM->getDebugLoc();
1911 for (unsigned Part = 0; Part < UF; ++Part) {
1913 Ops.append({ALM, Plan.getOrAddLiveIn(
1914 ConstantInt::get(IntegerType::getInt64Ty(Ctx),
1915 VF.getKnownMinValue() * Part))});
1916 auto *Ext =
1917 new VPWidenIntrinsicRecipe(Intrinsic::vector_extract, Ops,
1918 IntegerType::getInt1Ty(Ctx), {}, {}, DL);
1919 Extracts[Part] = Ext;
1920 Ext->insertAfter(ALM);
1921 }
1922 };
1923
1924 // Create a list of each active lane mask phi, ordered by unroll part.
1926 for (VPRecipeBase &R : Header->phis()) {
1928 if (!Phi)
1929 continue;
1930 VPValue *Index = nullptr;
1931 match(Phi->getBackedgeValue(),
1933 assert(Index && "Expected index from ActiveLaneMask instruction");
1934
1935 uint64_t Part;
1936 if (match(Index,
1938 m_VPValue(), m_ConstantInt(Part))))
1939 Phis[Part] = Phi;
1940 else
1941 // Anything other than a CanonicalIVIncrementForPart is part 0
1942 Phis[0] = Phi;
1943 }
1944
1945 assert(all_of(Phis, [](VPActiveLaneMaskPHIRecipe *Phi) { return Phi; }) &&
1946 "Expected one VPActiveLaneMaskPHIRecipe for each unroll part");
1947
1948 auto *EntryALM = cast<VPInstruction>(Phis[0]->getStartValue());
1949 auto *LoopALM = cast<VPInstruction>(Phis[0]->getBackedgeValue());
1950
1951 assert((EntryALM->getOpcode() == VPInstruction::ActiveLaneMask &&
1952 LoopALM->getOpcode() == VPInstruction::ActiveLaneMask) &&
1953 "Expected incoming values of Phi to be ActiveLaneMasks");
1954
1955 // When using wide lane masks, the return type of the get.active.lane.mask
1956 // intrinsic is VF x UF (last operand).
1957 VPValue *ALMMultiplier = Plan.getConstantInt(64, UF);
1958 EntryALM->setOperand(2, ALMMultiplier);
1959 LoopALM->setOperand(2, ALMMultiplier);
1960
1961 // Create UF x extract vectors and insert into preheader.
1962 SmallVector<VPValue *> EntryExtracts(UF);
1963 ExtractFromALM(EntryALM, EntryExtracts);
1964
1965 // Create UF x extract vectors and insert before the loop compare & branch,
1966 // updating the compare to use the first extract.
1967 SmallVector<VPValue *> LoopExtracts(UF);
1968 ExtractFromALM(LoopALM, LoopExtracts);
1969 VPInstruction *Not = cast<VPInstruction>(Term->getOperand(0));
1970 Not->setOperand(0, LoopExtracts[0]);
1971
1972 // Update the incoming values of active lane mask phis.
1973 for (unsigned Part = 0; Part < UF; ++Part) {
1974 Phis[Part]->setStartValue(EntryExtracts[Part]);
1975 Phis[Part]->setBackedgeValue(LoopExtracts[Part]);
1976 }
1977
1978 return true;
1979}
1980
1981/// Try to simplify the branch condition of \p Plan. This may restrict the
1982/// resulting plan to \p BestVF and \p BestUF.
1984 unsigned BestUF,
1986 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
1987 VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
1988 auto *Term = &ExitingVPBB->back();
1989 VPValue *Cond;
1990 ScalarEvolution &SE = *PSE.getSE();
1991 if (match(Term, m_BranchOnCount()) ||
1993 m_VPValue(), m_VPValue(), m_VPValue()))))) {
1994 // Try to simplify the branch condition if VectorTC <= VF * UF when the
1995 // latch terminator is BranchOnCount or BranchOnCond(Not(ActiveLaneMask)).
1996 const SCEV *VectorTripCount =
1998 if (isa<SCEVCouldNotCompute>(VectorTripCount))
1999 VectorTripCount = vputils::getSCEVExprForVPValue(Plan.getTripCount(), SE);
2000 assert(!isa<SCEVCouldNotCompute>(VectorTripCount) &&
2001 "Trip count SCEV must be computable");
2002 ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF);
2003 const SCEV *C = SE.getElementCount(VectorTripCount->getType(), NumElements);
2004 if (!SE.isKnownPredicate(CmpInst::ICMP_ULE, VectorTripCount, C))
2005 return false;
2006 } else if (match(Term, m_BranchOnCond(m_VPValue(Cond)))) {
2007 // For BranchOnCond, check if we can prove the condition to be true using VF
2008 // and UF.
2009 if (!isConditionTrueViaVFAndUF(Cond, Plan, BestVF, BestUF, SE))
2010 return false;
2011 } else {
2012 return false;
2013 }
2014
2015 // The vector loop region only executes once. If possible, completely remove
2016 // the region, otherwise replace the terminator controlling the latch with
2017 // (BranchOnCond true).
2018 // TODO: VPWidenIntOrFpInductionRecipe is only partially supported; add
2019 // support for other non-canonical widen induction recipes (e.g.,
2020 // VPWidenPointerInductionRecipe).
2021 auto *Header = cast<VPBasicBlock>(VectorRegion->getEntry());
2022 if (all_of(Header->phis(), [](VPRecipeBase &Phi) {
2023 if (auto *R = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi))
2024 return R->isCanonical();
2025 return isa<VPCanonicalIVPHIRecipe, VPEVLBasedIVPHIRecipe,
2026 VPFirstOrderRecurrencePHIRecipe, VPPhi>(&Phi);
2027 })) {
2028 for (VPRecipeBase &HeaderR : make_early_inc_range(Header->phis())) {
2029 if (auto *R = dyn_cast<VPWidenIntOrFpInductionRecipe>(&HeaderR)) {
2030 VPBuilder Builder(Plan.getVectorPreheader());
2031 VPValue *StepV = Builder.createNaryOp(VPInstruction::StepVector, {},
2032 R->getScalarType());
2033 HeaderR.getVPSingleValue()->replaceAllUsesWith(StepV);
2034 HeaderR.eraseFromParent();
2035 continue;
2036 }
2037 auto *Phi = cast<VPPhiAccessors>(&HeaderR);
2038 HeaderR.getVPSingleValue()->replaceAllUsesWith(Phi->getIncomingValue(0));
2039 HeaderR.eraseFromParent();
2040 }
2041
2042 VPBlockBase *Preheader = VectorRegion->getSinglePredecessor();
2043 VPBlockBase *Exit = VectorRegion->getSingleSuccessor();
2044 VPBlockUtils::disconnectBlocks(Preheader, VectorRegion);
2045 VPBlockUtils::disconnectBlocks(VectorRegion, Exit);
2046
2047 for (VPBlockBase *B : vp_depth_first_shallow(VectorRegion->getEntry()))
2048 B->setParent(nullptr);
2049
2050 VPBlockUtils::connectBlocks(Preheader, Header);
2051 VPBlockUtils::connectBlocks(ExitingVPBB, Exit);
2053 } else {
2054 // The vector region contains header phis for which we cannot remove the
2055 // loop region yet.
2056 auto *BOC = new VPInstruction(VPInstruction::BranchOnCond, {Plan.getTrue()},
2057 {}, {}, Term->getDebugLoc());
2058 ExitingVPBB->appendRecipe(BOC);
2059 }
2060
2061 Term->eraseFromParent();
2062
2063 return true;
2064}
2065
2066/// From the definition of llvm.experimental.get.vector.length,
2067/// VPInstruction::ExplicitVectorLength(%AVL) = %AVL when %AVL <= VF.
2071 vp_depth_first_deep(Plan.getEntry()))) {
2072 for (VPRecipeBase &R : *VPBB) {
2073 VPValue *AVL;
2074 if (!match(&R, m_EVL(m_VPValue(AVL))))
2075 continue;
2076
2077 ScalarEvolution &SE = *PSE.getSE();
2078 const SCEV *AVLSCEV = vputils::getSCEVExprForVPValue(AVL, SE);
2079 if (isa<SCEVCouldNotCompute>(AVLSCEV))
2080 continue;
2081 const SCEV *VFSCEV = SE.getElementCount(AVLSCEV->getType(), VF);
2082 if (!SE.isKnownPredicate(CmpInst::ICMP_ULE, AVLSCEV, VFSCEV))
2083 continue;
2084
2086 AVL, Type::getInt32Ty(Plan.getContext()), AVLSCEV->getType(),
2087 R.getDebugLoc());
2088 R.getVPSingleValue()->replaceAllUsesWith(Trunc);
2089 return true;
2090 }
2091 }
2092 return false;
2093}
2094
2096 unsigned BestUF,
2098 assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
2099 assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
2100
2101 bool MadeChange = tryToReplaceALMWithWideALM(Plan, BestVF, BestUF);
2102 MadeChange |= simplifyBranchConditionForVFAndUF(Plan, BestVF, BestUF, PSE);
2103 MadeChange |= optimizeVectorInductionWidthForTCAndVFUF(Plan, BestVF, BestUF);
2104 MadeChange |= simplifyKnownEVL(Plan, BestVF, PSE);
2105
2106 if (MadeChange) {
2107 Plan.setVF(BestVF);
2108 assert(Plan.getUF() == BestUF && "BestUF must match the Plan's UF");
2109 }
2110}
2111
2112/// Sink users of \p FOR after the recipe defining the previous value \p
2113/// Previous of the recurrence. \returns true if all users of \p FOR could be
2114/// re-arranged as needed or false if it is not possible.
2115static bool
2117 VPRecipeBase *Previous,
2118 VPDominatorTree &VPDT) {
2119 // Collect recipes that need sinking.
2122 Seen.insert(Previous);
2123 auto TryToPushSinkCandidate = [&](VPRecipeBase *SinkCandidate) {
2124 // The previous value must not depend on the users of the recurrence phi. In
2125 // that case, FOR is not a fixed order recurrence.
2126 if (SinkCandidate == Previous)
2127 return false;
2128
2129 if (isa<VPHeaderPHIRecipe>(SinkCandidate) ||
2130 !Seen.insert(SinkCandidate).second ||
2131 VPDT.properlyDominates(Previous, SinkCandidate))
2132 return true;
2133
2134 if (cannotHoistOrSinkRecipe(*SinkCandidate))
2135 return false;
2136
2137 WorkList.push_back(SinkCandidate);
2138 return true;
2139 };
2140
2141 // Recursively sink users of FOR after Previous.
2142 WorkList.push_back(FOR);
2143 for (unsigned I = 0; I != WorkList.size(); ++I) {
2144 VPRecipeBase *Current = WorkList[I];
2145 assert(Current->getNumDefinedValues() == 1 &&
2146 "only recipes with a single defined value expected");
2147
2148 for (VPUser *User : Current->getVPSingleValue()->users()) {
2149 if (!TryToPushSinkCandidate(cast<VPRecipeBase>(User)))
2150 return false;
2151 }
2152 }
2153
2154 // Keep recipes to sink ordered by dominance so earlier instructions are
2155 // processed first.
2156 sort(WorkList, [&VPDT](const VPRecipeBase *A, const VPRecipeBase *B) {
2157 return VPDT.properlyDominates(A, B);
2158 });
2159
2160 for (VPRecipeBase *SinkCandidate : WorkList) {
2161 if (SinkCandidate == FOR)
2162 continue;
2163
2164 SinkCandidate->moveAfter(Previous);
2165 Previous = SinkCandidate;
2166 }
2167 return true;
2168}
2169
2170/// Try to hoist \p Previous and its operands before all users of \p FOR.
2172 VPRecipeBase *Previous,
2173 VPDominatorTree &VPDT) {
2174 if (cannotHoistOrSinkRecipe(*Previous))
2175 return false;
2176
2177 // Collect recipes that need hoisting.
2178 SmallVector<VPRecipeBase *> HoistCandidates;
2180 VPRecipeBase *HoistPoint = nullptr;
2181 // Find the closest hoist point by looking at all users of FOR and selecting
2182 // the recipe dominating all other users.
2183 for (VPUser *U : FOR->users()) {
2184 auto *R = cast<VPRecipeBase>(U);
2185 if (!HoistPoint || VPDT.properlyDominates(R, HoistPoint))
2186 HoistPoint = R;
2187 }
2188 assert(all_of(FOR->users(),
2189 [&VPDT, HoistPoint](VPUser *U) {
2190 auto *R = cast<VPRecipeBase>(U);
2191 return HoistPoint == R ||
2192 VPDT.properlyDominates(HoistPoint, R);
2193 }) &&
2194 "HoistPoint must dominate all users of FOR");
2195
2196 auto NeedsHoisting = [HoistPoint, &VPDT,
2197 &Visited](VPValue *HoistCandidateV) -> VPRecipeBase * {
2198 VPRecipeBase *HoistCandidate = HoistCandidateV->getDefiningRecipe();
2199 if (!HoistCandidate)
2200 return nullptr;
2201 VPRegionBlock *EnclosingLoopRegion =
2202 HoistCandidate->getParent()->getEnclosingLoopRegion();
2203 assert((!HoistCandidate->getRegion() ||
2204 HoistCandidate->getRegion() == EnclosingLoopRegion) &&
2205 "CFG in VPlan should still be flat, without replicate regions");
2206 // Hoist candidate was already visited, no need to hoist.
2207 if (!Visited.insert(HoistCandidate).second)
2208 return nullptr;
2209
2210 // Candidate is outside loop region or a header phi, dominates FOR users w/o
2211 // hoisting.
2212 if (!EnclosingLoopRegion || isa<VPHeaderPHIRecipe>(HoistCandidate))
2213 return nullptr;
2214
2215 // If we reached a recipe that dominates HoistPoint, we don't need to
2216 // hoist the recipe.
2217 if (VPDT.properlyDominates(HoistCandidate, HoistPoint))
2218 return nullptr;
2219 return HoistCandidate;
2220 };
2221
2222 if (!NeedsHoisting(Previous->getVPSingleValue()))
2223 return true;
2224
2225 // Recursively try to hoist Previous and its operands before all users of FOR.
2226 HoistCandidates.push_back(Previous);
2227
2228 for (unsigned I = 0; I != HoistCandidates.size(); ++I) {
2229 VPRecipeBase *Current = HoistCandidates[I];
2230 assert(Current->getNumDefinedValues() == 1 &&
2231 "only recipes with a single defined value expected");
2232 if (cannotHoistOrSinkRecipe(*Current))
2233 return false;
2234
2235 for (VPValue *Op : Current->operands()) {
2236 // If we reach FOR, it means the original Previous depends on some other
2237 // recurrence that in turn depends on FOR. If that is the case, we would
2238 // also need to hoist recipes involving the other FOR, which may break
2239 // dependencies.
2240 if (Op == FOR)
2241 return false;
2242
2243 if (auto *R = NeedsHoisting(Op)) {
2244 // Bail out if the recipe defines multiple values.
2245 // TODO: Hoisting such recipes requires additional handling.
2246 if (R->getNumDefinedValues() != 1)
2247 return false;
2248 HoistCandidates.push_back(R);
2249 }
2250 }
2251 }
2252
2253 // Order recipes to hoist by dominance so earlier instructions are processed
2254 // first.
2255 sort(HoistCandidates, [&VPDT](const VPRecipeBase *A, const VPRecipeBase *B) {
2256 return VPDT.properlyDominates(A, B);
2257 });
2258
2259 for (VPRecipeBase *HoistCandidate : HoistCandidates) {
2260 HoistCandidate->moveBefore(*HoistPoint->getParent(),
2261 HoistPoint->getIterator());
2262 }
2263
2264 return true;
2265}
2266
2268 VPBuilder &LoopBuilder) {
2269 VPDominatorTree VPDT(Plan);
2270
2272 for (VPRecipeBase &R :
2275 RecurrencePhis.push_back(FOR);
2276
2277 for (VPFirstOrderRecurrencePHIRecipe *FOR : RecurrencePhis) {
2279 VPRecipeBase *Previous = FOR->getBackedgeValue()->getDefiningRecipe();
2280 // Fixed-order recurrences do not contain cycles, so this loop is guaranteed
2281 // to terminate.
2282 while (auto *PrevPhi =
2284 assert(PrevPhi->getParent() == FOR->getParent());
2285 assert(SeenPhis.insert(PrevPhi).second);
2286 Previous = PrevPhi->getBackedgeValue()->getDefiningRecipe();
2287 }
2288
2289 if (!sinkRecurrenceUsersAfterPrevious(FOR, Previous, VPDT) &&
2290 !hoistPreviousBeforeFORUsers(FOR, Previous, VPDT))
2291 return false;
2292
2293 // Introduce a recipe to combine the incoming and previous values of a
2294 // fixed-order recurrence.
2295 VPBasicBlock *InsertBlock = Previous->getParent();
2296 if (isa<VPHeaderPHIRecipe>(Previous))
2297 LoopBuilder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi());
2298 else
2299 LoopBuilder.setInsertPoint(InsertBlock,
2300 std::next(Previous->getIterator()));
2301
2302 auto *RecurSplice =
2304 {FOR, FOR->getBackedgeValue()});
2305
2306 FOR->replaceAllUsesWith(RecurSplice);
2307 // Set the first operand of RecurSplice to FOR again, after replacing
2308 // all users.
2309 RecurSplice->setOperand(0, FOR);
2310
2311 // Check for users extracting at the penultimate active lane of the FOR.
2312 // If only a single lane is active in the current iteration, we need to
2313 // select the last element from the previous iteration (from the FOR phi
2314 // directly).
2315 for (VPUser *U : RecurSplice->users()) {
2317 m_Specific(RecurSplice))))
2318 continue;
2319
2321 VPValue *LastActiveLane = cast<VPInstruction>(U)->getOperand(0);
2322 Type *I64Ty = Type::getInt64Ty(Plan.getContext());
2323 VPValue *Zero = Plan.getOrAddLiveIn(ConstantInt::get(I64Ty, 0));
2324 VPValue *One = Plan.getOrAddLiveIn(ConstantInt::get(I64Ty, 1));
2325 VPValue *PenultimateIndex =
2326 B.createNaryOp(Instruction::Sub, {LastActiveLane, One});
2327 VPValue *PenultimateLastIter =
2328 B.createNaryOp(VPInstruction::ExtractLane,
2329 {PenultimateIndex, FOR->getBackedgeValue()});
2330 VPValue *LastPrevIter =
2331 B.createNaryOp(VPInstruction::ExtractLastLane, FOR);
2332
2333 VPValue *Cmp = B.createICmp(CmpInst::ICMP_EQ, LastActiveLane, Zero);
2334 VPValue *Sel = B.createSelect(Cmp, LastPrevIter, PenultimateLastIter);
2335 cast<VPInstruction>(U)->replaceAllUsesWith(Sel);
2336 }
2337 }
2338 return true;
2339}
2340
2342 for (VPRecipeBase &R :
2344 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
2345 if (!PhiR)
2346 continue;
2347 RecurKind RK = PhiR->getRecurrenceKind();
2348 if (RK != RecurKind::Add && RK != RecurKind::Mul && RK != RecurKind::Sub &&
2350 continue;
2351
2352 for (VPUser *U : collectUsersRecursively(PhiR))
2353 if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(U)) {
2354 RecWithFlags->dropPoisonGeneratingFlags();
2355 }
2356 }
2357}
2358
2359namespace {
2360struct VPCSEDenseMapInfo : public DenseMapInfo<VPSingleDefRecipe *> {
2361 static bool isSentinel(const VPSingleDefRecipe *Def) {
2362 return Def == getEmptyKey() || Def == getTombstoneKey();
2363 }
2364
2365 /// If recipe \p R will lower to a GEP with a non-i8 source element type,
2366 /// return that source element type.
2367 static Type *getGEPSourceElementType(const VPSingleDefRecipe *R) {
2368 // All VPInstructions that lower to GEPs must have the i8 source element
2369 // type (as they are PtrAdds), so we omit it.
2371 .Case<VPReplicateRecipe>([](auto *I) -> Type * {
2372 if (auto *GEP = dyn_cast<GetElementPtrInst>(I->getUnderlyingValue()))
2373 return GEP->getSourceElementType();
2374 return nullptr;
2375 })
2376 .Case<VPVectorPointerRecipe, VPWidenGEPRecipe>(
2377 [](auto *I) { return I->getSourceElementType(); })
2378 .Default([](auto *) { return nullptr; });
2379 }
2380
2381 /// Returns true if recipe \p Def can be safely handed for CSE.
2382 static bool canHandle(const VPSingleDefRecipe *Def) {
2383 // We can extend the list of handled recipes in the future,
2384 // provided we account for the data embedded in them while checking for
2385 // equality or hashing.
2386 auto C = getOpcodeOrIntrinsicID(Def);
2387
2388 // The issue with (Insert|Extract)Value is that the index of the
2389 // insert/extract is not a proper operand in LLVM IR, and hence also not in
2390 // VPlan.
2391 if (!C || (!C->first && (C->second == Instruction::InsertValue ||
2392 C->second == Instruction::ExtractValue)))
2393 return false;
2394
2395 // During CSE, we can only handle recipes that don't read from memory: if
2396 // they read from memory, there could be an intervening write to memory
2397 // before the next instance is CSE'd, leading to an incorrect result.
2398 return !Def->mayReadFromMemory();
2399 }
2400
2401 /// Hash the underlying data of \p Def.
2402 static unsigned getHashValue(const VPSingleDefRecipe *Def) {
2403 const VPlan *Plan = Def->getParent()->getPlan();
2404 VPTypeAnalysis TypeInfo(*Plan);
2405 hash_code Result = hash_combine(
2406 Def->getVPDefID(), getOpcodeOrIntrinsicID(Def),
2407 getGEPSourceElementType(Def), TypeInfo.inferScalarType(Def),
2409 if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(Def))
2410 if (RFlags->hasPredicate())
2411 return hash_combine(Result, RFlags->getPredicate());
2412 return Result;
2413 }
2414
2415 /// Check equality of underlying data of \p L and \p R.
2416 static bool isEqual(const VPSingleDefRecipe *L, const VPSingleDefRecipe *R) {
2417 if (isSentinel(L) || isSentinel(R))
2418 return L == R;
2419 if (L->getVPDefID() != R->getVPDefID() ||
2421 getGEPSourceElementType(L) != getGEPSourceElementType(R) ||
2423 !equal(L->operands(), R->operands()))
2424 return false;
2426 "must have valid opcode info for both recipes");
2427 if (auto *LFlags = dyn_cast<VPRecipeWithIRFlags>(L))
2428 if (LFlags->hasPredicate() &&
2429 LFlags->getPredicate() !=
2430 cast<VPRecipeWithIRFlags>(R)->getPredicate())
2431 return false;
2432 // Recipes in replicate regions implicitly depend on predicate. If either
2433 // recipe is in a replicate region, only consider them equal if both have
2434 // the same parent.
2435 const VPRegionBlock *RegionL = L->getRegion();
2436 const VPRegionBlock *RegionR = R->getRegion();
2437 if (((RegionL && RegionL->isReplicator()) ||
2438 (RegionR && RegionR->isReplicator())) &&
2439 L->getParent() != R->getParent())
2440 return false;
2441 const VPlan *Plan = L->getParent()->getPlan();
2442 VPTypeAnalysis TypeInfo(*Plan);
2443 return TypeInfo.inferScalarType(L) == TypeInfo.inferScalarType(R);
2444 }
2445};
2446} // end anonymous namespace
2447
2448/// Perform a common-subexpression-elimination of VPSingleDefRecipes on the \p
2449/// Plan.
2451 VPDominatorTree VPDT(Plan);
2453
2455 vp_depth_first_deep(Plan.getEntry()))) {
2456 for (VPRecipeBase &R : *VPBB) {
2457 auto *Def = dyn_cast<VPSingleDefRecipe>(&R);
2458 if (!Def || !VPCSEDenseMapInfo::canHandle(Def))
2459 continue;
2460 if (VPSingleDefRecipe *V = CSEMap.lookup(Def)) {
2461 // V must dominate Def for a valid replacement.
2462 if (!VPDT.dominates(V->getParent(), VPBB))
2463 continue;
2464 // Only keep flags present on both V and Def.
2465 if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(V))
2466 RFlags->intersectFlags(*cast<VPRecipeWithIRFlags>(Def));
2467 Def->replaceAllUsesWith(V);
2468 continue;
2469 }
2470 CSEMap[Def] = Def;
2471 }
2472 }
2473}
2474
2475/// Move loop-invariant recipes out of the vector loop region in \p Plan.
2476static void licm(VPlan &Plan) {
2477 VPBasicBlock *Preheader = Plan.getVectorPreheader();
2478
2479 // Hoist any loop invariant recipes from the vector loop region to the
2480 // preheader. Preform a shallow traversal of the vector loop region, to
2481 // exclude recipes in replicate regions. Since the top-level blocks in the
2482 // vector loop region are guaranteed to execute if the vector pre-header is,
2483 // we don't need to check speculation safety.
2484 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2485 assert(Preheader->getSingleSuccessor() == LoopRegion &&
2486 "Expected vector prehader's successor to be the vector loop region");
2488 vp_depth_first_shallow(LoopRegion->getEntry()))) {
2489 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
2491 continue;
2492 if (any_of(R.operands(), [](VPValue *Op) {
2493 return !Op->isDefinedOutsideLoopRegions();
2494 }))
2495 continue;
2496 R.moveBefore(*Preheader, Preheader->end());
2497 }
2498 }
2499}
2500
2502 VPlan &Plan, const MapVector<Instruction *, uint64_t> &MinBWs) {
2503 if (Plan.hasScalarVFOnly())
2504 return;
2505 // Keep track of created truncates, so they can be re-used. Note that we
2506 // cannot use RAUW after creating a new truncate, as this would could make
2507 // other uses have different types for their operands, making them invalidly
2508 // typed.
2510 VPTypeAnalysis TypeInfo(Plan);
2511 VPBasicBlock *PH = Plan.getVectorPreheader();
2514 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
2517 &R))
2518 continue;
2519
2520 VPValue *ResultVPV = R.getVPSingleValue();
2521 auto *UI = cast_or_null<Instruction>(ResultVPV->getUnderlyingValue());
2522 unsigned NewResSizeInBits = MinBWs.lookup(UI);
2523 if (!NewResSizeInBits)
2524 continue;
2525
2526 // If the value wasn't vectorized, we must maintain the original scalar
2527 // type. Skip those here, after incrementing NumProcessedRecipes. Also
2528 // skip casts which do not need to be handled explicitly here, as
2529 // redundant casts will be removed during recipe simplification.
2531 continue;
2532
2533 Type *OldResTy = TypeInfo.inferScalarType(ResultVPV);
2534 unsigned OldResSizeInBits = OldResTy->getScalarSizeInBits();
2535 assert(OldResTy->isIntegerTy() && "only integer types supported");
2536 (void)OldResSizeInBits;
2537
2538 auto *NewResTy = IntegerType::get(Plan.getContext(), NewResSizeInBits);
2539
2540 // Any wrapping introduced by shrinking this operation shouldn't be
2541 // considered undefined behavior. So, we can't unconditionally copy
2542 // arithmetic wrapping flags to VPW.
2543 if (auto *VPW = dyn_cast<VPRecipeWithIRFlags>(&R))
2544 VPW->dropPoisonGeneratingFlags();
2545
2546 if (OldResSizeInBits != NewResSizeInBits &&
2547 !match(&R, m_ICmp(m_VPValue(), m_VPValue()))) {
2548 // Extend result to original width.
2549 auto *Ext =
2550 new VPWidenCastRecipe(Instruction::ZExt, ResultVPV, OldResTy);
2551 Ext->insertAfter(&R);
2552 ResultVPV->replaceAllUsesWith(Ext);
2553 Ext->setOperand(0, ResultVPV);
2554 assert(OldResSizeInBits > NewResSizeInBits && "Nothing to shrink?");
2555 } else {
2556 assert(match(&R, m_ICmp(m_VPValue(), m_VPValue())) &&
2557 "Only ICmps should not need extending the result.");
2558 }
2559
2560 assert(!isa<VPWidenStoreRecipe>(&R) && "stores cannot be narrowed");
2562 continue;
2563
2564 // Shrink operands by introducing truncates as needed.
2565 unsigned StartIdx = isa<VPWidenSelectRecipe>(&R) ? 1 : 0;
2566 for (unsigned Idx = StartIdx; Idx != R.getNumOperands(); ++Idx) {
2567 auto *Op = R.getOperand(Idx);
2568 unsigned OpSizeInBits =
2570 if (OpSizeInBits == NewResSizeInBits)
2571 continue;
2572 assert(OpSizeInBits > NewResSizeInBits && "nothing to truncate");
2573 auto [ProcessedIter, IterIsEmpty] = ProcessedTruncs.try_emplace(Op);
2574 if (!IterIsEmpty) {
2575 R.setOperand(Idx, ProcessedIter->second);
2576 continue;
2577 }
2578
2579 VPBuilder Builder;
2580 if (Op->isLiveIn())
2581 Builder.setInsertPoint(PH);
2582 else
2583 Builder.setInsertPoint(&R);
2584 VPWidenCastRecipe *NewOp =
2585 Builder.createWidenCast(Instruction::Trunc, Op, NewResTy);
2586 ProcessedIter->second = NewOp;
2587 R.setOperand(Idx, NewOp);
2588 }
2589
2590 }
2591 }
2592}
2593
2597 VPValue *Cond;
2598 // Skip blocks that are not terminated by BranchOnCond.
2599 if (VPBB->empty() || !match(&VPBB->back(), m_BranchOnCond(m_VPValue(Cond))))
2600 continue;
2601
2602 assert(VPBB->getNumSuccessors() == 2 &&
2603 "Two successors expected for BranchOnCond");
2604 unsigned RemovedIdx;
2605 if (match(Cond, m_True()))
2606 RemovedIdx = 1;
2607 else if (match(Cond, m_False()))
2608 RemovedIdx = 0;
2609 else
2610 continue;
2611
2612 VPBasicBlock *RemovedSucc =
2613 cast<VPBasicBlock>(VPBB->getSuccessors()[RemovedIdx]);
2614 assert(count(RemovedSucc->getPredecessors(), VPBB) == 1 &&
2615 "There must be a single edge between VPBB and its successor");
2616 // Values coming from VPBB into phi recipes of RemoveSucc are removed from
2617 // these recipes.
2618 for (VPRecipeBase &R : RemovedSucc->phis())
2619 cast<VPPhiAccessors>(&R)->removeIncomingValueFor(VPBB);
2620
2621 // Disconnect blocks and remove the terminator. RemovedSucc will be deleted
2622 // automatically on VPlan destruction if it becomes unreachable.
2623 VPBlockUtils::disconnectBlocks(VPBB, RemovedSucc);
2624 VPBB->back().eraseFromParent();
2625 }
2626}
2627
2647
2648// Add a VPActiveLaneMaskPHIRecipe and related recipes to \p Plan and replace
2649// the loop terminator with a branch-on-cond recipe with the negated
2650// active-lane-mask as operand. Note that this turns the loop into an
2651// uncountable one. Only the existing terminator is replaced, all other existing
2652// recipes/users remain unchanged, except for poison-generating flags being
2653// dropped from the canonical IV increment. Return the created
2654// VPActiveLaneMaskPHIRecipe.
2655//
2656// The function uses the following definitions:
2657//
2658// %TripCount = DataWithControlFlowWithoutRuntimeCheck ?
2659// calculate-trip-count-minus-VF (original TC) : original TC
2660// %IncrementValue = DataWithControlFlowWithoutRuntimeCheck ?
2661// CanonicalIVPhi : CanonicalIVIncrement
2662// %StartV is the canonical induction start value.
2663//
2664// The function adds the following recipes:
2665//
2666// vector.ph:
2667// %TripCount = calculate-trip-count-minus-VF (original TC)
2668// [if DataWithControlFlowWithoutRuntimeCheck]
2669// %EntryInc = canonical-iv-increment-for-part %StartV
2670// %EntryALM = active-lane-mask %EntryInc, %TripCount
2671//
2672// vector.body:
2673// ...
2674// %P = active-lane-mask-phi [ %EntryALM, %vector.ph ], [ %ALM, %vector.body ]
2675// ...
2676// %InLoopInc = canonical-iv-increment-for-part %IncrementValue
2677// %ALM = active-lane-mask %InLoopInc, TripCount
2678// %Negated = Not %ALM
2679// branch-on-cond %Negated
2680//
2683 VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
2684 VPBasicBlock *EB = TopRegion->getExitingBasicBlock();
2685 auto *CanonicalIVPHI = TopRegion->getCanonicalIV();
2686 VPValue *StartV = CanonicalIVPHI->getStartValue();
2687
2688 auto *CanonicalIVIncrement =
2689 cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue());
2690 // TODO: Check if dropping the flags is needed if
2691 // !DataAndControlFlowWithoutRuntimeCheck.
2692 CanonicalIVIncrement->dropPoisonGeneratingFlags();
2693 DebugLoc DL = CanonicalIVIncrement->getDebugLoc();
2694 // We can't use StartV directly in the ActiveLaneMask VPInstruction, since
2695 // we have to take unrolling into account. Each part needs to start at
2696 // Part * VF
2697 auto *VecPreheader = Plan.getVectorPreheader();
2698 VPBuilder Builder(VecPreheader);
2699
2700 // Create the ActiveLaneMask instruction using the correct start values.
2701 VPValue *TC = Plan.getTripCount();
2702
2703 VPValue *TripCount, *IncrementValue;
2705 // When the loop is guarded by a runtime overflow check for the loop
2706 // induction variable increment by VF, we can increment the value before
2707 // the get.active.lane mask and use the unmodified tripcount.
2708 IncrementValue = CanonicalIVIncrement;
2709 TripCount = TC;
2710 } else {
2711 // When avoiding a runtime check, the active.lane.mask inside the loop
2712 // uses a modified trip count and the induction variable increment is
2713 // done after the active.lane.mask intrinsic is called.
2714 IncrementValue = CanonicalIVPHI;
2715 TripCount = Builder.createNaryOp(VPInstruction::CalculateTripCountMinusVF,
2716 {TC}, DL);
2717 }
2718 auto *EntryIncrement = Builder.createOverflowingOp(
2719 VPInstruction::CanonicalIVIncrementForPart, {StartV}, {false, false}, DL,
2720 "index.part.next");
2721
2722 // Create the active lane mask instruction in the VPlan preheader.
2723 VPValue *ALMMultiplier =
2724 Plan.getConstantInt(TopRegion->getCanonicalIVType(), 1);
2725 auto *EntryALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
2726 {EntryIncrement, TC, ALMMultiplier}, DL,
2727 "active.lane.mask.entry");
2728
2729 // Now create the ActiveLaneMaskPhi recipe in the main loop using the
2730 // preheader ActiveLaneMask instruction.
2731 auto *LaneMaskPhi =
2733 LaneMaskPhi->insertAfter(CanonicalIVPHI);
2734
2735 // Create the active lane mask for the next iteration of the loop before the
2736 // original terminator.
2737 VPRecipeBase *OriginalTerminator = EB->getTerminator();
2738 Builder.setInsertPoint(OriginalTerminator);
2739 auto *InLoopIncrement =
2740 Builder.createOverflowingOp(VPInstruction::CanonicalIVIncrementForPart,
2741 {IncrementValue}, {false, false}, DL);
2742 auto *ALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
2743 {InLoopIncrement, TripCount, ALMMultiplier},
2744 DL, "active.lane.mask.next");
2745 LaneMaskPhi->addOperand(ALM);
2746
2747 // Replace the original terminator with BranchOnCond. We have to invert the
2748 // mask here because a true condition means jumping to the exit block.
2749 auto *NotMask = Builder.createNot(ALM, DL);
2750 Builder.createNaryOp(VPInstruction::BranchOnCond, {NotMask}, DL);
2751 OriginalTerminator->eraseFromParent();
2752 return LaneMaskPhi;
2753}
2754
2755/// Collect the header mask with the pattern:
2756/// (ICMP_ULE, WideCanonicalIV, backedge-taken-count)
2757/// TODO: Introduce explicit recipe for header-mask instead of searching
2758/// for the header-mask pattern manually.
2760 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2761 SmallVector<VPValue *> WideCanonicalIVs;
2762 auto *FoundWidenCanonicalIVUser = find_if(
2764 assert(count_if(LoopRegion->getCanonicalIV()->users(),
2766 "Must have at most one VPWideCanonicalIVRecipe");
2767 if (FoundWidenCanonicalIVUser !=
2768 LoopRegion->getCanonicalIV()->users().end()) {
2769 auto *WideCanonicalIV =
2770 cast<VPWidenCanonicalIVRecipe>(*FoundWidenCanonicalIVUser);
2771 WideCanonicalIVs.push_back(WideCanonicalIV);
2772 }
2773
2774 // Also include VPWidenIntOrFpInductionRecipes that represent a widened
2775 // version of the canonical induction.
2776 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
2777 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
2778 auto *WidenOriginalIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
2779 if (WidenOriginalIV && WidenOriginalIV->isCanonical())
2780 WideCanonicalIVs.push_back(WidenOriginalIV);
2781 }
2782
2783 // Walk users of wide canonical IVs and find the single compare of the form
2784 // (ICMP_ULE, WideCanonicalIV, backedge-taken-count).
2785 VPSingleDefRecipe *HeaderMask = nullptr;
2786 for (auto *Wide : WideCanonicalIVs) {
2787 for (VPUser *U : SmallVector<VPUser *>(Wide->users())) {
2788 auto *VPI = dyn_cast<VPInstruction>(U);
2789 if (!VPI || !vputils::isHeaderMask(VPI, Plan))
2790 continue;
2791
2792 assert(VPI->getOperand(0) == Wide &&
2793 "WidenCanonicalIV must be the first operand of the compare");
2794 assert(!HeaderMask && "Multiple header masks found?");
2795 HeaderMask = VPI;
2796 }
2797 }
2798 return HeaderMask;
2799}
2800
2802 VPlan &Plan, bool UseActiveLaneMaskForControlFlow,
2805 UseActiveLaneMaskForControlFlow) &&
2806 "DataAndControlFlowWithoutRuntimeCheck implies "
2807 "UseActiveLaneMaskForControlFlow");
2808
2809 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2810 auto *FoundWidenCanonicalIVUser = find_if(
2812 assert(FoundWidenCanonicalIVUser &&
2813 "Must have widened canonical IV when tail folding!");
2814 VPSingleDefRecipe *HeaderMask = findHeaderMask(Plan);
2815 auto *WideCanonicalIV =
2816 cast<VPWidenCanonicalIVRecipe>(*FoundWidenCanonicalIVUser);
2817 VPSingleDefRecipe *LaneMask;
2818 if (UseActiveLaneMaskForControlFlow) {
2821 } else {
2822 VPBuilder B = VPBuilder::getToInsertAfter(WideCanonicalIV);
2823 VPValue *ALMMultiplier = Plan.getOrAddLiveIn(
2824 ConstantInt::get(LoopRegion->getCanonicalIVType(), 1));
2825 LaneMask =
2826 B.createNaryOp(VPInstruction::ActiveLaneMask,
2827 {WideCanonicalIV, Plan.getTripCount(), ALMMultiplier},
2828 nullptr, "active.lane.mask");
2829 }
2830
2831 // Walk users of WideCanonicalIV and replace the header mask of the form
2832 // (ICMP_ULE, WideCanonicalIV, backedge-taken-count) with an active-lane-mask,
2833 // removing the old one to ensure there is always only a single header mask.
2834 HeaderMask->replaceAllUsesWith(LaneMask);
2835 HeaderMask->eraseFromParent();
2836}
2837
2838template <typename Op0_t, typename Op1_t> struct RemoveMask_match {
2839 Op0_t In;
2841
2842 RemoveMask_match(const Op0_t &In, Op1_t &Out) : In(In), Out(Out) {}
2843
2844 template <typename OpTy> bool match(OpTy *V) const {
2845 if (m_Specific(In).match(V)) {
2846 Out = nullptr;
2847 return true;
2848 }
2849 return m_LogicalAnd(m_Specific(In), m_VPValue(Out)).match(V);
2850 }
2851};
2852
2853/// Match a specific mask \p In, or a combination of it (logical-and In, Out).
2854/// Returns the remaining part \p Out if so, or nullptr otherwise.
2855template <typename Op0_t, typename Op1_t>
2856static inline RemoveMask_match<Op0_t, Op1_t> m_RemoveMask(const Op0_t &In,
2857 Op1_t &Out) {
2858 return RemoveMask_match<Op0_t, Op1_t>(In, Out);
2859}
2860
2861/// Try to optimize a \p CurRecipe masked by \p HeaderMask to a corresponding
2862/// EVL-based recipe without the header mask. Returns nullptr if no EVL-based
2863/// recipe could be created.
2864/// \p HeaderMask Header Mask.
2865/// \p CurRecipe Recipe to be transform.
2866/// \p TypeInfo VPlan-based type analysis.
2867/// \p EVL The explicit vector length parameter of vector-predication
2868/// intrinsics.
2870 VPRecipeBase &CurRecipe,
2871 VPTypeAnalysis &TypeInfo, VPValue &EVL) {
2872 VPlan *Plan = CurRecipe.getParent()->getPlan();
2873 DebugLoc DL = CurRecipe.getDebugLoc();
2874 VPValue *Addr, *Mask, *EndPtr;
2875
2876 /// Adjust any end pointers so that they point to the end of EVL lanes not VF.
2877 auto AdjustEndPtr = [&CurRecipe, &EVL](VPValue *EndPtr) {
2878 auto *EVLEndPtr = cast<VPVectorEndPointerRecipe>(EndPtr)->clone();
2879 EVLEndPtr->insertBefore(&CurRecipe);
2880 EVLEndPtr->setOperand(1, &EVL);
2881 return EVLEndPtr;
2882 };
2883
2884 if (match(&CurRecipe,
2885 m_MaskedLoad(m_VPValue(Addr), m_RemoveMask(HeaderMask, Mask))) &&
2886 !cast<VPWidenLoadRecipe>(CurRecipe).isReverse())
2887 return new VPWidenLoadEVLRecipe(cast<VPWidenLoadRecipe>(CurRecipe), Addr,
2888 EVL, Mask);
2889
2890 if (match(&CurRecipe,
2891 m_MaskedLoad(m_VPValue(EndPtr), m_RemoveMask(HeaderMask, Mask))) &&
2892 match(EndPtr, m_VecEndPtr(m_VPValue(Addr), m_Specific(&Plan->getVF()))) &&
2893 cast<VPWidenLoadRecipe>(CurRecipe).isReverse())
2894 return new VPWidenLoadEVLRecipe(cast<VPWidenLoadRecipe>(CurRecipe),
2895 AdjustEndPtr(EndPtr), EVL, Mask);
2896
2897 if (match(&CurRecipe, m_MaskedStore(m_VPValue(Addr), m_VPValue(),
2898 m_RemoveMask(HeaderMask, Mask))) &&
2899 !cast<VPWidenStoreRecipe>(CurRecipe).isReverse())
2900 return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(CurRecipe), Addr,
2901 EVL, Mask);
2902
2903 if (match(&CurRecipe, m_MaskedStore(m_VPValue(EndPtr), m_VPValue(),
2904 m_RemoveMask(HeaderMask, Mask))) &&
2905 match(EndPtr, m_VecEndPtr(m_VPValue(Addr), m_Specific(&Plan->getVF()))) &&
2906 cast<VPWidenStoreRecipe>(CurRecipe).isReverse())
2907 return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(CurRecipe),
2908 AdjustEndPtr(EndPtr), EVL, Mask);
2909
2910 if (auto *Rdx = dyn_cast<VPReductionRecipe>(&CurRecipe))
2911 if (Rdx->isConditional() &&
2912 match(Rdx->getCondOp(), m_RemoveMask(HeaderMask, Mask)))
2913 return new VPReductionEVLRecipe(*Rdx, EVL, Mask);
2914
2915 if (auto *Interleave = dyn_cast<VPInterleaveRecipe>(&CurRecipe))
2916 if (Interleave->getMask() &&
2917 match(Interleave->getMask(), m_RemoveMask(HeaderMask, Mask)))
2918 return new VPInterleaveEVLRecipe(*Interleave, EVL, Mask);
2919
2920 VPValue *LHS, *RHS;
2921 if (match(&CurRecipe,
2922 m_Select(m_Specific(HeaderMask), m_VPValue(LHS), m_VPValue(RHS))))
2923 return new VPWidenIntrinsicRecipe(
2924 Intrinsic::vp_merge, {Plan->getTrue(), LHS, RHS, &EVL},
2925 TypeInfo.inferScalarType(LHS), {}, {}, DL);
2926
2927 if (match(&CurRecipe, m_Select(m_RemoveMask(HeaderMask, Mask), m_VPValue(LHS),
2928 m_VPValue(RHS))))
2929 return new VPWidenIntrinsicRecipe(
2930 Intrinsic::vp_merge, {Mask, LHS, RHS, &EVL},
2931 TypeInfo.inferScalarType(LHS), {}, {}, DL);
2932
2933 if (match(&CurRecipe, m_LastActiveLane(m_Specific(HeaderMask)))) {
2934 Type *Ty = TypeInfo.inferScalarType(CurRecipe.getVPSingleValue());
2935 VPValue *ZExt =
2936 VPBuilder(&CurRecipe).createScalarCast(Instruction::ZExt, &EVL, Ty, DL);
2937 return new VPInstruction(Instruction::Sub,
2938 {ZExt, Plan->getConstantInt(Ty, 1)}, {}, {}, DL);
2939 }
2940
2941 return nullptr;
2942}
2943
2944/// Replace recipes with their EVL variants.
2946 VPTypeAnalysis TypeInfo(Plan);
2947 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2948 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
2949
2950 assert(all_of(Plan.getVF().users(),
2953 "User of VF that we can't transform to EVL.");
2954 Plan.getVF().replaceUsesWithIf(&EVL, [](VPUser &U, unsigned Idx) {
2956 });
2957
2958 assert(all_of(Plan.getVFxUF().users(),
2959 [&LoopRegion, &Plan](VPUser *U) {
2960 return match(U,
2961 m_c_Add(m_Specific(LoopRegion->getCanonicalIV()),
2962 m_Specific(&Plan.getVFxUF()))) ||
2963 isa<VPWidenPointerInductionRecipe>(U);
2964 }) &&
2965 "Only users of VFxUF should be VPWidenPointerInductionRecipe and the "
2966 "increment of the canonical induction.");
2967 Plan.getVFxUF().replaceUsesWithIf(&EVL, [](VPUser &U, unsigned Idx) {
2968 // Only replace uses in VPWidenPointerInductionRecipe; The increment of the
2969 // canonical induction must not be updated.
2971 });
2972
2973 // Defer erasing recipes till the end so that we don't invalidate the
2974 // VPTypeAnalysis cache.
2976
2977 // Create a scalar phi to track the previous EVL if fixed-order recurrence is
2978 // contained.
2979 bool ContainsFORs =
2981 if (ContainsFORs) {
2982 // TODO: Use VPInstruction::ExplicitVectorLength to get maximum EVL.
2983 VPValue *MaxEVL = &Plan.getVF();
2984 // Emit VPScalarCastRecipe in preheader if VF is not a 32 bits integer.
2985 VPBuilder Builder(LoopRegion->getPreheaderVPBB());
2986 MaxEVL = Builder.createScalarZExtOrTrunc(
2987 MaxEVL, Type::getInt32Ty(Plan.getContext()),
2988 TypeInfo.inferScalarType(MaxEVL), DebugLoc::getUnknown());
2989
2990 Builder.setInsertPoint(Header, Header->getFirstNonPhi());
2991 VPValue *PrevEVL = Builder.createScalarPhi(
2992 {MaxEVL, &EVL}, DebugLoc::getUnknown(), "prev.evl");
2993
2996 for (VPRecipeBase &R : *VPBB) {
2997 VPValue *V1, *V2;
2998 if (!match(&R,
3000 m_VPValue(V1), m_VPValue(V2))))
3001 continue;
3002 VPValue *Imm = Plan.getOrAddLiveIn(
3005 Intrinsic::experimental_vp_splice,
3006 {V1, V2, Imm, Plan.getTrue(), PrevEVL, &EVL},
3007 TypeInfo.inferScalarType(R.getVPSingleValue()), {}, {},
3008 R.getDebugLoc());
3009 VPSplice->insertBefore(&R);
3010 R.getVPSingleValue()->replaceAllUsesWith(VPSplice);
3011 ToErase.push_back(&R);
3012 }
3013 }
3014 }
3015
3016 VPValue *HeaderMask = findHeaderMask(Plan);
3017 if (!HeaderMask)
3018 return;
3019
3020 // Replace header masks with a mask equivalent to predicating by EVL:
3021 //
3022 // icmp ule widen-canonical-iv backedge-taken-count
3023 // ->
3024 // icmp ult step-vector, EVL
3025 VPRecipeBase *EVLR = EVL.getDefiningRecipe();
3026 VPBuilder Builder(EVLR->getParent(), std::next(EVLR->getIterator()));
3027 Type *EVLType = TypeInfo.inferScalarType(&EVL);
3028 VPValue *EVLMask = Builder.createICmp(
3030 Builder.createNaryOp(VPInstruction::StepVector, {}, EVLType), &EVL);
3031 HeaderMask->replaceAllUsesWith(EVLMask);
3032 ToErase.push_back(HeaderMask->getDefiningRecipe());
3033
3034 // Try to optimize header mask recipes away to their EVL variants.
3035 // TODO: Split optimizeMaskToEVL out and move into
3036 // VPlanTransforms::optimize. transformRecipestoEVLRecipes should be run in
3037 // tryToBuildVPlanWithVPRecipes beforehand.
3038 for (VPUser *U : collectUsersRecursively(EVLMask)) {
3039 auto *CurRecipe = cast<VPRecipeBase>(U);
3040 VPRecipeBase *EVLRecipe =
3041 optimizeMaskToEVL(EVLMask, *CurRecipe, TypeInfo, EVL);
3042 if (!EVLRecipe)
3043 continue;
3044
3045 unsigned NumDefVal = EVLRecipe->getNumDefinedValues();
3046 assert(NumDefVal == CurRecipe->getNumDefinedValues() &&
3047 "New recipe must define the same number of values as the "
3048 "original.");
3049 EVLRecipe->insertBefore(CurRecipe);
3051 EVLRecipe)) {
3052 for (unsigned I = 0; I < NumDefVal; ++I) {
3053 VPValue *CurVPV = CurRecipe->getVPValue(I);
3054 CurVPV->replaceAllUsesWith(EVLRecipe->getVPValue(I));
3055 }
3056 }
3057 ToErase.push_back(CurRecipe);
3058 }
3059 // Remove dead EVL mask.
3060 if (EVLMask->getNumUsers() == 0)
3061 ToErase.push_back(EVLMask->getDefiningRecipe());
3062
3063 for (VPRecipeBase *R : reverse(ToErase)) {
3064 SmallVector<VPValue *> PossiblyDead(R->operands());
3065 R->eraseFromParent();
3066 for (VPValue *Op : PossiblyDead)
3068 }
3069}
3070
3071/// Add a VPEVLBasedIVPHIRecipe and related recipes to \p Plan and
3072/// replaces all uses except the canonical IV increment of
3073/// VPCanonicalIVPHIRecipe with a VPEVLBasedIVPHIRecipe. VPCanonicalIVPHIRecipe
3074/// is used only for loop iterations counting after this transformation.
3075///
3076/// The function uses the following definitions:
3077/// %StartV is the canonical induction start value.
3078///
3079/// The function adds the following recipes:
3080///
3081/// vector.ph:
3082/// ...
3083///
3084/// vector.body:
3085/// ...
3086/// %EVLPhi = EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI [ %StartV, %vector.ph ],
3087/// [ %NextEVLIV, %vector.body ]
3088/// %AVL = phi [ trip-count, %vector.ph ], [ %NextAVL, %vector.body ]
3089/// %VPEVL = EXPLICIT-VECTOR-LENGTH %AVL
3090/// ...
3091/// %OpEVL = cast i32 %VPEVL to IVSize
3092/// %NextEVLIV = add IVSize %OpEVL, %EVLPhi
3093/// %NextAVL = sub IVSize nuw %AVL, %OpEVL
3094/// ...
3095///
3096/// If MaxSafeElements is provided, the function adds the following recipes:
3097/// vector.ph:
3098/// ...
3099///
3100/// vector.body:
3101/// ...
3102/// %EVLPhi = EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI [ %StartV, %vector.ph ],
3103/// [ %NextEVLIV, %vector.body ]
3104/// %AVL = phi [ trip-count, %vector.ph ], [ %NextAVL, %vector.body ]
3105/// %cmp = cmp ult %AVL, MaxSafeElements
3106/// %SAFE_AVL = select %cmp, %AVL, MaxSafeElements
3107/// %VPEVL = EXPLICIT-VECTOR-LENGTH %SAFE_AVL
3108/// ...
3109/// %OpEVL = cast i32 %VPEVL to IVSize
3110/// %NextEVLIV = add IVSize %OpEVL, %EVLPhi
3111/// %NextAVL = sub IVSize nuw %AVL, %OpEVL
3112/// ...
3113///
3115 VPlan &Plan, const std::optional<unsigned> &MaxSafeElements) {
3116 if (Plan.hasScalarVFOnly())
3117 return;
3118 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3119 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
3120
3121 auto *CanonicalIVPHI = LoopRegion->getCanonicalIV();
3122 auto *CanIVTy = LoopRegion->getCanonicalIVType();
3123 VPValue *StartV = CanonicalIVPHI->getStartValue();
3124
3125 // Create the ExplicitVectorLengthPhi recipe in the main loop.
3126 auto *EVLPhi = new VPEVLBasedIVPHIRecipe(StartV, DebugLoc::getUnknown());
3127 EVLPhi->insertAfter(CanonicalIVPHI);
3128 VPBuilder Builder(Header, Header->getFirstNonPhi());
3129 // Create the AVL (application vector length), starting from TC -> 0 in steps
3130 // of EVL.
3131 VPPhi *AVLPhi = Builder.createScalarPhi(
3132 {Plan.getTripCount()}, DebugLoc::getCompilerGenerated(), "avl");
3133 VPValue *AVL = AVLPhi;
3134
3135 if (MaxSafeElements) {
3136 // Support for MaxSafeDist for correct loop emission.
3137 VPValue *AVLSafe = Plan.getConstantInt(CanIVTy, *MaxSafeElements);
3138 VPValue *Cmp = Builder.createICmp(ICmpInst::ICMP_ULT, AVL, AVLSafe);
3139 AVL = Builder.createSelect(Cmp, AVL, AVLSafe, DebugLoc::getUnknown(),
3140 "safe_avl");
3141 }
3142 auto *VPEVL = Builder.createNaryOp(VPInstruction::ExplicitVectorLength, AVL,
3144
3145 auto *CanonicalIVIncrement =
3146 cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue());
3147 Builder.setInsertPoint(CanonicalIVIncrement);
3148 VPValue *OpVPEVL = VPEVL;
3149
3150 auto *I32Ty = Type::getInt32Ty(Plan.getContext());
3151 OpVPEVL = Builder.createScalarZExtOrTrunc(
3152 OpVPEVL, CanIVTy, I32Ty, CanonicalIVIncrement->getDebugLoc());
3153
3154 auto *NextEVLIV = Builder.createOverflowingOp(
3155 Instruction::Add, {OpVPEVL, EVLPhi},
3156 {CanonicalIVIncrement->hasNoUnsignedWrap(),
3157 CanonicalIVIncrement->hasNoSignedWrap()},
3158 CanonicalIVIncrement->getDebugLoc(), "index.evl.next");
3159 EVLPhi->addOperand(NextEVLIV);
3160
3161 VPValue *NextAVL = Builder.createOverflowingOp(
3162 Instruction::Sub, {AVLPhi, OpVPEVL}, {/*hasNUW=*/true, /*hasNSW=*/false},
3163 DebugLoc::getCompilerGenerated(), "avl.next");
3164 AVLPhi->addOperand(NextAVL);
3165
3166 transformRecipestoEVLRecipes(Plan, *VPEVL);
3167
3168 // Replace all uses of VPCanonicalIVPHIRecipe by
3169 // VPEVLBasedIVPHIRecipe except for the canonical IV increment.
3170 CanonicalIVPHI->replaceAllUsesWith(EVLPhi);
3171 CanonicalIVIncrement->setOperand(0, CanonicalIVPHI);
3172 // TODO: support unroll factor > 1.
3173 Plan.setUF(1);
3174}
3175
3177 // Find EVL loop entries by locating VPEVLBasedIVPHIRecipe.
3178 // There should be only one EVL PHI in the entire plan.
3179 VPEVLBasedIVPHIRecipe *EVLPhi = nullptr;
3180
3183 for (VPRecipeBase &R : VPBB->phis())
3184 if (auto *PhiR = dyn_cast<VPEVLBasedIVPHIRecipe>(&R)) {
3185 assert(!EVLPhi && "Found multiple EVL PHIs. Only one expected");
3186 EVLPhi = PhiR;
3187 }
3188
3189 // Early return if no EVL PHI is found.
3190 if (!EVLPhi)
3191 return;
3192
3193 VPBasicBlock *HeaderVPBB = EVLPhi->getParent();
3194 VPValue *EVLIncrement = EVLPhi->getBackedgeValue();
3195 VPValue *AVL;
3196 [[maybe_unused]] bool FoundAVL =
3197 match(EVLIncrement,
3198 m_c_Add(m_ZExtOrSelf(m_EVL(m_VPValue(AVL))), m_Specific(EVLPhi)));
3199 assert(FoundAVL && "Didn't find AVL?");
3200
3201 // The AVL may be capped to a safe distance.
3202 VPValue *SafeAVL;
3203 if (match(AVL, m_Select(m_VPValue(), m_VPValue(SafeAVL), m_VPValue())))
3204 AVL = SafeAVL;
3205
3206 VPValue *AVLNext;
3207 [[maybe_unused]] bool FoundAVLNext =
3209 m_Specific(Plan.getTripCount()), m_VPValue(AVLNext)));
3210 assert(FoundAVLNext && "Didn't find AVL backedge?");
3211
3212 // Convert EVLPhi to concrete recipe.
3213 auto *ScalarR =
3214 VPBuilder(EVLPhi).createScalarPhi({EVLPhi->getStartValue(), EVLIncrement},
3215 EVLPhi->getDebugLoc(), "evl.based.iv");
3216 EVLPhi->replaceAllUsesWith(ScalarR);
3217 EVLPhi->eraseFromParent();
3218
3219 // Replace CanonicalIVInc with EVL-PHI increment.
3220 auto *CanonicalIV = cast<VPPhi>(&*HeaderVPBB->begin());
3221 VPValue *Backedge = CanonicalIV->getIncomingValue(1);
3222 assert(match(Backedge, m_c_Add(m_Specific(CanonicalIV),
3223 m_Specific(&Plan.getVFxUF()))) &&
3224 "Unexpected canonical iv");
3225 Backedge->replaceAllUsesWith(EVLIncrement);
3226
3227 // Remove unused phi and increment.
3228 VPRecipeBase *CanonicalIVIncrement = Backedge->getDefiningRecipe();
3229 CanonicalIVIncrement->eraseFromParent();
3230 CanonicalIV->eraseFromParent();
3231
3232 // Replace the use of VectorTripCount in the latch-exiting block.
3233 // Before: (branch-on-count EVLIVInc, VectorTripCount)
3234 // After: (branch-on-cond eq AVLNext, 0)
3235
3236 VPBasicBlock *LatchExiting =
3237 HeaderVPBB->getPredecessors()[1]->getEntryBasicBlock();
3238 auto *LatchExitingBr = cast<VPInstruction>(LatchExiting->getTerminator());
3239 // Skip single-iteration loop region
3240 if (match(LatchExitingBr, m_BranchOnCond(m_True())))
3241 return;
3242 assert(LatchExitingBr &&
3243 match(LatchExitingBr,
3244 m_BranchOnCount(m_VPValue(EVLIncrement),
3245 m_Specific(&Plan.getVectorTripCount()))) &&
3246 "Unexpected terminator in EVL loop");
3247
3248 Type *AVLTy = VPTypeAnalysis(Plan).inferScalarType(AVLNext);
3249 VPBuilder Builder(LatchExitingBr);
3250 VPValue *Cmp = Builder.createICmp(CmpInst::ICMP_EQ, AVLNext,
3251 Plan.getConstantInt(AVLTy, 0));
3252 Builder.createNaryOp(VPInstruction::BranchOnCond, Cmp);
3253 LatchExitingBr->eraseFromParent();
3254}
3255
3257 VPlan &Plan, PredicatedScalarEvolution &PSE,
3258 const DenseMap<Value *, const SCEV *> &StridesMap) {
3259 // Replace VPValues for known constant strides guaranteed by predicate scalar
3260 // evolution.
3261 auto CanUseVersionedStride = [&Plan](VPUser &U, unsigned) {
3262 auto *R = cast<VPRecipeBase>(&U);
3263 return R->getRegion() ||
3264 R->getParent() == Plan.getVectorLoopRegion()->getSinglePredecessor();
3265 };
3266 ValueToSCEVMapTy RewriteMap;
3267 for (const SCEV *Stride : StridesMap.values()) {
3268 using namespace SCEVPatternMatch;
3269 auto *StrideV = cast<SCEVUnknown>(Stride)->getValue();
3270 const APInt *StrideConst;
3271 if (!match(PSE.getSCEV(StrideV), m_scev_APInt(StrideConst)))
3272 // Only handle constant strides for now.
3273 continue;
3274
3275 auto *CI = Plan.getConstantInt(*StrideConst);
3276 if (VPValue *StrideVPV = Plan.getLiveIn(StrideV))
3277 StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
3278
3279 // The versioned value may not be used in the loop directly but through a
3280 // sext/zext. Add new live-ins in those cases.
3281 for (Value *U : StrideV->users()) {
3283 continue;
3284 VPValue *StrideVPV = Plan.getLiveIn(U);
3285 if (!StrideVPV)
3286 continue;
3287 unsigned BW = U->getType()->getScalarSizeInBits();
3288 APInt C =
3289 isa<SExtInst>(U) ? StrideConst->sext(BW) : StrideConst->zext(BW);
3290 VPValue *CI = Plan.getConstantInt(C);
3291 StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
3292 }
3293 RewriteMap[StrideV] = PSE.getSCEV(StrideV);
3294 }
3295
3296 for (VPRecipeBase &R : *Plan.getEntry()) {
3297 auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(&R);
3298 if (!ExpSCEV)
3299 continue;
3300 const SCEV *ScevExpr = ExpSCEV->getSCEV();
3301 auto *NewSCEV =
3302 SCEVParameterRewriter::rewrite(ScevExpr, *PSE.getSE(), RewriteMap);
3303 if (NewSCEV != ScevExpr) {
3304 VPValue *NewExp = vputils::getOrCreateVPValueForSCEVExpr(Plan, NewSCEV);
3305 ExpSCEV->replaceAllUsesWith(NewExp);
3306 if (Plan.getTripCount() == ExpSCEV)
3307 Plan.resetTripCount(NewExp);
3308 }
3309 }
3310}
3311
3313 VPlan &Plan,
3314 const std::function<bool(BasicBlock *)> &BlockNeedsPredication) {
3315 // Collect recipes in the backward slice of `Root` that may generate a poison
3316 // value that is used after vectorization.
3318 auto CollectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) {
3320 Worklist.push_back(Root);
3321
3322 // Traverse the backward slice of Root through its use-def chain.
3323 while (!Worklist.empty()) {
3324 VPRecipeBase *CurRec = Worklist.pop_back_val();
3325
3326 if (!Visited.insert(CurRec).second)
3327 continue;
3328
3329 // Prune search if we find another recipe generating a widen memory
3330 // instruction. Widen memory instructions involved in address computation
3331 // will lead to gather/scatter instructions, which don't need to be
3332 // handled.
3334 VPHeaderPHIRecipe>(CurRec))
3335 continue;
3336
3337 // This recipe contributes to the address computation of a widen
3338 // load/store. If the underlying instruction has poison-generating flags,
3339 // drop them directly.
3340 if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(CurRec)) {
3341 VPValue *A, *B;
3342 // Dropping disjoint from an OR may yield incorrect results, as some
3343 // analysis may have converted it to an Add implicitly (e.g. SCEV used
3344 // for dependence analysis). Instead, replace it with an equivalent Add.
3345 // This is possible as all users of the disjoint OR only access lanes
3346 // where the operands are disjoint or poison otherwise.
3347 if (match(RecWithFlags, m_BinaryOr(m_VPValue(A), m_VPValue(B))) &&
3348 RecWithFlags->isDisjoint()) {
3349 VPBuilder Builder(RecWithFlags);
3350 VPInstruction *New = Builder.createOverflowingOp(
3351 Instruction::Add, {A, B}, {false, false},
3352 RecWithFlags->getDebugLoc());
3353 New->setUnderlyingValue(RecWithFlags->getUnderlyingValue());
3354 RecWithFlags->replaceAllUsesWith(New);
3355 RecWithFlags->eraseFromParent();
3356 CurRec = New;
3357 } else
3358 RecWithFlags->dropPoisonGeneratingFlags();
3359 } else {
3362 (void)Instr;
3363 assert((!Instr || !Instr->hasPoisonGeneratingFlags()) &&
3364 "found instruction with poison generating flags not covered by "
3365 "VPRecipeWithIRFlags");
3366 }
3367
3368 // Add new definitions to the worklist.
3369 for (VPValue *Operand : CurRec->operands())
3370 if (VPRecipeBase *OpDef = Operand->getDefiningRecipe())
3371 Worklist.push_back(OpDef);
3372 }
3373 });
3374
3375 // Traverse all the recipes in the VPlan and collect the poison-generating
3376 // recipes in the backward slice starting at the address of a VPWidenRecipe or
3377 // VPInterleaveRecipe.
3378 auto Iter = vp_depth_first_deep(Plan.getEntry());
3380 for (VPRecipeBase &Recipe : *VPBB) {
3381 if (auto *WidenRec = dyn_cast<VPWidenMemoryRecipe>(&Recipe)) {
3382 Instruction &UnderlyingInstr = WidenRec->getIngredient();
3383 VPRecipeBase *AddrDef = WidenRec->getAddr()->getDefiningRecipe();
3384 if (AddrDef && WidenRec->isConsecutive() &&
3385 BlockNeedsPredication(UnderlyingInstr.getParent()))
3386 CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
3387 } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) {
3388 VPRecipeBase *AddrDef = InterleaveRec->getAddr()->getDefiningRecipe();
3389 if (AddrDef) {
3390 // Check if any member of the interleave group needs predication.
3391 const InterleaveGroup<Instruction> *InterGroup =
3392 InterleaveRec->getInterleaveGroup();
3393 bool NeedPredication = false;
3394 for (int I = 0, NumMembers = InterGroup->getNumMembers();
3395 I < NumMembers; ++I) {
3396 Instruction *Member = InterGroup->getMember(I);
3397 if (Member)
3398 NeedPredication |= BlockNeedsPredication(Member->getParent());
3399 }
3400
3401 if (NeedPredication)
3402 CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
3403 }
3404 }
3405 }
3406 }
3407}
3408
3410 VPlan &Plan,
3412 &InterleaveGroups,
3413 VPRecipeBuilder &RecipeBuilder, const bool &ScalarEpilogueAllowed) {
3414 if (InterleaveGroups.empty())
3415 return;
3416
3417 // Interleave memory: for each Interleave Group we marked earlier as relevant
3418 // for this VPlan, replace the Recipes widening its memory instructions with a
3419 // single VPInterleaveRecipe at its insertion point.
3420 VPDominatorTree VPDT(Plan);
3421 for (const auto *IG : InterleaveGroups) {
3422 auto *Start =
3423 cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IG->getMember(0)));
3424 VPIRMetadata InterleaveMD(*Start);
3425 SmallVector<VPValue *, 4> StoredValues;
3426 if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(Start))
3427 StoredValues.push_back(StoreR->getStoredValue());
3428 for (unsigned I = 1; I < IG->getFactor(); ++I) {
3429 Instruction *MemberI = IG->getMember(I);
3430 if (!MemberI)
3431 continue;
3432 VPWidenMemoryRecipe *MemoryR =
3433 cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(MemberI));
3434 if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(MemoryR))
3435 StoredValues.push_back(StoreR->getStoredValue());
3436 InterleaveMD.intersect(*MemoryR);
3437 }
3438
3439 bool NeedsMaskForGaps =
3440 (IG->requiresScalarEpilogue() && !ScalarEpilogueAllowed) ||
3441 (!StoredValues.empty() && !IG->isFull());
3442
3443 Instruction *IRInsertPos = IG->getInsertPos();
3444 auto *InsertPos =
3445 cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IRInsertPos));
3446
3448 if (auto *Gep = dyn_cast<GetElementPtrInst>(
3449 getLoadStorePointerOperand(IRInsertPos)->stripPointerCasts()))
3450 NW = Gep->getNoWrapFlags().withoutNoUnsignedWrap();
3451
3452 // Get or create the start address for the interleave group.
3453 VPValue *Addr = Start->getAddr();
3454 VPRecipeBase *AddrDef = Addr->getDefiningRecipe();
3455 if (AddrDef && !VPDT.properlyDominates(AddrDef, InsertPos)) {
3456 // We cannot re-use the address of member zero because it does not
3457 // dominate the insert position. Instead, use the address of the insert
3458 // position and create a PtrAdd adjusting it to the address of member
3459 // zero.
3460 // TODO: Hoist Addr's defining recipe (and any operands as needed) to
3461 // InsertPos or sink loads above zero members to join it.
3462 assert(IG->getIndex(IRInsertPos) != 0 &&
3463 "index of insert position shouldn't be zero");
3464 auto &DL = IRInsertPos->getDataLayout();
3465 APInt Offset(32,
3466 DL.getTypeAllocSize(getLoadStoreType(IRInsertPos)) *
3467 IG->getIndex(IRInsertPos),
3468 /*IsSigned=*/true);
3469 VPValue *OffsetVPV = Plan.getConstantInt(-Offset);
3470 VPBuilder B(InsertPos);
3471 Addr = B.createNoWrapPtrAdd(InsertPos->getAddr(), OffsetVPV, NW);
3472 }
3473 // If the group is reverse, adjust the index to refer to the last vector
3474 // lane instead of the first. We adjust the index from the first vector
3475 // lane, rather than directly getting the pointer for lane VF - 1, because
3476 // the pointer operand of the interleaved access is supposed to be uniform.
3477 if (IG->isReverse()) {
3478 auto *ReversePtr = new VPVectorEndPointerRecipe(
3479 Addr, &Plan.getVF(), getLoadStoreType(IRInsertPos),
3480 -(int64_t)IG->getFactor(), NW, InsertPos->getDebugLoc());
3481 ReversePtr->insertBefore(InsertPos);
3482 Addr = ReversePtr;
3483 }
3484 auto *VPIG = new VPInterleaveRecipe(IG, Addr, StoredValues,
3485 InsertPos->getMask(), NeedsMaskForGaps,
3486 InterleaveMD, InsertPos->getDebugLoc());
3487 VPIG->insertBefore(InsertPos);
3488
3489 unsigned J = 0;
3490 for (unsigned i = 0; i < IG->getFactor(); ++i)
3491 if (Instruction *Member = IG->getMember(i)) {
3492 VPRecipeBase *MemberR = RecipeBuilder.getRecipe(Member);
3493 if (!Member->getType()->isVoidTy()) {
3494 VPValue *OriginalV = MemberR->getVPSingleValue();
3495 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));
3496 J++;
3497 }
3498 MemberR->eraseFromParent();
3499 }
3500 }
3501}
3502
3503/// Expand a VPWidenIntOrFpInduction into executable recipes, for the initial
3504/// value, phi and backedge value. In the following example:
3505///
3506/// vector.ph:
3507/// Successor(s): vector loop
3508///
3509/// <x1> vector loop: {
3510/// vector.body:
3511/// WIDEN-INDUCTION %i = phi %start, %step, %vf
3512/// ...
3513/// EMIT branch-on-count ...
3514/// No successors
3515/// }
3516///
3517/// WIDEN-INDUCTION will get expanded to:
3518///
3519/// vector.ph:
3520/// ...
3521/// vp<%induction.start> = ...
3522/// vp<%induction.increment> = ...
3523///
3524/// Successor(s): vector loop
3525///
3526/// <x1> vector loop: {
3527/// vector.body:
3528/// ir<%i> = WIDEN-PHI vp<%induction.start>, vp<%vec.ind.next>
3529/// ...
3530/// vp<%vec.ind.next> = add ir<%i>, vp<%induction.increment>
3531/// EMIT branch-on-count ...
3532/// No successors
3533/// }
3534static void
3536 VPTypeAnalysis &TypeInfo) {
3537 VPlan *Plan = WidenIVR->getParent()->getPlan();
3538 VPValue *Start = WidenIVR->getStartValue();
3539 VPValue *Step = WidenIVR->getStepValue();
3540 VPValue *VF = WidenIVR->getVFValue();
3541 DebugLoc DL = WidenIVR->getDebugLoc();
3542
3543 // The value from the original loop to which we are mapping the new induction
3544 // variable.
3545 Type *Ty = TypeInfo.inferScalarType(WidenIVR);
3546
3547 const InductionDescriptor &ID = WidenIVR->getInductionDescriptor();
3550 VPIRFlags Flags = *WidenIVR;
3551 if (ID.getKind() == InductionDescriptor::IK_IntInduction) {
3552 AddOp = Instruction::Add;
3553 MulOp = Instruction::Mul;
3554 } else {
3555 AddOp = ID.getInductionOpcode();
3556 MulOp = Instruction::FMul;
3557 }
3558
3559 // If the phi is truncated, truncate the start and step values.
3560 VPBuilder Builder(Plan->getVectorPreheader());
3561 Type *StepTy = TypeInfo.inferScalarType(Step);
3562 if (Ty->getScalarSizeInBits() < StepTy->getScalarSizeInBits()) {
3563 assert(StepTy->isIntegerTy() && "Truncation requires an integer type");
3564 Step = Builder.createScalarCast(Instruction::Trunc, Step, Ty, DL);
3565 Start = Builder.createScalarCast(Instruction::Trunc, Start, Ty, DL);
3566 // Truncation doesn't preserve WrapFlags.
3567 Flags.dropPoisonGeneratingFlags();
3568 StepTy = Ty;
3569 }
3570
3571 // Construct the initial value of the vector IV in the vector loop preheader.
3572 Type *IVIntTy =
3574 VPValue *Init = Builder.createNaryOp(VPInstruction::StepVector, {}, IVIntTy);
3575 if (StepTy->isFloatingPointTy())
3576 Init = Builder.createWidenCast(Instruction::UIToFP, Init, StepTy);
3577
3578 VPValue *SplatStart = Builder.createNaryOp(VPInstruction::Broadcast, Start);
3579 VPValue *SplatStep = Builder.createNaryOp(VPInstruction::Broadcast, Step);
3580
3581 Init = Builder.createNaryOp(MulOp, {Init, SplatStep}, Flags);
3582 Init = Builder.createNaryOp(AddOp, {SplatStart, Init}, Flags,
3583 DebugLoc::getUnknown(), "induction");
3584
3585 // Create the widened phi of the vector IV.
3586 auto *WidePHI = new VPWidenPHIRecipe(WidenIVR->getPHINode(), Init,
3587 WidenIVR->getDebugLoc(), "vec.ind");
3588 WidePHI->insertBefore(WidenIVR);
3589
3590 // Create the backedge value for the vector IV.
3591 VPValue *Inc;
3592 VPValue *Prev;
3593 // If unrolled, use the increment and prev value from the operands.
3594 if (auto *SplatVF = WidenIVR->getSplatVFValue()) {
3595 Inc = SplatVF;
3596 Prev = WidenIVR->getLastUnrolledPartOperand();
3597 } else {
3598 if (VPRecipeBase *R = VF->getDefiningRecipe())
3599 Builder.setInsertPoint(R->getParent(), std::next(R->getIterator()));
3600 // Multiply the vectorization factor by the step using integer or
3601 // floating-point arithmetic as appropriate.
3602 if (StepTy->isFloatingPointTy())
3603 VF = Builder.createScalarCast(Instruction::CastOps::UIToFP, VF, StepTy,
3604 DL);
3605 else
3606 VF = Builder.createScalarZExtOrTrunc(VF, StepTy,
3607 TypeInfo.inferScalarType(VF), DL);
3608
3609 Inc = Builder.createNaryOp(MulOp, {Step, VF}, Flags);
3610 Inc = Builder.createNaryOp(VPInstruction::Broadcast, Inc);
3611 Prev = WidePHI;
3612 }
3613
3615 Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator());
3616 auto *Next = Builder.createNaryOp(AddOp, {Prev, Inc}, Flags,
3617 WidenIVR->getDebugLoc(), "vec.ind.next");
3618
3619 WidePHI->addOperand(Next);
3620
3621 WidenIVR->replaceAllUsesWith(WidePHI);
3622}
3623
3624/// Expand a VPWidenPointerInductionRecipe into executable recipes, for the
3625/// initial value, phi and backedge value. In the following example:
3626///
3627/// <x1> vector loop: {
3628/// vector.body:
3629/// EMIT ir<%ptr.iv> = WIDEN-POINTER-INDUCTION %start, %step, %vf
3630/// ...
3631/// EMIT branch-on-count ...
3632/// }
3633///
3634/// WIDEN-POINTER-INDUCTION will get expanded to:
3635///
3636/// <x1> vector loop: {
3637/// vector.body:
3638/// EMIT-SCALAR %pointer.phi = phi %start, %ptr.ind
3639/// EMIT %mul = mul %stepvector, %step
3640/// EMIT %vector.gep = wide-ptradd %pointer.phi, %mul
3641/// ...
3642/// EMIT %ptr.ind = ptradd %pointer.phi, %vf
3643/// EMIT branch-on-count ...
3644/// }
3646 VPTypeAnalysis &TypeInfo) {
3647 VPlan *Plan = R->getParent()->getPlan();
3648 VPValue *Start = R->getStartValue();
3649 VPValue *Step = R->getStepValue();
3650 VPValue *VF = R->getVFValue();
3651
3652 assert(R->getInductionDescriptor().getKind() ==
3654 "Not a pointer induction according to InductionDescriptor!");
3655 assert(TypeInfo.inferScalarType(R)->isPointerTy() && "Unexpected type.");
3656 assert(!R->onlyScalarsGenerated(Plan->hasScalableVF()) &&
3657 "Recipe should have been replaced");
3658
3659 VPBuilder Builder(R);
3660 DebugLoc DL = R->getDebugLoc();
3661
3662 // Build a scalar pointer phi.
3663 VPPhi *ScalarPtrPhi = Builder.createScalarPhi(Start, DL, "pointer.phi");
3664
3665 // Create actual address geps that use the pointer phi as base and a
3666 // vectorized version of the step value (<step*0, ..., step*N>) as offset.
3667 Builder.setInsertPoint(R->getParent(), R->getParent()->getFirstNonPhi());
3668 Type *StepTy = TypeInfo.inferScalarType(Step);
3669 VPValue *Offset = Builder.createNaryOp(VPInstruction::StepVector, {}, StepTy);
3670 Offset = Builder.createOverflowingOp(Instruction::Mul, {Offset, Step});
3671 VPValue *PtrAdd = Builder.createNaryOp(
3672 VPInstruction::WidePtrAdd, {ScalarPtrPhi, Offset}, DL, "vector.gep");
3673 R->replaceAllUsesWith(PtrAdd);
3674
3675 // Create the backedge value for the scalar pointer phi.
3677 Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator());
3678 VF = Builder.createScalarZExtOrTrunc(VF, StepTy, TypeInfo.inferScalarType(VF),
3679 DL);
3680 VPValue *Inc = Builder.createOverflowingOp(Instruction::Mul, {Step, VF});
3681
3682 VPValue *InductionGEP =
3683 Builder.createPtrAdd(ScalarPtrPhi, Inc, DL, "ptr.ind");
3684 ScalarPtrPhi->addOperand(InductionGEP);
3685}
3686
3688 // Replace loop regions with explicity CFG.
3689 SmallVector<VPRegionBlock *> LoopRegions;
3691 vp_depth_first_deep(Plan.getEntry()))) {
3692 if (!R->isReplicator())
3693 LoopRegions.push_back(R);
3694 }
3695 for (VPRegionBlock *R : LoopRegions)
3696 R->dissolveToCFGLoop();
3697}
3698
3700 VPTypeAnalysis TypeInfo(Plan);
3703 vp_depth_first_deep(Plan.getEntry()))) {
3704 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
3705 if (auto *WidenIVR = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R)) {
3706 expandVPWidenIntOrFpInduction(WidenIVR, TypeInfo);
3707 ToRemove.push_back(WidenIVR);
3708 continue;
3709 }
3710
3711 if (auto *WidenIVR = dyn_cast<VPWidenPointerInductionRecipe>(&R)) {
3712 // If the recipe only generates scalars, scalarize it instead of
3713 // expanding it.
3714 if (WidenIVR->onlyScalarsGenerated(Plan.hasScalableVF())) {
3715 VPBuilder Builder(WidenIVR);
3716 VPValue *PtrAdd =
3717 scalarizeVPWidenPointerInduction(WidenIVR, Plan, Builder);
3718 WidenIVR->replaceAllUsesWith(PtrAdd);
3719 ToRemove.push_back(WidenIVR);
3720 continue;
3721 }
3722 expandVPWidenPointerInduction(WidenIVR, TypeInfo);
3723 ToRemove.push_back(WidenIVR);
3724 continue;
3725 }
3726
3727 // Expand VPBlendRecipe into VPInstruction::Select.
3728 VPBuilder Builder(&R);
3729 if (auto *Blend = dyn_cast<VPBlendRecipe>(&R)) {
3730 VPValue *Select = Blend->getIncomingValue(0);
3731 for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
3732 Select = Builder.createSelect(Blend->getMask(I),
3733 Blend->getIncomingValue(I), Select,
3734 R.getDebugLoc(), "predphi");
3735 Blend->replaceAllUsesWith(Select);
3736 ToRemove.push_back(Blend);
3737 }
3738
3739 if (auto *Expr = dyn_cast<VPExpressionRecipe>(&R)) {
3740 Expr->decompose();
3741 ToRemove.push_back(Expr);
3742 }
3743
3744 // Expand LastActiveLane into Not + FirstActiveLane + Sub.
3745 auto *LastActiveL = dyn_cast<VPInstruction>(&R);
3746 if (LastActiveL &&
3747 LastActiveL->getOpcode() == VPInstruction::LastActiveLane) {
3748 // Create Not(Mask) for all operands.
3750 for (VPValue *Op : LastActiveL->operands()) {
3751 VPValue *NotMask = Builder.createNot(Op, LastActiveL->getDebugLoc());
3752 NotMasks.push_back(NotMask);
3753 }
3754
3755 // Create FirstActiveLane on the inverted masks.
3756 VPValue *FirstInactiveLane = Builder.createNaryOp(
3758 LastActiveL->getDebugLoc(), "first.inactive.lane");
3759
3760 // Subtract 1 to get the last active lane.
3761 VPValue *One = Plan.getOrAddLiveIn(
3762 ConstantInt::get(Type::getInt64Ty(Plan.getContext()), 1));
3763 VPValue *LastLane = Builder.createNaryOp(
3764 Instruction::Sub, {FirstInactiveLane, One},
3765 LastActiveL->getDebugLoc(), "last.active.lane");
3766
3767 LastActiveL->replaceAllUsesWith(LastLane);
3768 ToRemove.push_back(LastActiveL);
3769 continue;
3770 }
3771
3772 VPValue *VectorStep;
3773 VPValue *ScalarStep;
3775 m_VPValue(VectorStep), m_VPValue(ScalarStep))))
3776 continue;
3777
3778 // Expand WideIVStep.
3779 auto *VPI = cast<VPInstruction>(&R);
3780 Type *IVTy = TypeInfo.inferScalarType(VPI);
3781 if (TypeInfo.inferScalarType(VectorStep) != IVTy) {
3783 ? Instruction::UIToFP
3784 : Instruction::Trunc;
3785 VectorStep = Builder.createWidenCast(CastOp, VectorStep, IVTy);
3786 }
3787
3788 assert(!match(ScalarStep, m_One()) && "Expected non-unit scalar-step");
3789 if (TypeInfo.inferScalarType(ScalarStep) != IVTy) {
3790 ScalarStep =
3791 Builder.createWidenCast(Instruction::Trunc, ScalarStep, IVTy);
3792 }
3793
3794 VPIRFlags Flags;
3795 if (IVTy->isFloatingPointTy())
3796 Flags = {VPI->getFastMathFlags()};
3797
3798 unsigned MulOpc =
3799 IVTy->isFloatingPointTy() ? Instruction::FMul : Instruction::Mul;
3800 VPInstruction *Mul = Builder.createNaryOp(
3801 MulOpc, {VectorStep, ScalarStep}, Flags, R.getDebugLoc());
3802 VectorStep = Mul;
3803 VPI->replaceAllUsesWith(VectorStep);
3804 ToRemove.push_back(VPI);
3805 }
3806 }
3807
3808 for (VPRecipeBase *R : ToRemove)
3809 R->eraseFromParent();
3810}
3811
3813 VPBasicBlock *EarlyExitVPBB,
3814 VPlan &Plan,
3815 VPBasicBlock *HeaderVPBB,
3816 VPBasicBlock *LatchVPBB) {
3817 VPBlockBase *MiddleVPBB = LatchVPBB->getSuccessors()[0];
3818 if (!EarlyExitVPBB->getSinglePredecessor() &&
3819 EarlyExitVPBB->getPredecessors()[1] == MiddleVPBB) {
3820 assert(EarlyExitVPBB->getNumPredecessors() == 2 &&
3821 EarlyExitVPBB->getPredecessors()[0] == EarlyExitingVPBB &&
3822 "unsupported early exit VPBB");
3823 // Early exit operand should always be last phi operand. If EarlyExitVPBB
3824 // has two predecessors and EarlyExitingVPBB is the first, swap the operands
3825 // of the phis.
3826 for (VPRecipeBase &R : EarlyExitVPBB->phis())
3827 cast<VPIRPhi>(&R)->swapOperands();
3828 }
3829
3830 VPBuilder Builder(LatchVPBB->getTerminator());
3831 VPBlockBase *TrueSucc = EarlyExitingVPBB->getSuccessors()[0];
3832 assert(match(EarlyExitingVPBB->getTerminator(), m_BranchOnCond()) &&
3833 "Terminator must be be BranchOnCond");
3834 VPValue *CondOfEarlyExitingVPBB =
3835 EarlyExitingVPBB->getTerminator()->getOperand(0);
3836 auto *CondToEarlyExit = TrueSucc == EarlyExitVPBB
3837 ? CondOfEarlyExitingVPBB
3838 : Builder.createNot(CondOfEarlyExitingVPBB);
3839
3840 // Split the middle block and have it conditionally branch to the early exit
3841 // block if CondToEarlyExit.
3842 VPValue *IsEarlyExitTaken =
3843 Builder.createNaryOp(VPInstruction::AnyOf, {CondToEarlyExit});
3844 VPBasicBlock *NewMiddle = Plan.createVPBasicBlock("middle.split");
3845 VPBasicBlock *VectorEarlyExitVPBB =
3846 Plan.createVPBasicBlock("vector.early.exit");
3847 VPBlockUtils::insertOnEdge(LatchVPBB, MiddleVPBB, NewMiddle);
3848 VPBlockUtils::connectBlocks(NewMiddle, VectorEarlyExitVPBB);
3849 NewMiddle->swapSuccessors();
3850
3851 VPBlockUtils::connectBlocks(VectorEarlyExitVPBB, EarlyExitVPBB);
3852
3853 // Update the exit phis in the early exit block.
3854 VPBuilder MiddleBuilder(NewMiddle);
3855 VPBuilder EarlyExitB(VectorEarlyExitVPBB);
3856 for (VPRecipeBase &R : EarlyExitVPBB->phis()) {
3857 auto *ExitIRI = cast<VPIRPhi>(&R);
3858 // Early exit operand should always be last, i.e., 0 if EarlyExitVPBB has
3859 // a single predecessor and 1 if it has two.
3860 unsigned EarlyExitIdx = ExitIRI->getNumOperands() - 1;
3861 if (ExitIRI->getNumOperands() != 1) {
3862 // The first of two operands corresponds to the latch exit, via MiddleVPBB
3863 // predecessor. Extract its final lane.
3864 ExitIRI->extractLastLaneOfLastPartOfFirstOperand(MiddleBuilder);
3865 }
3866
3867 VPValue *IncomingFromEarlyExit = ExitIRI->getOperand(EarlyExitIdx);
3868 if (!IncomingFromEarlyExit->isLiveIn()) {
3869 // Update the incoming value from the early exit.
3870 VPValue *FirstActiveLane = EarlyExitB.createNaryOp(
3871 VPInstruction::FirstActiveLane, {CondToEarlyExit},
3872 DebugLoc::getUnknown(), "first.active.lane");
3873 IncomingFromEarlyExit = EarlyExitB.createNaryOp(
3874 VPInstruction::ExtractLane, {FirstActiveLane, IncomingFromEarlyExit},
3875 DebugLoc::getUnknown(), "early.exit.value");
3876 ExitIRI->setOperand(EarlyExitIdx, IncomingFromEarlyExit);
3877 }
3878 }
3879 MiddleBuilder.createNaryOp(VPInstruction::BranchOnCond, {IsEarlyExitTaken});
3880
3881 // Replace the condition controlling the non-early exit from the vector loop
3882 // with one exiting if either the original condition of the vector latch is
3883 // true or the early exit has been taken.
3884 auto *LatchExitingBranch = cast<VPInstruction>(LatchVPBB->getTerminator());
3885 assert(LatchExitingBranch->getOpcode() == VPInstruction::BranchOnCount &&
3886 "Unexpected terminator");
3887 auto *IsLatchExitTaken =
3888 Builder.createICmp(CmpInst::ICMP_EQ, LatchExitingBranch->getOperand(0),
3889 LatchExitingBranch->getOperand(1));
3890 auto *AnyExitTaken = Builder.createNaryOp(
3891 Instruction::Or, {IsEarlyExitTaken, IsLatchExitTaken});
3892 Builder.createNaryOp(VPInstruction::BranchOnCond, AnyExitTaken);
3893 LatchExitingBranch->eraseFromParent();
3894}
3895
3896/// This function tries convert extended in-loop reductions to
3897/// VPExpressionRecipe and clamp the \p Range if it is beneficial and
3898/// valid. The created recipe must be decomposed to its constituent
3899/// recipes before execution.
3900static VPExpressionRecipe *
3902 VFRange &Range) {
3903 Type *RedTy = Ctx.Types.inferScalarType(Red);
3904 VPValue *VecOp = Red->getVecOp();
3905
3906 // Clamp the range if using extended-reduction is profitable.
3907 auto IsExtendedRedValidAndClampRange =
3908 [&](unsigned Opcode, Instruction::CastOps ExtOpc, Type *SrcTy) -> bool {
3910 [&](ElementCount VF) {
3911 auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF));
3913
3914 InstructionCost ExtRedCost;
3915 InstructionCost ExtCost =
3916 cast<VPWidenCastRecipe>(VecOp)->computeCost(VF, Ctx);
3917 InstructionCost RedCost = Red->computeCost(VF, Ctx);
3918
3919 if (Red->isPartialReduction()) {
3922 // FIXME: Move partial reduction creation, costing and clamping
3923 // here from LoopVectorize.cpp.
3924 ExtRedCost = Ctx.TTI.getPartialReductionCost(
3925 Opcode, SrcTy, nullptr, RedTy, VF, ExtKind,
3926 llvm::TargetTransformInfo::PR_None, std::nullopt, Ctx.CostKind);
3927 } else {
3928 ExtRedCost = Ctx.TTI.getExtendedReductionCost(
3929 Opcode, ExtOpc == Instruction::CastOps::ZExt, RedTy, SrcVecTy,
3930 Red->getFastMathFlags(), CostKind);
3931 }
3932 return ExtRedCost.isValid() && ExtRedCost < ExtCost + RedCost;
3933 },
3934 Range);
3935 };
3936
3937 VPValue *A;
3938 // Match reduce(ext)).
3939 if (match(VecOp, m_ZExtOrSExt(m_VPValue(A))) &&
3940 IsExtendedRedValidAndClampRange(
3941 RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()),
3942 cast<VPWidenCastRecipe>(VecOp)->getOpcode(),
3943 Ctx.Types.inferScalarType(A)))
3944 return new VPExpressionRecipe(cast<VPWidenCastRecipe>(VecOp), Red);
3945
3946 return nullptr;
3947}
3948
3949/// This function tries convert extended in-loop reductions to
3950/// VPExpressionRecipe and clamp the \p Range if it is beneficial
3951/// and valid. The created VPExpressionRecipe must be decomposed to its
3952/// constituent recipes before execution. Patterns of the
3953/// VPExpressionRecipe:
3954/// reduce.add(mul(...)),
3955/// reduce.add(mul(ext(A), ext(B))),
3956/// reduce.add(ext(mul(ext(A), ext(B)))).
3957static VPExpressionRecipe *
3959 VPCostContext &Ctx, VFRange &Range) {
3960 unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind());
3961 if (Opcode != Instruction::Add && Opcode != Instruction::Sub)
3962 return nullptr;
3963
3964 Type *RedTy = Ctx.Types.inferScalarType(Red);
3965
3966 // Clamp the range if using multiply-accumulate-reduction is profitable.
3967 auto IsMulAccValidAndClampRange =
3969 VPWidenCastRecipe *OuterExt) -> bool {
3971 [&](ElementCount VF) {
3973 Type *SrcTy =
3974 Ext0 ? Ctx.Types.inferScalarType(Ext0->getOperand(0)) : RedTy;
3975 InstructionCost MulAccCost;
3976
3977 if (Red->isPartialReduction()) {
3978 Type *SrcTy2 =
3979 Ext1 ? Ctx.Types.inferScalarType(Ext1->getOperand(0)) : nullptr;
3980 // FIXME: Move partial reduction creation, costing and clamping
3981 // here from LoopVectorize.cpp.
3982 MulAccCost = Ctx.TTI.getPartialReductionCost(
3983 Opcode, SrcTy, SrcTy2, RedTy, VF,
3985 Ext0->getOpcode())
3988 Ext1->getOpcode())
3990 Mul->getOpcode(), CostKind);
3991 } else {
3992 // Only partial reductions support mixed extends at the moment.
3993 if (Ext0 && Ext1 && Ext0->getOpcode() != Ext1->getOpcode())
3994 return false;
3995
3996 bool IsZExt =
3997 !Ext0 || Ext0->getOpcode() == Instruction::CastOps::ZExt;
3998 auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF));
3999 MulAccCost = Ctx.TTI.getMulAccReductionCost(IsZExt, Opcode, RedTy,
4000 SrcVecTy, CostKind);
4001 }
4002
4003 InstructionCost MulCost = Mul->computeCost(VF, Ctx);
4004 InstructionCost RedCost = Red->computeCost(VF, Ctx);
4005 InstructionCost ExtCost = 0;
4006 if (Ext0)
4007 ExtCost += Ext0->computeCost(VF, Ctx);
4008 if (Ext1)
4009 ExtCost += Ext1->computeCost(VF, Ctx);
4010 if (OuterExt)
4011 ExtCost += OuterExt->computeCost(VF, Ctx);
4012
4013 return MulAccCost.isValid() &&
4014 MulAccCost < ExtCost + MulCost + RedCost;
4015 },
4016 Range);
4017 };
4018
4019 VPValue *VecOp = Red->getVecOp();
4020 VPRecipeBase *Sub = nullptr;
4021 VPValue *A, *B;
4022 VPValue *Tmp = nullptr;
4023 // Sub reductions could have a sub between the add reduction and vec op.
4024 if (match(VecOp, m_Sub(m_ZeroInt(), m_VPValue(Tmp)))) {
4025 Sub = VecOp->getDefiningRecipe();
4026 VecOp = Tmp;
4027 }
4028
4029 // If ValB is a constant and can be safely extended, truncate it to the same
4030 // type as ExtA's operand, then extend it to the same type as ExtA. This
4031 // creates two uniform extends that can more easily be matched by the rest of
4032 // the bundling code. The ExtB reference, ValB and operand 1 of Mul are all
4033 // replaced with the new extend of the constant.
4034 auto ExtendAndReplaceConstantOp = [&Ctx](VPWidenCastRecipe *ExtA,
4035 VPWidenCastRecipe *&ExtB,
4036 VPValue *&ValB, VPWidenRecipe *Mul) {
4037 if (!ExtA || ExtB || !ValB->isLiveIn())
4038 return;
4039 Type *NarrowTy = Ctx.Types.inferScalarType(ExtA->getOperand(0));
4040 Instruction::CastOps ExtOpc = ExtA->getOpcode();
4041 const APInt *Const;
4042 if (!match(ValB, m_APInt(Const)) ||
4044 Const, NarrowTy, TTI::getPartialReductionExtendKind(ExtOpc)))
4045 return;
4046 // The truncate ensures that the type of each extended operand is the
4047 // same, and it's been proven that the constant can be extended from
4048 // NarrowTy safely. Necessary since ExtA's extended operand would be
4049 // e.g. an i8, while the const will likely be an i32. This will be
4050 // elided by later optimisations.
4051 VPBuilder Builder(Mul);
4052 auto *Trunc =
4053 Builder.createWidenCast(Instruction::CastOps::Trunc, ValB, NarrowTy);
4054 Type *WideTy = Ctx.Types.inferScalarType(ExtA);
4055 ValB = ExtB = Builder.createWidenCast(ExtOpc, Trunc, WideTy);
4056 Mul->setOperand(1, ExtB);
4057 };
4058
4059 // Try to match reduce.add(mul(...)).
4060 if (match(VecOp, m_Mul(m_VPValue(A), m_VPValue(B)))) {
4063 auto *Mul = cast<VPWidenRecipe>(VecOp);
4064
4065 // Convert reduce.add(mul(ext, const)) to reduce.add(mul(ext, ext(const)))
4066 ExtendAndReplaceConstantOp(RecipeA, RecipeB, B, Mul);
4067
4068 // Match reduce.add/sub(mul(ext, ext)).
4069 if (RecipeA && RecipeB && match(RecipeA, m_ZExtOrSExt(m_VPValue())) &&
4070 match(RecipeB, m_ZExtOrSExt(m_VPValue())) &&
4071 IsMulAccValidAndClampRange(Mul, RecipeA, RecipeB, nullptr)) {
4072 if (Sub)
4073 return new VPExpressionRecipe(RecipeA, RecipeB, Mul,
4074 cast<VPWidenRecipe>(Sub), Red);
4075 return new VPExpressionRecipe(RecipeA, RecipeB, Mul, Red);
4076 }
4077 // TODO: Add an expression type for this variant with a negated mul
4078 if (!Sub && IsMulAccValidAndClampRange(Mul, nullptr, nullptr, nullptr))
4079 return new VPExpressionRecipe(Mul, Red);
4080 }
4081 // TODO: Add an expression type for negated versions of other expression
4082 // variants.
4083 if (Sub)
4084 return nullptr;
4085
4086 // Match reduce.add(ext(mul(A, B))).
4087 if (match(VecOp, m_ZExtOrSExt(m_Mul(m_VPValue(A), m_VPValue(B))))) {
4088 auto *Ext = cast<VPWidenCastRecipe>(VecOp);
4089 auto *Mul = cast<VPWidenRecipe>(Ext->getOperand(0));
4092
4093 // reduce.add(ext(mul(ext, const)))
4094 // -> reduce.add(ext(mul(ext, ext(const))))
4095 ExtendAndReplaceConstantOp(Ext0, Ext1, B, Mul);
4096
4097 // reduce.add(ext(mul(ext(A), ext(B))))
4098 // -> reduce.add(mul(wider_ext(A), wider_ext(B)))
4099 // The inner extends must either have the same opcode as the outer extend or
4100 // be the same, in which case the multiply can never result in a negative
4101 // value and the outer extend can be folded away by doing wider
4102 // extends for the operands of the mul.
4103 if (Ext0 && Ext1 &&
4104 (Ext->getOpcode() == Ext0->getOpcode() || Ext0 == Ext1) &&
4105 Ext0->getOpcode() == Ext1->getOpcode() &&
4106 IsMulAccValidAndClampRange(Mul, Ext0, Ext1, Ext) && Mul->hasOneUse()) {
4107 auto *NewExt0 = new VPWidenCastRecipe(
4108 Ext0->getOpcode(), Ext0->getOperand(0), Ext->getResultType(), nullptr,
4109 *Ext0, *Ext0, Ext0->getDebugLoc());
4110 NewExt0->insertBefore(Ext0);
4111
4112 VPWidenCastRecipe *NewExt1 = NewExt0;
4113 if (Ext0 != Ext1) {
4114 NewExt1 = new VPWidenCastRecipe(Ext1->getOpcode(), Ext1->getOperand(0),
4115 Ext->getResultType(), nullptr, *Ext1,
4116 *Ext1, Ext1->getDebugLoc());
4117 NewExt1->insertBefore(Ext1);
4118 }
4119 Mul->setOperand(0, NewExt0);
4120 Mul->setOperand(1, NewExt1);
4121 Red->setOperand(1, Mul);
4122 return new VPExpressionRecipe(NewExt0, NewExt1, Mul, Red);
4123 }
4124 }
4125 return nullptr;
4126}
4127
4128/// This function tries to create abstract recipes from the reduction recipe for
4129/// following optimizations and cost estimation.
4131 VPCostContext &Ctx,
4132 VFRange &Range) {
4133 VPExpressionRecipe *AbstractR = nullptr;
4134 auto IP = std::next(Red->getIterator());
4135 auto *VPBB = Red->getParent();
4136 if (auto *MulAcc = tryToMatchAndCreateMulAccumulateReduction(Red, Ctx, Range))
4137 AbstractR = MulAcc;
4138 else if (auto *ExtRed = tryToMatchAndCreateExtendedReduction(Red, Ctx, Range))
4139 AbstractR = ExtRed;
4140 // Cannot create abstract inloop reduction recipes.
4141 if (!AbstractR)
4142 return;
4143
4144 AbstractR->insertBefore(*VPBB, IP);
4145 Red->replaceAllUsesWith(AbstractR);
4146}
4147
4158
4160 if (Plan.hasScalarVFOnly())
4161 return;
4162
4163#ifndef NDEBUG
4164 VPDominatorTree VPDT(Plan);
4165#endif
4166
4167 SmallVector<VPValue *> VPValues;
4170 append_range(VPValues, Plan.getLiveIns());
4171 for (VPRecipeBase &R : *Plan.getEntry())
4172 append_range(VPValues, R.definedValues());
4173
4174 auto *VectorPreheader = Plan.getVectorPreheader();
4175 for (VPValue *VPV : VPValues) {
4177 (VPV->isLiveIn() && VPV->getLiveInIRValue() &&
4178 isa<Constant>(VPV->getLiveInIRValue())))
4179 continue;
4180
4181 // Add explicit broadcast at the insert point that dominates all users.
4182 VPBasicBlock *HoistBlock = VectorPreheader;
4183 VPBasicBlock::iterator HoistPoint = VectorPreheader->end();
4184 for (VPUser *User : VPV->users()) {
4185 if (User->usesScalars(VPV))
4186 continue;
4187 if (cast<VPRecipeBase>(User)->getParent() == VectorPreheader)
4188 HoistPoint = HoistBlock->begin();
4189 else
4190 assert(VPDT.dominates(VectorPreheader,
4191 cast<VPRecipeBase>(User)->getParent()) &&
4192 "All users must be in the vector preheader or dominated by it");
4193 }
4194
4195 VPBuilder Builder(cast<VPBasicBlock>(HoistBlock), HoistPoint);
4196 auto *Broadcast = Builder.createNaryOp(VPInstruction::Broadcast, {VPV});
4197 VPV->replaceUsesWithIf(Broadcast,
4198 [VPV, Broadcast](VPUser &U, unsigned Idx) {
4199 return Broadcast != &U && !U.usesScalars(VPV);
4200 });
4201 }
4202}
4203
4205 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
4206
4207 // Collect candidate loads with invariant addresses and noalias scopes
4208 // metadata and memory-writing recipes with noalias metadata.
4212 vp_depth_first_shallow(LoopRegion->getEntry()))) {
4213 for (VPRecipeBase &R : *VPBB) {
4214 // Only handle single-scalar replicated loads with invariant addresses.
4215 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
4216 if (RepR->isPredicated() || !RepR->isSingleScalar() ||
4217 RepR->getOpcode() != Instruction::Load)
4218 continue;
4219
4220 VPValue *Addr = RepR->getOperand(0);
4221 if (Addr->isDefinedOutsideLoopRegions()) {
4223 if (!Loc.AATags.Scope)
4224 continue;
4225 CandidateLoads.push_back({RepR, Loc});
4226 }
4227 }
4228 if (R.mayWriteToMemory()) {
4230 if (!Loc || !Loc->AATags.Scope || !Loc->AATags.NoAlias)
4231 return;
4232 Stores.push_back(*Loc);
4233 }
4234 }
4235 }
4236
4237 VPBasicBlock *Preheader = Plan.getVectorPreheader();
4238 for (auto &[LoadRecipe, LoadLoc] : CandidateLoads) {
4239 // Hoist the load to the preheader if it doesn't alias with any stores
4240 // according to the noalias metadata. Other loads should have been hoisted
4241 // by other passes
4242 const AAMDNodes &LoadAA = LoadLoc.AATags;
4243 if (all_of(Stores, [&](const MemoryLocation &StoreLoc) {
4245 LoadAA.Scope, StoreLoc.AATags.NoAlias);
4246 })) {
4247 LoadRecipe->moveBefore(*Preheader, Preheader->getFirstNonPhi());
4248 }
4249 }
4250}
4251
4252// Collect common metadata from a group of replicate recipes by intersecting
4253// metadata from all recipes in the group.
4255 VPIRMetadata CommonMetadata = *Recipes.front();
4256 for (VPReplicateRecipe *Recipe : drop_begin(Recipes))
4257 CommonMetadata.intersect(*Recipe);
4258 return CommonMetadata;
4259}
4260
4261template <unsigned Opcode>
4264 const Loop *L) {
4265 static_assert(Opcode == Instruction::Load || Opcode == Instruction::Store,
4266 "Only Load and Store opcodes supported");
4267 constexpr bool IsLoad = (Opcode == Instruction::Load);
4268 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
4269 VPTypeAnalysis TypeInfo(Plan);
4270
4271 // Group predicated operations by their address SCEV.
4273 for (VPBlockBase *Block : vp_depth_first_shallow(LoopRegion->getEntry())) {
4274 auto *VPBB = cast<VPBasicBlock>(Block);
4275 for (VPRecipeBase &R : *VPBB) {
4276 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
4277 if (!RepR || RepR->getOpcode() != Opcode || !RepR->isPredicated())
4278 continue;
4279
4280 // For loads, operand 0 is address; for stores, operand 1 is address.
4281 VPValue *Addr = RepR->getOperand(IsLoad ? 0 : 1);
4282 const SCEV *AddrSCEV = vputils::getSCEVExprForVPValue(Addr, SE, L);
4283 if (!isa<SCEVCouldNotCompute>(AddrSCEV))
4284 RecipesByAddress[AddrSCEV].push_back(RepR);
4285 }
4286 }
4287
4288 // For each address, collect operations with the same or complementary masks.
4290 auto GetLoadStoreValueType = [&](VPReplicateRecipe *Recipe) {
4291 return TypeInfo.inferScalarType(IsLoad ? Recipe : Recipe->getOperand(0));
4292 };
4293 for (auto &[Addr, Recipes] : RecipesByAddress) {
4294 if (Recipes.size() < 2)
4295 continue;
4296
4297 // Collect groups with the same or complementary masks.
4298 for (VPReplicateRecipe *&RecipeI : Recipes) {
4299 if (!RecipeI)
4300 continue;
4301
4302 VPValue *MaskI = RecipeI->getMask();
4303 Type *TypeI = GetLoadStoreValueType(RecipeI);
4305 Group.push_back(RecipeI);
4306 RecipeI = nullptr;
4307
4308 // Find all operations with the same or complementary masks.
4309 bool HasComplementaryMask = false;
4310 for (VPReplicateRecipe *&RecipeJ : Recipes) {
4311 if (!RecipeJ)
4312 continue;
4313
4314 VPValue *MaskJ = RecipeJ->getMask();
4315 Type *TypeJ = GetLoadStoreValueType(RecipeJ);
4316 if (TypeI == TypeJ) {
4317 // Check if any operation in the group has a complementary mask with
4318 // another, that is M1 == NOT(M2) or M2 == NOT(M1).
4319 HasComplementaryMask |= match(MaskI, m_Not(m_Specific(MaskJ))) ||
4320 match(MaskJ, m_Not(m_Specific(MaskI)));
4321 Group.push_back(RecipeJ);
4322 RecipeJ = nullptr;
4323 }
4324 }
4325
4326 if (HasComplementaryMask) {
4327 assert(Group.size() >= 2 && "must have at least 2 entries");
4328 AllGroups.push_back(std::move(Group));
4329 }
4330 }
4331 }
4332
4333 return AllGroups;
4334}
4335
4336// Find the recipe with minimum alignment in the group.
4337template <typename InstType>
4338static VPReplicateRecipe *
4340 return *min_element(Group, [](VPReplicateRecipe *A, VPReplicateRecipe *B) {
4341 return cast<InstType>(A->getUnderlyingInstr())->getAlign() <
4342 cast<InstType>(B->getUnderlyingInstr())->getAlign();
4343 });
4344}
4345
4347 const Loop *L) {
4348 auto Groups =
4350 if (Groups.empty())
4351 return;
4352
4353 VPDominatorTree VPDT(Plan);
4354
4355 // Process each group of loads.
4356 for (auto &Group : Groups) {
4357 // Sort loads by dominance order, with earliest (most dominating) first.
4358 sort(Group, [&VPDT](VPReplicateRecipe *A, VPReplicateRecipe *B) {
4359 return VPDT.properlyDominates(A, B);
4360 });
4361
4362 // Try to use the earliest (most dominating) load to replace all others.
4363 VPReplicateRecipe *EarliestLoad = Group[0];
4364 VPBasicBlock *FirstBB = EarliestLoad->getParent();
4365 VPBasicBlock *LastBB = Group.back()->getParent();
4366
4367 // Check that the load doesn't alias with stores between first and last.
4368 auto LoadLoc = vputils::getMemoryLocation(*EarliestLoad);
4369 if (!LoadLoc || !canHoistOrSinkWithNoAliasCheck(*LoadLoc, FirstBB, LastBB))
4370 continue;
4371
4372 // Collect common metadata from all loads in the group.
4373 VPIRMetadata CommonMetadata = getCommonMetadata(Group);
4374
4375 // Find the load with minimum alignment to use.
4376 auto *LoadWithMinAlign = findRecipeWithMinAlign<LoadInst>(Group);
4377
4378 // Create an unpredicated version of the earliest load with common
4379 // metadata.
4380 auto *UnpredicatedLoad = new VPReplicateRecipe(
4381 LoadWithMinAlign->getUnderlyingInstr(), {EarliestLoad->getOperand(0)},
4382 /*IsSingleScalar=*/false, /*Mask=*/nullptr, *EarliestLoad,
4383 CommonMetadata);
4384
4385 UnpredicatedLoad->insertBefore(EarliestLoad);
4386
4387 // Replace all loads in the group with the unpredicated load.
4388 for (VPReplicateRecipe *Load : Group) {
4389 Load->replaceAllUsesWith(UnpredicatedLoad);
4390 Load->eraseFromParent();
4391 }
4392 }
4393}
4394
4395static bool
4397 ScalarEvolution &SE, const Loop &L,
4398 VPTypeAnalysis &TypeInfo) {
4399 auto StoreLoc = vputils::getMemoryLocation(*StoresToSink.front());
4400 if (!StoreLoc || !StoreLoc->AATags.Scope)
4401 return false;
4402
4403 // When sinking a group of stores, all members of the group alias each other.
4404 // Skip them during the alias checks.
4405 SmallPtrSet<VPRecipeBase *, 4> StoresToSinkSet(StoresToSink.begin(),
4406 StoresToSink.end());
4407
4408 VPBasicBlock *FirstBB = StoresToSink.front()->getParent();
4409 VPBasicBlock *LastBB = StoresToSink.back()->getParent();
4410 SinkStoreInfo SinkInfo(StoresToSinkSet, *StoresToSink[0], SE, L, TypeInfo);
4411 return canHoistOrSinkWithNoAliasCheck(*StoreLoc, FirstBB, LastBB, SinkInfo);
4412}
4413
4415 const Loop *L) {
4416 auto Groups =
4418 if (Groups.empty())
4419 return;
4420
4421 VPDominatorTree VPDT(Plan);
4422 VPTypeAnalysis TypeInfo(Plan);
4423
4424 for (auto &Group : Groups) {
4425 sort(Group, [&VPDT](VPReplicateRecipe *A, VPReplicateRecipe *B) {
4426 return VPDT.properlyDominates(A, B);
4427 });
4428
4429 if (!canSinkStoreWithNoAliasCheck(Group, SE, *L, TypeInfo))
4430 continue;
4431
4432 // Use the last (most dominated) store's location for the unconditional
4433 // store.
4434 VPReplicateRecipe *LastStore = Group.back();
4435 VPBasicBlock *InsertBB = LastStore->getParent();
4436
4437 // Collect common alias metadata from all stores in the group.
4438 VPIRMetadata CommonMetadata = getCommonMetadata(Group);
4439
4440 // Build select chain for stored values.
4441 VPValue *SelectedValue = Group[0]->getOperand(0);
4442 VPBuilder Builder(InsertBB, LastStore->getIterator());
4443
4444 for (unsigned I = 1; I < Group.size(); ++I) {
4445 VPValue *Mask = Group[I]->getMask();
4446 VPValue *Value = Group[I]->getOperand(0);
4447 SelectedValue = Builder.createSelect(Mask, Value, SelectedValue,
4448 Group[I]->getDebugLoc());
4449 }
4450
4451 // Find the store with minimum alignment to use.
4452 auto *StoreWithMinAlign = findRecipeWithMinAlign<StoreInst>(Group);
4453
4454 // Create unconditional store with selected value and common metadata.
4455 auto *UnpredicatedStore =
4456 new VPReplicateRecipe(StoreWithMinAlign->getUnderlyingInstr(),
4457 {SelectedValue, LastStore->getOperand(1)},
4458 /*IsSingleScalar=*/false,
4459 /*Mask=*/nullptr, *LastStore, CommonMetadata);
4460 UnpredicatedStore->insertBefore(*InsertBB, LastStore->getIterator());
4461
4462 // Remove all predicated stores from the group.
4463 for (VPReplicateRecipe *Store : Group)
4464 Store->eraseFromParent();
4465 }
4466}
4467
4469 VPlan &Plan, ElementCount BestVF, unsigned BestUF,
4471 assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
4472 assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
4473
4474 VPValue *TC = Plan.getTripCount();
4475 // Skip cases for which the trip count may be non-trivial to materialize.
4476 // I.e., when a scalar tail is absent - due to tail folding, or when a scalar
4477 // tail is required.
4478 if (!Plan.hasScalarTail() ||
4480 Plan.getScalarPreheader() ||
4481 !TC->isLiveIn())
4482 return;
4483
4484 // Materialize vector trip counts for constants early if it can simply
4485 // be computed as (Original TC / VF * UF) * VF * UF.
4486 // TODO: Compute vector trip counts for loops requiring a scalar epilogue and
4487 // tail-folded loops.
4488 ScalarEvolution &SE = *PSE.getSE();
4489 auto *TCScev = SE.getSCEV(TC->getLiveInIRValue());
4490 if (!isa<SCEVConstant>(TCScev))
4491 return;
4492 const SCEV *VFxUF = SE.getElementCount(TCScev->getType(), BestVF * BestUF);
4493 auto VecTCScev = SE.getMulExpr(SE.getUDivExpr(TCScev, VFxUF), VFxUF);
4494 if (auto *ConstVecTC = dyn_cast<SCEVConstant>(VecTCScev))
4495 Plan.getVectorTripCount().setUnderlyingValue(ConstVecTC->getValue());
4496}
4497
4499 VPBasicBlock *VectorPH) {
4501 if (BTC->getNumUsers() == 0)
4502 return;
4503
4504 VPBuilder Builder(VectorPH, VectorPH->begin());
4505 auto *TCTy = VPTypeAnalysis(Plan).inferScalarType(Plan.getTripCount());
4506 auto *TCMO = Builder.createNaryOp(
4507 Instruction::Sub, {Plan.getTripCount(), Plan.getConstantInt(TCTy, 1)},
4508 DebugLoc::getCompilerGenerated(), "trip.count.minus.1");
4509 BTC->replaceAllUsesWith(TCMO);
4510}
4511
4513 if (Plan.hasScalarVFOnly())
4514 return;
4515
4516 VPTypeAnalysis TypeInfo(Plan);
4517 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
4518 auto VPBBsOutsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
4520 auto VPBBsInsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
4521 vp_depth_first_shallow(LoopRegion->getEntry()));
4522 // Materialize Build(Struct)Vector for all replicating VPReplicateRecipes and
4523 // VPInstructions, excluding ones in replicate regions. Those are not
4524 // materialized explicitly yet. Those vector users are still handled in
4525 // VPReplicateRegion::execute(), via shouldPack().
4526 // TODO: materialize build vectors for replicating recipes in replicating
4527 // regions.
4528 for (VPBasicBlock *VPBB :
4529 concat<VPBasicBlock *>(VPBBsOutsideLoopRegion, VPBBsInsideLoopRegion)) {
4530 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
4532 continue;
4533 auto *DefR = cast<VPRecipeWithIRFlags>(&R);
4534 auto UsesVectorOrInsideReplicateRegion = [DefR, LoopRegion](VPUser *U) {
4535 VPRegionBlock *ParentRegion = cast<VPRecipeBase>(U)->getRegion();
4536 return !U->usesScalars(DefR) || ParentRegion != LoopRegion;
4537 };
4538 if ((isa<VPReplicateRecipe>(DefR) &&
4539 cast<VPReplicateRecipe>(DefR)->isSingleScalar()) ||
4540 (isa<VPInstruction>(DefR) &&
4542 !cast<VPInstruction>(DefR)->doesGeneratePerAllLanes())) ||
4543 none_of(DefR->users(), UsesVectorOrInsideReplicateRegion))
4544 continue;
4545
4546 Type *ScalarTy = TypeInfo.inferScalarType(DefR);
4547 unsigned Opcode = ScalarTy->isStructTy()
4550 auto *BuildVector = new VPInstruction(Opcode, {DefR});
4551 BuildVector->insertAfter(DefR);
4552
4553 DefR->replaceUsesWithIf(
4554 BuildVector, [BuildVector, &UsesVectorOrInsideReplicateRegion](
4555 VPUser &U, unsigned) {
4556 return &U != BuildVector && UsesVectorOrInsideReplicateRegion(&U);
4557 });
4558 }
4559 }
4560
4561 // Create explicit VPInstructions to convert vectors to scalars. The current
4562 // implementation is conservative - it may miss some cases that may or may not
4563 // be vector values. TODO: introduce Unpacks speculatively - remove them later
4564 // if they are known to operate on scalar values.
4565 for (VPBasicBlock *VPBB : VPBBsInsideLoopRegion) {
4566 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
4569 continue;
4570 for (VPValue *Def : R.definedValues()) {
4571 // Skip recipes that are single-scalar or only have their first lane
4572 // used.
4573 // TODO: The Defs skipped here may or may not be vector values.
4574 // Introduce Unpacks, and remove them later, if they are guaranteed to
4575 // produce scalar values.
4577 continue;
4578
4579 // At the moment, we create unpacks only for scalar users outside
4580 // replicate regions. Recipes inside replicate regions still extract the
4581 // required lanes implicitly.
4582 // TODO: Remove once replicate regions are unrolled completely.
4583 auto IsCandidateUnpackUser = [Def](VPUser *U) {
4584 VPRegionBlock *ParentRegion = cast<VPRecipeBase>(U)->getRegion();
4585 return U->usesScalars(Def) &&
4586 (!ParentRegion || !ParentRegion->isReplicator());
4587 };
4588 if (none_of(Def->users(), IsCandidateUnpackUser))
4589 continue;
4590
4591 auto *Unpack = new VPInstruction(VPInstruction::Unpack, {Def});
4592 if (R.isPhi())
4593 Unpack->insertBefore(*VPBB, VPBB->getFirstNonPhi());
4594 else
4595 Unpack->insertAfter(&R);
4596 Def->replaceUsesWithIf(Unpack,
4597 [&IsCandidateUnpackUser](VPUser &U, unsigned) {
4598 return IsCandidateUnpackUser(&U);
4599 });
4600 }
4601 }
4602 }
4603}
4604
4606 VPBasicBlock *VectorPHVPBB,
4607 bool TailByMasking,
4608 bool RequiresScalarEpilogue) {
4609 VPValue &VectorTC = Plan.getVectorTripCount();
4610 assert(VectorTC.isLiveIn() && "vector-trip-count must be a live-in");
4611 // There's nothing to do if there are no users of the vector trip count or its
4612 // IR value has already been set.
4613 if (VectorTC.getNumUsers() == 0 || VectorTC.getLiveInIRValue())
4614 return;
4615
4616 VPValue *TC = Plan.getTripCount();
4617 Type *TCTy = VPTypeAnalysis(Plan).inferScalarType(TC);
4618 VPBuilder Builder(VectorPHVPBB, VectorPHVPBB->begin());
4619 VPValue *Step = &Plan.getVFxUF();
4620
4621 // If the tail is to be folded by masking, round the number of iterations N
4622 // up to a multiple of Step instead of rounding down. This is done by first
4623 // adding Step-1 and then rounding down. Note that it's ok if this addition
4624 // overflows: the vector induction variable will eventually wrap to zero given
4625 // that it starts at zero and its Step is a power of two; the loop will then
4626 // exit, with the last early-exit vector comparison also producing all-true.
4627 // For scalable vectors the VF is not guaranteed to be a power of 2, but this
4628 // is accounted for in emitIterationCountCheck that adds an overflow check.
4629 if (TailByMasking) {
4630 TC = Builder.createNaryOp(
4631 Instruction::Add,
4632 {TC, Builder.createNaryOp(Instruction::Sub,
4633 {Step, Plan.getConstantInt(TCTy, 1)})},
4634 DebugLoc::getCompilerGenerated(), "n.rnd.up");
4635 }
4636
4637 // Now we need to generate the expression for the part of the loop that the
4638 // vectorized body will execute. This is equal to N - (N % Step) if scalar
4639 // iterations are not required for correctness, or N - Step, otherwise. Step
4640 // is equal to the vectorization factor (number of SIMD elements) times the
4641 // unroll factor (number of SIMD instructions).
4642 VPValue *R =
4643 Builder.createNaryOp(Instruction::URem, {TC, Step},
4644 DebugLoc::getCompilerGenerated(), "n.mod.vf");
4645
4646 // There are cases where we *must* run at least one iteration in the remainder
4647 // loop. See the cost model for when this can happen. If the step evenly
4648 // divides the trip count, we set the remainder to be equal to the step. If
4649 // the step does not evenly divide the trip count, no adjustment is necessary
4650 // since there will already be scalar iterations. Note that the minimum
4651 // iterations check ensures that N >= Step.
4652 if (RequiresScalarEpilogue) {
4653 assert(!TailByMasking &&
4654 "requiring scalar epilogue is not supported with fail folding");
4655 VPValue *IsZero =
4656 Builder.createICmp(CmpInst::ICMP_EQ, R, Plan.getConstantInt(TCTy, 0));
4657 R = Builder.createSelect(IsZero, Step, R);
4658 }
4659
4660 VPValue *Res = Builder.createNaryOp(
4661 Instruction::Sub, {TC, R}, DebugLoc::getCompilerGenerated(), "n.vec");
4662 VectorTC.replaceAllUsesWith(Res);
4663}
4664
4666 ElementCount VFEC) {
4667 VPBuilder Builder(VectorPH, VectorPH->begin());
4668 Type *TCTy = VPTypeAnalysis(Plan).inferScalarType(Plan.getTripCount());
4669 VPValue &VF = Plan.getVF();
4670 VPValue &VFxUF = Plan.getVFxUF();
4671 // Note that after the transform, Plan.getVF and Plan.getVFxUF should not be
4672 // used.
4673 // TODO: Assert that they aren't used.
4674
4675 // If there are no users of the runtime VF, compute VFxUF by constant folding
4676 // the multiplication of VF and UF.
4677 if (VF.getNumUsers() == 0) {
4678 VPValue *RuntimeVFxUF =
4679 Builder.createElementCount(TCTy, VFEC * Plan.getUF());
4680 VFxUF.replaceAllUsesWith(RuntimeVFxUF);
4681 return;
4682 }
4683
4684 // For users of the runtime VF, compute it as VF * vscale, and VFxUF as (VF *
4685 // vscale) * UF.
4686 VPValue *RuntimeVF = Builder.createElementCount(TCTy, VFEC);
4688 VPValue *BC = Builder.createNaryOp(VPInstruction::Broadcast, RuntimeVF);
4690 BC, [&VF](VPUser &U, unsigned) { return !U.usesScalars(&VF); });
4691 }
4692 VF.replaceAllUsesWith(RuntimeVF);
4693
4694 VPValue *UF = Plan.getConstantInt(TCTy, Plan.getUF());
4695 VPValue *MulByUF = Builder.createOverflowingOp(
4696 Instruction::Mul, {RuntimeVF, UF}, {true, false});
4697 VFxUF.replaceAllUsesWith(MulByUF);
4698}
4699
4702 SCEVExpander Expander(SE, "induction", /*PreserveLCSSA=*/false);
4703
4704 auto *Entry = cast<VPIRBasicBlock>(Plan.getEntry());
4705 BasicBlock *EntryBB = Entry->getIRBasicBlock();
4706 DenseMap<const SCEV *, Value *> ExpandedSCEVs;
4707 for (VPRecipeBase &R : make_early_inc_range(*Entry)) {
4709 continue;
4710 auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(&R);
4711 if (!ExpSCEV)
4712 break;
4713 const SCEV *Expr = ExpSCEV->getSCEV();
4714 Value *Res =
4715 Expander.expandCodeFor(Expr, Expr->getType(), EntryBB->getTerminator());
4716 ExpandedSCEVs[ExpSCEV->getSCEV()] = Res;
4717 VPValue *Exp = Plan.getOrAddLiveIn(Res);
4718 ExpSCEV->replaceAllUsesWith(Exp);
4719 if (Plan.getTripCount() == ExpSCEV)
4720 Plan.resetTripCount(Exp);
4721 ExpSCEV->eraseFromParent();
4722 }
4724 "VPExpandSCEVRecipes must be at the beginning of the entry block, "
4725 "after any VPIRInstructions");
4726 // Add IR instructions in the entry basic block but not in the VPIRBasicBlock
4727 // to the VPIRBasicBlock.
4728 auto EI = Entry->begin();
4729 for (Instruction &I : drop_end(*EntryBB)) {
4730 if (EI != Entry->end() && isa<VPIRInstruction>(*EI) &&
4731 &cast<VPIRInstruction>(&*EI)->getInstruction() == &I) {
4732 EI++;
4733 continue;
4734 }
4736 }
4737
4738 return ExpandedSCEVs;
4739}
4740
4741/// Returns true if \p V is VPWidenLoadRecipe or VPInterleaveRecipe that can be
4742/// converted to a narrower recipe. \p V is used by a wide recipe that feeds a
4743/// store interleave group at index \p Idx, \p WideMember0 is the recipe feeding
4744/// the same interleave group at index 0. A VPWidenLoadRecipe can be narrowed to
4745/// an index-independent load if it feeds all wide ops at all indices (\p OpV
4746/// must be the operand at index \p OpIdx for both the recipe at lane 0, \p
4747/// WideMember0). A VPInterleaveRecipe can be narrowed to a wide load, if \p V
4748/// is defined at \p Idx of a load interleave group.
4749static bool canNarrowLoad(VPWidenRecipe *WideMember0, unsigned OpIdx,
4750 VPValue *OpV, unsigned Idx) {
4751 VPValue *Member0Op = WideMember0->getOperand(OpIdx);
4752 VPRecipeBase *Member0OpR = Member0Op->getDefiningRecipe();
4753 if (!Member0OpR)
4754 return Member0Op == OpV;
4755 if (auto *W = dyn_cast<VPWidenLoadRecipe>(Member0OpR))
4756 return !W->getMask() && Member0Op == OpV;
4757 if (auto *IR = dyn_cast<VPInterleaveRecipe>(Member0OpR))
4758 return IR->getInterleaveGroup()->isFull() && IR->getVPValue(Idx) == OpV;
4759 return false;
4760}
4761
4762/// Returns true if \p IR is a full interleave group with factor and number of
4763/// members both equal to \p VF. The interleave group must also access the full
4764/// vector width \p VectorRegWidth.
4766 ElementCount VF,
4767 VPTypeAnalysis &TypeInfo,
4768 TypeSize VectorRegWidth) {
4769 if (!InterleaveR || InterleaveR->getMask())
4770 return false;
4771
4772 Type *GroupElementTy = nullptr;
4773 if (InterleaveR->getStoredValues().empty()) {
4774 GroupElementTy = TypeInfo.inferScalarType(InterleaveR->getVPValue(0));
4775 if (!all_of(InterleaveR->definedValues(),
4776 [&TypeInfo, GroupElementTy](VPValue *Op) {
4777 return TypeInfo.inferScalarType(Op) == GroupElementTy;
4778 }))
4779 return false;
4780 } else {
4781 GroupElementTy =
4782 TypeInfo.inferScalarType(InterleaveR->getStoredValues()[0]);
4783 if (!all_of(InterleaveR->getStoredValues(),
4784 [&TypeInfo, GroupElementTy](VPValue *Op) {
4785 return TypeInfo.inferScalarType(Op) == GroupElementTy;
4786 }))
4787 return false;
4788 }
4789
4790 unsigned VFMin = VF.getKnownMinValue();
4791 TypeSize GroupSize = TypeSize::get(
4792 GroupElementTy->getScalarSizeInBits() * VFMin, VF.isScalable());
4793 const auto *IG = InterleaveR->getInterleaveGroup();
4794 return IG->getFactor() == VFMin && IG->getNumMembers() == VFMin &&
4795 GroupSize == VectorRegWidth;
4796}
4797
4798/// Returns true if \p VPValue is a narrow VPValue.
4799static bool isAlreadyNarrow(VPValue *VPV) {
4800 if (VPV->isLiveIn())
4801 return true;
4802 auto *RepR = dyn_cast<VPReplicateRecipe>(VPV);
4803 return RepR && RepR->isSingleScalar();
4804}
4805
4806// Convert a wide recipe defining a VPValue \p V feeding an interleave group to
4807// a narrow variant.
4808static VPValue *
4810 auto *R = V->getDefiningRecipe();
4811 if (!R || NarrowedOps.contains(V))
4812 return V;
4813
4814 if (isAlreadyNarrow(V))
4815 return V;
4816
4817 if (auto *WideMember0 = dyn_cast<VPWidenRecipe>(R)) {
4818 for (unsigned Idx = 0, E = WideMember0->getNumOperands(); Idx != E; ++Idx)
4819 WideMember0->setOperand(
4820 Idx,
4821 narrowInterleaveGroupOp(WideMember0->getOperand(Idx), NarrowedOps));
4822 return V;
4823 }
4824
4825 if (auto *LoadGroup = dyn_cast<VPInterleaveRecipe>(R)) {
4826 // Narrow interleave group to wide load, as transformed VPlan will only
4827 // process one original iteration.
4828 auto *LI = cast<LoadInst>(LoadGroup->getInterleaveGroup()->getInsertPos());
4829 auto *L = new VPWidenLoadRecipe(
4830 *LI, LoadGroup->getAddr(), LoadGroup->getMask(), /*Consecutive=*/true,
4831 /*Reverse=*/false, {}, LoadGroup->getDebugLoc());
4832 L->insertBefore(LoadGroup);
4833 NarrowedOps.insert(L);
4834 return L;
4835 }
4836
4837 if (auto *RepR = dyn_cast<VPReplicateRecipe>(R)) {
4838 assert(RepR->isSingleScalar() &&
4839 isa<LoadInst>(RepR->getUnderlyingInstr()) &&
4840 "must be a single scalar load");
4841 NarrowedOps.insert(RepR);
4842 return RepR;
4843 }
4844
4845 auto *WideLoad = cast<VPWidenLoadRecipe>(R);
4846 VPValue *PtrOp = WideLoad->getAddr();
4847 if (auto *VecPtr = dyn_cast<VPVectorPointerRecipe>(PtrOp))
4848 PtrOp = VecPtr->getOperand(0);
4849 // Narrow wide load to uniform scalar load, as transformed VPlan will only
4850 // process one original iteration.
4851 auto *N = new VPReplicateRecipe(&WideLoad->getIngredient(), {PtrOp},
4852 /*IsUniform*/ true,
4853 /*Mask*/ nullptr, {}, *WideLoad);
4854 N->insertBefore(WideLoad);
4855 NarrowedOps.insert(N);
4856 return N;
4857}
4858
4860 TypeSize VectorRegWidth) {
4861 VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
4862 if (!VectorLoop || VectorLoop->getEntry()->getNumSuccessors() != 0)
4863 return;
4864
4865 VPTypeAnalysis TypeInfo(Plan);
4866
4868 for (auto &R : *VectorLoop->getEntryBasicBlock()) {
4870 continue;
4871
4874 continue;
4875
4876 // Bail out on recipes not supported at the moment:
4877 // * phi recipes other than the canonical induction
4878 // * recipes writing to memory except interleave groups
4879 // Only support plans with a canonical induction phi.
4880 if (R.isPhi())
4881 return;
4882
4883 auto *InterleaveR = dyn_cast<VPInterleaveRecipe>(&R);
4884 if (R.mayWriteToMemory() && !InterleaveR)
4885 return;
4886
4887 // Do not narrow interleave groups if there are VectorPointer recipes and
4888 // the plan was unrolled. The recipe implicitly uses VF from
4889 // VPTransformState.
4890 // TODO: Remove restriction once the VF for the VectorPointer offset is
4891 // modeled explicitly as operand.
4892 if (isa<VPVectorPointerRecipe>(&R) && Plan.getUF() > 1)
4893 return;
4894
4895 // All other ops are allowed, but we reject uses that cannot be converted
4896 // when checking all allowed consumers (store interleave groups) below.
4897 if (!InterleaveR)
4898 continue;
4899
4900 // Bail out on non-consecutive interleave groups.
4901 if (!isConsecutiveInterleaveGroup(InterleaveR, VF, TypeInfo,
4902 VectorRegWidth))
4903 return;
4904
4905 // Skip read interleave groups.
4906 if (InterleaveR->getStoredValues().empty())
4907 continue;
4908
4909 // Narrow interleave groups, if all operands are already matching narrow
4910 // ops.
4911 auto *Member0 = InterleaveR->getStoredValues()[0];
4912 if (isAlreadyNarrow(Member0) &&
4913 all_of(InterleaveR->getStoredValues(),
4914 [Member0](VPValue *VPV) { return Member0 == VPV; })) {
4915 StoreGroups.push_back(InterleaveR);
4916 continue;
4917 }
4918
4919 // For now, we only support full interleave groups storing load interleave
4920 // groups.
4921 if (all_of(enumerate(InterleaveR->getStoredValues()), [](auto Op) {
4922 VPRecipeBase *DefR = Op.value()->getDefiningRecipe();
4923 if (!DefR)
4924 return false;
4925 auto *IR = dyn_cast<VPInterleaveRecipe>(DefR);
4926 return IR && IR->getInterleaveGroup()->isFull() &&
4927 IR->getVPValue(Op.index()) == Op.value();
4928 })) {
4929 StoreGroups.push_back(InterleaveR);
4930 continue;
4931 }
4932
4933 // Check if all values feeding InterleaveR are matching wide recipes, which
4934 // operands that can be narrowed.
4935 auto *WideMember0 =
4936 dyn_cast_or_null<VPWidenRecipe>(InterleaveR->getStoredValues()[0]);
4937 if (!WideMember0)
4938 return;
4939 for (const auto &[I, V] : enumerate(InterleaveR->getStoredValues())) {
4941 if (!R || R->getOpcode() != WideMember0->getOpcode() ||
4942 R->getNumOperands() > 2)
4943 return;
4944 if (any_of(enumerate(R->operands()),
4945 [WideMember0, Idx = I](const auto &P) {
4946 const auto &[OpIdx, OpV] = P;
4947 return !canNarrowLoad(WideMember0, OpIdx, OpV, Idx);
4948 }))
4949 return;
4950 }
4951 StoreGroups.push_back(InterleaveR);
4952 }
4953
4954 if (StoreGroups.empty())
4955 return;
4956
4957 // Convert InterleaveGroup \p R to a single VPWidenLoadRecipe.
4958 SmallPtrSet<VPValue *, 4> NarrowedOps;
4959 // Narrow operation tree rooted at store groups.
4960 for (auto *StoreGroup : StoreGroups) {
4961 VPValue *Res =
4962 narrowInterleaveGroupOp(StoreGroup->getStoredValues()[0], NarrowedOps);
4963 auto *SI =
4964 cast<StoreInst>(StoreGroup->getInterleaveGroup()->getInsertPos());
4965 auto *S = new VPWidenStoreRecipe(
4966 *SI, StoreGroup->getAddr(), Res, nullptr, /*Consecutive=*/true,
4967 /*Reverse=*/false, {}, StoreGroup->getDebugLoc());
4968 S->insertBefore(StoreGroup);
4969 StoreGroup->eraseFromParent();
4970 }
4971
4972 // Adjust induction to reflect that the transformed plan only processes one
4973 // original iteration.
4974 auto *CanIV = VectorLoop->getCanonicalIV();
4975 auto *Inc = cast<VPInstruction>(CanIV->getBackedgeValue());
4976 VPBuilder PHBuilder(Plan.getVectorPreheader());
4977
4978 VPValue *UF = Plan.getOrAddLiveIn(
4979 ConstantInt::get(VectorLoop->getCanonicalIVType(), 1 * Plan.getUF()));
4980 if (VF.isScalable()) {
4981 VPValue *VScale = PHBuilder.createElementCount(
4983 VPValue *VScaleUF = PHBuilder.createOverflowingOp(
4984 Instruction::Mul, {VScale, UF}, {true, false});
4985 Inc->setOperand(1, VScaleUF);
4986 Plan.getVF().replaceAllUsesWith(VScale);
4987 } else {
4988 Inc->setOperand(1, UF);
4990 Plan.getConstantInt(CanIV->getScalarType(), 1));
4991 }
4992 removeDeadRecipes(Plan);
4993}
4994
4995/// Add branch weight metadata, if the \p Plan's middle block is terminated by a
4996/// BranchOnCond recipe.
4998 VPlan &Plan, ElementCount VF, std::optional<unsigned> VScaleForTuning) {
4999 VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
5000 auto *MiddleTerm =
5002 // Only add branch metadata if there is a (conditional) terminator.
5003 if (!MiddleTerm)
5004 return;
5005
5006 assert(MiddleTerm->getOpcode() == VPInstruction::BranchOnCond &&
5007 "must have a BranchOnCond");
5008 // Assume that `TripCount % VectorStep ` is equally distributed.
5009 unsigned VectorStep = Plan.getUF() * VF.getKnownMinValue();
5010 if (VF.isScalable() && VScaleForTuning.has_value())
5011 VectorStep *= *VScaleForTuning;
5012 assert(VectorStep > 0 && "trip count should not be zero");
5013 MDBuilder MDB(Plan.getContext());
5014 MDNode *BranchWeights =
5015 MDB.createBranchWeights({1, VectorStep - 1}, /*IsExpected=*/false);
5016 MiddleTerm->setMetadata(LLVMContext::MD_prof, BranchWeights);
5017}
5018
5019/// Compute and return the end value for \p WideIV, unless it is truncated. If
5020/// the induction recipe is not canonical, creates a VPDerivedIVRecipe to
5021/// compute the end value of the induction.
5023 VPBuilder &VectorPHBuilder,
5024 VPTypeAnalysis &TypeInfo,
5025 VPValue *VectorTC) {
5026 auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
5027 // Truncated wide inductions resume from the last lane of their vector value
5028 // in the last vector iteration which is handled elsewhere.
5029 if (WideIntOrFp && WideIntOrFp->getTruncInst())
5030 return nullptr;
5031
5032 VPValue *Start = WideIV->getStartValue();
5033 VPValue *Step = WideIV->getStepValue();
5035 VPValue *EndValue = VectorTC;
5036 if (!WideIntOrFp || !WideIntOrFp->isCanonical()) {
5037 EndValue = VectorPHBuilder.createDerivedIV(
5038 ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
5039 Start, VectorTC, Step);
5040 }
5041
5042 // EndValue is derived from the vector trip count (which has the same type as
5043 // the widest induction) and thus may be wider than the induction here.
5044 Type *ScalarTypeOfWideIV = TypeInfo.inferScalarType(WideIV);
5045 if (ScalarTypeOfWideIV != TypeInfo.inferScalarType(EndValue)) {
5046 EndValue = VectorPHBuilder.createScalarCast(Instruction::Trunc, EndValue,
5047 ScalarTypeOfWideIV,
5048 WideIV->getDebugLoc());
5049 }
5050
5051 return EndValue;
5052}
5053
5055 VPlan &Plan, DenseMap<VPValue *, VPValue *> &IVEndValues) {
5056 VPTypeAnalysis TypeInfo(Plan);
5057 auto *ScalarPH = Plan.getScalarPreheader();
5058 auto *MiddleVPBB = cast<VPBasicBlock>(ScalarPH->getPredecessors()[0]);
5059 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
5060 VPBuilder VectorPHBuilder(
5061 cast<VPBasicBlock>(VectorRegion->getSinglePredecessor()));
5062 VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
5063 for (VPRecipeBase &PhiR : Plan.getScalarPreheader()->phis()) {
5064 auto *ResumePhiR = cast<VPPhi>(&PhiR);
5065
5066 // TODO: Extract final value from induction recipe initially, optimize to
5067 // pre-computed end value together in optimizeInductionExitUsers.
5068 auto *VectorPhiR = cast<VPHeaderPHIRecipe>(ResumePhiR->getOperand(0));
5069 if (auto *WideIVR = dyn_cast<VPWidenInductionRecipe>(VectorPhiR)) {
5071 WideIVR, VectorPHBuilder, TypeInfo, &Plan.getVectorTripCount())) {
5072 IVEndValues[WideIVR] = EndValue;
5073 ResumePhiR->setOperand(0, EndValue);
5074 ResumePhiR->setName("bc.resume.val");
5075 continue;
5076 }
5077 // TODO: Also handle truncated inductions here. Computing end-values
5078 // separately should be done as VPlan-to-VPlan optimization, after
5079 // legalizing all resume values to use the last lane from the loop.
5080 assert(cast<VPWidenIntOrFpInductionRecipe>(VectorPhiR)->getTruncInst() &&
5081 "should only skip truncated wide inductions");
5082 continue;
5083 }
5084
5085 // The backedge value provides the value to resume coming out of a loop,
5086 // which for FORs is a vector whose last element needs to be extracted. The
5087 // start value provides the value if the loop is bypassed.
5088 bool IsFOR = isa<VPFirstOrderRecurrencePHIRecipe>(VectorPhiR);
5089 auto *ResumeFromVectorLoop = VectorPhiR->getBackedgeValue();
5090 assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
5091 "Cannot handle loops with uncountable early exits");
5092 if (IsFOR) {
5093 auto *ExtractPart = MiddleBuilder.createNaryOp(
5094 VPInstruction::ExtractLastPart, ResumeFromVectorLoop);
5095 ResumeFromVectorLoop = MiddleBuilder.createNaryOp(
5097 "vector.recur.extract");
5098 }
5099 ResumePhiR->setName(IsFOR ? "scalar.recur.init" : "bc.merge.rdx");
5100 ResumePhiR->setOperand(0, ResumeFromVectorLoop);
5101 }
5102}
5103
5105 VFRange &Range) {
5106 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
5107 auto *ScalarPHVPBB = Plan.getScalarPreheader();
5108 auto *MiddleVPBB = Plan.getMiddleBlock();
5109 VPBuilder ScalarPHBuilder(ScalarPHVPBB);
5110 VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
5111
5112 auto IsScalableOne = [](ElementCount VF) -> bool {
5113 return VF == ElementCount::getScalable(1);
5114 };
5115
5116 for (auto &HeaderPhi : VectorRegion->getEntryBasicBlock()->phis()) {
5117 auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&HeaderPhi);
5118 if (!FOR)
5119 continue;
5120
5121 assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
5122 "Cannot handle loops with uncountable early exits");
5123
5124 // This is the second phase of vectorizing first-order recurrences, creating
5125 // extract for users outside the loop. An overview of the transformation is
5126 // described below. Suppose we have the following loop with some use after
5127 // the loop of the last a[i-1],
5128 //
5129 // for (int i = 0; i < n; ++i) {
5130 // t = a[i - 1];
5131 // b[i] = a[i] - t;
5132 // }
5133 // use t;
5134 //
5135 // There is a first-order recurrence on "a". For this loop, the shorthand
5136 // scalar IR looks like:
5137 //
5138 // scalar.ph:
5139 // s.init = a[-1]
5140 // br scalar.body
5141 //
5142 // scalar.body:
5143 // i = phi [0, scalar.ph], [i+1, scalar.body]
5144 // s1 = phi [s.init, scalar.ph], [s2, scalar.body]
5145 // s2 = a[i]
5146 // b[i] = s2 - s1
5147 // br cond, scalar.body, exit.block
5148 //
5149 // exit.block:
5150 // use = lcssa.phi [s1, scalar.body]
5151 //
5152 // In this example, s1 is a recurrence because it's value depends on the
5153 // previous iteration. In the first phase of vectorization, we created a
5154 // VPFirstOrderRecurrencePHIRecipe v1 for s1. Now we create the extracts
5155 // for users in the scalar preheader and exit block.
5156 //
5157 // vector.ph:
5158 // v_init = vector(..., ..., ..., a[-1])
5159 // br vector.body
5160 //
5161 // vector.body
5162 // i = phi [0, vector.ph], [i+4, vector.body]
5163 // v1 = phi [v_init, vector.ph], [v2, vector.body]
5164 // v2 = a[i, i+1, i+2, i+3]
5165 // b[i] = v2 - v1
5166 // // Next, third phase will introduce v1' = splice(v1(3), v2(0, 1, 2))
5167 // b[i, i+1, i+2, i+3] = v2 - v1
5168 // br cond, vector.body, middle.block
5169 //
5170 // middle.block:
5171 // vector.recur.extract.for.phi = v2(2)
5172 // vector.recur.extract = v2(3)
5173 // br cond, scalar.ph, exit.block
5174 //
5175 // scalar.ph:
5176 // scalar.recur.init = phi [vector.recur.extract, middle.block],
5177 // [s.init, otherwise]
5178 // br scalar.body
5179 //
5180 // scalar.body:
5181 // i = phi [0, scalar.ph], [i+1, scalar.body]
5182 // s1 = phi [scalar.recur.init, scalar.ph], [s2, scalar.body]
5183 // s2 = a[i]
5184 // b[i] = s2 - s1
5185 // br cond, scalar.body, exit.block
5186 //
5187 // exit.block:
5188 // lo = lcssa.phi [s1, scalar.body],
5189 // [vector.recur.extract.for.phi, middle.block]
5190 //
5191 // Now update VPIRInstructions modeling LCSSA phis in the exit block.
5192 // Extract the penultimate value of the recurrence and use it as operand for
5193 // the VPIRInstruction modeling the phi.
5195 make_range(MiddleVPBB->getFirstNonPhi(), MiddleVPBB->end()))) {
5197 continue;
5198
5199 // For VF vscale x 1, if vscale = 1, we are unable to extract the
5200 // penultimate value of the recurrence. Instead we rely on the existing
5201 // extract of the last element from the result of
5202 // VPInstruction::FirstOrderRecurrenceSplice.
5203 // TODO: Consider vscale_range info and UF.
5205 Range))
5206 return;
5207 VPValue *PenultimateElement = MiddleBuilder.createNaryOp(
5208 VPInstruction::ExtractPenultimateElement, FOR->getBackedgeValue(), {},
5209 "vector.recur.extract.for.phi");
5210 cast<VPInstruction>(&R)->replaceAllUsesWith(PenultimateElement);
5211 }
5212 }
5213}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
AMDGPU Register Bank Select
This file implements a class to represent arbitrary precision integral constant values and operations...
ReachingDefInfo InstSet & ToRemove
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool isEqual(const Function &Caller, const Function &Callee)
static const Function * getParent(const Value *V)
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
static bool isSentinel(const DWARFDebugNames::AttributeEncoding &AE)
@ Default
Hexagon Common GEP
iv Induction Variable Users
Definition IVUsers.cpp:48
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
licm
Definition LICM.cpp:383
Legalize the Machine IR a function s Machine IR
Definition Legalizer.cpp:80
static bool mergeBlocksIntoPredecessors(Loop &L, DominatorTree &DT, LoopInfo &LI, MemorySSAUpdater *MSSAU, ScalarEvolution &SE)
#define I(x, y, z)
Definition MD5.cpp:57
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first DebugLoc that has line number information, given a range of instructions.
This file provides utility analysis objects describing memory locations.
This file contains the declarations for metadata subclasses.
MachineInstr unsigned OpIdx
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
uint64_t IntrinsicInst * II
#define P(N)
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
const SmallVectorImpl< MachineOperand > & Cond
This file contains some templates that are useful if you are working with the STL at all.
This is the interface for a metadata-based scoped no-alias analysis.
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file defines the SmallPtrSet class.
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This file implements the TypeSwitch template, which mimics a switch() statement whose cases are type ...
This file implements dominator tree analysis for a single level of a VPlan's H-CFG.
This file contains the declarations of different VPlan-related auxiliary helpers.
static VPValue * optimizeLatchExitInductionUser(VPlan &Plan, VPTypeAnalysis &TypeInfo, VPBlockBase *PredVPBB, VPValue *Op, DenseMap< VPValue *, VPValue * > &EndValues, ScalarEvolution &SE)
Attempts to optimize the induction variable exit values for users in the exit block coming from the l...
static void removeCommonBlendMask(VPBlendRecipe *Blend)
Try to see if all of Blend's masks share a common value logically and'ed and remove it from the masks...
static void tryToCreateAbstractReductionRecipe(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries to create abstract recipes from the reduction recipe for following optimizations ...
static VPReplicateRecipe * findRecipeWithMinAlign(ArrayRef< VPReplicateRecipe * > Group)
static bool sinkScalarOperands(VPlan &Plan)
static bool cannotHoistOrSinkRecipe(const VPRecipeBase &R)
Return true if we do not know how to (mechanically) hoist or sink R out of a loop region.
static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Try to simplify the branch condition of Plan.
static SmallVector< SmallVector< VPReplicateRecipe *, 4 > > collectComplementaryPredicatedMemOps(VPlan &Plan, ScalarEvolution &SE, const Loop *L)
static void simplifyRecipe(VPSingleDefRecipe *Def, VPTypeAnalysis &TypeInfo)
Try to simplify VPSingleDefRecipe Def.
static void removeRedundantInductionCasts(VPlan &Plan)
Remove redundant casts of inductions.
static bool tryToReplaceALMWithWideALM(VPlan &Plan, ElementCount VF, unsigned UF)
Try to replace multiple active lane masks used for control flow with a single, wide active lane mask ...
static std::optional< std::pair< bool, unsigned > > getOpcodeOrIntrinsicID(const VPSingleDefRecipe *R)
Get any instruction opcode or intrinsic ID data embedded in recipe R.
static VPExpressionRecipe * tryToMatchAndCreateExtendedReduction(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries convert extended in-loop reductions to VPExpressionRecipe and clamp the Range if ...
static VPScalarIVStepsRecipe * createScalarIVSteps(VPlan &Plan, InductionDescriptor::InductionKind Kind, Instruction::BinaryOps InductionOpcode, FPMathOperator *FPBinOp, Instruction *TruncI, VPValue *StartV, VPValue *Step, DebugLoc DL, VPBuilder &Builder)
static RemoveMask_match< Op0_t, Op1_t > m_RemoveMask(const Op0_t &In, Op1_t &Out)
Match a specific mask In, or a combination of it (logical-and In, Out).
static VPIRMetadata getCommonMetadata(ArrayRef< VPReplicateRecipe * > Recipes)
static VPValue * getPredicatedMask(VPRegionBlock *R)
If R is a region with a VPBranchOnMaskRecipe in the entry block, return the mask.
static bool sinkRecurrenceUsersAfterPrevious(VPFirstOrderRecurrencePHIRecipe *FOR, VPRecipeBase *Previous, VPDominatorTree &VPDT)
Sink users of FOR after the recipe defining the previous value Previous of the recurrence.
static bool mergeReplicateRegionsIntoSuccessors(VPlan &Plan)
static VPActiveLaneMaskPHIRecipe * addVPLaneMaskPhiAndUpdateExitBranch(VPlan &Plan, bool DataAndControlFlowWithoutRuntimeCheck)
static void expandVPWidenPointerInduction(VPWidenPointerInductionRecipe *R, VPTypeAnalysis &TypeInfo)
Expand a VPWidenPointerInductionRecipe into executable recipes, for the initial value,...
static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL)
Replace recipes with their EVL variants.
static bool isDeadRecipe(VPRecipeBase &R)
Returns true if R is dead and can be removed.
static void legalizeAndOptimizeInductions(VPlan &Plan)
Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd (IndStart, ScalarIVSteps (0,...
static void addReplicateRegions(VPlan &Plan)
static VPValue * tryToFoldLiveIns(VPSingleDefRecipe &R, ArrayRef< VPValue * > Operands, const DataLayout &DL, VPTypeAnalysis &TypeInfo)
Try to fold R using InstSimplifyFolder.
static VPValue * tryToComputeEndValueForInduction(VPWidenInductionRecipe *WideIV, VPBuilder &VectorPHBuilder, VPTypeAnalysis &TypeInfo, VPValue *VectorTC)
Compute and return the end value for WideIV, unless it is truncated.
static void removeRedundantExpandSCEVRecipes(VPlan &Plan)
Remove redundant EpxandSCEVRecipes in Plan's entry block by replacing them with already existing reci...
static bool simplifyKnownEVL(VPlan &Plan, ElementCount VF, PredicatedScalarEvolution &PSE)
From the definition of llvm.experimental.get.vector.length, VPInstruction::ExplicitVectorLength(AVL) ...
static bool isConditionTrueViaVFAndUF(VPValue *Cond, VPlan &Plan, ElementCount BestVF, unsigned BestUF, ScalarEvolution &SE)
Return true if Cond is known to be true for given BestVF and BestUF.
static bool hoistPreviousBeforeFORUsers(VPFirstOrderRecurrencePHIRecipe *FOR, VPRecipeBase *Previous, VPDominatorTree &VPDT)
Try to hoist Previous and its operands before all users of FOR.
static VPValue * scalarizeVPWidenPointerInduction(VPWidenPointerInductionRecipe *PtrIV, VPlan &Plan, VPBuilder &Builder)
Scalarize a VPWidenPointerInductionRecipe by replacing it with a PtrAdd (IndStart,...
static bool canSinkStoreWithNoAliasCheck(ArrayRef< VPReplicateRecipe * > StoresToSink, ScalarEvolution &SE, const Loop &L, VPTypeAnalysis &TypeInfo)
static SmallVector< VPUser * > collectUsersRecursively(VPValue *V)
static void recursivelyDeleteDeadRecipes(VPValue *V)
static VPValue * optimizeEarlyExitInductionUser(VPlan &Plan, VPTypeAnalysis &TypeInfo, VPBlockBase *PredVPBB, VPValue *Op, ScalarEvolution &SE)
Attempts to optimize the induction variable exit values for users in the early exit block.
static VPWidenInductionRecipe * getOptimizableIVOf(VPValue *VPV, ScalarEvolution &SE)
Check if VPV is an untruncated wide induction, either before or after the increment.
static VPRegionBlock * createReplicateRegion(VPReplicateRecipe *PredRecipe, VPlan &Plan)
static VPBasicBlock * getPredicatedThenBlock(VPRegionBlock *R)
If R is a triangle region, return the 'then' block of the triangle.
static VPValue * narrowInterleaveGroupOp(VPValue *V, SmallPtrSetImpl< VPValue * > &NarrowedOps)
static bool canHoistOrSinkWithNoAliasCheck(const MemoryLocation &MemLoc, VPBasicBlock *FirstBB, VPBasicBlock *LastBB, std::optional< SinkStoreInfo > SinkInfo={})
Check if a memory operation doesn't alias with memory operations in blocks between FirstBB and LastBB...
static void simplifyBlends(VPlan &Plan)
Normalize and simplify VPBlendRecipes.
static bool isConsecutiveInterleaveGroup(VPInterleaveRecipe *InterleaveR, ElementCount VF, VPTypeAnalysis &TypeInfo, TypeSize VectorRegWidth)
Returns true if IR is a full interleave group with factor and number of members both equal to VF.
static VPRecipeBase * optimizeMaskToEVL(VPValue *HeaderMask, VPRecipeBase &CurRecipe, VPTypeAnalysis &TypeInfo, VPValue &EVL)
Try to optimize a CurRecipe masked by HeaderMask to a corresponding EVL-based recipe without the head...
static bool isAlreadyNarrow(VPValue *VPV)
Returns true if VPValue is a narrow VPValue.
static bool optimizeVectorInductionWidthForTCAndVFUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF)
Optimize the width of vector induction variables in Plan based on a known constant Trip Count,...
static VPExpressionRecipe * tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries convert extended in-loop reductions to VPExpressionRecipe and clamp the Range if ...
static void expandVPWidenIntOrFpInduction(VPWidenIntOrFpInductionRecipe *WidenIVR, VPTypeAnalysis &TypeInfo)
Expand a VPWidenIntOrFpInduction into executable recipes, for the initial value, phi and backedge val...
static VPSingleDefRecipe * findHeaderMask(VPlan &Plan)
Collect the header mask with the pattern: (ICMP_ULE, WideCanonicalIV, backedge-taken-count) TODO: Int...
static void removeRedundantCanonicalIVs(VPlan &Plan)
Try to replace VPWidenCanonicalIVRecipes with a widened canonical IV recipe, if it exists.
static bool canNarrowLoad(VPWidenRecipe *WideMember0, unsigned OpIdx, VPValue *OpV, unsigned Idx)
Returns true if V is VPWidenLoadRecipe or VPInterleaveRecipe that can be converted to a narrower reci...
static void narrowToSingleScalarRecipes(VPlan &Plan)
This file provides utility VPlan to VPlan transformations.
This file declares the class VPlanVerifier, which contains utility functions to check the consistency...
This file contains the declarations of the Vectorization Plan base classes:
static const X86InstrFMA3Group Groups[]
Value * RHS
Value * LHS
BinaryOperator * Mul
static const uint32_t IV[8]
Definition blake3_impl.h:83
Helper for extra no-alias checks via known-safe recipe and SCEV.
SinkStoreInfo(const SmallPtrSetImpl< VPRecipeBase * > &ExcludeRecipes, VPReplicateRecipe &GroupLeader, ScalarEvolution &SE, const Loop &L, VPTypeAnalysis &TypeInfo)
bool shouldSkip(VPRecipeBase &R) const
Return true if R should be skipped during alias checking, either because it's in the exclude set or b...
Class for arbitrary precision integers.
Definition APInt.h:78
LLVM_ABI APInt zext(unsigned width) const
Zero extend to a new width.
Definition APInt.cpp:1012
unsigned getActiveBits() const
Compute the number of active bits in the value.
Definition APInt.h:1513
APInt abs() const
Get the absolute value.
Definition APInt.h:1796
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1489
LLVM_ABI APInt sext(unsigned width) const
Sign extend to a new width.
Definition APInt.cpp:985
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition APInt.h:1222
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
const T & back() const
back - Get the last element.
Definition ArrayRef.h:151
const T & front() const
front - Get the first element.
Definition ArrayRef.h:145
iterator end() const
Definition ArrayRef.h:131
iterator begin() const
Definition ArrayRef.h:130
LLVM Basic Block Representation.
Definition BasicBlock.h:62
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this basic block belongs to.
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233
This class represents a function call, abstracting a target machine's calling convention.
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:701
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:702
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition InstrTypes.h:686
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition InstrTypes.h:789
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
static ConstantInt * getSigned(IntegerType *Ty, int64_t V)
Return a ConstantInt with the specified value for the specified type.
Definition Constants.h:136
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
A debug info location.
Definition DebugLoc.h:123
static DebugLoc getCompilerGenerated()
Definition DebugLoc.h:162
static DebugLoc getUnknown()
Definition DebugLoc.h:161
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition DenseMap.h:205
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:256
bool dominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
dominates - Returns true iff A dominates B.
constexpr bool isVector() const
One or more elements.
Definition TypeSize.h:324
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition TypeSize.h:312
Utility class for floating point operations which can have information about relaxed accuracy require...
Definition Operator.h:200
Represents flags for the getelementptr instruction/expression.
GEPNoWrapFlags withoutNoUnsignedWrap() const
static GEPNoWrapFlags none()
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
A struct for saving information about induction variables.
InductionKind
This enum represents the kinds of inductions that we support.
@ IK_PtrInduction
Pointer induction var. Step = C.
@ IK_IntInduction
Integer induction variable. Step = C.
InstSimplifyFolder - Use InstructionSimplify to fold operations to existing values.
bool isCast() const
bool isBinaryOp() const
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:318
The group of interleaved loads/stores sharing the same stride and close to each other.
InstTy * getMember(uint32_t Index) const
Get the member with the given index Index.
uint32_t getNumMembers() const
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
static bool getDecisionAndClampRange(const std::function< bool(ElementCount)> &Predicate, VFRange &Range)
Test a Predicate on a Range of VF's.
Definition VPlan.cpp:1541
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
LLVM_ABI MDNode * createBranchWeights(uint32_t TrueWeight, uint32_t FalseWeight, bool IsExpected=false)
Return metadata containing two branch weights.
Definition MDBuilder.cpp:38
Metadata node.
Definition Metadata.h:1078
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:36
ValueT lookup(const KeyT &Key) const
Definition MapVector.h:108
Representation for a specific memory location.
AAMDNodes AATags
The metadata nodes which describes the aliasing of the location (each member is null if that kind of ...
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
ScalarEvolution * getSE() const
Returns the ScalarEvolution analysis used.
LLVM_ABI const SCEV * getSCEV(Value *V)
Returns the SCEV expression of V, in the context of the current SCEV predicate.
static LLVM_ABI unsigned getOpcode(RecurKind Kind)
Returns the opcode corresponding to the RecurrenceKind.
RegionT * getParent() const
Get the parent of the Region.
Definition RegionInfo.h:362
This class uses information about analyze scalars to rewrite expressions in canonical form.
LLVM_ABI Value * expandCodeFor(const SCEV *SH, Type *Ty, BasicBlock::iterator I)
Insert code to directly compute the specified SCEV expression into the program.
static const SCEV * rewrite(const SCEV *Scev, ScalarEvolution &SE, ValueToSCEVMapTy &Map)
This class represents an analyzed expression in the program.
LLVM_ABI Type * getType() const
Return the LLVM type of this SCEV expression.
The main scalar evolution driver.
LLVM_ABI const SCEV * getNegativeSCEV(const SCEV *V, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
Return the SCEV object corresponding to -V.
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI const SCEV * getUDivExpr(const SCEV *LHS, const SCEV *RHS)
Get a canonical unsigned division expression, or something simpler if possible.
LLVM_ABI const SCEV * getElementCount(Type *Ty, ElementCount EC, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
LLVM_ABI const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
LLVM_ABI bool isKnownPredicate(CmpPredicate Pred, const SCEV *LHS, const SCEV *RHS)
Test if the given expression is known to satisfy the condition described by Pred, LHS,...
static LLVM_ABI bool mayAliasInScopes(const MDNode *Scopes, const MDNode *NoAlias)
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
Definition SetVector.h:57
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:103
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:151
size_type size() const
Definition SmallPtrSet.h:99
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
iterator begin() const
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
Provides information about what library functions are available for the current target.
static LLVM_ABI PartialReductionExtendKind getPartialReductionExtendKind(Instruction *I)
Get the kind of extension that an instruction represents.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
static constexpr TypeSize get(ScalarTy Quantity, bool Scalable)
Definition TypeSize.h:340
This class implements a switch-like dispatch statement for a value of 'T' using dyn_cast functionalit...
Definition TypeSwitch.h:88
TypeSwitch< T, ResultT > & Case(CallableT &&caseFn)
Add a case on the given type.
Definition TypeSwitch.h:97
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:297
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:296
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
Definition Type.cpp:294
bool isStructTy() const
True if this is an instance of StructType.
Definition Type.h:261
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:230
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:293
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:184
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
op_range operands()
Definition User.h:292
A recipe for generating the active lane mask for the vector loop that is used to predicate the vector...
Definition VPlan.h:3617
VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph.
Definition VPlan.h:3978
void appendRecipe(VPRecipeBase *Recipe)
Augment the existing recipes of a VPBasicBlock with an additional Recipe as the last recipe.
Definition VPlan.h:4053
RecipeListTy::iterator iterator
Instruction iterators...
Definition VPlan.h:4005
iterator end()
Definition VPlan.h:4015
iterator begin()
Recipe iterator methods.
Definition VPlan.h:4013
iterator_range< iterator > phis()
Returns an iterator range over the PHI-like recipes in the block.
Definition VPlan.h:4066
iterator getFirstNonPhi()
Return the position of the first non-phi node recipe in the block.
Definition VPlan.cpp:216
VPRegionBlock * getEnclosingLoopRegion()
Definition VPlan.cpp:578
VPBasicBlock * splitAt(iterator SplitAt)
Split current block at SplitAt by inserting a new block between the current block and its successors ...
Definition VPlan.cpp:550
VPRecipeBase * getTerminator()
If the block has multiple successors, return the branch recipe terminating the block.
Definition VPlan.cpp:623
const VPRecipeBase & back() const
Definition VPlan.h:4027
A recipe for vectorizing a phi-node as a sequence of mask-based select instructions.
Definition VPlan.h:2526
VPValue * getMask(unsigned Idx) const
Return mask number Idx.
Definition VPlan.h:2560
unsigned getNumIncomingValues() const
Return the number of incoming values, taking into account when normalized the first incoming value wi...
Definition VPlan.h:2550
void setMask(unsigned Idx, VPValue *V)
Set mask number Idx to V.
Definition VPlan.h:2566
bool isNormalized() const
A normalized blend is one that has an odd number of operands, whereby the first operand does not have...
Definition VPlan.h:2546
VPBlockBase is the building block of the Hierarchical Control-Flow Graph.
Definition VPlan.h:81
VPRegionBlock * getParent()
Definition VPlan.h:173
const VPBasicBlock * getExitingBasicBlock() const
Definition VPlan.cpp:186
size_t getNumSuccessors() const
Definition VPlan.h:219
void swapSuccessors()
Swap successors of the block. The block must have exactly 2 successors.
Definition VPlan.h:322
size_t getNumPredecessors() const
Definition VPlan.h:220
const VPBlocksTy & getPredecessors() const
Definition VPlan.h:204
VPlan * getPlan()
Definition VPlan.cpp:161
VPBlockBase * getSinglePredecessor() const
Definition VPlan.h:215
const VPBasicBlock * getEntryBasicBlock() const
Definition VPlan.cpp:166
VPBlockBase * getSingleHierarchicalPredecessor()
Definition VPlan.h:264
VPBlockBase * getSingleSuccessor() const
Definition VPlan.h:209
const VPBlocksTy & getSuccessors() const
Definition VPlan.h:198
static auto blocksOnly(const T &Range)
Return an iterator range over Range which only includes BlockTy blocks.
Definition VPlanUtils.h:211
static void insertOnEdge(VPBlockBase *From, VPBlockBase *To, VPBlockBase *BlockPtr)
Inserts BlockPtr on the edge between From and To.
Definition VPlanUtils.h:232
static void insertTwoBlocksAfter(VPBlockBase *IfTrue, VPBlockBase *IfFalse, VPBlockBase *BlockPtr)
Insert disconnected VPBlockBases IfTrue and IfFalse after BlockPtr.
Definition VPlanUtils.h:151
static void connectBlocks(VPBlockBase *From, VPBlockBase *To, unsigned PredIdx=-1u, unsigned SuccIdx=-1u)
Connect VPBlockBases From and To bi-directionally.
Definition VPlanUtils.h:170
static void disconnectBlocks(VPBlockBase *From, VPBlockBase *To)
Disconnect VPBlockBases From and To bi-directionally.
Definition VPlanUtils.h:189
A recipe for generating conditional branches on the bits of a mask.
Definition VPlan.h:3030
RAII object that stores the current insertion point and restores it when the object is destroyed.
VPlan-based builder utility analogous to IRBuilder.
VPValue * createScalarZExtOrTrunc(VPValue *Op, Type *ResultTy, Type *SrcTy, DebugLoc DL)
VPValue * createElementCount(Type *Ty, ElementCount EC)
VPInstruction * createScalarCast(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy, DebugLoc DL, const VPIRFlags &Flags={}, const VPIRMetadata &Metadata={})
VPDerivedIVRecipe * createDerivedIV(InductionDescriptor::InductionKind Kind, FPMathOperator *FPBinOp, VPValue *Start, VPValue *Current, VPValue *Step, const Twine &Name="")
Convert the input value Current to the corresponding value of an induction with Start and Step values...
static VPBuilder getToInsertAfter(VPRecipeBase *R)
Create a VPBuilder to insert after R.
VPInstruction * createOverflowingOp(unsigned Opcode, ArrayRef< VPValue * > Operands, VPRecipeWithIRFlags::WrapFlagsTy WrapFlags={false, false}, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
VPPhi * createScalarPhi(ArrayRef< VPValue * > IncomingValues, DebugLoc DL, const Twine &Name="")
void setInsertPoint(VPBasicBlock *TheBB)
This specifies that created VPInstructions should be appended to the end of the specified block.
VPInstruction * createNaryOp(unsigned Opcode, ArrayRef< VPValue * > Operands, Instruction *Inst=nullptr, const VPIRFlags &Flags={}, const VPIRMetadata &MD={}, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
Create an N-ary operation with Opcode, Operands and set Inst as its underlying Instruction.
Canonical scalar induction phi of the vector loop.
Definition VPlan.h:3561
unsigned getNumDefinedValues() const
Returns the number of values defined by the VPDef.
Definition VPlanValue.h:426
ArrayRef< VPValue * > definedValues()
Returns an ArrayRef of the values defined by the VPDef.
Definition VPlanValue.h:421
VPValue * getVPSingleValue()
Returns the only VPValue defined by the VPDef.
Definition VPlanValue.h:399
VPValue * getVPValue(unsigned I)
Returns the VPValue with index I defined by the VPDef.
Definition VPlanValue.h:411
A recipe for converting the input value IV value to the corresponding value of an IV with different s...
Definition VPlan.h:3728
Template specialization of the standard LLVM dominator tree utility for VPBlockBases.
bool properlyDominates(const VPRecipeBase *A, const VPRecipeBase *B)
A recipe for generating the phi node for the current index of elements, adjusted in accordance with E...
Definition VPlan.h:3649
A recipe to combine multiple recipes into a single 'expression' recipe, which should be considered a ...
Definition VPlan.h:3075
A pure virtual base class for all recipes modeling header phis, including phis for first order recurr...
Definition VPlan.h:2052
virtual VPValue * getBackedgeValue()
Returns the incoming value from the loop backedge.
Definition VPlan.h:2095
VPValue * getStartValue()
Returns the start value of the phi, if one is set.
Definition VPlan.h:2084
A special type of VPBasicBlock that wraps an existing IR basic block.
Definition VPlan.h:4131
BasicBlock * getIRBasicBlock() const
Definition VPlan.h:4155
Class to record and manage LLVM IR flags.
Definition VPlan.h:609
static LLVM_ABI_FOR_TEST VPIRInstruction * create(Instruction &I)
Create a new VPIRPhi for \I , if it is a PHINode, otherwise create a VPIRInstruction.
Helper to manage IR metadata for recipes.
Definition VPlan.h:982
void intersect(const VPIRMetadata &MD)
Intersect this VPIRMetadata object with MD, keeping only metadata nodes that are common to both.
This is a concrete Recipe that models a single VPlan-level instruction.
Definition VPlan.h:1036
@ ExtractLane
Extracts a single lane (first operand) from a set of vector operands.
Definition VPlan.h:1127
@ Unpack
Extracts all lanes from its (non-scalable) vector operand.
Definition VPlan.h:1071
@ BuildVector
Creates a fixed-width vector containing all operands.
Definition VPlan.h:1066
@ BuildStructVector
Given operands of (the same) struct type, creates a struct of fixed- width vectors each containing a ...
Definition VPlan.h:1063
@ CanonicalIVIncrementForPart
Definition VPlan.h:1056
const InterleaveGroup< Instruction > * getInterleaveGroup() const
Definition VPlan.h:2668
VPValue * getMask() const
Return the mask used by this recipe.
Definition VPlan.h:2660
ArrayRef< VPValue * > getStoredValues() const
Return the VPValues stored by this interleave group.
Definition VPlan.h:2689
A recipe for interleaved memory operations with vector-predication intrinsics.
Definition VPlan.h:2742
VPInterleaveRecipe is a recipe for transforming an interleave group of load or stores into one wide l...
Definition VPlan.h:2700
VPPredInstPHIRecipe is a recipe for generating the phi nodes needed when control converges back from ...
Definition VPlan.h:3217
VPRecipeBase is a base class modeling a sequence of one or more output IR instructions.
Definition VPlan.h:387
VPRegionBlock * getRegion()
Definition VPlan.h:4283
VPBasicBlock * getParent()
Definition VPlan.h:408
DebugLoc getDebugLoc() const
Returns the debug location of the recipe.
Definition VPlan.h:479
void moveBefore(VPBasicBlock &BB, iplist< VPRecipeBase >::iterator I)
Unlink this recipe and insert into BB before I.
void insertBefore(VPRecipeBase *InsertPos)
Insert an unlinked recipe into a basic block immediately before the specified recipe.
void insertAfter(VPRecipeBase *InsertPos)
Insert an unlinked Recipe into a basic block immediately after the specified Recipe.
iplist< VPRecipeBase >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Helper class to create VPRecipies from IR instructions.
VPRecipeBase * getRecipe(Instruction *I)
Return the recipe created for given ingredient.
A recipe to represent inloop reduction operations with vector-predication intrinsics,...
Definition VPlan.h:2904
A recipe to represent inloop, ordered or partial reduction operations.
Definition VPlan.h:2793
VPRegionBlock represents a collection of VPBasicBlocks and VPRegionBlocks which form a Single-Entry-S...
Definition VPlan.h:4166
const VPBlockBase * getEntry() const
Definition VPlan.h:4202
Type * getCanonicalIVType()
Return the type of the canonical IV for loop regions.
Definition VPlan.h:4277
bool isReplicator() const
An indicator whether this region is to generate multiple replicated instances of output IR correspond...
Definition VPlan.h:4234
void setExiting(VPBlockBase *ExitingBlock)
Set ExitingBlock as the exiting VPBlockBase of this VPRegionBlock.
Definition VPlan.h:4219
VPCanonicalIVPHIRecipe * getCanonicalIV()
Returns the canonical induction recipe of the region.
Definition VPlan.h:4264
const VPBlockBase * getExiting() const
Definition VPlan.h:4214
VPBasicBlock * getPreheaderVPBB()
Returns the pre-header VPBasicBlock of the loop region.
Definition VPlan.h:4227
VPReplicateRecipe replicates a given instruction producing multiple scalar copies of the original sca...
Definition VPlan.h:2949
bool isSingleScalar() const
Definition VPlan.h:2990
VPValue * getMask()
Return the mask of a predicated VPReplicateRecipe.
Definition VPlan.h:3014
A recipe for handling phi nodes of integer and floating-point inductions, producing their scalar valu...
Definition VPlan.h:3798
VPSingleDef is a base class for recipes for modeling a sequence of one or more output IR that define ...
Definition VPlan.h:531
Instruction * getUnderlyingInstr()
Returns the underlying instruction.
Definition VPlan.h:595
VPSingleDefRecipe * clone() override=0
Clone the current recipe.
An analysis for type-inference for VPValues.
LLVMContext & getContext()
Return the LLVMContext used by the analysis.
Type * inferScalarType(const VPValue *V)
Infer the type of V. Returns the scalar type of V.
This class augments VPValue with operands which provide the inverse def-use edges from VPValue's user...
Definition VPlanValue.h:202
operand_range operands()
Definition VPlanValue.h:270
void setOperand(unsigned I, VPValue *New)
Definition VPlanValue.h:246
VPValue * getOperand(unsigned N) const
Definition VPlanValue.h:241
void addOperand(VPValue *Operand)
Definition VPlanValue.h:235
This is the base class of the VPlan Def/Use graph, used for modeling the data flow into,...
Definition VPlanValue.h:46
bool isDefinedOutsideLoopRegions() const
Returns true if the VPValue is defined outside any loop.
Definition VPlan.cpp:1374
VPRecipeBase * getDefiningRecipe()
Returns the recipe defining this VPValue or nullptr if it is not defined by a recipe,...
Definition VPlan.cpp:131
Value * getLiveInIRValue() const
Returns the underlying IR value, if this VPValue is defined outside the scope of VPlan.
Definition VPlanValue.h:181
Value * getUnderlyingValue() const
Return the underlying Value attached to this VPValue.
Definition VPlanValue.h:83
void setUnderlyingValue(Value *Val)
Definition VPlanValue.h:191
void replaceAllUsesWith(VPValue *New)
Definition VPlan.cpp:1377
unsigned getNumUsers() const
Definition VPlanValue.h:111
bool isLiveIn() const
Returns true if this VPValue is a live-in, i.e. defined outside the VPlan.
Definition VPlanValue.h:176
void replaceUsesWithIf(VPValue *New, llvm::function_ref< bool(VPUser &U, unsigned Idx)> ShouldReplace)
Go through the uses list for this VPValue and make each use point to New if the callback ShouldReplac...
Definition VPlan.cpp:1381
user_range users()
Definition VPlanValue.h:132
A recipe to compute a pointer to the last element of each part of a widened memory access for widened...
Definition VPlan.h:1911
A Recipe for widening the canonical induction variable of the vector loop.
Definition VPlan.h:3691
VPWidenCastRecipe is a recipe to create vector cast instructions.
Definition VPlan.h:1552
Instruction::CastOps getOpcode() const
Definition VPlan.h:1588
A recipe for handling GEP instructions.
Definition VPlan.h:1848
Base class for widened induction (VPWidenIntOrFpInductionRecipe and VPWidenPointerInductionRecipe),...
Definition VPlan.h:2119
PHINode * getPHINode() const
Definition VPlan.h:2161
VPValue * getStepValue()
Returns the step value of the induction.
Definition VPlan.h:2147
const InductionDescriptor & getInductionDescriptor() const
Returns the induction descriptor for the recipe.
Definition VPlan.h:2164
A recipe for handling phi nodes of integer and floating-point inductions, producing their vector valu...
Definition VPlan.h:2195
VPValue * getLastUnrolledPartOperand()
Returns the VPValue representing the value of this induction at the last unrolled part,...
Definition VPlan.h:2271
A recipe for widening vector intrinsics.
Definition VPlan.h:1602
A common base class for widening memory operations.
Definition VPlan.h:3260
A recipe for widened phis.
Definition VPlan.h:2329
VPWidenRecipe is a recipe for producing a widened instruction using the opcode and operands of the re...
Definition VPlan.h:1512
VPlan models a candidate for vectorization, encoding various decisions take to produce efficient outp...
Definition VPlan.h:4296
bool hasVF(ElementCount VF) const
Definition VPlan.h:4497
LLVMContext & getContext() const
Definition VPlan.h:4485
VPBasicBlock * getEntry()
Definition VPlan.h:4385
VPValue & getVectorTripCount()
The vector trip count.
Definition VPlan.h:4476
bool hasScalableVF() const
Definition VPlan.h:4498
VPValue & getVFxUF()
Returns VF * UF of the vector loop region.
Definition VPlan.h:4483
VPValue & getVF()
Returns the VF of the vector loop region.
Definition VPlan.h:4479
VPValue * getTripCount() const
The trip count of the original loop.
Definition VPlan.h:4447
VPValue * getTrue()
Return a VPValue wrapping i1 true.
Definition VPlan.h:4553
VPValue * getOrCreateBackedgeTakenCount()
The backedge taken count of the original loop.
Definition VPlan.h:4468
unsigned getUF() const
Definition VPlan.h:4517
VPRegionBlock * createReplicateRegion(VPBlockBase *Entry, VPBlockBase *Exiting, const std::string &Name="")
Create a new replicate region with Entry, Exiting and Name.
Definition VPlan.h:4623
auto getLiveIns() const
Return the list of live-in VPValues available in the VPlan.
Definition VPlan.h:4578
bool hasUF(unsigned UF) const
Definition VPlan.h:4515
ArrayRef< VPIRBasicBlock * > getExitBlocks() const
Return an ArrayRef containing VPIRBasicBlocks wrapping the exit blocks of the original scalar loop.
Definition VPlan.h:4437
VPValue * getConstantInt(Type *Ty, uint64_t Val, bool IsSigned=false)
Return a VPValue wrapping a ConstantInt with the given type and value.
Definition VPlan.h:4559
void setVF(ElementCount VF)
Definition VPlan.h:4491
bool isUnrolled() const
Returns true if the VPlan already has been unrolled, i.e.
Definition VPlan.h:4530
LLVM_ABI_FOR_TEST VPRegionBlock * getVectorLoopRegion()
Returns the VPRegionBlock of the vector loop.
Definition VPlan.cpp:1011
void resetTripCount(VPValue *NewTripCount)
Resets the trip count for the VPlan.
Definition VPlan.h:4461
VPBasicBlock * getMiddleBlock()
Returns the 'middle' block of the plan, that is the block that selects whether to execute the scalar ...
Definition VPlan.h:4410
VPBasicBlock * createVPBasicBlock(const Twine &Name, VPRecipeBase *Recipe=nullptr)
Create a new VPBasicBlock with Name and containing Recipe if present.
Definition VPlan.h:4601
VPValue * getFalse()
Return a VPValue wrapping i1 false.
Definition VPlan.h:4556
VPValue * getOrAddLiveIn(Value *V)
Gets the live-in VPValue for V or adds a new live-in (if none exists yet) for V.
Definition VPlan.h:4539
bool hasScalarVFOnly() const
Definition VPlan.h:4508
VPBasicBlock * getScalarPreheader() const
Return the VPBasicBlock for the preheader of the scalar loop.
Definition VPlan.h:4428
VPIRBasicBlock * getScalarHeader() const
Return the VPIRBasicBlock wrapping the header of the scalar loop.
Definition VPlan.h:4433
VPValue * getLiveIn(Value *V) const
Return the live-in VPValue for V, if there is one or nullptr otherwise.
Definition VPlan.h:4575
VPBasicBlock * getVectorPreheader()
Returns the preheader of the vector loop region, if one exists, or null otherwise.
Definition VPlan.h:4390
void setUF(unsigned UF)
Definition VPlan.h:4522
bool hasScalarTail() const
Returns true if the scalar tail may execute after the vector loop.
Definition VPlan.h:4655
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
iterator_range< user_iterator > users()
Definition Value.h:426
bool hasName() const
Definition Value.h:262
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
static constexpr bool isKnownLT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:216
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:168
constexpr LeafTy multiplyCoefficientBy(ScalarTy RHS) const
Definition TypeSize.h:256
constexpr bool isFixed() const
Returns true if the quantity is not scaled by vscale.
Definition TypeSize.h:171
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
IteratorT end() const
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
LLVM_ABI APInt RoundingUDiv(const APInt &A, const APInt &B, APInt::Rounding RM)
Return A unsign-divided by B, rounded by the given rounding mode.
Definition APInt.cpp:2763
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
m_Intrinsic_Ty< Opnd0, Opnd1, Opnd2 >::Ty m_MaskedStore(const Opnd0 &Op0, const Opnd1 &Op1, const Opnd2 &Op2)
Matches MaskedStore Intrinsic.
ap_match< APInt > m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
CastInst_match< OpTy, TruncInst > m_Trunc(const OpTy &Op)
Matches Trunc.
LogicalOp_match< LHS, RHS, Instruction::And > m_LogicalAnd(const LHS &L, const RHS &R)
Matches L && R either in the form of L & R or L ?
match_combine_or< CastInst_match< OpTy, ZExtInst >, OpTy > m_ZExtOrSelf(const OpTy &Op)
bool match(Val *V, const Pattern &P)
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
m_Intrinsic_Ty< Opnd0, Opnd1, Opnd2 >::Ty m_MaskedLoad(const Opnd0 &Op0, const Opnd1 &Op1, const Opnd2 &Op2)
Matches MaskedLoad Intrinsic.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
SpecificCmpClass_match< LHS, RHS, CmpInst > m_SpecificCmp(CmpPredicate MatchPred, const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
deferredval_ty< Value > m_Deferred(Value *const &V)
Like m_Specific(), but works if the specific value to match is determined as part of the same match()...
SpecificCmpClass_match< LHS, RHS, ICmpInst > m_SpecificICmp(CmpPredicate MatchPred, const LHS &L, const RHS &R)
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
CmpClass_match< LHS, RHS, ICmpInst > m_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
BinaryOp_match< LHS, RHS, Instruction::Mul, true > m_c_Mul(const LHS &L, const RHS &R)
Matches a Mul with LHS and RHS in either order.
MatchFunctor< Val, Pattern > match_fn(const Pattern &P)
A match functor that can be used as a UnaryPredicate in functional algorithms like all_of.
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
bind_cst_ty m_scev_APInt(const APInt *&C)
Match an SCEV constant and bind it to an APInt.
bool match(const SCEV *S, const Pattern &P)
VPInstruction_match< VPInstruction::ExtractLastLane, VPInstruction_match< VPInstruction::ExtractLastPart, Op0_t > > m_ExtractLastLaneOfLastPart(const Op0_t &Op0)
AllRecipe_commutative_match< Instruction::And, Op0_t, Op1_t > m_c_BinaryAnd(const Op0_t &Op0, const Op1_t &Op1)
Match a binary AND operation.
AllRecipe_match< Instruction::Or, Op0_t, Op1_t > m_BinaryOr(const Op0_t &Op0, const Op1_t &Op1)
Match a binary OR operation.
VPInstruction_match< VPInstruction::AnyOf > m_AnyOf()
AllRecipe_commutative_match< Opcode, Op0_t, Op1_t > m_c_Binary(const Op0_t &Op0, const Op1_t &Op1)
AllRecipe_commutative_match< Instruction::Or, Op0_t, Op1_t > m_c_BinaryOr(const Op0_t &Op0, const Op1_t &Op1)
GEPLikeRecipe_match< Op0_t, Op1_t > m_GetElementPtr(const Op0_t &Op0, const Op1_t &Op1)
AllRecipe_match< Opcode, Op0_t, Op1_t > m_Binary(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::LastActiveLane, Op0_t > m_LastActiveLane(const Op0_t &Op0)
VPInstruction_match< Instruction::ExtractElement, Op0_t, Op1_t > m_ExtractElement(const Op0_t &Op0, const Op1_t &Op1)
specific_intval< 1 > m_False()
VPDerivedIV_match< Op0_t, Op1_t, Op2_t > m_DerivedIV(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2)
VPInstruction_match< VPInstruction::ExtractLastLane, Op0_t > m_ExtractLastLane(const Op0_t &Op0)
VPInstruction_match< VPInstruction::ActiveLaneMask, Op0_t, Op1_t, Op2_t > m_ActiveLaneMask(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2)
VPInstruction_match< VPInstruction::BranchOnCount > m_BranchOnCount()
specific_intval< 1 > m_True()
VectorEndPointerRecipe_match< Op0_t, Op1_t > m_VecEndPtr(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::ExtractLastPart, Op0_t > m_ExtractLastPart(const Op0_t &Op0)
VPInstruction_match< VPInstruction::Broadcast, Op0_t > m_Broadcast(const Op0_t &Op0)
class_match< VPValue > m_VPValue()
Match an arbitrary VPValue and ignore it.
VPInstruction_match< VPInstruction::ExplicitVectorLength, Op0_t > m_EVL(const Op0_t &Op0)
VPInstruction_match< VPInstruction::BuildVector > m_BuildVector()
BuildVector is matches only its opcode, w/o matching its operands as the number of operands is not fi...
VPInstruction_match< VPInstruction::ExtractPenultimateElement, Op0_t > m_ExtractPenultimateElement(const Op0_t &Op0)
VPInstruction_match< VPInstruction::FirstActiveLane, Op0_t > m_FirstActiveLane(const Op0_t &Op0)
bind_ty< VPInstruction > m_VPInstruction(VPInstruction *&V)
Match a VPInstruction, capturing if we match.
VPInstruction_match< VPInstruction::BranchOnCond > m_BranchOnCond()
VPInstruction_match< VPInstruction::ExtractLane, Op0_t, Op1_t > m_ExtractLane(const Op0_t &Op0, const Op1_t &Op1)
NodeAddr< DefNode * > Def
Definition RDFGraph.h:384
bool isSingleScalar(const VPValue *VPV)
Returns true if VPV is a single scalar, either because it produces the same value for all lanes or on...
bool isUniformAcrossVFsAndUFs(VPValue *V)
Checks if V is uniform across all VF lanes and UF parts.
VPValue * getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr)
Get or create a VPValue that corresponds to the expansion of Expr.
std::optional< MemoryLocation > getMemoryLocation(const VPRecipeBase &R)
Return a MemoryLocation for R with noalias metadata populated from R, if the recipe is supported and ...
bool onlyFirstLaneUsed(const VPValue *Def)
Returns true if only the first lane of Def is used.
VPIRFlags getFlagsFromIndDesc(const InductionDescriptor &ID)
Extracts and returns NoWrap and FastMath flags from the induction binop in ID.
Definition VPlanUtils.h:85
bool onlyScalarValuesUsed(const VPValue *Def)
Returns true if only scalar values of Def are used by all users.
bool isHeaderMask(const VPValue *V, const VPlan &Plan)
Return true if V is a header mask in Plan.
const SCEV * getSCEVExprForVPValue(const VPValue *V, ScalarEvolution &SE, const Loop *L=nullptr)
Return the SCEV expression for V.
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
@ Offset
Definition DWP.cpp:532
auto min_element(R &&Range)
Provide wrappers to std::min_element which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:2032
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1737
LLVM_ABI Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
DenseMap< const Value *, const SCEV * > ValueToSCEVMapTy
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2494
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
constexpr from_range_t from_range
auto dyn_cast_if_present(const Y &Val)
dyn_cast_if_present<X> - Functionally identical to dyn_cast, except that a null (or none in the case ...
Definition Casting.h:732
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2148
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:632
auto cast_or_null(const Y &Val)
Definition Casting.h:714
iterator_range< df_iterator< VPBlockShallowTraversalWrapper< VPBlockBase * > > > vp_depth_first_shallow(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order.
Definition VPlanCFG.h:216
iterator_range< df_iterator< VPBlockDeepTraversalWrapper< VPBlockBase * > > > vp_depth_first_deep(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order while traversing t...
Definition VPlanCFG.h:243
detail::concat_range< ValueT, RangeTs... > concat(RangeTs &&...Ranges)
Returns a concatenated range across two or more ranges.
Definition STLExtras.h:1150
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:385
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1744
auto reverse(ContainerTy &&C)
Definition STLExtras.h:406
iterator_range< po_iterator< VPBlockDeepTraversalWrapper< VPBlockBase * > > > vp_post_order_deep(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in post order while traversing through ...
Definition VPlanCFG.h:236
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1634
LLVM_ABI_FOR_TEST cl::opt< bool > EnableWideActiveLaneMask
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1751
SmallVector< ValueTypeFromRangeType< R >, Size > to_vector(R &&Range)
Given a range of type R, iterate the entire range and return a SmallVector with elements of the vecto...
iterator_range< filter_iterator< detail::IterOfRange< RangeT >, PredicateT > > make_filter_range(RangeT &&Range, PredicateT Pred)
Convenience function that takes a range of elements and a predicate, and return a new filter_iterator...
Definition STLExtras.h:550
bool canConstantBeExtended(const APInt *C, Type *NarrowType, TTI::PartialReductionExtendKind ExtKind)
Check if a constant CI can be safely treated as having been extended from a narrower type with the gi...
Definition VPlan.cpp:1718
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
auto drop_end(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the last N elements excluded.
Definition STLExtras.h:323
RecurKind
These are the kinds of recurrences that we support.
@ Mul
Product of integers.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
@ AddChainWithSubs
A chain of adds and subs.
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition STLExtras.h:1966
DWARFExpression::Operation Op
auto max_element(R &&Range)
Provide wrappers to std::max_element which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:2042
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition STLExtras.h:1973
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="", bool Before=false)
Split the specified block at the specified instruction.
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1770
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition STLExtras.h:2120
@ DataAndControlFlowWithoutRuntimeCheck
Use predicate to control both data and control flow, but modify the trip count so that a runtime over...
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition Hashing.h:592
bool equal(L &&LRange, R &&RRange)
Wrapper function around std::equal to detect if pair-wise elements between two ranges are the same.
Definition STLExtras.h:2100
Type * toVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
@ Default
The result values are uniform if and only if all operands are uniform.
Definition Uniformity.h:20
constexpr detail::IsaCheckPredicate< Types... > IsaPred
Function object wrapper for the llvm::isa type check.
Definition Casting.h:866
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition Hashing.h:466
#define N
RemoveMask_match(const Op0_t &In, Op1_t &Out)
bool match(OpTy *V) const
A collection of metadata nodes that might be associated with a memory access used by the alias-analys...
Definition Metadata.h:761
MDNode * Scope
The tag for alias scope specification (used with noalias).
Definition Metadata.h:784
MDNode * NoAlias
The tag specifying the noalias scope.
Definition Metadata.h:787
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
An information struct used to provide DenseMap with the various necessary components for a given valu...
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
A range of powers-of-2 vectorization factors with fixed start and adjustable end.
Struct to hold various analysis needed for cost computations.
A recipe for handling first-order recurrence phis.
Definition VPlan.h:2371
A recipe for widening load operations with vector-predication intrinsics, using the address to load f...
Definition VPlan.h:3392
A recipe for widening load operations, using the address to load from and an optional mask.
Definition VPlan.h:3351
A recipe for widening select instructions.
Definition VPlan.h:1801
A recipe for widening store operations with vector-predication intrinsics, using the value to store,...
Definition VPlan.h:3476
A recipe for widening store operations, using the stored value, the address to store to and an option...
Definition VPlan.h:3433
static void hoistPredicatedLoads(VPlan &Plan, ScalarEvolution &SE, const Loop *L)
Hoist predicated loads from the same address to the loop entry block, if they are guaranteed to execu...
static void sinkPredicatedStores(VPlan &Plan, ScalarEvolution &SE, const Loop *L)
Sink predicated stores to the same address with complementary predicates (P and NOT P) to an uncondit...
static void materializeBroadcasts(VPlan &Plan)
Add explicit broadcasts for live-ins and VPValues defined in Plan's entry block if they are used as v...
static void materializePacksAndUnpacks(VPlan &Plan)
Add explicit Build[Struct]Vector recipes to Pack multiple scalar values into vectors and Unpack recip...
static void materializeBackedgeTakenCount(VPlan &Plan, VPBasicBlock *VectorPH)
Materialize the backedge-taken count to be computed explicitly using VPInstructions.
static void optimizeInductionExitUsers(VPlan &Plan, DenseMap< VPValue *, VPValue * > &EndValues, ScalarEvolution &SE)
If there's a single exit block, optimize its phi recipes that use exiting IV values by feeding them p...
static void hoistInvariantLoads(VPlan &Plan)
Hoist single-scalar loads with invariant addresses out of the vector loop to the preheader,...
static void canonicalizeEVLLoops(VPlan &Plan)
Transform EVL loops to use variable-length stepping after region dissolution.
static void dropPoisonGeneratingRecipes(VPlan &Plan, const std::function< bool(BasicBlock *)> &BlockNeedsPredication)
Drop poison flags from recipes that may generate a poison value that is used after vectorization,...
static void createAndOptimizeReplicateRegions(VPlan &Plan)
Wrap predicated VPReplicateRecipes with a mask operand in an if-then region block and remove the mask...
static void createInterleaveGroups(VPlan &Plan, const SmallPtrSetImpl< const InterleaveGroup< Instruction > * > &InterleaveGroups, VPRecipeBuilder &RecipeBuilder, const bool &ScalarEpilogueAllowed)
static bool runPass(bool(*Transform)(VPlan &, ArgsTy...), VPlan &Plan, typename std::remove_reference< ArgsTy >::type &...Args)
Helper to run a VPlan transform Transform on VPlan, forwarding extra arguments to the transform.
static void addBranchWeightToMiddleTerminator(VPlan &Plan, ElementCount VF, std::optional< unsigned > VScaleForTuning)
Add branch weight metadata, if the Plan's middle block is terminated by a BranchOnCond recipe.
static void narrowInterleaveGroups(VPlan &Plan, ElementCount VF, TypeSize VectorRegWidth)
Try to convert a plan with interleave groups with VF elements to a plan with the interleave groups re...
static DenseMap< const SCEV *, Value * > expandSCEVs(VPlan &Plan, ScalarEvolution &SE)
Expand VPExpandSCEVRecipes in Plan's entry block.
static void convertToConcreteRecipes(VPlan &Plan)
Lower abstract recipes to concrete ones, that can be codegen'd.
static void convertToAbstractRecipes(VPlan &Plan, VPCostContext &Ctx, VFRange &Range)
This function converts initial recipes to the abstract recipes and clamps Range based on cost model f...
static void materializeConstantVectorTripCount(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
static LLVM_ABI_FOR_TEST bool tryToConvertVPInstructionsToVPRecipes(VPlan &Plan, function_ref< const InductionDescriptor *(PHINode *)> GetIntOrFpInductionDescriptor, const TargetLibraryInfo &TLI)
Replaces the VPInstructions in Plan with corresponding widen recipes.
static void addExitUsersForFirstOrderRecurrences(VPlan &Plan, VFRange &Range)
Handle users in the exit block for first order reductions in the original exit block.
static void addExplicitVectorLength(VPlan &Plan, const std::optional< unsigned > &MaxEVLSafeElements)
Add a VPEVLBasedIVPHIRecipe and related recipes to Plan and replaces all uses except the canonical IV...
static void replaceSymbolicStrides(VPlan &Plan, PredicatedScalarEvolution &PSE, const DenseMap< Value *, const SCEV * > &StridesMap)
Replace symbolic strides from StridesMap in Plan with constants when possible.
static void removeBranchOnConst(VPlan &Plan)
Remove BranchOnCond recipes with true or false conditions together with removing dead edges to their ...
static void removeDeadRecipes(VPlan &Plan)
Remove dead recipes from Plan.
static void materializeVectorTripCount(VPlan &Plan, VPBasicBlock *VectorPHVPBB, bool TailByMasking, bool RequiresScalarEpilogue)
Materialize vector trip count computations to a set of VPInstructions.
static void simplifyRecipes(VPlan &Plan)
Perform instcombine-like simplifications on recipes in Plan.
static void handleUncountableEarlyExit(VPBasicBlock *EarlyExitingVPBB, VPBasicBlock *EarlyExitVPBB, VPlan &Plan, VPBasicBlock *HeaderVPBB, VPBasicBlock *LatchVPBB)
Update Plan to account for the uncountable early exit from EarlyExitingVPBB to EarlyExitVPBB by.
static void clearReductionWrapFlags(VPlan &Plan)
Clear NSW/NUW flags from reduction instructions if necessary.
static void cse(VPlan &Plan)
Perform common-subexpression-elimination on Plan.
static void addActiveLaneMask(VPlan &Plan, bool UseActiveLaneMaskForControlFlow, bool DataAndControlFlowWithoutRuntimeCheck)
Replace (ICMP_ULE, wide canonical IV, backedge-taken-count) checks with an (active-lane-mask recipe,...
static LLVM_ABI_FOR_TEST void optimize(VPlan &Plan)
Apply VPlan-to-VPlan optimizations to Plan, including induction recipe optimizations,...
static void dissolveLoopRegions(VPlan &Plan)
Replace loop regions with explicit CFG.
static void truncateToMinimalBitwidths(VPlan &Plan, const MapVector< Instruction *, uint64_t > &MinBWs)
Insert truncates and extends for any truncated recipe.
static bool adjustFixedOrderRecurrences(VPlan &Plan, VPBuilder &Builder)
Try to have all users of fixed-order recurrences appear after the recipe defining their previous valu...
static void optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Optimize Plan based on BestVF and BestUF.
static void materializeVFAndVFxUF(VPlan &Plan, VPBasicBlock *VectorPH, ElementCount VF)
Materialize VF and VFxUF to be computed explicitly using VPInstructions.
static void updateScalarResumePhis(VPlan &Plan, DenseMap< VPValue *, VPValue * > &IVEndValues)
Update the resume phis in the scalar preheader after creating wide recipes for first-order recurrence...