LLVM 22.0.0git
VPlanTransforms.cpp
Go to the documentation of this file.
1//===-- VPlanTransforms.cpp - Utility VPlan to VPlan transforms -----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file implements a set of utility VPlan to VPlan transformations.
11///
12//===----------------------------------------------------------------------===//
13
14#include "VPlanTransforms.h"
15#include "VPRecipeBuilder.h"
16#include "VPlan.h"
17#include "VPlanAnalysis.h"
18#include "VPlanCFG.h"
19#include "VPlanDominatorTree.h"
20#include "VPlanHelpers.h"
21#include "VPlanPatternMatch.h"
22#include "VPlanUtils.h"
23#include "VPlanVerifier.h"
24#include "llvm/ADT/APInt.h"
26#include "llvm/ADT/STLExtras.h"
28#include "llvm/ADT/SetVector.h"
30#include "llvm/ADT/TypeSwitch.h"
38#include "llvm/IR/Intrinsics.h"
39#include "llvm/IR/MDBuilder.h"
40#include "llvm/IR/Metadata.h"
44
45using namespace llvm;
46using namespace VPlanPatternMatch;
47using namespace SCEVPatternMatch;
48
50 VPlan &Plan,
52 GetIntOrFpInductionDescriptor,
53 const TargetLibraryInfo &TLI) {
54
56 Plan.getVectorLoopRegion());
58 // Skip blocks outside region
59 if (!VPBB->getParent())
60 break;
61 VPRecipeBase *Term = VPBB->getTerminator();
62 auto EndIter = Term ? Term->getIterator() : VPBB->end();
63 // Introduce each ingredient into VPlan.
64 for (VPRecipeBase &Ingredient :
65 make_early_inc_range(make_range(VPBB->begin(), EndIter))) {
66
67 VPValue *VPV = Ingredient.getVPSingleValue();
68 if (!VPV->getUnderlyingValue())
69 continue;
70
72
73 VPRecipeBase *NewRecipe = nullptr;
74 if (auto *PhiR = dyn_cast<VPPhi>(&Ingredient)) {
75 auto *Phi = cast<PHINode>(PhiR->getUnderlyingValue());
76 const auto *II = GetIntOrFpInductionDescriptor(Phi);
77 if (!II) {
78 NewRecipe = new VPWidenPHIRecipe(Phi, nullptr, PhiR->getDebugLoc());
79 for (VPValue *Op : PhiR->operands())
80 NewRecipe->addOperand(Op);
81 } else {
82 VPValue *Start = Plan.getOrAddLiveIn(II->getStartValue());
83 VPValue *Step =
85 // It is always safe to copy over the NoWrap and FastMath flags. In
86 // particular, when folding tail by masking, the masked-off lanes are
87 // never used, so it is safe.
89 NewRecipe = new VPWidenIntOrFpInductionRecipe(
90 Phi, Start, Step, &Plan.getVF(), *II, Flags,
91 Ingredient.getDebugLoc());
92 }
93 } else {
94 auto *VPI = cast<VPInstruction>(&Ingredient);
95 assert(!isa<PHINode>(Inst) && "phis should be handled above");
96 // Create VPWidenMemoryRecipe for loads and stores.
97 if (LoadInst *Load = dyn_cast<LoadInst>(Inst)) {
98 NewRecipe = new VPWidenLoadRecipe(
99 *Load, Ingredient.getOperand(0), nullptr /*Mask*/,
100 false /*Consecutive*/, false /*Reverse*/, *VPI,
101 Ingredient.getDebugLoc());
102 } else if (StoreInst *Store = dyn_cast<StoreInst>(Inst)) {
103 NewRecipe = new VPWidenStoreRecipe(
104 *Store, Ingredient.getOperand(1), Ingredient.getOperand(0),
105 nullptr /*Mask*/, false /*Consecutive*/, false /*Reverse*/, *VPI,
106 Ingredient.getDebugLoc());
108 NewRecipe = new VPWidenGEPRecipe(GEP, Ingredient.operands(), *VPI,
109 Ingredient.getDebugLoc());
110 } else if (CallInst *CI = dyn_cast<CallInst>(Inst)) {
111 Intrinsic::ID VectorID = getVectorIntrinsicIDForCall(CI, &TLI);
112 if (VectorID == Intrinsic::not_intrinsic)
113 return false;
114 NewRecipe = new VPWidenIntrinsicRecipe(
115 *CI, getVectorIntrinsicIDForCall(CI, &TLI),
116 drop_end(Ingredient.operands()), CI->getType(), VPIRFlags(*CI),
117 *VPI, CI->getDebugLoc());
118 } else if (SelectInst *SI = dyn_cast<SelectInst>(Inst)) {
119 NewRecipe = new VPWidenSelectRecipe(SI, Ingredient.operands(), *VPI,
120 *VPI, Ingredient.getDebugLoc());
121 } else if (auto *CI = dyn_cast<CastInst>(Inst)) {
122 NewRecipe = new VPWidenCastRecipe(
123 CI->getOpcode(), Ingredient.getOperand(0), CI->getType(), CI,
124 VPIRFlags(*CI), VPIRMetadata(*CI));
125 } else {
126 NewRecipe = new VPWidenRecipe(*Inst, Ingredient.operands(), *VPI,
127 *VPI, Ingredient.getDebugLoc());
128 }
129 }
130
131 NewRecipe->insertBefore(&Ingredient);
132 if (NewRecipe->getNumDefinedValues() == 1)
133 VPV->replaceAllUsesWith(NewRecipe->getVPSingleValue());
134 else
135 assert(NewRecipe->getNumDefinedValues() == 0 &&
136 "Only recpies with zero or one defined values expected");
137 Ingredient.eraseFromParent();
138 }
139 }
140 return true;
141}
142
143/// Helper for extra no-alias checks via known-safe recipe and SCEV.
145 const SmallPtrSetImpl<VPRecipeBase *> &ExcludeRecipes;
146 VPReplicateRecipe &GroupLeader;
147 ScalarEvolution &SE;
148 const Loop &L;
149 VPTypeAnalysis &TypeInfo;
150
151 // Return true if \p A and \p B are known to not alias for all VFs in the
152 // plan, checked via the distance between the accesses
153 bool isNoAliasViaDistance(VPReplicateRecipe *A, VPReplicateRecipe *B) const {
154 if (A->getOpcode() != Instruction::Store ||
155 B->getOpcode() != Instruction::Store)
156 return false;
157
158 VPValue *AddrA = A->getOperand(1);
159 const SCEV *SCEVA = vputils::getSCEVExprForVPValue(AddrA, SE, &L);
160 VPValue *AddrB = B->getOperand(1);
161 const SCEV *SCEVB = vputils::getSCEVExprForVPValue(AddrB, SE, &L);
163 return false;
164
165 const APInt *Distance;
166 if (!match(SE.getMinusSCEV(SCEVA, SCEVB), m_scev_APInt(Distance)))
167 return false;
168
169 const DataLayout &DL = SE.getDataLayout();
170 Type *TyA = TypeInfo.inferScalarType(A->getOperand(0));
171 uint64_t SizeA = DL.getTypeStoreSize(TyA);
172 Type *TyB = TypeInfo.inferScalarType(B->getOperand(0));
173 uint64_t SizeB = DL.getTypeStoreSize(TyB);
174
175 // Use the maximum store size to ensure no overlap from either direction.
176 // Currently only handles fixed sizes, as it is only used for
177 // replicating VPReplicateRecipes.
178 uint64_t MaxStoreSize = std::max(SizeA, SizeB);
179
180 auto VFs = B->getParent()->getPlan()->vectorFactors();
182 return Distance->abs().uge(
183 MaxVF.multiplyCoefficientBy(MaxStoreSize).getFixedValue());
184 }
185
186public:
188 VPReplicateRecipe &GroupLeader, ScalarEvolution &SE,
189 const Loop &L, VPTypeAnalysis &TypeInfo)
190 : ExcludeRecipes(ExcludeRecipes), GroupLeader(GroupLeader), SE(SE), L(L),
191 TypeInfo(TypeInfo) {}
192
193 /// Return true if \p R should be skipped during alias checking, either
194 /// because it's in the exclude set or because no-alias can be proven via
195 /// SCEV.
196 bool shouldSkip(VPRecipeBase &R) const {
197 auto *Store = dyn_cast<VPReplicateRecipe>(&R);
198 return ExcludeRecipes.contains(&R) ||
199 (Store && isNoAliasViaDistance(Store, &GroupLeader));
200 }
201};
202
203/// Check if a memory operation doesn't alias with memory operations in blocks
204/// between \p FirstBB and \p LastBB using scoped noalias metadata. If
205/// \p SinkInfo is std::nullopt, only recipes that may write to memory are
206/// checked (for load hoisting). Otherwise recipes that both read and write
207/// memory are checked, and SCEV is used to prove no-alias between the group
208/// leader and other replicate recipes (for store sinking).
209static bool
211 VPBasicBlock *FirstBB, VPBasicBlock *LastBB,
212 std::optional<SinkStoreInfo> SinkInfo = {}) {
213 bool CheckReads = SinkInfo.has_value();
214 if (!MemLoc.AATags.Scope)
215 return false;
216
217 const AAMDNodes &MemAA = MemLoc.AATags;
218
219 for (VPBlockBase *Block = FirstBB; Block;
220 Block = Block->getSingleSuccessor()) {
221 assert(Block->getNumSuccessors() <= 1 &&
222 "Expected at most one successor in block chain");
223 auto *VPBB = cast<VPBasicBlock>(Block);
224 for (VPRecipeBase &R : *VPBB) {
225 if (SinkInfo && SinkInfo->shouldSkip(R))
226 continue;
227
228 // Skip recipes that don't need checking.
229 if (!R.mayWriteToMemory() && !(CheckReads && R.mayReadFromMemory()))
230 continue;
231
233 if (!Loc)
234 // Conservatively assume aliasing for memory operations without
235 // location.
236 return false;
237
238 // For reads, check if they don't alias in the reverse direction and
239 // skip if so.
240 if (CheckReads && R.mayReadFromMemory() &&
242 MemAA.NoAlias))
243 continue;
244
245 // Check if the memory operations may alias in the forward direction.
247 Loc->AATags.NoAlias))
248 return false;
249 }
250
251 if (Block == LastBB)
252 break;
253 }
254 return true;
255}
256
257/// Return true if we do not know how to (mechanically) hoist or sink \p R out
258/// of a loop region.
260 // Assumes don't alias anything or throw; as long as they're guaranteed to
261 // execute, they're safe to hoist.
263 return false;
264
265 // TODO: Relax checks in the future, e.g. we could also hoist reads, if their
266 // memory location is not modified in the vector loop.
267 if (R.mayHaveSideEffects() || R.mayReadFromMemory() || R.isPhi())
268 return true;
269
270 // Allocas cannot be hoisted.
271 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
272 return RepR && RepR->getOpcode() == Instruction::Alloca;
273}
274
275static bool sinkScalarOperands(VPlan &Plan) {
276 auto Iter = vp_depth_first_deep(Plan.getEntry());
277 bool ScalarVFOnly = Plan.hasScalarVFOnly();
278 bool Changed = false;
279
281 auto InsertIfValidSinkCandidate = [ScalarVFOnly, &WorkList](
282 VPBasicBlock *SinkTo, VPValue *Op) {
283 auto *Candidate =
284 dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe());
285 if (!Candidate)
286 return;
287
288 // We only know how to sink VPReplicateRecipes and VPScalarIVStepsRecipes
289 // for now.
291 return;
292
293 if (Candidate->getParent() == SinkTo || cannotHoistOrSinkRecipe(*Candidate))
294 return;
295
296 if (auto *RepR = dyn_cast<VPReplicateRecipe>(Candidate))
297 if (!ScalarVFOnly && RepR->isSingleScalar())
298 return;
299
300 WorkList.insert({SinkTo, Candidate});
301 };
302
303 // First, collect the operands of all recipes in replicate blocks as seeds for
304 // sinking.
306 VPBasicBlock *EntryVPBB = VPR->getEntryBasicBlock();
307 if (!VPR->isReplicator() || EntryVPBB->getSuccessors().size() != 2)
308 continue;
309 VPBasicBlock *VPBB = cast<VPBasicBlock>(EntryVPBB->getSuccessors().front());
310 if (VPBB->getSingleSuccessor() != VPR->getExitingBasicBlock())
311 continue;
312 for (auto &Recipe : *VPBB)
313 for (VPValue *Op : Recipe.operands())
314 InsertIfValidSinkCandidate(VPBB, Op);
315 }
316
317 // Try to sink each replicate or scalar IV steps recipe in the worklist.
318 for (unsigned I = 0; I != WorkList.size(); ++I) {
319 VPBasicBlock *SinkTo;
320 VPSingleDefRecipe *SinkCandidate;
321 std::tie(SinkTo, SinkCandidate) = WorkList[I];
322
323 // All recipe users of SinkCandidate must be in the same block SinkTo or all
324 // users outside of SinkTo must only use the first lane of SinkCandidate. In
325 // the latter case, we need to duplicate SinkCandidate.
326 auto UsersOutsideSinkTo =
327 make_filter_range(SinkCandidate->users(), [SinkTo](VPUser *U) {
328 return cast<VPRecipeBase>(U)->getParent() != SinkTo;
329 });
330 if (any_of(UsersOutsideSinkTo, [SinkCandidate](VPUser *U) {
331 return !U->usesFirstLaneOnly(SinkCandidate);
332 }))
333 continue;
334 bool NeedsDuplicating = !UsersOutsideSinkTo.empty();
335
336 if (NeedsDuplicating) {
337 if (ScalarVFOnly)
338 continue;
339 VPSingleDefRecipe *Clone;
340 if (auto *SinkCandidateRepR =
341 dyn_cast<VPReplicateRecipe>(SinkCandidate)) {
342 // TODO: Handle converting to uniform recipes as separate transform,
343 // then cloning should be sufficient here.
344 Instruction *I = SinkCandidate->getUnderlyingInstr();
345 Clone = new VPReplicateRecipe(I, SinkCandidate->operands(), true,
346 nullptr /*Mask*/, *SinkCandidateRepR,
347 *SinkCandidateRepR);
348 // TODO: add ".cloned" suffix to name of Clone's VPValue.
349 } else {
350 Clone = SinkCandidate->clone();
351 }
352
353 Clone->insertBefore(SinkCandidate);
354 SinkCandidate->replaceUsesWithIf(Clone, [SinkTo](VPUser &U, unsigned) {
355 return cast<VPRecipeBase>(&U)->getParent() != SinkTo;
356 });
357 }
358 SinkCandidate->moveBefore(*SinkTo, SinkTo->getFirstNonPhi());
359 for (VPValue *Op : SinkCandidate->operands())
360 InsertIfValidSinkCandidate(SinkTo, Op);
361 Changed = true;
362 }
363 return Changed;
364}
365
366/// If \p R is a region with a VPBranchOnMaskRecipe in the entry block, return
367/// the mask.
369 auto *EntryBB = dyn_cast<VPBasicBlock>(R->getEntry());
370 if (!EntryBB || EntryBB->size() != 1 ||
371 !isa<VPBranchOnMaskRecipe>(EntryBB->begin()))
372 return nullptr;
373
374 return cast<VPBranchOnMaskRecipe>(&*EntryBB->begin())->getOperand(0);
375}
376
377/// If \p R is a triangle region, return the 'then' block of the triangle.
379 auto *EntryBB = cast<VPBasicBlock>(R->getEntry());
380 if (EntryBB->getNumSuccessors() != 2)
381 return nullptr;
382
383 auto *Succ0 = dyn_cast<VPBasicBlock>(EntryBB->getSuccessors()[0]);
384 auto *Succ1 = dyn_cast<VPBasicBlock>(EntryBB->getSuccessors()[1]);
385 if (!Succ0 || !Succ1)
386 return nullptr;
387
388 if (Succ0->getNumSuccessors() + Succ1->getNumSuccessors() != 1)
389 return nullptr;
390 if (Succ0->getSingleSuccessor() == Succ1)
391 return Succ0;
392 if (Succ1->getSingleSuccessor() == Succ0)
393 return Succ1;
394 return nullptr;
395}
396
397// Merge replicate regions in their successor region, if a replicate region
398// is connected to a successor replicate region with the same predicate by a
399// single, empty VPBasicBlock.
401 SmallPtrSet<VPRegionBlock *, 4> TransformedRegions;
402
403 // Collect replicate regions followed by an empty block, followed by another
404 // replicate region with matching masks to process front. This is to avoid
405 // iterator invalidation issues while merging regions.
408 vp_depth_first_deep(Plan.getEntry()))) {
409 if (!Region1->isReplicator())
410 continue;
411 auto *MiddleBasicBlock =
412 dyn_cast_or_null<VPBasicBlock>(Region1->getSingleSuccessor());
413 if (!MiddleBasicBlock || !MiddleBasicBlock->empty())
414 continue;
415
416 auto *Region2 =
417 dyn_cast_or_null<VPRegionBlock>(MiddleBasicBlock->getSingleSuccessor());
418 if (!Region2 || !Region2->isReplicator())
419 continue;
420
421 VPValue *Mask1 = getPredicatedMask(Region1);
422 VPValue *Mask2 = getPredicatedMask(Region2);
423 if (!Mask1 || Mask1 != Mask2)
424 continue;
425
426 assert(Mask1 && Mask2 && "both region must have conditions");
427 WorkList.push_back(Region1);
428 }
429
430 // Move recipes from Region1 to its successor region, if both are triangles.
431 for (VPRegionBlock *Region1 : WorkList) {
432 if (TransformedRegions.contains(Region1))
433 continue;
434 auto *MiddleBasicBlock = cast<VPBasicBlock>(Region1->getSingleSuccessor());
435 auto *Region2 = cast<VPRegionBlock>(MiddleBasicBlock->getSingleSuccessor());
436
437 VPBasicBlock *Then1 = getPredicatedThenBlock(Region1);
438 VPBasicBlock *Then2 = getPredicatedThenBlock(Region2);
439 if (!Then1 || !Then2)
440 continue;
441
442 // Note: No fusion-preventing memory dependencies are expected in either
443 // region. Such dependencies should be rejected during earlier dependence
444 // checks, which guarantee accesses can be re-ordered for vectorization.
445 //
446 // Move recipes to the successor region.
447 for (VPRecipeBase &ToMove : make_early_inc_range(reverse(*Then1)))
448 ToMove.moveBefore(*Then2, Then2->getFirstNonPhi());
449
450 auto *Merge1 = cast<VPBasicBlock>(Then1->getSingleSuccessor());
451 auto *Merge2 = cast<VPBasicBlock>(Then2->getSingleSuccessor());
452
453 // Move VPPredInstPHIRecipes from the merge block to the successor region's
454 // merge block. Update all users inside the successor region to use the
455 // original values.
456 for (VPRecipeBase &Phi1ToMove : make_early_inc_range(reverse(*Merge1))) {
457 VPValue *PredInst1 =
458 cast<VPPredInstPHIRecipe>(&Phi1ToMove)->getOperand(0);
459 VPValue *Phi1ToMoveV = Phi1ToMove.getVPSingleValue();
460 Phi1ToMoveV->replaceUsesWithIf(PredInst1, [Then2](VPUser &U, unsigned) {
461 return cast<VPRecipeBase>(&U)->getParent() == Then2;
462 });
463
464 // Remove phi recipes that are unused after merging the regions.
465 if (Phi1ToMove.getVPSingleValue()->getNumUsers() == 0) {
466 Phi1ToMove.eraseFromParent();
467 continue;
468 }
469 Phi1ToMove.moveBefore(*Merge2, Merge2->begin());
470 }
471
472 // Remove the dead recipes in Region1's entry block.
473 for (VPRecipeBase &R :
474 make_early_inc_range(reverse(*Region1->getEntryBasicBlock())))
475 R.eraseFromParent();
476
477 // Finally, remove the first region.
478 for (VPBlockBase *Pred : make_early_inc_range(Region1->getPredecessors())) {
479 VPBlockUtils::disconnectBlocks(Pred, Region1);
480 VPBlockUtils::connectBlocks(Pred, MiddleBasicBlock);
481 }
482 VPBlockUtils::disconnectBlocks(Region1, MiddleBasicBlock);
483 TransformedRegions.insert(Region1);
484 }
485
486 return !TransformedRegions.empty();
487}
488
490 VPlan &Plan) {
491 Instruction *Instr = PredRecipe->getUnderlyingInstr();
492 // Build the triangular if-then region.
493 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
494 assert(Instr->getParent() && "Predicated instruction not in any basic block");
495 auto *BlockInMask = PredRecipe->getMask();
496 auto *MaskDef = BlockInMask->getDefiningRecipe();
497 auto *BOMRecipe = new VPBranchOnMaskRecipe(
498 BlockInMask, MaskDef ? MaskDef->getDebugLoc() : DebugLoc::getUnknown());
499 auto *Entry =
500 Plan.createVPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
501
502 // Replace predicated replicate recipe with a replicate recipe without a
503 // mask but in the replicate region.
504 auto *RecipeWithoutMask = new VPReplicateRecipe(
505 PredRecipe->getUnderlyingInstr(), drop_end(PredRecipe->operands()),
506 PredRecipe->isSingleScalar(), nullptr /*Mask*/, *PredRecipe, *PredRecipe,
507 PredRecipe->getDebugLoc());
508 auto *Pred =
509 Plan.createVPBasicBlock(Twine(RegionName) + ".if", RecipeWithoutMask);
510
511 VPPredInstPHIRecipe *PHIRecipe = nullptr;
512 if (PredRecipe->getNumUsers() != 0) {
513 PHIRecipe = new VPPredInstPHIRecipe(RecipeWithoutMask,
514 RecipeWithoutMask->getDebugLoc());
515 PredRecipe->replaceAllUsesWith(PHIRecipe);
516 PHIRecipe->setOperand(0, RecipeWithoutMask);
517 }
518 PredRecipe->eraseFromParent();
519 auto *Exiting =
520 Plan.createVPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
522 Plan.createReplicateRegion(Entry, Exiting, RegionName);
523
524 // Note: first set Entry as region entry and then connect successors starting
525 // from it in order, to propagate the "parent" of each VPBasicBlock.
526 VPBlockUtils::insertTwoBlocksAfter(Pred, Exiting, Entry);
527 VPBlockUtils::connectBlocks(Pred, Exiting);
528
529 return Region;
530}
531
532static void addReplicateRegions(VPlan &Plan) {
535 vp_depth_first_deep(Plan.getEntry()))) {
536 for (VPRecipeBase &R : *VPBB)
537 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
538 if (RepR->isPredicated())
539 WorkList.push_back(RepR);
540 }
541 }
542
543 unsigned BBNum = 0;
544 for (VPReplicateRecipe *RepR : WorkList) {
545 VPBasicBlock *CurrentBlock = RepR->getParent();
546 VPBasicBlock *SplitBlock = CurrentBlock->splitAt(RepR->getIterator());
547
548 BasicBlock *OrigBB = RepR->getUnderlyingInstr()->getParent();
549 SplitBlock->setName(
550 OrigBB->hasName() ? OrigBB->getName() + "." + Twine(BBNum++) : "");
551 // Record predicated instructions for above packing optimizations.
553 Region->setParent(CurrentBlock->getParent());
555
556 VPRegionBlock *ParentRegion = Region->getParent();
557 if (ParentRegion && ParentRegion->getExiting() == CurrentBlock)
558 ParentRegion->setExiting(SplitBlock);
559 }
560}
561
562/// Remove redundant VPBasicBlocks by merging them into their predecessor if
563/// the predecessor has a single successor.
567 vp_depth_first_deep(Plan.getEntry()))) {
568 // Don't fold the blocks in the skeleton of the Plan into their single
569 // predecessors for now.
570 // TODO: Remove restriction once more of the skeleton is modeled in VPlan.
571 if (!VPBB->getParent())
572 continue;
573 auto *PredVPBB =
574 dyn_cast_or_null<VPBasicBlock>(VPBB->getSinglePredecessor());
575 if (!PredVPBB || PredVPBB->getNumSuccessors() != 1 ||
576 isa<VPIRBasicBlock>(PredVPBB))
577 continue;
578 WorkList.push_back(VPBB);
579 }
580
581 for (VPBasicBlock *VPBB : WorkList) {
582 VPBasicBlock *PredVPBB = cast<VPBasicBlock>(VPBB->getSinglePredecessor());
583 for (VPRecipeBase &R : make_early_inc_range(*VPBB))
584 R.moveBefore(*PredVPBB, PredVPBB->end());
585 VPBlockUtils::disconnectBlocks(PredVPBB, VPBB);
586 auto *ParentRegion = VPBB->getParent();
587 if (ParentRegion && ParentRegion->getExiting() == VPBB)
588 ParentRegion->setExiting(PredVPBB);
589 for (auto *Succ : to_vector(VPBB->successors())) {
591 VPBlockUtils::connectBlocks(PredVPBB, Succ);
592 }
593 // VPBB is now dead and will be cleaned up when the plan gets destroyed.
594 }
595 return !WorkList.empty();
596}
597
599 // Convert masked VPReplicateRecipes to if-then region blocks.
601
602 bool ShouldSimplify = true;
603 while (ShouldSimplify) {
604 ShouldSimplify = sinkScalarOperands(Plan);
605 ShouldSimplify |= mergeReplicateRegionsIntoSuccessors(Plan);
606 ShouldSimplify |= mergeBlocksIntoPredecessors(Plan);
607 }
608}
609
610/// Remove redundant casts of inductions.
611///
612/// Such redundant casts are casts of induction variables that can be ignored,
613/// because we already proved that the casted phi is equal to the uncasted phi
614/// in the vectorized loop. There is no need to vectorize the cast - the same
615/// value can be used for both the phi and casts in the vector loop.
617 for (auto &Phi : Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
619 if (!IV || IV->getTruncInst())
620 continue;
621
622 // A sequence of IR Casts has potentially been recorded for IV, which
623 // *must be bypassed* when the IV is vectorized, because the vectorized IV
624 // will produce the desired casted value. This sequence forms a def-use
625 // chain and is provided in reverse order, ending with the cast that uses
626 // the IV phi. Search for the recipe of the last cast in the chain and
627 // replace it with the original IV. Note that only the final cast is
628 // expected to have users outside the cast-chain and the dead casts left
629 // over will be cleaned up later.
630 ArrayRef<Instruction *> Casts = IV->getInductionDescriptor().getCastInsts();
631 VPValue *FindMyCast = IV;
632 for (Instruction *IRCast : reverse(Casts)) {
633 VPSingleDefRecipe *FoundUserCast = nullptr;
634 for (auto *U : FindMyCast->users()) {
635 auto *UserCast = dyn_cast<VPSingleDefRecipe>(U);
636 if (UserCast && UserCast->getUnderlyingValue() == IRCast) {
637 FoundUserCast = UserCast;
638 break;
639 }
640 }
641 FindMyCast = FoundUserCast;
642 }
643 FindMyCast->replaceAllUsesWith(IV);
644 }
645}
646
647/// Try to replace VPWidenCanonicalIVRecipes with a widened canonical IV
648/// recipe, if it exists.
650 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
651 VPCanonicalIVPHIRecipe *CanonicalIV = LoopRegion->getCanonicalIV();
652 VPWidenCanonicalIVRecipe *WidenNewIV = nullptr;
653 for (VPUser *U : CanonicalIV->users()) {
655 if (WidenNewIV)
656 break;
657 }
658
659 if (!WidenNewIV)
660 return;
661
662 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
663 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
664 auto *WidenOriginalIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
665
666 if (!WidenOriginalIV || !WidenOriginalIV->isCanonical())
667 continue;
668
669 // Replace WidenNewIV with WidenOriginalIV if WidenOriginalIV provides
670 // everything WidenNewIV's users need. That is, WidenOriginalIV will
671 // generate a vector phi or all users of WidenNewIV demand the first lane
672 // only.
673 if (!vputils::onlyScalarValuesUsed(WidenOriginalIV) ||
674 vputils::onlyFirstLaneUsed(WidenNewIV)) {
675 // We are replacing a wide canonical iv with a suitable wide induction.
676 // This is used to compute header mask, hence all lanes will be used and
677 // we need to drop wrap flags only applying to lanes guranteed to execute
678 // in the original scalar loop.
679 WidenOriginalIV->dropPoisonGeneratingFlags();
680 WidenNewIV->replaceAllUsesWith(WidenOriginalIV);
681 WidenNewIV->eraseFromParent();
682 return;
683 }
684 }
685}
686
687/// Returns true if \p R is dead and can be removed.
688static bool isDeadRecipe(VPRecipeBase &R) {
689 // Do remove conditional assume instructions as their conditions may be
690 // flattened.
691 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
692 bool IsConditionalAssume = RepR && RepR->isPredicated() &&
694 if (IsConditionalAssume)
695 return true;
696
697 if (R.mayHaveSideEffects())
698 return false;
699
700 // Recipe is dead if no user keeps the recipe alive.
701 return all_of(R.definedValues(),
702 [](VPValue *V) { return V->getNumUsers() == 0; });
703}
704
707 vp_post_order_deep(Plan.getEntry()))) {
708 // The recipes in the block are processed in reverse order, to catch chains
709 // of dead recipes.
710 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
711 if (isDeadRecipe(R)) {
712 R.eraseFromParent();
713 continue;
714 }
715
716 // Check if R is a dead VPPhi <-> update cycle and remove it.
717 auto *PhiR = dyn_cast<VPPhi>(&R);
718 if (!PhiR || PhiR->getNumOperands() != 2)
719 continue;
720 VPUser *PhiUser = PhiR->getSingleUser();
721 if (!PhiUser)
722 continue;
723 VPValue *Incoming = PhiR->getOperand(1);
724 if (PhiUser != Incoming->getDefiningRecipe() ||
725 Incoming->getNumUsers() != 1)
726 continue;
727 PhiR->replaceAllUsesWith(PhiR->getOperand(0));
728 PhiR->eraseFromParent();
729 Incoming->getDefiningRecipe()->eraseFromParent();
730 }
731 }
732}
733
736 Instruction::BinaryOps InductionOpcode,
737 FPMathOperator *FPBinOp, Instruction *TruncI,
738 VPValue *StartV, VPValue *Step, DebugLoc DL,
739 VPBuilder &Builder) {
740 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
741 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
742 VPCanonicalIVPHIRecipe *CanonicalIV = LoopRegion->getCanonicalIV();
743 VPSingleDefRecipe *BaseIV = Builder.createDerivedIV(
744 Kind, FPBinOp, StartV, CanonicalIV, Step, "offset.idx");
745
746 // Truncate base induction if needed.
747 VPTypeAnalysis TypeInfo(Plan);
748 Type *ResultTy = TypeInfo.inferScalarType(BaseIV);
749 if (TruncI) {
750 Type *TruncTy = TruncI->getType();
751 assert(ResultTy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits() &&
752 "Not truncating.");
753 assert(ResultTy->isIntegerTy() && "Truncation requires an integer type");
754 BaseIV = Builder.createScalarCast(Instruction::Trunc, BaseIV, TruncTy, DL);
755 ResultTy = TruncTy;
756 }
757
758 // Truncate step if needed.
759 Type *StepTy = TypeInfo.inferScalarType(Step);
760 if (ResultTy != StepTy) {
761 assert(StepTy->getScalarSizeInBits() > ResultTy->getScalarSizeInBits() &&
762 "Not truncating.");
763 assert(StepTy->isIntegerTy() && "Truncation requires an integer type");
764 auto *VecPreheader =
766 VPBuilder::InsertPointGuard Guard(Builder);
767 Builder.setInsertPoint(VecPreheader);
768 Step = Builder.createScalarCast(Instruction::Trunc, Step, ResultTy, DL);
769 }
770 return Builder.createScalarIVSteps(InductionOpcode, FPBinOp, BaseIV, Step,
771 &Plan.getVF(), DL);
772}
773
776 for (unsigned I = 0; I != Users.size(); ++I) {
778 if (isa<VPHeaderPHIRecipe>(Cur))
779 continue;
780 for (VPValue *V : Cur->definedValues())
781 Users.insert_range(V->users());
782 }
783 return Users.takeVector();
784}
785
786/// Scalarize a VPWidenPointerInductionRecipe by replacing it with a PtrAdd
787/// (IndStart, ScalarIVSteps (0, Step)). This is used when the recipe only
788/// generates scalar values.
789static VPValue *
791 VPlan &Plan, VPBuilder &Builder) {
793 VPValue *StartV = Plan.getConstantInt(ID.getStep()->getType(), 0);
794 VPValue *StepV = PtrIV->getOperand(1);
796 Plan, InductionDescriptor::IK_IntInduction, Instruction::Add, nullptr,
797 nullptr, StartV, StepV, PtrIV->getDebugLoc(), Builder);
798
799 return Builder.createPtrAdd(PtrIV->getStartValue(), Steps,
800 PtrIV->getDebugLoc(), "next.gep");
801}
802
803/// Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd
804/// (IndStart, ScalarIVSteps (0, Step)) if only its scalar values are used, as
805/// VPWidenPointerInductionRecipe will generate vectors only. If some users
806/// require vectors while other require scalars, the scalar uses need to extract
807/// the scalars from the generated vectors (Note that this is different to how
808/// int/fp inductions are handled). Legalize extract-from-ends using uniform
809/// VPReplicateRecipe of wide inductions to use regular VPReplicateRecipe, so
810/// the correct end value is available. Also optimize
811/// VPWidenIntOrFpInductionRecipe, if any of its users needs scalar values, by
812/// providing them scalar steps built on the canonical scalar IV and update the
813/// original IV's users. This is an optional optimization to reduce the needs of
814/// vector extracts.
817 bool HasOnlyVectorVFs = !Plan.hasScalarVFOnly();
818 VPBuilder Builder(HeaderVPBB, HeaderVPBB->getFirstNonPhi());
819 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
820 auto *PhiR = dyn_cast<VPWidenInductionRecipe>(&Phi);
821 if (!PhiR)
822 continue;
823
824 // Try to narrow wide and replicating recipes to uniform recipes, based on
825 // VPlan analysis.
826 // TODO: Apply to all recipes in the future, to replace legacy uniformity
827 // analysis.
828 auto Users = collectUsersRecursively(PhiR);
829 for (VPUser *U : reverse(Users)) {
830 auto *Def = dyn_cast<VPRecipeWithIRFlags>(U);
831 auto *RepR = dyn_cast<VPReplicateRecipe>(U);
832 // Skip recipes that shouldn't be narrowed.
833 if (!Def || !isa<VPReplicateRecipe, VPWidenRecipe>(Def) ||
834 Def->getNumUsers() == 0 || !Def->getUnderlyingValue() ||
835 (RepR && (RepR->isSingleScalar() || RepR->isPredicated())))
836 continue;
837
838 // Skip recipes that may have other lanes than their first used.
840 continue;
841
842 auto *Clone = new VPReplicateRecipe(Def->getUnderlyingInstr(),
843 Def->operands(), /*IsUniform*/ true,
844 /*Mask*/ nullptr, /*Flags*/ *Def);
845 Clone->insertAfter(Def);
846 Def->replaceAllUsesWith(Clone);
847 }
848
849 // Replace wide pointer inductions which have only their scalars used by
850 // PtrAdd(IndStart, ScalarIVSteps (0, Step)).
851 if (auto *PtrIV = dyn_cast<VPWidenPointerInductionRecipe>(&Phi)) {
852 if (!Plan.hasScalarVFOnly() &&
853 !PtrIV->onlyScalarsGenerated(Plan.hasScalableVF()))
854 continue;
855
856 VPValue *PtrAdd = scalarizeVPWidenPointerInduction(PtrIV, Plan, Builder);
857 PtrIV->replaceAllUsesWith(PtrAdd);
858 continue;
859 }
860
861 // Replace widened induction with scalar steps for users that only use
862 // scalars.
863 auto *WideIV = cast<VPWidenIntOrFpInductionRecipe>(&Phi);
864 if (HasOnlyVectorVFs && none_of(WideIV->users(), [WideIV](VPUser *U) {
865 return U->usesScalars(WideIV);
866 }))
867 continue;
868
869 const InductionDescriptor &ID = WideIV->getInductionDescriptor();
871 Plan, ID.getKind(), ID.getInductionOpcode(),
872 dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
873 WideIV->getTruncInst(), WideIV->getStartValue(), WideIV->getStepValue(),
874 WideIV->getDebugLoc(), Builder);
875
876 // Update scalar users of IV to use Step instead.
877 if (!HasOnlyVectorVFs) {
878 assert(!Plan.hasScalableVF() &&
879 "plans containing a scalar VF cannot also include scalable VFs");
880 WideIV->replaceAllUsesWith(Steps);
881 } else {
882 bool HasScalableVF = Plan.hasScalableVF();
883 WideIV->replaceUsesWithIf(Steps,
884 [WideIV, HasScalableVF](VPUser &U, unsigned) {
885 if (HasScalableVF)
886 return U.usesFirstLaneOnly(WideIV);
887 return U.usesScalars(WideIV);
888 });
889 }
890 }
891}
892
893/// Check if \p VPV is an untruncated wide induction, either before or after the
894/// increment. If so return the header IV (before the increment), otherwise
895/// return null.
897 ScalarEvolution &SE) {
898 auto *WideIV = dyn_cast<VPWidenInductionRecipe>(VPV);
899 if (WideIV) {
900 // VPV itself is a wide induction, separately compute the end value for exit
901 // users if it is not a truncated IV.
902 auto *IntOrFpIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
903 return (IntOrFpIV && IntOrFpIV->getTruncInst()) ? nullptr : WideIV;
904 }
905
906 // Check if VPV is an optimizable induction increment.
907 VPRecipeBase *Def = VPV->getDefiningRecipe();
908 if (!Def || Def->getNumOperands() != 2)
909 return nullptr;
910 WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(0));
911 if (!WideIV)
912 WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(1));
913 if (!WideIV)
914 return nullptr;
915
916 auto IsWideIVInc = [&]() {
917 auto &ID = WideIV->getInductionDescriptor();
918
919 // Check if VPV increments the induction by the induction step.
920 VPValue *IVStep = WideIV->getStepValue();
921 switch (ID.getInductionOpcode()) {
922 case Instruction::Add:
923 return match(VPV, m_c_Add(m_Specific(WideIV), m_Specific(IVStep)));
924 case Instruction::FAdd:
926 m_Specific(IVStep)));
927 case Instruction::FSub:
928 return match(VPV, m_Binary<Instruction::FSub>(m_Specific(WideIV),
929 m_Specific(IVStep)));
930 case Instruction::Sub: {
931 // IVStep will be the negated step of the subtraction. Check if Step == -1
932 // * IVStep.
933 VPValue *Step;
934 if (!match(VPV, m_Sub(m_VPValue(), m_VPValue(Step))))
935 return false;
936 const SCEV *IVStepSCEV = vputils::getSCEVExprForVPValue(IVStep, SE);
937 const SCEV *StepSCEV = vputils::getSCEVExprForVPValue(Step, SE);
938 return !isa<SCEVCouldNotCompute>(IVStepSCEV) &&
939 !isa<SCEVCouldNotCompute>(StepSCEV) &&
940 IVStepSCEV == SE.getNegativeSCEV(StepSCEV);
941 }
942 default:
943 return ID.getKind() == InductionDescriptor::IK_PtrInduction &&
944 match(VPV, m_GetElementPtr(m_Specific(WideIV),
945 m_Specific(WideIV->getStepValue())));
946 }
947 llvm_unreachable("should have been covered by switch above");
948 };
949 return IsWideIVInc() ? WideIV : nullptr;
950}
951
952/// Attempts to optimize the induction variable exit values for users in the
953/// early exit block.
955 VPTypeAnalysis &TypeInfo,
956 VPBlockBase *PredVPBB,
957 VPValue *Op,
958 ScalarEvolution &SE) {
959 VPValue *Incoming, *Mask;
962 return nullptr;
963
964 auto *WideIV = getOptimizableIVOf(Incoming, SE);
965 if (!WideIV)
966 return nullptr;
967
968 auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
969 if (WideIntOrFp && WideIntOrFp->getTruncInst())
970 return nullptr;
971
972 // Calculate the final index.
973 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
974 auto *CanonicalIV = LoopRegion->getCanonicalIV();
975 Type *CanonicalIVType = LoopRegion->getCanonicalIVType();
976 VPBuilder B(cast<VPBasicBlock>(PredVPBB));
977
978 DebugLoc DL = cast<VPInstruction>(Op)->getDebugLoc();
979 VPValue *FirstActiveLane =
980 B.createNaryOp(VPInstruction::FirstActiveLane, Mask, DL);
981 Type *FirstActiveLaneType = TypeInfo.inferScalarType(FirstActiveLane);
982 FirstActiveLane = B.createScalarZExtOrTrunc(FirstActiveLane, CanonicalIVType,
983 FirstActiveLaneType, DL);
984 VPValue *EndValue =
985 B.createNaryOp(Instruction::Add, {CanonicalIV, FirstActiveLane}, DL);
986
987 // `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
988 // changed it means the exit is using the incremented value, so we need to
989 // add the step.
990 if (Incoming != WideIV) {
991 VPValue *One = Plan.getConstantInt(CanonicalIVType, 1);
992 EndValue = B.createNaryOp(Instruction::Add, {EndValue, One}, DL);
993 }
994
995 if (!WideIntOrFp || !WideIntOrFp->isCanonical()) {
996 const InductionDescriptor &ID = WideIV->getInductionDescriptor();
997 VPValue *Start = WideIV->getStartValue();
998 VPValue *Step = WideIV->getStepValue();
999 EndValue = B.createDerivedIV(
1000 ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
1001 Start, EndValue, Step);
1002 }
1003
1004 return EndValue;
1005}
1006
1007/// Attempts to optimize the induction variable exit values for users in the
1008/// exit block coming from the latch in the original scalar loop.
1010 VPlan &Plan, VPTypeAnalysis &TypeInfo, VPBlockBase *PredVPBB, VPValue *Op,
1014 return nullptr;
1015
1016 auto *WideIV = getOptimizableIVOf(Incoming, SE);
1017 if (!WideIV)
1018 return nullptr;
1019
1020 VPValue *EndValue = EndValues.lookup(WideIV);
1021 assert(EndValue && "end value must have been pre-computed");
1022
1023 // `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
1024 // changed it means the exit is using the incremented value, so we don't
1025 // need to subtract the step.
1026 if (Incoming != WideIV)
1027 return EndValue;
1028
1029 // Otherwise, subtract the step from the EndValue.
1030 VPBuilder B(cast<VPBasicBlock>(PredVPBB)->getTerminator());
1031 VPValue *Step = WideIV->getStepValue();
1032 Type *ScalarTy = TypeInfo.inferScalarType(WideIV);
1033 if (ScalarTy->isIntegerTy())
1034 return B.createNaryOp(Instruction::Sub, {EndValue, Step},
1035 DebugLoc::getUnknown(), "ind.escape");
1036 if (ScalarTy->isPointerTy()) {
1037 Type *StepTy = TypeInfo.inferScalarType(Step);
1038 auto *Zero = Plan.getConstantInt(StepTy, 0);
1039 return B.createPtrAdd(EndValue,
1040 B.createNaryOp(Instruction::Sub, {Zero, Step}),
1041 DebugLoc::getUnknown(), "ind.escape");
1042 }
1043 if (ScalarTy->isFloatingPointTy()) {
1044 const auto &ID = WideIV->getInductionDescriptor();
1045 return B.createNaryOp(
1046 ID.getInductionBinOp()->getOpcode() == Instruction::FAdd
1047 ? Instruction::FSub
1048 : Instruction::FAdd,
1049 {EndValue, Step}, {ID.getInductionBinOp()->getFastMathFlags()});
1050 }
1051 llvm_unreachable("all possible induction types must be handled");
1052 return nullptr;
1053}
1054
1056 VPlan &Plan, DenseMap<VPValue *, VPValue *> &EndValues,
1057 ScalarEvolution &SE) {
1058 VPBlockBase *MiddleVPBB = Plan.getMiddleBlock();
1059 VPTypeAnalysis TypeInfo(Plan);
1060 for (VPIRBasicBlock *ExitVPBB : Plan.getExitBlocks()) {
1061 for (VPRecipeBase &R : ExitVPBB->phis()) {
1062 auto *ExitIRI = cast<VPIRPhi>(&R);
1063
1064 for (auto [Idx, PredVPBB] : enumerate(ExitVPBB->getPredecessors())) {
1065 VPValue *Escape = nullptr;
1066 if (PredVPBB == MiddleVPBB)
1067 Escape = optimizeLatchExitInductionUser(Plan, TypeInfo, PredVPBB,
1068 ExitIRI->getOperand(Idx),
1069 EndValues, SE);
1070 else
1071 Escape = optimizeEarlyExitInductionUser(Plan, TypeInfo, PredVPBB,
1072 ExitIRI->getOperand(Idx), SE);
1073 if (Escape)
1074 ExitIRI->setOperand(Idx, Escape);
1075 }
1076 }
1077 }
1078}
1079
1080/// Remove redundant EpxandSCEVRecipes in \p Plan's entry block by replacing
1081/// them with already existing recipes expanding the same SCEV expression.
1084
1085 for (VPRecipeBase &R :
1087 auto *ExpR = dyn_cast<VPExpandSCEVRecipe>(&R);
1088 if (!ExpR)
1089 continue;
1090
1091 const auto &[V, Inserted] = SCEV2VPV.try_emplace(ExpR->getSCEV(), ExpR);
1092 if (Inserted)
1093 continue;
1094 ExpR->replaceAllUsesWith(V->second);
1095 ExpR->eraseFromParent();
1096 }
1097}
1098
1100 SmallVector<VPValue *> WorkList;
1102 WorkList.push_back(V);
1103
1104 while (!WorkList.empty()) {
1105 VPValue *Cur = WorkList.pop_back_val();
1106 if (!Seen.insert(Cur).second)
1107 continue;
1108 VPRecipeBase *R = Cur->getDefiningRecipe();
1109 if (!R)
1110 continue;
1111 if (!isDeadRecipe(*R))
1112 continue;
1113 append_range(WorkList, R->operands());
1114 R->eraseFromParent();
1115 }
1116}
1117
1118/// Get any instruction opcode or intrinsic ID data embedded in recipe \p R.
1119/// Returns an optional pair, where the first element indicates whether it is
1120/// an intrinsic ID.
1121static std::optional<std::pair<bool, unsigned>>
1123 return TypeSwitch<const VPSingleDefRecipe *,
1124 std::optional<std::pair<bool, unsigned>>>(R)
1127 [](auto *I) { return std::make_pair(false, I->getOpcode()); })
1128 .Case<VPWidenIntrinsicRecipe>([](auto *I) {
1129 return std::make_pair(true, I->getVectorIntrinsicID());
1130 })
1131 .Case<VPVectorPointerRecipe, VPPredInstPHIRecipe>([](auto *I) {
1132 // For recipes that do not directly map to LLVM IR instructions,
1133 // assign opcodes after the last VPInstruction opcode (which is also
1134 // after the last IR Instruction opcode), based on the VPDefID.
1135 return std::make_pair(false,
1136 VPInstruction::OpsEnd + 1 + I->getVPDefID());
1137 })
1138 .Default([](auto *) { return std::nullopt; });
1139}
1140
1141/// Try to fold \p R using InstSimplifyFolder. Will succeed and return a
1142/// non-nullptr VPValue for a handled opcode or intrinsic ID if corresponding \p
1143/// Operands are foldable live-ins.
1145 ArrayRef<VPValue *> Operands,
1146 const DataLayout &DL,
1147 VPTypeAnalysis &TypeInfo) {
1148 auto OpcodeOrIID = getOpcodeOrIntrinsicID(&R);
1149 if (!OpcodeOrIID)
1150 return nullptr;
1151
1153 for (VPValue *Op : Operands) {
1154 if (!Op->isLiveIn() || !Op->getLiveInIRValue())
1155 return nullptr;
1156 Ops.push_back(Op->getLiveInIRValue());
1157 }
1158
1159 auto FoldToIRValue = [&]() -> Value * {
1160 InstSimplifyFolder Folder(DL);
1161 if (OpcodeOrIID->first) {
1162 if (R.getNumOperands() != 2)
1163 return nullptr;
1164 unsigned ID = OpcodeOrIID->second;
1165 return Folder.FoldBinaryIntrinsic(ID, Ops[0], Ops[1],
1166 TypeInfo.inferScalarType(&R));
1167 }
1168 unsigned Opcode = OpcodeOrIID->second;
1169 if (Instruction::isBinaryOp(Opcode))
1170 return Folder.FoldBinOp(static_cast<Instruction::BinaryOps>(Opcode),
1171 Ops[0], Ops[1]);
1172 if (Instruction::isCast(Opcode))
1173 return Folder.FoldCast(static_cast<Instruction::CastOps>(Opcode), Ops[0],
1174 TypeInfo.inferScalarType(R.getVPSingleValue()));
1175 switch (Opcode) {
1177 return Folder.FoldSelect(Ops[0], Ops[1],
1179 case VPInstruction::Not:
1180 return Folder.FoldBinOp(Instruction::BinaryOps::Xor, Ops[0],
1182 case Instruction::Select:
1183 return Folder.FoldSelect(Ops[0], Ops[1], Ops[2]);
1184 case Instruction::ICmp:
1185 case Instruction::FCmp:
1186 return Folder.FoldCmp(cast<VPRecipeWithIRFlags>(R).getPredicate(), Ops[0],
1187 Ops[1]);
1188 case Instruction::GetElementPtr: {
1189 auto &RFlags = cast<VPRecipeWithIRFlags>(R);
1190 auto *GEP = cast<GetElementPtrInst>(RFlags.getUnderlyingInstr());
1191 return Folder.FoldGEP(GEP->getSourceElementType(), Ops[0],
1192 drop_begin(Ops), RFlags.getGEPNoWrapFlags());
1193 }
1196 return Folder.FoldGEP(IntegerType::getInt8Ty(TypeInfo.getContext()),
1197 Ops[0], Ops[1],
1198 cast<VPRecipeWithIRFlags>(R).getGEPNoWrapFlags());
1199 // An extract of a live-in is an extract of a broadcast, so return the
1200 // broadcasted element.
1201 case Instruction::ExtractElement:
1202 assert(!Ops[0]->getType()->isVectorTy() && "Live-ins should be scalar");
1203 return Ops[0];
1204 }
1205 return nullptr;
1206 };
1207
1208 if (Value *V = FoldToIRValue())
1209 return R.getParent()->getPlan()->getOrAddLiveIn(V);
1210 return nullptr;
1211}
1212
1213/// Try to simplify VPSingleDefRecipe \p Def.
1215 VPlan *Plan = Def->getParent()->getPlan();
1216
1217 // Simplification of live-in IR values for SingleDef recipes using
1218 // InstSimplifyFolder.
1219 const DataLayout &DL =
1221 if (VPValue *V = tryToFoldLiveIns(*Def, Def->operands(), DL, TypeInfo))
1222 return Def->replaceAllUsesWith(V);
1223
1224 // Fold PredPHI LiveIn -> LiveIn.
1225 if (auto *PredPHI = dyn_cast<VPPredInstPHIRecipe>(Def)) {
1226 VPValue *Op = PredPHI->getOperand(0);
1227 if (Op->isLiveIn())
1228 PredPHI->replaceAllUsesWith(Op);
1229 }
1230
1231 VPBuilder Builder(Def);
1232 VPValue *A;
1233 if (match(Def, m_Trunc(m_ZExtOrSExt(m_VPValue(A))))) {
1234 Type *TruncTy = TypeInfo.inferScalarType(Def);
1235 Type *ATy = TypeInfo.inferScalarType(A);
1236 if (TruncTy == ATy) {
1237 Def->replaceAllUsesWith(A);
1238 } else {
1239 // Don't replace a scalarizing recipe with a widened cast.
1240 if (isa<VPReplicateRecipe>(Def))
1241 return;
1242 if (ATy->getScalarSizeInBits() < TruncTy->getScalarSizeInBits()) {
1243
1244 unsigned ExtOpcode = match(Def->getOperand(0), m_SExt(m_VPValue()))
1245 ? Instruction::SExt
1246 : Instruction::ZExt;
1247 auto *Ext = Builder.createWidenCast(Instruction::CastOps(ExtOpcode), A,
1248 TruncTy);
1249 if (auto *UnderlyingExt = Def->getOperand(0)->getUnderlyingValue()) {
1250 // UnderlyingExt has distinct return type, used to retain legacy cost.
1251 Ext->setUnderlyingValue(UnderlyingExt);
1252 }
1253 Def->replaceAllUsesWith(Ext);
1254 } else if (ATy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits()) {
1255 auto *Trunc = Builder.createWidenCast(Instruction::Trunc, A, TruncTy);
1256 Def->replaceAllUsesWith(Trunc);
1257 }
1258 }
1259#ifndef NDEBUG
1260 // Verify that the cached type info is for both A and its users is still
1261 // accurate by comparing it to freshly computed types.
1262 VPTypeAnalysis TypeInfo2(*Plan);
1263 assert(TypeInfo.inferScalarType(A) == TypeInfo2.inferScalarType(A));
1264 for (VPUser *U : A->users()) {
1265 auto *R = cast<VPRecipeBase>(U);
1266 for (VPValue *VPV : R->definedValues())
1267 assert(TypeInfo.inferScalarType(VPV) == TypeInfo2.inferScalarType(VPV));
1268 }
1269#endif
1270 }
1271
1272 // Simplify (X && Y) || (X && !Y) -> X.
1273 // TODO: Split up into simpler, modular combines: (X && Y) || (X && Z) into X
1274 // && (Y || Z) and (X || !X) into true. This requires queuing newly created
1275 // recipes to be visited during simplification.
1276 VPValue *X, *Y, *Z;
1277 if (match(Def,
1280 Def->replaceAllUsesWith(X);
1281 Def->eraseFromParent();
1282 return;
1283 }
1284
1285 // x | 1 -> 1
1286 if (match(Def, m_c_BinaryOr(m_VPValue(X), m_AllOnes())))
1287 return Def->replaceAllUsesWith(Def->getOperand(Def->getOperand(0) == X));
1288
1289 // x | 0 -> x
1290 if (match(Def, m_c_BinaryOr(m_VPValue(X), m_ZeroInt())))
1291 return Def->replaceAllUsesWith(X);
1292
1293 // x & 0 -> 0
1294 if (match(Def, m_c_BinaryAnd(m_VPValue(X), m_ZeroInt())))
1295 return Def->replaceAllUsesWith(Def->getOperand(Def->getOperand(0) == X));
1296
1297 // x && false -> false
1298 if (match(Def, m_LogicalAnd(m_VPValue(X), m_False())))
1299 return Def->replaceAllUsesWith(Def->getOperand(1));
1300
1301 // (x && y) || (x && z) -> x && (y || z)
1304 // Simplify only if one of the operands has one use to avoid creating an
1305 // extra recipe.
1306 (!Def->getOperand(0)->hasMoreThanOneUniqueUser() ||
1307 !Def->getOperand(1)->hasMoreThanOneUniqueUser()))
1308 return Def->replaceAllUsesWith(
1309 Builder.createLogicalAnd(X, Builder.createOr(Y, Z)));
1310
1311 // x && !x -> 0
1313 return Def->replaceAllUsesWith(Plan->getFalse());
1314
1315 if (match(Def, m_Select(m_VPValue(), m_VPValue(X), m_Deferred(X))))
1316 return Def->replaceAllUsesWith(X);
1317
1318 // select !c, x, y -> select c, y, x
1319 VPValue *C;
1320 if (match(Def, m_Select(m_Not(m_VPValue(C)), m_VPValue(X), m_VPValue(Y)))) {
1321 Def->setOperand(0, C);
1322 Def->setOperand(1, Y);
1323 Def->setOperand(2, X);
1324 return;
1325 }
1326
1327 // Reassociate (x && y) && z -> x && (y && z) if x has multiple users. With
1328 // tail folding it is likely that x is a header mask and can be simplified
1329 // further.
1331 m_VPValue(Z))) &&
1332 X->hasMoreThanOneUniqueUser())
1333 return Def->replaceAllUsesWith(
1334 Builder.createLogicalAnd(X, Builder.createLogicalAnd(Y, Z)));
1335
1336 if (match(Def, m_c_Add(m_VPValue(A), m_ZeroInt())))
1337 return Def->replaceAllUsesWith(A);
1338
1339 if (match(Def, m_c_Mul(m_VPValue(A), m_One())))
1340 return Def->replaceAllUsesWith(A);
1341
1342 if (match(Def, m_c_Mul(m_VPValue(A), m_ZeroInt())))
1343 return Def->replaceAllUsesWith(
1344 Def->getOperand(0) == A ? Def->getOperand(1) : Def->getOperand(0));
1345
1346 if (match(Def, m_Not(m_VPValue(A)))) {
1347 if (match(A, m_Not(m_VPValue(A))))
1348 return Def->replaceAllUsesWith(A);
1349
1350 // Try to fold Not into compares by adjusting the predicate in-place.
1351 CmpPredicate Pred;
1352 if (match(A, m_Cmp(Pred, m_VPValue(), m_VPValue()))) {
1353 auto *Cmp = cast<VPRecipeWithIRFlags>(A);
1354 if (all_of(Cmp->users(),
1356 m_Not(m_Specific(Cmp)),
1357 m_Select(m_Specific(Cmp), m_VPValue(), m_VPValue()))))) {
1358 Cmp->setPredicate(CmpInst::getInversePredicate(Pred));
1359 for (VPUser *U : to_vector(Cmp->users())) {
1360 auto *R = cast<VPSingleDefRecipe>(U);
1361 if (match(R, m_Select(m_Specific(Cmp), m_VPValue(X), m_VPValue(Y)))) {
1362 // select (cmp pred), x, y -> select (cmp inv_pred), y, x
1363 R->setOperand(1, Y);
1364 R->setOperand(2, X);
1365 } else {
1366 // not (cmp pred) -> cmp inv_pred
1367 assert(match(R, m_Not(m_Specific(Cmp))) && "Unexpected user");
1368 R->replaceAllUsesWith(Cmp);
1369 }
1370 }
1371 // If Cmp doesn't have a debug location, use the one from the negation,
1372 // to preserve the location.
1373 if (!Cmp->getDebugLoc() && Def->getDebugLoc())
1374 Cmp->setDebugLoc(Def->getDebugLoc());
1375 }
1376 }
1377 }
1378
1379 // Fold any-of (fcmp uno %A, %A), (fcmp uno %B, %B), ... ->
1380 // any-of (fcmp uno %A, %B), ...
1381 if (match(Def, m_AnyOf())) {
1383 VPRecipeBase *UnpairedCmp = nullptr;
1384 for (VPValue *Op : Def->operands()) {
1385 VPValue *X;
1386 if (Op->getNumUsers() > 1 ||
1388 m_Deferred(X)))) {
1389 NewOps.push_back(Op);
1390 } else if (!UnpairedCmp) {
1391 UnpairedCmp = Op->getDefiningRecipe();
1392 } else {
1393 NewOps.push_back(Builder.createFCmp(CmpInst::FCMP_UNO,
1394 UnpairedCmp->getOperand(0), X));
1395 UnpairedCmp = nullptr;
1396 }
1397 }
1398
1399 if (UnpairedCmp)
1400 NewOps.push_back(UnpairedCmp->getVPSingleValue());
1401
1402 if (NewOps.size() < Def->getNumOperands()) {
1403 VPValue *NewAnyOf = Builder.createNaryOp(VPInstruction::AnyOf, NewOps);
1404 return Def->replaceAllUsesWith(NewAnyOf);
1405 }
1406 }
1407
1408 // Fold (fcmp uno %X, %X) or (fcmp uno %Y, %Y) -> fcmp uno %X, %Y
1409 // This is useful for fmax/fmin without fast-math flags, where we need to
1410 // check if any operand is NaN.
1412 m_Deferred(X)),
1414 m_Deferred(Y))))) {
1415 VPValue *NewCmp = Builder.createFCmp(CmpInst::FCMP_UNO, X, Y);
1416 return Def->replaceAllUsesWith(NewCmp);
1417 }
1418
1419 // Remove redundant DerviedIVs, that is 0 + A * 1 -> A and 0 + 0 * x -> 0.
1420 if ((match(Def, m_DerivedIV(m_ZeroInt(), m_VPValue(A), m_One())) ||
1421 match(Def, m_DerivedIV(m_ZeroInt(), m_ZeroInt(), m_VPValue()))) &&
1422 TypeInfo.inferScalarType(Def->getOperand(1)) ==
1423 TypeInfo.inferScalarType(Def))
1424 return Def->replaceAllUsesWith(Def->getOperand(1));
1425
1427 m_One()))) {
1428 Type *WideStepTy = TypeInfo.inferScalarType(Def);
1429 if (TypeInfo.inferScalarType(X) != WideStepTy)
1430 X = Builder.createWidenCast(Instruction::Trunc, X, WideStepTy);
1431 Def->replaceAllUsesWith(X);
1432 return;
1433 }
1434
1435 // For i1 vp.merges produced by AnyOf reductions:
1436 // vp.merge true, (or x, y), x, evl -> vp.merge y, true, x, evl
1438 m_VPValue(X), m_VPValue())) &&
1440 TypeInfo.inferScalarType(Def)->isIntegerTy(1)) {
1441 Def->setOperand(1, Def->getOperand(0));
1442 Def->setOperand(0, Y);
1443 return;
1444 }
1445
1446 if (auto *Phi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(Def)) {
1447 if (Phi->getOperand(0) == Phi->getOperand(1))
1448 Phi->replaceAllUsesWith(Phi->getOperand(0));
1449 return;
1450 }
1451
1452 // Look through ExtractLastLane.
1453 if (match(Def, m_ExtractLastLane(m_VPValue(A)))) {
1454 if (match(A, m_BuildVector())) {
1455 auto *BuildVector = cast<VPInstruction>(A);
1456 Def->replaceAllUsesWith(
1457 BuildVector->getOperand(BuildVector->getNumOperands() - 1));
1458 return;
1459 }
1460 if (Plan->hasScalarVFOnly())
1461 return Def->replaceAllUsesWith(A);
1462 }
1463
1464 // Look through ExtractPenultimateElement (BuildVector ....).
1466 auto *BuildVector = cast<VPInstruction>(Def->getOperand(0));
1467 Def->replaceAllUsesWith(
1468 BuildVector->getOperand(BuildVector->getNumOperands() - 2));
1469 return;
1470 }
1471
1472 uint64_t Idx;
1474 auto *BuildVector = cast<VPInstruction>(Def->getOperand(0));
1475 Def->replaceAllUsesWith(BuildVector->getOperand(Idx));
1476 return;
1477 }
1478
1479 if (match(Def, m_BuildVector()) && all_equal(Def->operands())) {
1480 Def->replaceAllUsesWith(
1481 Builder.createNaryOp(VPInstruction::Broadcast, Def->getOperand(0)));
1482 return;
1483 }
1484
1485 // Look through broadcast of single-scalar when used as select conditions; in
1486 // that case the scalar condition can be used directly.
1487 if (match(Def,
1490 "broadcast operand must be single-scalar");
1491 Def->setOperand(0, C);
1492 return;
1493 }
1494
1495 if (auto *Phi = dyn_cast<VPPhi>(Def)) {
1496 if (Phi->getNumOperands() == 1)
1497 Phi->replaceAllUsesWith(Phi->getOperand(0));
1498 return;
1499 }
1500
1501 // Some simplifications can only be applied after unrolling. Perform them
1502 // below.
1503 if (!Plan->isUnrolled())
1504 return;
1505
1506 // Hoist an invariant increment Y of a phi X, by having X start at Y.
1507 if (match(Def, m_c_Add(m_VPValue(X), m_VPValue(Y))) && Y->isLiveIn() &&
1508 isa<VPPhi>(X)) {
1509 auto *Phi = cast<VPPhi>(X);
1510 if (Phi->getOperand(1) != Def && match(Phi->getOperand(0), m_ZeroInt()) &&
1511 Phi->getSingleUser() == Def) {
1512 Phi->setOperand(0, Y);
1513 Def->replaceAllUsesWith(Phi);
1514 return;
1515 }
1516 }
1517
1518 // VPVectorPointer for part 0 can be replaced by their start pointer.
1519 if (auto *VecPtr = dyn_cast<VPVectorPointerRecipe>(Def)) {
1520 if (VecPtr->isFirstPart()) {
1521 VecPtr->replaceAllUsesWith(VecPtr->getOperand(0));
1522 return;
1523 }
1524 }
1525
1526 // VPScalarIVSteps for part 0 can be replaced by their start value, if only
1527 // the first lane is demanded.
1528 if (auto *Steps = dyn_cast<VPScalarIVStepsRecipe>(Def)) {
1529 if (Steps->isPart0() && vputils::onlyFirstLaneUsed(Steps)) {
1530 Steps->replaceAllUsesWith(Steps->getOperand(0));
1531 return;
1532 }
1533 }
1534 // Simplify redundant ReductionStartVector recipes after unrolling.
1535 VPValue *StartV;
1537 m_VPValue(StartV), m_VPValue(), m_VPValue()))) {
1538 Def->replaceUsesWithIf(StartV, [](const VPUser &U, unsigned Idx) {
1539 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&U);
1540 return PhiR && PhiR->isInLoop();
1541 });
1542 return;
1543 }
1544
1546 Def->replaceAllUsesWith(A);
1547 return;
1548 }
1549
1550 if (match(Def, m_ExtractLastLane(m_VPValue(A))) &&
1553 cast<VPReplicateRecipe>(A)->isSingleScalar())) &&
1554 all_of(A->users(),
1555 [Def, A](VPUser *U) { return U->usesScalars(A) || Def == U; })) {
1556 return Def->replaceAllUsesWith(A);
1557 }
1558
1559 if (Plan->getUF() == 1 && match(Def, m_ExtractLastPart(m_VPValue(A))))
1560 return Def->replaceAllUsesWith(A);
1561}
1562
1565 Plan.getEntry());
1566 VPTypeAnalysis TypeInfo(Plan);
1568 for (VPRecipeBase &R : make_early_inc_range(*VPBB))
1569 if (auto *Def = dyn_cast<VPSingleDefRecipe>(&R))
1570 simplifyRecipe(Def, TypeInfo);
1571 }
1572}
1573
1575 if (Plan.hasScalarVFOnly())
1576 return;
1577
1578 // Try to narrow wide and replicating recipes to single scalar recipes,
1579 // based on VPlan analysis. Only process blocks in the loop region for now,
1580 // without traversing into nested regions, as recipes in replicate regions
1581 // cannot be converted yet.
1584 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
1586 VPReplicateRecipe>(&R))
1587 continue;
1588 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
1589 if (RepR && (RepR->isSingleScalar() || RepR->isPredicated()))
1590 continue;
1591
1592 auto *RepOrWidenR = cast<VPRecipeWithIRFlags>(&R);
1593 if (RepR && isa<StoreInst>(RepR->getUnderlyingInstr()) &&
1594 vputils::isSingleScalar(RepR->getOperand(1))) {
1595 auto *Clone = new VPReplicateRecipe(
1596 RepOrWidenR->getUnderlyingInstr(), RepOrWidenR->operands(),
1597 true /*IsSingleScalar*/, nullptr /*Mask*/, *RepR /*Flags*/,
1598 *RepR /*Metadata*/, RepR->getDebugLoc());
1599 Clone->insertBefore(RepOrWidenR);
1600 VPBuilder Builder(Clone);
1601 VPValue *ExtractOp = Clone->getOperand(0);
1602 if (vputils::isUniformAcrossVFsAndUFs(RepR->getOperand(1)))
1603 ExtractOp =
1604 Builder.createNaryOp(VPInstruction::ExtractLastPart, ExtractOp);
1605 ExtractOp =
1606 Builder.createNaryOp(VPInstruction::ExtractLastLane, ExtractOp);
1607 Clone->setOperand(0, ExtractOp);
1608 RepR->eraseFromParent();
1609 continue;
1610 }
1611
1612 // Skip recipes that aren't single scalars.
1613 if (!vputils::isSingleScalar(RepOrWidenR))
1614 continue;
1615
1616 // Skip recipes for which conversion to single-scalar does introduce
1617 // additional broadcasts. No extra broadcasts are needed, if either only
1618 // the scalars of the recipe are used, or at least one of the operands
1619 // would require a broadcast. In the latter case, the single-scalar may
1620 // need to be broadcasted, but another broadcast is removed.
1621 if (!all_of(RepOrWidenR->users(),
1622 [RepOrWidenR](const VPUser *U) {
1623 if (auto *VPI = dyn_cast<VPInstruction>(U)) {
1624 unsigned Opcode = VPI->getOpcode();
1625 if (Opcode == VPInstruction::ExtractLastLane ||
1626 Opcode == VPInstruction::ExtractLastPart ||
1627 Opcode == VPInstruction::ExtractPenultimateElement)
1628 return true;
1629 }
1630
1631 return U->usesScalars(RepOrWidenR);
1632 }) &&
1633 none_of(RepOrWidenR->operands(), [RepOrWidenR](VPValue *Op) {
1634 if (Op->getSingleUser() != RepOrWidenR)
1635 return false;
1636 // Non-constant live-ins require broadcasts, while constants do not
1637 // need explicit broadcasts.
1638 bool LiveInNeedsBroadcast =
1639 Op->isLiveIn() && !isa<Constant>(Op->getLiveInIRValue());
1640 auto *OpR = dyn_cast<VPReplicateRecipe>(Op);
1641 return LiveInNeedsBroadcast || (OpR && OpR->isSingleScalar());
1642 }))
1643 continue;
1644
1645 auto *Clone = new VPReplicateRecipe(
1646 RepOrWidenR->getUnderlyingInstr(), RepOrWidenR->operands(),
1647 true /*IsSingleScalar*/, nullptr, *RepOrWidenR);
1648 Clone->insertBefore(RepOrWidenR);
1649 RepOrWidenR->replaceAllUsesWith(Clone);
1650 if (isDeadRecipe(*RepOrWidenR))
1651 RepOrWidenR->eraseFromParent();
1652 }
1653 }
1654}
1655
1656/// Try to see if all of \p Blend's masks share a common value logically and'ed
1657/// and remove it from the masks.
1659 if (Blend->isNormalized())
1660 return;
1661 VPValue *CommonEdgeMask;
1662 if (!match(Blend->getMask(0),
1663 m_LogicalAnd(m_VPValue(CommonEdgeMask), m_VPValue())))
1664 return;
1665 for (unsigned I = 0; I < Blend->getNumIncomingValues(); I++)
1666 if (!match(Blend->getMask(I),
1667 m_LogicalAnd(m_Specific(CommonEdgeMask), m_VPValue())))
1668 return;
1669 for (unsigned I = 0; I < Blend->getNumIncomingValues(); I++)
1670 Blend->setMask(I, Blend->getMask(I)->getDefiningRecipe()->getOperand(1));
1671}
1672
1673/// Normalize and simplify VPBlendRecipes. Should be run after simplifyRecipes
1674/// to make sure the masks are simplified.
1675static void simplifyBlends(VPlan &Plan) {
1678 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
1679 auto *Blend = dyn_cast<VPBlendRecipe>(&R);
1680 if (!Blend)
1681 continue;
1682
1683 removeCommonBlendMask(Blend);
1684
1685 // Try to remove redundant blend recipes.
1686 SmallPtrSet<VPValue *, 4> UniqueValues;
1687 if (Blend->isNormalized() || !match(Blend->getMask(0), m_False()))
1688 UniqueValues.insert(Blend->getIncomingValue(0));
1689 for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
1690 if (!match(Blend->getMask(I), m_False()))
1691 UniqueValues.insert(Blend->getIncomingValue(I));
1692
1693 if (UniqueValues.size() == 1) {
1694 Blend->replaceAllUsesWith(*UniqueValues.begin());
1695 Blend->eraseFromParent();
1696 continue;
1697 }
1698
1699 if (Blend->isNormalized())
1700 continue;
1701
1702 // Normalize the blend so its first incoming value is used as the initial
1703 // value with the others blended into it.
1704
1705 unsigned StartIndex = 0;
1706 for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
1707 // If a value's mask is used only by the blend then is can be deadcoded.
1708 // TODO: Find the most expensive mask that can be deadcoded, or a mask
1709 // that's used by multiple blends where it can be removed from them all.
1710 VPValue *Mask = Blend->getMask(I);
1711 if (Mask->getNumUsers() == 1 && !match(Mask, m_False())) {
1712 StartIndex = I;
1713 break;
1714 }
1715 }
1716
1717 SmallVector<VPValue *, 4> OperandsWithMask;
1718 OperandsWithMask.push_back(Blend->getIncomingValue(StartIndex));
1719
1720 for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
1721 if (I == StartIndex)
1722 continue;
1723 OperandsWithMask.push_back(Blend->getIncomingValue(I));
1724 OperandsWithMask.push_back(Blend->getMask(I));
1725 }
1726
1727 auto *NewBlend =
1728 new VPBlendRecipe(cast_or_null<PHINode>(Blend->getUnderlyingValue()),
1729 OperandsWithMask, Blend->getDebugLoc());
1730 NewBlend->insertBefore(&R);
1731
1732 VPValue *DeadMask = Blend->getMask(StartIndex);
1733 Blend->replaceAllUsesWith(NewBlend);
1734 Blend->eraseFromParent();
1736
1737 /// Simplify BLEND %a, %b, Not(%mask) -> BLEND %b, %a, %mask.
1738 VPValue *NewMask;
1739 if (NewBlend->getNumOperands() == 3 &&
1740 match(NewBlend->getMask(1), m_Not(m_VPValue(NewMask)))) {
1741 VPValue *Inc0 = NewBlend->getOperand(0);
1742 VPValue *Inc1 = NewBlend->getOperand(1);
1743 VPValue *OldMask = NewBlend->getOperand(2);
1744 NewBlend->setOperand(0, Inc1);
1745 NewBlend->setOperand(1, Inc0);
1746 NewBlend->setOperand(2, NewMask);
1747 if (OldMask->getNumUsers() == 0)
1748 cast<VPInstruction>(OldMask)->eraseFromParent();
1749 }
1750 }
1751 }
1752}
1753
1754/// Optimize the width of vector induction variables in \p Plan based on a known
1755/// constant Trip Count, \p BestVF and \p BestUF.
1757 ElementCount BestVF,
1758 unsigned BestUF) {
1759 // Only proceed if we have not completely removed the vector region.
1760 if (!Plan.getVectorLoopRegion())
1761 return false;
1762
1763 const APInt *TC;
1764 if (!BestVF.isFixed() || !match(Plan.getTripCount(), m_APInt(TC)))
1765 return false;
1766
1767 // Calculate the minimum power-of-2 bit width that can fit the known TC, VF
1768 // and UF. Returns at least 8.
1769 auto ComputeBitWidth = [](APInt TC, uint64_t Align) {
1770 APInt AlignedTC =
1773 APInt MaxVal = AlignedTC - 1;
1774 return std::max<unsigned>(PowerOf2Ceil(MaxVal.getActiveBits()), 8);
1775 };
1776 unsigned NewBitWidth =
1777 ComputeBitWidth(*TC, BestVF.getKnownMinValue() * BestUF);
1778
1779 LLVMContext &Ctx = Plan.getContext();
1780 auto *NewIVTy = IntegerType::get(Ctx, NewBitWidth);
1781
1782 bool MadeChange = false;
1783
1784 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
1785 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
1786 auto *WideIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
1787
1788 // Currently only handle canonical IVs as it is trivial to replace the start
1789 // and stop values, and we currently only perform the optimization when the
1790 // IV has a single use.
1791 if (!WideIV || !WideIV->isCanonical() ||
1792 WideIV->hasMoreThanOneUniqueUser() ||
1793 NewIVTy == WideIV->getScalarType())
1794 continue;
1795
1796 // Currently only handle cases where the single user is a header-mask
1797 // comparison with the backedge-taken-count.
1798 VPUser *SingleUser = WideIV->getSingleUser();
1799 if (!SingleUser ||
1800 !match(SingleUser, m_ICmp(m_Specific(WideIV),
1803 continue;
1804
1805 // Update IV operands and comparison bound to use new narrower type.
1806 auto *NewStart = Plan.getConstantInt(NewIVTy, 0);
1807 WideIV->setStartValue(NewStart);
1808 auto *NewStep = Plan.getConstantInt(NewIVTy, 1);
1809 WideIV->setStepValue(NewStep);
1810
1811 auto *NewBTC = new VPWidenCastRecipe(
1812 Instruction::Trunc, Plan.getOrCreateBackedgeTakenCount(), NewIVTy);
1813 Plan.getVectorPreheader()->appendRecipe(NewBTC);
1814 auto *Cmp = cast<VPInstruction>(WideIV->getSingleUser());
1815 Cmp->setOperand(1, NewBTC);
1816
1817 MadeChange = true;
1818 }
1819
1820 return MadeChange;
1821}
1822
1823/// Return true if \p Cond is known to be true for given \p BestVF and \p
1824/// BestUF.
1826 ElementCount BestVF, unsigned BestUF,
1827 ScalarEvolution &SE) {
1829 return any_of(Cond->getDefiningRecipe()->operands(), [&Plan, BestVF, BestUF,
1830 &SE](VPValue *C) {
1831 return isConditionTrueViaVFAndUF(C, Plan, BestVF, BestUF, SE);
1832 });
1833
1834 auto *CanIV = Plan.getVectorLoopRegion()->getCanonicalIV();
1836 m_Specific(CanIV->getBackedgeValue()),
1837 m_Specific(&Plan.getVectorTripCount()))))
1838 return false;
1839
1840 // The compare checks CanIV + VFxUF == vector trip count. The vector trip
1841 // count is not conveniently available as SCEV so far, so we compare directly
1842 // against the original trip count. This is stricter than necessary, as we
1843 // will only return true if the trip count == vector trip count.
1844 const SCEV *VectorTripCount =
1846 if (isa<SCEVCouldNotCompute>(VectorTripCount))
1847 VectorTripCount = vputils::getSCEVExprForVPValue(Plan.getTripCount(), SE);
1848 assert(!isa<SCEVCouldNotCompute>(VectorTripCount) &&
1849 "Trip count SCEV must be computable");
1850 ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF);
1851 const SCEV *C = SE.getElementCount(VectorTripCount->getType(), NumElements);
1852 return SE.isKnownPredicate(CmpInst::ICMP_EQ, VectorTripCount, C);
1853}
1854
1855/// Try to replace multiple active lane masks used for control flow with
1856/// a single, wide active lane mask instruction followed by multiple
1857/// extract subvector intrinsics. This applies to the active lane mask
1858/// instructions both in the loop and in the preheader.
1859/// Incoming values of all ActiveLaneMaskPHIs are updated to use the
1860/// new extracts from the first active lane mask, which has it's last
1861/// operand (multiplier) set to UF.
1863 unsigned UF) {
1864 if (!EnableWideActiveLaneMask || !VF.isVector() || UF == 1)
1865 return false;
1866
1867 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
1868 VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
1869 auto *Term = &ExitingVPBB->back();
1870
1871 using namespace llvm::VPlanPatternMatch;
1873 m_VPValue(), m_VPValue(), m_VPValue())))))
1874 return false;
1875
1876 auto *Header = cast<VPBasicBlock>(VectorRegion->getEntry());
1877 LLVMContext &Ctx = Plan.getContext();
1878
1879 auto ExtractFromALM = [&](VPInstruction *ALM,
1880 SmallVectorImpl<VPValue *> &Extracts) {
1881 DebugLoc DL = ALM->getDebugLoc();
1882 for (unsigned Part = 0; Part < UF; ++Part) {
1884 Ops.append({ALM, Plan.getOrAddLiveIn(
1885 ConstantInt::get(IntegerType::getInt64Ty(Ctx),
1886 VF.getKnownMinValue() * Part))});
1887 auto *Ext =
1888 new VPWidenIntrinsicRecipe(Intrinsic::vector_extract, Ops,
1889 IntegerType::getInt1Ty(Ctx), {}, {}, DL);
1890 Extracts[Part] = Ext;
1891 Ext->insertAfter(ALM);
1892 }
1893 };
1894
1895 // Create a list of each active lane mask phi, ordered by unroll part.
1897 for (VPRecipeBase &R : Header->phis()) {
1899 if (!Phi)
1900 continue;
1901 VPValue *Index = nullptr;
1902 match(Phi->getBackedgeValue(),
1904 assert(Index && "Expected index from ActiveLaneMask instruction");
1905
1906 uint64_t Part;
1907 if (match(Index,
1909 m_VPValue(), m_ConstantInt(Part))))
1910 Phis[Part] = Phi;
1911 else
1912 // Anything other than a CanonicalIVIncrementForPart is part 0
1913 Phis[0] = Phi;
1914 }
1915
1916 assert(all_of(Phis, [](VPActiveLaneMaskPHIRecipe *Phi) { return Phi; }) &&
1917 "Expected one VPActiveLaneMaskPHIRecipe for each unroll part");
1918
1919 auto *EntryALM = cast<VPInstruction>(Phis[0]->getStartValue());
1920 auto *LoopALM = cast<VPInstruction>(Phis[0]->getBackedgeValue());
1921
1922 assert((EntryALM->getOpcode() == VPInstruction::ActiveLaneMask &&
1923 LoopALM->getOpcode() == VPInstruction::ActiveLaneMask) &&
1924 "Expected incoming values of Phi to be ActiveLaneMasks");
1925
1926 // When using wide lane masks, the return type of the get.active.lane.mask
1927 // intrinsic is VF x UF (last operand).
1928 VPValue *ALMMultiplier = Plan.getConstantInt(64, UF);
1929 EntryALM->setOperand(2, ALMMultiplier);
1930 LoopALM->setOperand(2, ALMMultiplier);
1931
1932 // Create UF x extract vectors and insert into preheader.
1933 SmallVector<VPValue *> EntryExtracts(UF);
1934 ExtractFromALM(EntryALM, EntryExtracts);
1935
1936 // Create UF x extract vectors and insert before the loop compare & branch,
1937 // updating the compare to use the first extract.
1938 SmallVector<VPValue *> LoopExtracts(UF);
1939 ExtractFromALM(LoopALM, LoopExtracts);
1940 VPInstruction *Not = cast<VPInstruction>(Term->getOperand(0));
1941 Not->setOperand(0, LoopExtracts[0]);
1942
1943 // Update the incoming values of active lane mask phis.
1944 for (unsigned Part = 0; Part < UF; ++Part) {
1945 Phis[Part]->setStartValue(EntryExtracts[Part]);
1946 Phis[Part]->setBackedgeValue(LoopExtracts[Part]);
1947 }
1948
1949 return true;
1950}
1951
1952/// Try to simplify the branch condition of \p Plan. This may restrict the
1953/// resulting plan to \p BestVF and \p BestUF.
1955 unsigned BestUF,
1957 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
1958 VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
1959 auto *Term = &ExitingVPBB->back();
1960 VPValue *Cond;
1961 ScalarEvolution &SE = *PSE.getSE();
1962 if (match(Term, m_BranchOnCount()) ||
1964 m_VPValue(), m_VPValue(), m_VPValue()))))) {
1965 // Try to simplify the branch condition if VectorTC <= VF * UF when the
1966 // latch terminator is BranchOnCount or BranchOnCond(Not(ActiveLaneMask)).
1967 const SCEV *VectorTripCount =
1969 if (isa<SCEVCouldNotCompute>(VectorTripCount))
1970 VectorTripCount = vputils::getSCEVExprForVPValue(Plan.getTripCount(), SE);
1971 assert(!isa<SCEVCouldNotCompute>(VectorTripCount) &&
1972 "Trip count SCEV must be computable");
1973 ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF);
1974 const SCEV *C = SE.getElementCount(VectorTripCount->getType(), NumElements);
1975 if (!SE.isKnownPredicate(CmpInst::ICMP_ULE, VectorTripCount, C))
1976 return false;
1977 } else if (match(Term, m_BranchOnCond(m_VPValue(Cond)))) {
1978 // For BranchOnCond, check if we can prove the condition to be true using VF
1979 // and UF.
1980 if (!isConditionTrueViaVFAndUF(Cond, Plan, BestVF, BestUF, SE))
1981 return false;
1982 } else {
1983 return false;
1984 }
1985
1986 // The vector loop region only executes once. If possible, completely remove
1987 // the region, otherwise replace the terminator controlling the latch with
1988 // (BranchOnCond true).
1989 // TODO: VPWidenIntOrFpInductionRecipe is only partially supported; add
1990 // support for other non-canonical widen induction recipes (e.g.,
1991 // VPWidenPointerInductionRecipe).
1992 auto *Header = cast<VPBasicBlock>(VectorRegion->getEntry());
1993 if (all_of(Header->phis(), [](VPRecipeBase &Phi) {
1994 if (auto *R = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi))
1995 return R->isCanonical();
1996 return isa<VPCanonicalIVPHIRecipe, VPEVLBasedIVPHIRecipe,
1997 VPFirstOrderRecurrencePHIRecipe, VPPhi>(&Phi);
1998 })) {
1999 for (VPRecipeBase &HeaderR : make_early_inc_range(Header->phis())) {
2000 if (auto *R = dyn_cast<VPWidenIntOrFpInductionRecipe>(&HeaderR)) {
2001 VPBuilder Builder(Plan.getVectorPreheader());
2002 VPValue *StepV = Builder.createNaryOp(VPInstruction::StepVector, {},
2003 R->getScalarType());
2004 HeaderR.getVPSingleValue()->replaceAllUsesWith(StepV);
2005 HeaderR.eraseFromParent();
2006 continue;
2007 }
2008 auto *Phi = cast<VPPhiAccessors>(&HeaderR);
2009 HeaderR.getVPSingleValue()->replaceAllUsesWith(Phi->getIncomingValue(0));
2010 HeaderR.eraseFromParent();
2011 }
2012
2013 VPBlockBase *Preheader = VectorRegion->getSinglePredecessor();
2014 VPBlockBase *Exit = VectorRegion->getSingleSuccessor();
2015 VPBlockUtils::disconnectBlocks(Preheader, VectorRegion);
2016 VPBlockUtils::disconnectBlocks(VectorRegion, Exit);
2017
2018 for (VPBlockBase *B : vp_depth_first_shallow(VectorRegion->getEntry()))
2019 B->setParent(nullptr);
2020
2021 VPBlockUtils::connectBlocks(Preheader, Header);
2022 VPBlockUtils::connectBlocks(ExitingVPBB, Exit);
2024 } else {
2025 // The vector region contains header phis for which we cannot remove the
2026 // loop region yet.
2027 auto *BOC = new VPInstruction(VPInstruction::BranchOnCond, {Plan.getTrue()},
2028 {}, {}, Term->getDebugLoc());
2029 ExitingVPBB->appendRecipe(BOC);
2030 }
2031
2032 Term->eraseFromParent();
2033
2034 return true;
2035}
2036
2037/// From the definition of llvm.experimental.get.vector.length,
2038/// VPInstruction::ExplicitVectorLength(%AVL) = %AVL when %AVL <= VF.
2042 vp_depth_first_deep(Plan.getEntry()))) {
2043 for (VPRecipeBase &R : *VPBB) {
2044 VPValue *AVL;
2045 if (!match(&R, m_EVL(m_VPValue(AVL))))
2046 continue;
2047
2048 ScalarEvolution &SE = *PSE.getSE();
2049 const SCEV *AVLSCEV = vputils::getSCEVExprForVPValue(AVL, SE);
2050 if (isa<SCEVCouldNotCompute>(AVLSCEV))
2051 continue;
2052 const SCEV *VFSCEV = SE.getElementCount(AVLSCEV->getType(), VF);
2053 if (!SE.isKnownPredicate(CmpInst::ICMP_ULE, AVLSCEV, VFSCEV))
2054 continue;
2055
2057 AVL, Type::getInt32Ty(Plan.getContext()), AVLSCEV->getType(),
2058 R.getDebugLoc());
2059 R.getVPSingleValue()->replaceAllUsesWith(Trunc);
2060 return true;
2061 }
2062 }
2063 return false;
2064}
2065
2067 unsigned BestUF,
2069 assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
2070 assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
2071
2072 bool MadeChange = tryToReplaceALMWithWideALM(Plan, BestVF, BestUF);
2073 MadeChange |= simplifyBranchConditionForVFAndUF(Plan, BestVF, BestUF, PSE);
2074 MadeChange |= optimizeVectorInductionWidthForTCAndVFUF(Plan, BestVF, BestUF);
2075 MadeChange |= simplifyKnownEVL(Plan, BestVF, PSE);
2076
2077 if (MadeChange) {
2078 Plan.setVF(BestVF);
2079 assert(Plan.getUF() == BestUF && "BestUF must match the Plan's UF");
2080 }
2081}
2082
2083/// Sink users of \p FOR after the recipe defining the previous value \p
2084/// Previous of the recurrence. \returns true if all users of \p FOR could be
2085/// re-arranged as needed or false if it is not possible.
2086static bool
2088 VPRecipeBase *Previous,
2089 VPDominatorTree &VPDT) {
2090 // Collect recipes that need sinking.
2093 Seen.insert(Previous);
2094 auto TryToPushSinkCandidate = [&](VPRecipeBase *SinkCandidate) {
2095 // The previous value must not depend on the users of the recurrence phi. In
2096 // that case, FOR is not a fixed order recurrence.
2097 if (SinkCandidate == Previous)
2098 return false;
2099
2100 if (isa<VPHeaderPHIRecipe>(SinkCandidate) ||
2101 !Seen.insert(SinkCandidate).second ||
2102 VPDT.properlyDominates(Previous, SinkCandidate))
2103 return true;
2104
2105 if (cannotHoistOrSinkRecipe(*SinkCandidate))
2106 return false;
2107
2108 WorkList.push_back(SinkCandidate);
2109 return true;
2110 };
2111
2112 // Recursively sink users of FOR after Previous.
2113 WorkList.push_back(FOR);
2114 for (unsigned I = 0; I != WorkList.size(); ++I) {
2115 VPRecipeBase *Current = WorkList[I];
2116 assert(Current->getNumDefinedValues() == 1 &&
2117 "only recipes with a single defined value expected");
2118
2119 for (VPUser *User : Current->getVPSingleValue()->users()) {
2120 if (!TryToPushSinkCandidate(cast<VPRecipeBase>(User)))
2121 return false;
2122 }
2123 }
2124
2125 // Keep recipes to sink ordered by dominance so earlier instructions are
2126 // processed first.
2127 sort(WorkList, [&VPDT](const VPRecipeBase *A, const VPRecipeBase *B) {
2128 return VPDT.properlyDominates(A, B);
2129 });
2130
2131 for (VPRecipeBase *SinkCandidate : WorkList) {
2132 if (SinkCandidate == FOR)
2133 continue;
2134
2135 SinkCandidate->moveAfter(Previous);
2136 Previous = SinkCandidate;
2137 }
2138 return true;
2139}
2140
2141/// Try to hoist \p Previous and its operands before all users of \p FOR.
2143 VPRecipeBase *Previous,
2144 VPDominatorTree &VPDT) {
2145 if (cannotHoistOrSinkRecipe(*Previous))
2146 return false;
2147
2148 // Collect recipes that need hoisting.
2149 SmallVector<VPRecipeBase *> HoistCandidates;
2151 VPRecipeBase *HoistPoint = nullptr;
2152 // Find the closest hoist point by looking at all users of FOR and selecting
2153 // the recipe dominating all other users.
2154 for (VPUser *U : FOR->users()) {
2155 auto *R = cast<VPRecipeBase>(U);
2156 if (!HoistPoint || VPDT.properlyDominates(R, HoistPoint))
2157 HoistPoint = R;
2158 }
2159 assert(all_of(FOR->users(),
2160 [&VPDT, HoistPoint](VPUser *U) {
2161 auto *R = cast<VPRecipeBase>(U);
2162 return HoistPoint == R ||
2163 VPDT.properlyDominates(HoistPoint, R);
2164 }) &&
2165 "HoistPoint must dominate all users of FOR");
2166
2167 auto NeedsHoisting = [HoistPoint, &VPDT,
2168 &Visited](VPValue *HoistCandidateV) -> VPRecipeBase * {
2169 VPRecipeBase *HoistCandidate = HoistCandidateV->getDefiningRecipe();
2170 if (!HoistCandidate)
2171 return nullptr;
2172 VPRegionBlock *EnclosingLoopRegion =
2173 HoistCandidate->getParent()->getEnclosingLoopRegion();
2174 assert((!HoistCandidate->getRegion() ||
2175 HoistCandidate->getRegion() == EnclosingLoopRegion) &&
2176 "CFG in VPlan should still be flat, without replicate regions");
2177 // Hoist candidate was already visited, no need to hoist.
2178 if (!Visited.insert(HoistCandidate).second)
2179 return nullptr;
2180
2181 // Candidate is outside loop region or a header phi, dominates FOR users w/o
2182 // hoisting.
2183 if (!EnclosingLoopRegion || isa<VPHeaderPHIRecipe>(HoistCandidate))
2184 return nullptr;
2185
2186 // If we reached a recipe that dominates HoistPoint, we don't need to
2187 // hoist the recipe.
2188 if (VPDT.properlyDominates(HoistCandidate, HoistPoint))
2189 return nullptr;
2190 return HoistCandidate;
2191 };
2192
2193 if (!NeedsHoisting(Previous->getVPSingleValue()))
2194 return true;
2195
2196 // Recursively try to hoist Previous and its operands before all users of FOR.
2197 HoistCandidates.push_back(Previous);
2198
2199 for (unsigned I = 0; I != HoistCandidates.size(); ++I) {
2200 VPRecipeBase *Current = HoistCandidates[I];
2201 assert(Current->getNumDefinedValues() == 1 &&
2202 "only recipes with a single defined value expected");
2203 if (cannotHoistOrSinkRecipe(*Current))
2204 return false;
2205
2206 for (VPValue *Op : Current->operands()) {
2207 // If we reach FOR, it means the original Previous depends on some other
2208 // recurrence that in turn depends on FOR. If that is the case, we would
2209 // also need to hoist recipes involving the other FOR, which may break
2210 // dependencies.
2211 if (Op == FOR)
2212 return false;
2213
2214 if (auto *R = NeedsHoisting(Op)) {
2215 // Bail out if the recipe defines multiple values.
2216 // TODO: Hoisting such recipes requires additional handling.
2217 if (R->getNumDefinedValues() != 1)
2218 return false;
2219 HoistCandidates.push_back(R);
2220 }
2221 }
2222 }
2223
2224 // Order recipes to hoist by dominance so earlier instructions are processed
2225 // first.
2226 sort(HoistCandidates, [&VPDT](const VPRecipeBase *A, const VPRecipeBase *B) {
2227 return VPDT.properlyDominates(A, B);
2228 });
2229
2230 for (VPRecipeBase *HoistCandidate : HoistCandidates) {
2231 HoistCandidate->moveBefore(*HoistPoint->getParent(),
2232 HoistPoint->getIterator());
2233 }
2234
2235 return true;
2236}
2237
2239 VPBuilder &LoopBuilder) {
2240 VPDominatorTree VPDT(Plan);
2241
2243 for (VPRecipeBase &R :
2246 RecurrencePhis.push_back(FOR);
2247
2248 for (VPFirstOrderRecurrencePHIRecipe *FOR : RecurrencePhis) {
2250 VPRecipeBase *Previous = FOR->getBackedgeValue()->getDefiningRecipe();
2251 // Fixed-order recurrences do not contain cycles, so this loop is guaranteed
2252 // to terminate.
2253 while (auto *PrevPhi =
2255 assert(PrevPhi->getParent() == FOR->getParent());
2256 assert(SeenPhis.insert(PrevPhi).second);
2257 Previous = PrevPhi->getBackedgeValue()->getDefiningRecipe();
2258 }
2259
2260 if (!sinkRecurrenceUsersAfterPrevious(FOR, Previous, VPDT) &&
2261 !hoistPreviousBeforeFORUsers(FOR, Previous, VPDT))
2262 return false;
2263
2264 // Introduce a recipe to combine the incoming and previous values of a
2265 // fixed-order recurrence.
2266 VPBasicBlock *InsertBlock = Previous->getParent();
2267 if (isa<VPHeaderPHIRecipe>(Previous))
2268 LoopBuilder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi());
2269 else
2270 LoopBuilder.setInsertPoint(InsertBlock,
2271 std::next(Previous->getIterator()));
2272
2273 auto *RecurSplice =
2275 {FOR, FOR->getBackedgeValue()});
2276
2277 FOR->replaceAllUsesWith(RecurSplice);
2278 // Set the first operand of RecurSplice to FOR again, after replacing
2279 // all users.
2280 RecurSplice->setOperand(0, FOR);
2281
2282 // Check for users extracting at the penultimate active lane of the FOR.
2283 // If only a single lane is active in the current iteration, we need to
2284 // select the last element from the previous iteration (from the FOR phi
2285 // directly).
2286 for (VPUser *U : RecurSplice->users()) {
2288 m_Specific(RecurSplice))))
2289 continue;
2290
2292 VPValue *LastActiveLane = cast<VPInstruction>(U)->getOperand(0);
2293 Type *I64Ty = Type::getInt64Ty(Plan.getContext());
2294 VPValue *Zero = Plan.getOrAddLiveIn(ConstantInt::get(I64Ty, 0));
2295 VPValue *One = Plan.getOrAddLiveIn(ConstantInt::get(I64Ty, 1));
2296 VPValue *PenultimateIndex =
2297 B.createNaryOp(Instruction::Sub, {LastActiveLane, One});
2298 VPValue *PenultimateLastIter =
2299 B.createNaryOp(VPInstruction::ExtractLane,
2300 {PenultimateIndex, FOR->getBackedgeValue()});
2301 VPValue *LastPrevIter =
2302 B.createNaryOp(VPInstruction::ExtractLastLane, FOR);
2303
2304 VPValue *Cmp = B.createICmp(CmpInst::ICMP_EQ, LastActiveLane, Zero);
2305 VPValue *Sel = B.createSelect(Cmp, LastPrevIter, PenultimateLastIter);
2306 cast<VPInstruction>(U)->replaceAllUsesWith(Sel);
2307 }
2308 }
2309 return true;
2310}
2311
2313 for (VPRecipeBase &R :
2315 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
2316 if (!PhiR)
2317 continue;
2318 RecurKind RK = PhiR->getRecurrenceKind();
2319 if (RK != RecurKind::Add && RK != RecurKind::Mul && RK != RecurKind::Sub &&
2321 continue;
2322
2323 for (VPUser *U : collectUsersRecursively(PhiR))
2324 if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(U)) {
2325 RecWithFlags->dropPoisonGeneratingFlags();
2326 }
2327 }
2328}
2329
2330namespace {
2331struct VPCSEDenseMapInfo : public DenseMapInfo<VPSingleDefRecipe *> {
2332 static bool isSentinel(const VPSingleDefRecipe *Def) {
2333 return Def == getEmptyKey() || Def == getTombstoneKey();
2334 }
2335
2336 /// If recipe \p R will lower to a GEP with a non-i8 source element type,
2337 /// return that source element type.
2338 static Type *getGEPSourceElementType(const VPSingleDefRecipe *R) {
2339 // All VPInstructions that lower to GEPs must have the i8 source element
2340 // type (as they are PtrAdds), so we omit it.
2342 .Case<VPReplicateRecipe>([](auto *I) -> Type * {
2343 if (auto *GEP = dyn_cast<GetElementPtrInst>(I->getUnderlyingValue()))
2344 return GEP->getSourceElementType();
2345 return nullptr;
2346 })
2347 .Case<VPVectorPointerRecipe, VPWidenGEPRecipe>(
2348 [](auto *I) { return I->getSourceElementType(); })
2349 .Default([](auto *) { return nullptr; });
2350 }
2351
2352 /// Returns true if recipe \p Def can be safely handed for CSE.
2353 static bool canHandle(const VPSingleDefRecipe *Def) {
2354 // We can extend the list of handled recipes in the future,
2355 // provided we account for the data embedded in them while checking for
2356 // equality or hashing.
2357 auto C = getOpcodeOrIntrinsicID(Def);
2358
2359 // The issue with (Insert|Extract)Value is that the index of the
2360 // insert/extract is not a proper operand in LLVM IR, and hence also not in
2361 // VPlan.
2362 if (!C || (!C->first && (C->second == Instruction::InsertValue ||
2363 C->second == Instruction::ExtractValue)))
2364 return false;
2365
2366 // During CSE, we can only handle recipes that don't read from memory: if
2367 // they read from memory, there could be an intervening write to memory
2368 // before the next instance is CSE'd, leading to an incorrect result.
2369 return !Def->mayReadFromMemory();
2370 }
2371
2372 /// Hash the underlying data of \p Def.
2373 static unsigned getHashValue(const VPSingleDefRecipe *Def) {
2374 const VPlan *Plan = Def->getParent()->getPlan();
2375 VPTypeAnalysis TypeInfo(*Plan);
2376 hash_code Result = hash_combine(
2377 Def->getVPDefID(), getOpcodeOrIntrinsicID(Def),
2378 getGEPSourceElementType(Def), TypeInfo.inferScalarType(Def),
2380 if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(Def))
2381 if (RFlags->hasPredicate())
2382 return hash_combine(Result, RFlags->getPredicate());
2383 return Result;
2384 }
2385
2386 /// Check equality of underlying data of \p L and \p R.
2387 static bool isEqual(const VPSingleDefRecipe *L, const VPSingleDefRecipe *R) {
2388 if (isSentinel(L) || isSentinel(R))
2389 return L == R;
2390 if (L->getVPDefID() != R->getVPDefID() ||
2392 getGEPSourceElementType(L) != getGEPSourceElementType(R) ||
2394 !equal(L->operands(), R->operands()))
2395 return false;
2397 "must have valid opcode info for both recipes");
2398 if (auto *LFlags = dyn_cast<VPRecipeWithIRFlags>(L))
2399 if (LFlags->hasPredicate() &&
2400 LFlags->getPredicate() !=
2401 cast<VPRecipeWithIRFlags>(R)->getPredicate())
2402 return false;
2403 // Recipes in replicate regions implicitly depend on predicate. If either
2404 // recipe is in a replicate region, only consider them equal if both have
2405 // the same parent.
2406 const VPRegionBlock *RegionL = L->getRegion();
2407 const VPRegionBlock *RegionR = R->getRegion();
2408 if (((RegionL && RegionL->isReplicator()) ||
2409 (RegionR && RegionR->isReplicator())) &&
2410 L->getParent() != R->getParent())
2411 return false;
2412 const VPlan *Plan = L->getParent()->getPlan();
2413 VPTypeAnalysis TypeInfo(*Plan);
2414 return TypeInfo.inferScalarType(L) == TypeInfo.inferScalarType(R);
2415 }
2416};
2417} // end anonymous namespace
2418
2419/// Perform a common-subexpression-elimination of VPSingleDefRecipes on the \p
2420/// Plan.
2422 VPDominatorTree VPDT(Plan);
2424
2426 vp_depth_first_deep(Plan.getEntry()))) {
2427 for (VPRecipeBase &R : *VPBB) {
2428 auto *Def = dyn_cast<VPSingleDefRecipe>(&R);
2429 if (!Def || !VPCSEDenseMapInfo::canHandle(Def))
2430 continue;
2431 if (VPSingleDefRecipe *V = CSEMap.lookup(Def)) {
2432 // V must dominate Def for a valid replacement.
2433 if (!VPDT.dominates(V->getParent(), VPBB))
2434 continue;
2435 // Only keep flags present on both V and Def.
2436 if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(V))
2437 RFlags->intersectFlags(*cast<VPRecipeWithIRFlags>(Def));
2438 Def->replaceAllUsesWith(V);
2439 continue;
2440 }
2441 CSEMap[Def] = Def;
2442 }
2443 }
2444}
2445
2446/// Move loop-invariant recipes out of the vector loop region in \p Plan.
2447static void licm(VPlan &Plan) {
2448 VPBasicBlock *Preheader = Plan.getVectorPreheader();
2449
2450 // Hoist any loop invariant recipes from the vector loop region to the
2451 // preheader. Preform a shallow traversal of the vector loop region, to
2452 // exclude recipes in replicate regions. Since the top-level blocks in the
2453 // vector loop region are guaranteed to execute if the vector pre-header is,
2454 // we don't need to check speculation safety.
2455 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2456 assert(Preheader->getSingleSuccessor() == LoopRegion &&
2457 "Expected vector prehader's successor to be the vector loop region");
2459 vp_depth_first_shallow(LoopRegion->getEntry()))) {
2460 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
2462 continue;
2463 if (any_of(R.operands(), [](VPValue *Op) {
2464 return !Op->isDefinedOutsideLoopRegions();
2465 }))
2466 continue;
2467 R.moveBefore(*Preheader, Preheader->end());
2468 }
2469 }
2470}
2471
2473 VPlan &Plan, const MapVector<Instruction *, uint64_t> &MinBWs) {
2474 if (Plan.hasScalarVFOnly())
2475 return;
2476 // Keep track of created truncates, so they can be re-used. Note that we
2477 // cannot use RAUW after creating a new truncate, as this would could make
2478 // other uses have different types for their operands, making them invalidly
2479 // typed.
2481 VPTypeAnalysis TypeInfo(Plan);
2482 VPBasicBlock *PH = Plan.getVectorPreheader();
2485 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
2488 &R))
2489 continue;
2490
2491 VPValue *ResultVPV = R.getVPSingleValue();
2492 auto *UI = cast_or_null<Instruction>(ResultVPV->getUnderlyingValue());
2493 unsigned NewResSizeInBits = MinBWs.lookup(UI);
2494 if (!NewResSizeInBits)
2495 continue;
2496
2497 // If the value wasn't vectorized, we must maintain the original scalar
2498 // type. Skip those here, after incrementing NumProcessedRecipes. Also
2499 // skip casts which do not need to be handled explicitly here, as
2500 // redundant casts will be removed during recipe simplification.
2502 continue;
2503
2504 Type *OldResTy = TypeInfo.inferScalarType(ResultVPV);
2505 unsigned OldResSizeInBits = OldResTy->getScalarSizeInBits();
2506 assert(OldResTy->isIntegerTy() && "only integer types supported");
2507 (void)OldResSizeInBits;
2508
2509 auto *NewResTy = IntegerType::get(Plan.getContext(), NewResSizeInBits);
2510
2511 // Any wrapping introduced by shrinking this operation shouldn't be
2512 // considered undefined behavior. So, we can't unconditionally copy
2513 // arithmetic wrapping flags to VPW.
2514 if (auto *VPW = dyn_cast<VPRecipeWithIRFlags>(&R))
2515 VPW->dropPoisonGeneratingFlags();
2516
2517 if (OldResSizeInBits != NewResSizeInBits &&
2518 !match(&R, m_ICmp(m_VPValue(), m_VPValue()))) {
2519 // Extend result to original width.
2520 auto *Ext =
2521 new VPWidenCastRecipe(Instruction::ZExt, ResultVPV, OldResTy);
2522 Ext->insertAfter(&R);
2523 ResultVPV->replaceAllUsesWith(Ext);
2524 Ext->setOperand(0, ResultVPV);
2525 assert(OldResSizeInBits > NewResSizeInBits && "Nothing to shrink?");
2526 } else {
2527 assert(match(&R, m_ICmp(m_VPValue(), m_VPValue())) &&
2528 "Only ICmps should not need extending the result.");
2529 }
2530
2531 assert(!isa<VPWidenStoreRecipe>(&R) && "stores cannot be narrowed");
2533 continue;
2534
2535 // Shrink operands by introducing truncates as needed.
2536 unsigned StartIdx = isa<VPWidenSelectRecipe>(&R) ? 1 : 0;
2537 for (unsigned Idx = StartIdx; Idx != R.getNumOperands(); ++Idx) {
2538 auto *Op = R.getOperand(Idx);
2539 unsigned OpSizeInBits =
2541 if (OpSizeInBits == NewResSizeInBits)
2542 continue;
2543 assert(OpSizeInBits > NewResSizeInBits && "nothing to truncate");
2544 auto [ProcessedIter, IterIsEmpty] = ProcessedTruncs.try_emplace(Op);
2545 if (!IterIsEmpty) {
2546 R.setOperand(Idx, ProcessedIter->second);
2547 continue;
2548 }
2549
2550 VPBuilder Builder;
2551 if (Op->isLiveIn())
2552 Builder.setInsertPoint(PH);
2553 else
2554 Builder.setInsertPoint(&R);
2555 VPWidenCastRecipe *NewOp =
2556 Builder.createWidenCast(Instruction::Trunc, Op, NewResTy);
2557 ProcessedIter->second = NewOp;
2558 R.setOperand(Idx, NewOp);
2559 }
2560
2561 }
2562 }
2563}
2564
2568 VPValue *Cond;
2569 // Skip blocks that are not terminated by BranchOnCond.
2570 if (VPBB->empty() || !match(&VPBB->back(), m_BranchOnCond(m_VPValue(Cond))))
2571 continue;
2572
2573 assert(VPBB->getNumSuccessors() == 2 &&
2574 "Two successors expected for BranchOnCond");
2575 unsigned RemovedIdx;
2576 if (match(Cond, m_True()))
2577 RemovedIdx = 1;
2578 else if (match(Cond, m_False()))
2579 RemovedIdx = 0;
2580 else
2581 continue;
2582
2583 VPBasicBlock *RemovedSucc =
2584 cast<VPBasicBlock>(VPBB->getSuccessors()[RemovedIdx]);
2585 assert(count(RemovedSucc->getPredecessors(), VPBB) == 1 &&
2586 "There must be a single edge between VPBB and its successor");
2587 // Values coming from VPBB into phi recipes of RemoveSucc are removed from
2588 // these recipes.
2589 for (VPRecipeBase &R : RemovedSucc->phis())
2590 cast<VPPhiAccessors>(&R)->removeIncomingValueFor(VPBB);
2591
2592 // Disconnect blocks and remove the terminator. RemovedSucc will be deleted
2593 // automatically on VPlan destruction if it becomes unreachable.
2594 VPBlockUtils::disconnectBlocks(VPBB, RemovedSucc);
2595 VPBB->back().eraseFromParent();
2596 }
2597}
2598
2618
2619// Add a VPActiveLaneMaskPHIRecipe and related recipes to \p Plan and replace
2620// the loop terminator with a branch-on-cond recipe with the negated
2621// active-lane-mask as operand. Note that this turns the loop into an
2622// uncountable one. Only the existing terminator is replaced, all other existing
2623// recipes/users remain unchanged, except for poison-generating flags being
2624// dropped from the canonical IV increment. Return the created
2625// VPActiveLaneMaskPHIRecipe.
2626//
2627// The function uses the following definitions:
2628//
2629// %TripCount = DataWithControlFlowWithoutRuntimeCheck ?
2630// calculate-trip-count-minus-VF (original TC) : original TC
2631// %IncrementValue = DataWithControlFlowWithoutRuntimeCheck ?
2632// CanonicalIVPhi : CanonicalIVIncrement
2633// %StartV is the canonical induction start value.
2634//
2635// The function adds the following recipes:
2636//
2637// vector.ph:
2638// %TripCount = calculate-trip-count-minus-VF (original TC)
2639// [if DataWithControlFlowWithoutRuntimeCheck]
2640// %EntryInc = canonical-iv-increment-for-part %StartV
2641// %EntryALM = active-lane-mask %EntryInc, %TripCount
2642//
2643// vector.body:
2644// ...
2645// %P = active-lane-mask-phi [ %EntryALM, %vector.ph ], [ %ALM, %vector.body ]
2646// ...
2647// %InLoopInc = canonical-iv-increment-for-part %IncrementValue
2648// %ALM = active-lane-mask %InLoopInc, TripCount
2649// %Negated = Not %ALM
2650// branch-on-cond %Negated
2651//
2654 VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
2655 VPBasicBlock *EB = TopRegion->getExitingBasicBlock();
2656 auto *CanonicalIVPHI = TopRegion->getCanonicalIV();
2657 VPValue *StartV = CanonicalIVPHI->getStartValue();
2658
2659 auto *CanonicalIVIncrement =
2660 cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue());
2661 // TODO: Check if dropping the flags is needed if
2662 // !DataAndControlFlowWithoutRuntimeCheck.
2663 CanonicalIVIncrement->dropPoisonGeneratingFlags();
2664 DebugLoc DL = CanonicalIVIncrement->getDebugLoc();
2665 // We can't use StartV directly in the ActiveLaneMask VPInstruction, since
2666 // we have to take unrolling into account. Each part needs to start at
2667 // Part * VF
2668 auto *VecPreheader = Plan.getVectorPreheader();
2669 VPBuilder Builder(VecPreheader);
2670
2671 // Create the ActiveLaneMask instruction using the correct start values.
2672 VPValue *TC = Plan.getTripCount();
2673
2674 VPValue *TripCount, *IncrementValue;
2676 // When the loop is guarded by a runtime overflow check for the loop
2677 // induction variable increment by VF, we can increment the value before
2678 // the get.active.lane mask and use the unmodified tripcount.
2679 IncrementValue = CanonicalIVIncrement;
2680 TripCount = TC;
2681 } else {
2682 // When avoiding a runtime check, the active.lane.mask inside the loop
2683 // uses a modified trip count and the induction variable increment is
2684 // done after the active.lane.mask intrinsic is called.
2685 IncrementValue = CanonicalIVPHI;
2686 TripCount = Builder.createNaryOp(VPInstruction::CalculateTripCountMinusVF,
2687 {TC}, DL);
2688 }
2689 auto *EntryIncrement = Builder.createOverflowingOp(
2690 VPInstruction::CanonicalIVIncrementForPart, {StartV}, {false, false}, DL,
2691 "index.part.next");
2692
2693 // Create the active lane mask instruction in the VPlan preheader.
2694 VPValue *ALMMultiplier =
2695 Plan.getConstantInt(TopRegion->getCanonicalIVType(), 1);
2696 auto *EntryALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
2697 {EntryIncrement, TC, ALMMultiplier}, DL,
2698 "active.lane.mask.entry");
2699
2700 // Now create the ActiveLaneMaskPhi recipe in the main loop using the
2701 // preheader ActiveLaneMask instruction.
2702 auto *LaneMaskPhi =
2704 LaneMaskPhi->insertAfter(CanonicalIVPHI);
2705
2706 // Create the active lane mask for the next iteration of the loop before the
2707 // original terminator.
2708 VPRecipeBase *OriginalTerminator = EB->getTerminator();
2709 Builder.setInsertPoint(OriginalTerminator);
2710 auto *InLoopIncrement =
2711 Builder.createOverflowingOp(VPInstruction::CanonicalIVIncrementForPart,
2712 {IncrementValue}, {false, false}, DL);
2713 auto *ALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
2714 {InLoopIncrement, TripCount, ALMMultiplier},
2715 DL, "active.lane.mask.next");
2716 LaneMaskPhi->addOperand(ALM);
2717
2718 // Replace the original terminator with BranchOnCond. We have to invert the
2719 // mask here because a true condition means jumping to the exit block.
2720 auto *NotMask = Builder.createNot(ALM, DL);
2721 Builder.createNaryOp(VPInstruction::BranchOnCond, {NotMask}, DL);
2722 OriginalTerminator->eraseFromParent();
2723 return LaneMaskPhi;
2724}
2725
2726/// Collect the header mask with the pattern:
2727/// (ICMP_ULE, WideCanonicalIV, backedge-taken-count)
2728/// TODO: Introduce explicit recipe for header-mask instead of searching
2729/// for the header-mask pattern manually.
2731 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2732 SmallVector<VPValue *> WideCanonicalIVs;
2733 auto *FoundWidenCanonicalIVUser = find_if(
2735 assert(count_if(LoopRegion->getCanonicalIV()->users(),
2737 "Must have at most one VPWideCanonicalIVRecipe");
2738 if (FoundWidenCanonicalIVUser !=
2739 LoopRegion->getCanonicalIV()->users().end()) {
2740 auto *WideCanonicalIV =
2741 cast<VPWidenCanonicalIVRecipe>(*FoundWidenCanonicalIVUser);
2742 WideCanonicalIVs.push_back(WideCanonicalIV);
2743 }
2744
2745 // Also include VPWidenIntOrFpInductionRecipes that represent a widened
2746 // version of the canonical induction.
2747 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
2748 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
2749 auto *WidenOriginalIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
2750 if (WidenOriginalIV && WidenOriginalIV->isCanonical())
2751 WideCanonicalIVs.push_back(WidenOriginalIV);
2752 }
2753
2754 // Walk users of wide canonical IVs and find the single compare of the form
2755 // (ICMP_ULE, WideCanonicalIV, backedge-taken-count).
2756 VPSingleDefRecipe *HeaderMask = nullptr;
2757 for (auto *Wide : WideCanonicalIVs) {
2758 for (VPUser *U : SmallVector<VPUser *>(Wide->users())) {
2759 auto *VPI = dyn_cast<VPInstruction>(U);
2760 if (!VPI || !vputils::isHeaderMask(VPI, Plan))
2761 continue;
2762
2763 assert(VPI->getOperand(0) == Wide &&
2764 "WidenCanonicalIV must be the first operand of the compare");
2765 assert(!HeaderMask && "Multiple header masks found?");
2766 HeaderMask = VPI;
2767 }
2768 }
2769 return HeaderMask;
2770}
2771
2773 VPlan &Plan, bool UseActiveLaneMaskForControlFlow,
2776 UseActiveLaneMaskForControlFlow) &&
2777 "DataAndControlFlowWithoutRuntimeCheck implies "
2778 "UseActiveLaneMaskForControlFlow");
2779
2780 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2781 auto *FoundWidenCanonicalIVUser = find_if(
2783 assert(FoundWidenCanonicalIVUser &&
2784 "Must have widened canonical IV when tail folding!");
2785 VPSingleDefRecipe *HeaderMask = findHeaderMask(Plan);
2786 auto *WideCanonicalIV =
2787 cast<VPWidenCanonicalIVRecipe>(*FoundWidenCanonicalIVUser);
2788 VPSingleDefRecipe *LaneMask;
2789 if (UseActiveLaneMaskForControlFlow) {
2792 } else {
2793 VPBuilder B = VPBuilder::getToInsertAfter(WideCanonicalIV);
2794 VPValue *ALMMultiplier = Plan.getOrAddLiveIn(
2795 ConstantInt::get(LoopRegion->getCanonicalIVType(), 1));
2796 LaneMask =
2797 B.createNaryOp(VPInstruction::ActiveLaneMask,
2798 {WideCanonicalIV, Plan.getTripCount(), ALMMultiplier},
2799 nullptr, "active.lane.mask");
2800 }
2801
2802 // Walk users of WideCanonicalIV and replace the header mask of the form
2803 // (ICMP_ULE, WideCanonicalIV, backedge-taken-count) with an active-lane-mask,
2804 // removing the old one to ensure there is always only a single header mask.
2805 HeaderMask->replaceAllUsesWith(LaneMask);
2806 HeaderMask->eraseFromParent();
2807}
2808
2809template <typename Op0_t, typename Op1_t> struct RemoveMask_match {
2810 Op0_t In;
2812
2813 RemoveMask_match(const Op0_t &In, Op1_t &Out) : In(In), Out(Out) {}
2814
2815 template <typename OpTy> bool match(OpTy *V) const {
2816 if (m_Specific(In).match(V)) {
2817 Out = nullptr;
2818 return true;
2819 }
2820 return m_LogicalAnd(m_Specific(In), m_VPValue(Out)).match(V);
2821 }
2822};
2823
2824/// Match a specific mask \p In, or a combination of it (logical-and In, Out).
2825/// Returns the remaining part \p Out if so, or nullptr otherwise.
2826template <typename Op0_t, typename Op1_t>
2827static inline RemoveMask_match<Op0_t, Op1_t> m_RemoveMask(const Op0_t &In,
2828 Op1_t &Out) {
2829 return RemoveMask_match<Op0_t, Op1_t>(In, Out);
2830}
2831
2832/// Try to optimize a \p CurRecipe masked by \p HeaderMask to a corresponding
2833/// EVL-based recipe without the header mask. Returns nullptr if no EVL-based
2834/// recipe could be created.
2835/// \p HeaderMask Header Mask.
2836/// \p CurRecipe Recipe to be transform.
2837/// \p TypeInfo VPlan-based type analysis.
2838/// \p EVL The explicit vector length parameter of vector-predication
2839/// intrinsics.
2841 VPRecipeBase &CurRecipe,
2842 VPTypeAnalysis &TypeInfo, VPValue &EVL) {
2843 VPlan *Plan = CurRecipe.getParent()->getPlan();
2844 DebugLoc DL = CurRecipe.getDebugLoc();
2845 VPValue *Addr, *Mask, *EndPtr;
2846
2847 /// Adjust any end pointers so that they point to the end of EVL lanes not VF.
2848 auto AdjustEndPtr = [&CurRecipe, &EVL](VPValue *EndPtr) {
2849 auto *EVLEndPtr = cast<VPVectorEndPointerRecipe>(EndPtr)->clone();
2850 EVLEndPtr->insertBefore(&CurRecipe);
2851 EVLEndPtr->setOperand(1, &EVL);
2852 return EVLEndPtr;
2853 };
2854
2855 if (match(&CurRecipe,
2856 m_MaskedLoad(m_VPValue(Addr), m_RemoveMask(HeaderMask, Mask))) &&
2857 !cast<VPWidenLoadRecipe>(CurRecipe).isReverse())
2858 return new VPWidenLoadEVLRecipe(cast<VPWidenLoadRecipe>(CurRecipe), Addr,
2859 EVL, Mask);
2860
2861 if (match(&CurRecipe,
2862 m_MaskedLoad(m_VPValue(EndPtr), m_RemoveMask(HeaderMask, Mask))) &&
2863 match(EndPtr, m_VecEndPtr(m_VPValue(Addr), m_Specific(&Plan->getVF()))) &&
2864 cast<VPWidenLoadRecipe>(CurRecipe).isReverse())
2865 return new VPWidenLoadEVLRecipe(cast<VPWidenLoadRecipe>(CurRecipe),
2866 AdjustEndPtr(EndPtr), EVL, Mask);
2867
2868 if (match(&CurRecipe, m_MaskedStore(m_VPValue(Addr), m_VPValue(),
2869 m_RemoveMask(HeaderMask, Mask))) &&
2870 !cast<VPWidenStoreRecipe>(CurRecipe).isReverse())
2871 return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(CurRecipe), Addr,
2872 EVL, Mask);
2873
2874 if (match(&CurRecipe, m_MaskedStore(m_VPValue(EndPtr), m_VPValue(),
2875 m_RemoveMask(HeaderMask, Mask))) &&
2876 match(EndPtr, m_VecEndPtr(m_VPValue(Addr), m_Specific(&Plan->getVF()))) &&
2877 cast<VPWidenStoreRecipe>(CurRecipe).isReverse())
2878 return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(CurRecipe),
2879 AdjustEndPtr(EndPtr), EVL, Mask);
2880
2881 if (auto *Rdx = dyn_cast<VPReductionRecipe>(&CurRecipe))
2882 if (Rdx->isConditional() &&
2883 match(Rdx->getCondOp(), m_RemoveMask(HeaderMask, Mask)))
2884 return new VPReductionEVLRecipe(*Rdx, EVL, Mask);
2885
2886 if (auto *Interleave = dyn_cast<VPInterleaveRecipe>(&CurRecipe))
2887 if (Interleave->getMask() &&
2888 match(Interleave->getMask(), m_RemoveMask(HeaderMask, Mask)))
2889 return new VPInterleaveEVLRecipe(*Interleave, EVL, Mask);
2890
2891 VPValue *LHS, *RHS;
2892 if (match(&CurRecipe,
2893 m_Select(m_Specific(HeaderMask), m_VPValue(LHS), m_VPValue(RHS))))
2894 return new VPWidenIntrinsicRecipe(
2895 Intrinsic::vp_merge, {Plan->getTrue(), LHS, RHS, &EVL},
2896 TypeInfo.inferScalarType(LHS), {}, {}, DL);
2897
2898 if (match(&CurRecipe, m_Select(m_RemoveMask(HeaderMask, Mask), m_VPValue(LHS),
2899 m_VPValue(RHS))))
2900 return new VPWidenIntrinsicRecipe(
2901 Intrinsic::vp_merge, {Mask, LHS, RHS, &EVL},
2902 TypeInfo.inferScalarType(LHS), {}, {}, DL);
2903
2904 if (match(&CurRecipe, m_LastActiveLane(m_Specific(HeaderMask)))) {
2905 Type *Ty = TypeInfo.inferScalarType(CurRecipe.getVPSingleValue());
2906 VPValue *ZExt =
2907 VPBuilder(&CurRecipe).createScalarCast(Instruction::ZExt, &EVL, Ty, DL);
2908 return new VPInstruction(Instruction::Sub,
2909 {ZExt, Plan->getConstantInt(Ty, 1)}, {}, {}, DL);
2910 }
2911
2912 return nullptr;
2913}
2914
2915/// Replace recipes with their EVL variants.
2917 VPTypeAnalysis TypeInfo(Plan);
2918 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2919 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
2920
2921 assert(all_of(Plan.getVF().users(),
2924 "User of VF that we can't transform to EVL.");
2925 Plan.getVF().replaceUsesWithIf(&EVL, [](VPUser &U, unsigned Idx) {
2927 });
2928
2929 assert(all_of(Plan.getVFxUF().users(),
2930 [&LoopRegion, &Plan](VPUser *U) {
2931 return match(U,
2932 m_c_Add(m_Specific(LoopRegion->getCanonicalIV()),
2933 m_Specific(&Plan.getVFxUF()))) ||
2934 isa<VPWidenPointerInductionRecipe>(U);
2935 }) &&
2936 "Only users of VFxUF should be VPWidenPointerInductionRecipe and the "
2937 "increment of the canonical induction.");
2938 Plan.getVFxUF().replaceUsesWithIf(&EVL, [](VPUser &U, unsigned Idx) {
2939 // Only replace uses in VPWidenPointerInductionRecipe; The increment of the
2940 // canonical induction must not be updated.
2942 });
2943
2944 // Defer erasing recipes till the end so that we don't invalidate the
2945 // VPTypeAnalysis cache.
2947
2948 // Create a scalar phi to track the previous EVL if fixed-order recurrence is
2949 // contained.
2950 bool ContainsFORs =
2952 if (ContainsFORs) {
2953 // TODO: Use VPInstruction::ExplicitVectorLength to get maximum EVL.
2954 VPValue *MaxEVL = &Plan.getVF();
2955 // Emit VPScalarCastRecipe in preheader if VF is not a 32 bits integer.
2956 VPBuilder Builder(LoopRegion->getPreheaderVPBB());
2957 MaxEVL = Builder.createScalarZExtOrTrunc(
2958 MaxEVL, Type::getInt32Ty(Plan.getContext()),
2959 TypeInfo.inferScalarType(MaxEVL), DebugLoc::getUnknown());
2960
2961 Builder.setInsertPoint(Header, Header->getFirstNonPhi());
2962 VPValue *PrevEVL = Builder.createScalarPhi(
2963 {MaxEVL, &EVL}, DebugLoc::getUnknown(), "prev.evl");
2964
2967 for (VPRecipeBase &R : *VPBB) {
2968 VPValue *V1, *V2;
2969 if (!match(&R,
2971 m_VPValue(V1), m_VPValue(V2))))
2972 continue;
2973 VPValue *Imm = Plan.getOrAddLiveIn(
2976 Intrinsic::experimental_vp_splice,
2977 {V1, V2, Imm, Plan.getTrue(), PrevEVL, &EVL},
2978 TypeInfo.inferScalarType(R.getVPSingleValue()), {}, {},
2979 R.getDebugLoc());
2980 VPSplice->insertBefore(&R);
2981 R.getVPSingleValue()->replaceAllUsesWith(VPSplice);
2982 ToErase.push_back(&R);
2983 }
2984 }
2985 }
2986
2987 VPValue *HeaderMask = findHeaderMask(Plan);
2988 if (!HeaderMask)
2989 return;
2990
2991 // Replace header masks with a mask equivalent to predicating by EVL:
2992 //
2993 // icmp ule widen-canonical-iv backedge-taken-count
2994 // ->
2995 // icmp ult step-vector, EVL
2996 VPRecipeBase *EVLR = EVL.getDefiningRecipe();
2997 VPBuilder Builder(EVLR->getParent(), std::next(EVLR->getIterator()));
2998 Type *EVLType = TypeInfo.inferScalarType(&EVL);
2999 VPValue *EVLMask = Builder.createICmp(
3001 Builder.createNaryOp(VPInstruction::StepVector, {}, EVLType), &EVL);
3002 HeaderMask->replaceAllUsesWith(EVLMask);
3003 ToErase.push_back(HeaderMask->getDefiningRecipe());
3004
3005 // Try to optimize header mask recipes away to their EVL variants.
3006 // TODO: Split optimizeMaskToEVL out and move into
3007 // VPlanTransforms::optimize. transformRecipestoEVLRecipes should be run in
3008 // tryToBuildVPlanWithVPRecipes beforehand.
3009 for (VPUser *U : collectUsersRecursively(EVLMask)) {
3010 auto *CurRecipe = cast<VPRecipeBase>(U);
3011 VPRecipeBase *EVLRecipe =
3012 optimizeMaskToEVL(EVLMask, *CurRecipe, TypeInfo, EVL);
3013 if (!EVLRecipe)
3014 continue;
3015
3016 unsigned NumDefVal = EVLRecipe->getNumDefinedValues();
3017 assert(NumDefVal == CurRecipe->getNumDefinedValues() &&
3018 "New recipe must define the same number of values as the "
3019 "original.");
3020 EVLRecipe->insertBefore(CurRecipe);
3022 EVLRecipe)) {
3023 for (unsigned I = 0; I < NumDefVal; ++I) {
3024 VPValue *CurVPV = CurRecipe->getVPValue(I);
3025 CurVPV->replaceAllUsesWith(EVLRecipe->getVPValue(I));
3026 }
3027 }
3028 ToErase.push_back(CurRecipe);
3029 }
3030 // Remove dead EVL mask.
3031 if (EVLMask->getNumUsers() == 0)
3032 ToErase.push_back(EVLMask->getDefiningRecipe());
3033
3034 for (VPRecipeBase *R : reverse(ToErase)) {
3035 SmallVector<VPValue *> PossiblyDead(R->operands());
3036 R->eraseFromParent();
3037 for (VPValue *Op : PossiblyDead)
3039 }
3040}
3041
3042/// Add a VPEVLBasedIVPHIRecipe and related recipes to \p Plan and
3043/// replaces all uses except the canonical IV increment of
3044/// VPCanonicalIVPHIRecipe with a VPEVLBasedIVPHIRecipe. VPCanonicalIVPHIRecipe
3045/// is used only for loop iterations counting after this transformation.
3046///
3047/// The function uses the following definitions:
3048/// %StartV is the canonical induction start value.
3049///
3050/// The function adds the following recipes:
3051///
3052/// vector.ph:
3053/// ...
3054///
3055/// vector.body:
3056/// ...
3057/// %EVLPhi = EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI [ %StartV, %vector.ph ],
3058/// [ %NextEVLIV, %vector.body ]
3059/// %AVL = phi [ trip-count, %vector.ph ], [ %NextAVL, %vector.body ]
3060/// %VPEVL = EXPLICIT-VECTOR-LENGTH %AVL
3061/// ...
3062/// %OpEVL = cast i32 %VPEVL to IVSize
3063/// %NextEVLIV = add IVSize %OpEVL, %EVLPhi
3064/// %NextAVL = sub IVSize nuw %AVL, %OpEVL
3065/// ...
3066///
3067/// If MaxSafeElements is provided, the function adds the following recipes:
3068/// vector.ph:
3069/// ...
3070///
3071/// vector.body:
3072/// ...
3073/// %EVLPhi = EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI [ %StartV, %vector.ph ],
3074/// [ %NextEVLIV, %vector.body ]
3075/// %AVL = phi [ trip-count, %vector.ph ], [ %NextAVL, %vector.body ]
3076/// %cmp = cmp ult %AVL, MaxSafeElements
3077/// %SAFE_AVL = select %cmp, %AVL, MaxSafeElements
3078/// %VPEVL = EXPLICIT-VECTOR-LENGTH %SAFE_AVL
3079/// ...
3080/// %OpEVL = cast i32 %VPEVL to IVSize
3081/// %NextEVLIV = add IVSize %OpEVL, %EVLPhi
3082/// %NextAVL = sub IVSize nuw %AVL, %OpEVL
3083/// ...
3084///
3086 VPlan &Plan, const std::optional<unsigned> &MaxSafeElements) {
3087 if (Plan.hasScalarVFOnly())
3088 return;
3089 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3090 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
3091
3092 auto *CanonicalIVPHI = LoopRegion->getCanonicalIV();
3093 auto *CanIVTy = LoopRegion->getCanonicalIVType();
3094 VPValue *StartV = CanonicalIVPHI->getStartValue();
3095
3096 // Create the ExplicitVectorLengthPhi recipe in the main loop.
3097 auto *EVLPhi = new VPEVLBasedIVPHIRecipe(StartV, DebugLoc::getUnknown());
3098 EVLPhi->insertAfter(CanonicalIVPHI);
3099 VPBuilder Builder(Header, Header->getFirstNonPhi());
3100 // Create the AVL (application vector length), starting from TC -> 0 in steps
3101 // of EVL.
3102 VPPhi *AVLPhi = Builder.createScalarPhi(
3103 {Plan.getTripCount()}, DebugLoc::getCompilerGenerated(), "avl");
3104 VPValue *AVL = AVLPhi;
3105
3106 if (MaxSafeElements) {
3107 // Support for MaxSafeDist for correct loop emission.
3108 VPValue *AVLSafe = Plan.getConstantInt(CanIVTy, *MaxSafeElements);
3109 VPValue *Cmp = Builder.createICmp(ICmpInst::ICMP_ULT, AVL, AVLSafe);
3110 AVL = Builder.createSelect(Cmp, AVL, AVLSafe, DebugLoc::getUnknown(),
3111 "safe_avl");
3112 }
3113 auto *VPEVL = Builder.createNaryOp(VPInstruction::ExplicitVectorLength, AVL,
3115
3116 auto *CanonicalIVIncrement =
3117 cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue());
3118 Builder.setInsertPoint(CanonicalIVIncrement);
3119 VPValue *OpVPEVL = VPEVL;
3120
3121 auto *I32Ty = Type::getInt32Ty(Plan.getContext());
3122 OpVPEVL = Builder.createScalarZExtOrTrunc(
3123 OpVPEVL, CanIVTy, I32Ty, CanonicalIVIncrement->getDebugLoc());
3124
3125 auto *NextEVLIV = Builder.createOverflowingOp(
3126 Instruction::Add, {OpVPEVL, EVLPhi},
3127 {CanonicalIVIncrement->hasNoUnsignedWrap(),
3128 CanonicalIVIncrement->hasNoSignedWrap()},
3129 CanonicalIVIncrement->getDebugLoc(), "index.evl.next");
3130 EVLPhi->addOperand(NextEVLIV);
3131
3132 VPValue *NextAVL = Builder.createOverflowingOp(
3133 Instruction::Sub, {AVLPhi, OpVPEVL}, {/*hasNUW=*/true, /*hasNSW=*/false},
3134 DebugLoc::getCompilerGenerated(), "avl.next");
3135 AVLPhi->addOperand(NextAVL);
3136
3137 transformRecipestoEVLRecipes(Plan, *VPEVL);
3138
3139 // Replace all uses of VPCanonicalIVPHIRecipe by
3140 // VPEVLBasedIVPHIRecipe except for the canonical IV increment.
3141 CanonicalIVPHI->replaceAllUsesWith(EVLPhi);
3142 CanonicalIVIncrement->setOperand(0, CanonicalIVPHI);
3143 // TODO: support unroll factor > 1.
3144 Plan.setUF(1);
3145}
3146
3148 // Find EVL loop entries by locating VPEVLBasedIVPHIRecipe.
3149 // There should be only one EVL PHI in the entire plan.
3150 VPEVLBasedIVPHIRecipe *EVLPhi = nullptr;
3151
3154 for (VPRecipeBase &R : VPBB->phis())
3155 if (auto *PhiR = dyn_cast<VPEVLBasedIVPHIRecipe>(&R)) {
3156 assert(!EVLPhi && "Found multiple EVL PHIs. Only one expected");
3157 EVLPhi = PhiR;
3158 }
3159
3160 // Early return if no EVL PHI is found.
3161 if (!EVLPhi)
3162 return;
3163
3164 VPBasicBlock *HeaderVPBB = EVLPhi->getParent();
3165 VPValue *EVLIncrement = EVLPhi->getBackedgeValue();
3166 VPValue *AVL;
3167 [[maybe_unused]] bool FoundAVL =
3168 match(EVLIncrement,
3169 m_c_Add(m_ZExtOrSelf(m_EVL(m_VPValue(AVL))), m_Specific(EVLPhi)));
3170 assert(FoundAVL && "Didn't find AVL?");
3171
3172 // The AVL may be capped to a safe distance.
3173 VPValue *SafeAVL;
3174 if (match(AVL, m_Select(m_VPValue(), m_VPValue(SafeAVL), m_VPValue())))
3175 AVL = SafeAVL;
3176
3177 VPValue *AVLNext;
3178 [[maybe_unused]] bool FoundAVLNext =
3180 m_Specific(Plan.getTripCount()), m_VPValue(AVLNext)));
3181 assert(FoundAVLNext && "Didn't find AVL backedge?");
3182
3183 // Convert EVLPhi to concrete recipe.
3184 auto *ScalarR =
3185 VPBuilder(EVLPhi).createScalarPhi({EVLPhi->getStartValue(), EVLIncrement},
3186 EVLPhi->getDebugLoc(), "evl.based.iv");
3187 EVLPhi->replaceAllUsesWith(ScalarR);
3188 EVLPhi->eraseFromParent();
3189
3190 // Replace CanonicalIVInc with EVL-PHI increment.
3191 auto *CanonicalIV = cast<VPPhi>(&*HeaderVPBB->begin());
3192 VPValue *Backedge = CanonicalIV->getIncomingValue(1);
3193 assert(match(Backedge, m_c_Add(m_Specific(CanonicalIV),
3194 m_Specific(&Plan.getVFxUF()))) &&
3195 "Unexpected canonical iv");
3196 Backedge->replaceAllUsesWith(EVLIncrement);
3197
3198 // Remove unused phi and increment.
3199 VPRecipeBase *CanonicalIVIncrement = Backedge->getDefiningRecipe();
3200 CanonicalIVIncrement->eraseFromParent();
3201 CanonicalIV->eraseFromParent();
3202
3203 // Replace the use of VectorTripCount in the latch-exiting block.
3204 // Before: (branch-on-count EVLIVInc, VectorTripCount)
3205 // After: (branch-on-cond eq AVLNext, 0)
3206
3207 VPBasicBlock *LatchExiting =
3208 HeaderVPBB->getPredecessors()[1]->getEntryBasicBlock();
3209 auto *LatchExitingBr = cast<VPInstruction>(LatchExiting->getTerminator());
3210 // Skip single-iteration loop region
3211 if (match(LatchExitingBr, m_BranchOnCond(m_True())))
3212 return;
3213 assert(LatchExitingBr &&
3214 match(LatchExitingBr,
3215 m_BranchOnCount(m_VPValue(EVLIncrement),
3216 m_Specific(&Plan.getVectorTripCount()))) &&
3217 "Unexpected terminator in EVL loop");
3218
3219 Type *AVLTy = VPTypeAnalysis(Plan).inferScalarType(AVLNext);
3220 VPBuilder Builder(LatchExitingBr);
3221 VPValue *Cmp = Builder.createICmp(CmpInst::ICMP_EQ, AVLNext,
3222 Plan.getConstantInt(AVLTy, 0));
3223 Builder.createNaryOp(VPInstruction::BranchOnCond, Cmp);
3224 LatchExitingBr->eraseFromParent();
3225}
3226
3228 VPlan &Plan, PredicatedScalarEvolution &PSE,
3229 const DenseMap<Value *, const SCEV *> &StridesMap) {
3230 // Replace VPValues for known constant strides guaranteed by predicate scalar
3231 // evolution.
3232 auto CanUseVersionedStride = [&Plan](VPUser &U, unsigned) {
3233 auto *R = cast<VPRecipeBase>(&U);
3234 return R->getRegion() ||
3235 R->getParent() == Plan.getVectorLoopRegion()->getSinglePredecessor();
3236 };
3237 ValueToSCEVMapTy RewriteMap;
3238 for (const SCEV *Stride : StridesMap.values()) {
3239 using namespace SCEVPatternMatch;
3240 auto *StrideV = cast<SCEVUnknown>(Stride)->getValue();
3241 const APInt *StrideConst;
3242 if (!match(PSE.getSCEV(StrideV), m_scev_APInt(StrideConst)))
3243 // Only handle constant strides for now.
3244 continue;
3245
3246 auto *CI = Plan.getConstantInt(*StrideConst);
3247 if (VPValue *StrideVPV = Plan.getLiveIn(StrideV))
3248 StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
3249
3250 // The versioned value may not be used in the loop directly but through a
3251 // sext/zext. Add new live-ins in those cases.
3252 for (Value *U : StrideV->users()) {
3254 continue;
3255 VPValue *StrideVPV = Plan.getLiveIn(U);
3256 if (!StrideVPV)
3257 continue;
3258 unsigned BW = U->getType()->getScalarSizeInBits();
3259 APInt C =
3260 isa<SExtInst>(U) ? StrideConst->sext(BW) : StrideConst->zext(BW);
3261 VPValue *CI = Plan.getConstantInt(C);
3262 StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
3263 }
3264 RewriteMap[StrideV] = PSE.getSCEV(StrideV);
3265 }
3266
3267 for (VPRecipeBase &R : *Plan.getEntry()) {
3268 auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(&R);
3269 if (!ExpSCEV)
3270 continue;
3271 const SCEV *ScevExpr = ExpSCEV->getSCEV();
3272 auto *NewSCEV =
3273 SCEVParameterRewriter::rewrite(ScevExpr, *PSE.getSE(), RewriteMap);
3274 if (NewSCEV != ScevExpr) {
3275 VPValue *NewExp = vputils::getOrCreateVPValueForSCEVExpr(Plan, NewSCEV);
3276 ExpSCEV->replaceAllUsesWith(NewExp);
3277 if (Plan.getTripCount() == ExpSCEV)
3278 Plan.resetTripCount(NewExp);
3279 }
3280 }
3281}
3282
3284 VPlan &Plan,
3285 const std::function<bool(BasicBlock *)> &BlockNeedsPredication) {
3286 // Collect recipes in the backward slice of `Root` that may generate a poison
3287 // value that is used after vectorization.
3289 auto CollectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) {
3291 Worklist.push_back(Root);
3292
3293 // Traverse the backward slice of Root through its use-def chain.
3294 while (!Worklist.empty()) {
3295 VPRecipeBase *CurRec = Worklist.pop_back_val();
3296
3297 if (!Visited.insert(CurRec).second)
3298 continue;
3299
3300 // Prune search if we find another recipe generating a widen memory
3301 // instruction. Widen memory instructions involved in address computation
3302 // will lead to gather/scatter instructions, which don't need to be
3303 // handled.
3305 VPHeaderPHIRecipe>(CurRec))
3306 continue;
3307
3308 // This recipe contributes to the address computation of a widen
3309 // load/store. If the underlying instruction has poison-generating flags,
3310 // drop them directly.
3311 if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(CurRec)) {
3312 VPValue *A, *B;
3313 // Dropping disjoint from an OR may yield incorrect results, as some
3314 // analysis may have converted it to an Add implicitly (e.g. SCEV used
3315 // for dependence analysis). Instead, replace it with an equivalent Add.
3316 // This is possible as all users of the disjoint OR only access lanes
3317 // where the operands are disjoint or poison otherwise.
3318 if (match(RecWithFlags, m_BinaryOr(m_VPValue(A), m_VPValue(B))) &&
3319 RecWithFlags->isDisjoint()) {
3320 VPBuilder Builder(RecWithFlags);
3321 VPInstruction *New = Builder.createOverflowingOp(
3322 Instruction::Add, {A, B}, {false, false},
3323 RecWithFlags->getDebugLoc());
3324 New->setUnderlyingValue(RecWithFlags->getUnderlyingValue());
3325 RecWithFlags->replaceAllUsesWith(New);
3326 RecWithFlags->eraseFromParent();
3327 CurRec = New;
3328 } else
3329 RecWithFlags->dropPoisonGeneratingFlags();
3330 } else {
3333 (void)Instr;
3334 assert((!Instr || !Instr->hasPoisonGeneratingFlags()) &&
3335 "found instruction with poison generating flags not covered by "
3336 "VPRecipeWithIRFlags");
3337 }
3338
3339 // Add new definitions to the worklist.
3340 for (VPValue *Operand : CurRec->operands())
3341 if (VPRecipeBase *OpDef = Operand->getDefiningRecipe())
3342 Worklist.push_back(OpDef);
3343 }
3344 });
3345
3346 // Traverse all the recipes in the VPlan and collect the poison-generating
3347 // recipes in the backward slice starting at the address of a VPWidenRecipe or
3348 // VPInterleaveRecipe.
3349 auto Iter = vp_depth_first_deep(Plan.getEntry());
3351 for (VPRecipeBase &Recipe : *VPBB) {
3352 if (auto *WidenRec = dyn_cast<VPWidenMemoryRecipe>(&Recipe)) {
3353 Instruction &UnderlyingInstr = WidenRec->getIngredient();
3354 VPRecipeBase *AddrDef = WidenRec->getAddr()->getDefiningRecipe();
3355 if (AddrDef && WidenRec->isConsecutive() &&
3356 BlockNeedsPredication(UnderlyingInstr.getParent()))
3357 CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
3358 } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) {
3359 VPRecipeBase *AddrDef = InterleaveRec->getAddr()->getDefiningRecipe();
3360 if (AddrDef) {
3361 // Check if any member of the interleave group needs predication.
3362 const InterleaveGroup<Instruction> *InterGroup =
3363 InterleaveRec->getInterleaveGroup();
3364 bool NeedPredication = false;
3365 for (int I = 0, NumMembers = InterGroup->getNumMembers();
3366 I < NumMembers; ++I) {
3367 Instruction *Member = InterGroup->getMember(I);
3368 if (Member)
3369 NeedPredication |= BlockNeedsPredication(Member->getParent());
3370 }
3371
3372 if (NeedPredication)
3373 CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
3374 }
3375 }
3376 }
3377 }
3378}
3379
3381 VPlan &Plan,
3383 &InterleaveGroups,
3384 VPRecipeBuilder &RecipeBuilder, const bool &ScalarEpilogueAllowed) {
3385 if (InterleaveGroups.empty())
3386 return;
3387
3388 // Interleave memory: for each Interleave Group we marked earlier as relevant
3389 // for this VPlan, replace the Recipes widening its memory instructions with a
3390 // single VPInterleaveRecipe at its insertion point.
3391 VPDominatorTree VPDT(Plan);
3392 for (const auto *IG : InterleaveGroups) {
3393 auto *Start =
3394 cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IG->getMember(0)));
3395 VPIRMetadata InterleaveMD(*Start);
3396 SmallVector<VPValue *, 4> StoredValues;
3397 if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(Start))
3398 StoredValues.push_back(StoreR->getStoredValue());
3399 for (unsigned I = 1; I < IG->getFactor(); ++I) {
3400 Instruction *MemberI = IG->getMember(I);
3401 if (!MemberI)
3402 continue;
3403 VPWidenMemoryRecipe *MemoryR =
3404 cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(MemberI));
3405 if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(MemoryR))
3406 StoredValues.push_back(StoreR->getStoredValue());
3407 InterleaveMD.intersect(*MemoryR);
3408 }
3409
3410 bool NeedsMaskForGaps =
3411 (IG->requiresScalarEpilogue() && !ScalarEpilogueAllowed) ||
3412 (!StoredValues.empty() && !IG->isFull());
3413
3414 Instruction *IRInsertPos = IG->getInsertPos();
3415 auto *InsertPos =
3416 cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IRInsertPos));
3417
3419 if (auto *Gep = dyn_cast<GetElementPtrInst>(
3420 getLoadStorePointerOperand(IRInsertPos)->stripPointerCasts()))
3421 NW = Gep->getNoWrapFlags().withoutNoUnsignedWrap();
3422
3423 // Get or create the start address for the interleave group.
3424 VPValue *Addr = Start->getAddr();
3425 VPRecipeBase *AddrDef = Addr->getDefiningRecipe();
3426 if (AddrDef && !VPDT.properlyDominates(AddrDef, InsertPos)) {
3427 // We cannot re-use the address of member zero because it does not
3428 // dominate the insert position. Instead, use the address of the insert
3429 // position and create a PtrAdd adjusting it to the address of member
3430 // zero.
3431 // TODO: Hoist Addr's defining recipe (and any operands as needed) to
3432 // InsertPos or sink loads above zero members to join it.
3433 assert(IG->getIndex(IRInsertPos) != 0 &&
3434 "index of insert position shouldn't be zero");
3435 auto &DL = IRInsertPos->getDataLayout();
3436 APInt Offset(32,
3437 DL.getTypeAllocSize(getLoadStoreType(IRInsertPos)) *
3438 IG->getIndex(IRInsertPos),
3439 /*IsSigned=*/true);
3440 VPValue *OffsetVPV = Plan.getConstantInt(-Offset);
3441 VPBuilder B(InsertPos);
3442 Addr = B.createNoWrapPtrAdd(InsertPos->getAddr(), OffsetVPV, NW);
3443 }
3444 // If the group is reverse, adjust the index to refer to the last vector
3445 // lane instead of the first. We adjust the index from the first vector
3446 // lane, rather than directly getting the pointer for lane VF - 1, because
3447 // the pointer operand of the interleaved access is supposed to be uniform.
3448 if (IG->isReverse()) {
3449 auto *ReversePtr = new VPVectorEndPointerRecipe(
3450 Addr, &Plan.getVF(), getLoadStoreType(IRInsertPos),
3451 -(int64_t)IG->getFactor(), NW, InsertPos->getDebugLoc());
3452 ReversePtr->insertBefore(InsertPos);
3453 Addr = ReversePtr;
3454 }
3455 auto *VPIG = new VPInterleaveRecipe(IG, Addr, StoredValues,
3456 InsertPos->getMask(), NeedsMaskForGaps,
3457 InterleaveMD, InsertPos->getDebugLoc());
3458 VPIG->insertBefore(InsertPos);
3459
3460 unsigned J = 0;
3461 for (unsigned i = 0; i < IG->getFactor(); ++i)
3462 if (Instruction *Member = IG->getMember(i)) {
3463 VPRecipeBase *MemberR = RecipeBuilder.getRecipe(Member);
3464 if (!Member->getType()->isVoidTy()) {
3465 VPValue *OriginalV = MemberR->getVPSingleValue();
3466 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));
3467 J++;
3468 }
3469 MemberR->eraseFromParent();
3470 }
3471 }
3472}
3473
3474/// Expand a VPWidenIntOrFpInduction into executable recipes, for the initial
3475/// value, phi and backedge value. In the following example:
3476///
3477/// vector.ph:
3478/// Successor(s): vector loop
3479///
3480/// <x1> vector loop: {
3481/// vector.body:
3482/// WIDEN-INDUCTION %i = phi %start, %step, %vf
3483/// ...
3484/// EMIT branch-on-count ...
3485/// No successors
3486/// }
3487///
3488/// WIDEN-INDUCTION will get expanded to:
3489///
3490/// vector.ph:
3491/// ...
3492/// vp<%induction.start> = ...
3493/// vp<%induction.increment> = ...
3494///
3495/// Successor(s): vector loop
3496///
3497/// <x1> vector loop: {
3498/// vector.body:
3499/// ir<%i> = WIDEN-PHI vp<%induction.start>, vp<%vec.ind.next>
3500/// ...
3501/// vp<%vec.ind.next> = add ir<%i>, vp<%induction.increment>
3502/// EMIT branch-on-count ...
3503/// No successors
3504/// }
3505static void
3507 VPTypeAnalysis &TypeInfo) {
3508 VPlan *Plan = WidenIVR->getParent()->getPlan();
3509 VPValue *Start = WidenIVR->getStartValue();
3510 VPValue *Step = WidenIVR->getStepValue();
3511 VPValue *VF = WidenIVR->getVFValue();
3512 DebugLoc DL = WidenIVR->getDebugLoc();
3513
3514 // The value from the original loop to which we are mapping the new induction
3515 // variable.
3516 Type *Ty = TypeInfo.inferScalarType(WidenIVR);
3517
3518 const InductionDescriptor &ID = WidenIVR->getInductionDescriptor();
3521 VPIRFlags Flags = *WidenIVR;
3522 if (ID.getKind() == InductionDescriptor::IK_IntInduction) {
3523 AddOp = Instruction::Add;
3524 MulOp = Instruction::Mul;
3525 } else {
3526 AddOp = ID.getInductionOpcode();
3527 MulOp = Instruction::FMul;
3528 }
3529
3530 // If the phi is truncated, truncate the start and step values.
3531 VPBuilder Builder(Plan->getVectorPreheader());
3532 Type *StepTy = TypeInfo.inferScalarType(Step);
3533 if (Ty->getScalarSizeInBits() < StepTy->getScalarSizeInBits()) {
3534 assert(StepTy->isIntegerTy() && "Truncation requires an integer type");
3535 Step = Builder.createScalarCast(Instruction::Trunc, Step, Ty, DL);
3536 Start = Builder.createScalarCast(Instruction::Trunc, Start, Ty, DL);
3537 // Truncation doesn't preserve WrapFlags.
3538 Flags.dropPoisonGeneratingFlags();
3539 StepTy = Ty;
3540 }
3541
3542 // Construct the initial value of the vector IV in the vector loop preheader.
3543 Type *IVIntTy =
3545 VPValue *Init = Builder.createNaryOp(VPInstruction::StepVector, {}, IVIntTy);
3546 if (StepTy->isFloatingPointTy())
3547 Init = Builder.createWidenCast(Instruction::UIToFP, Init, StepTy);
3548
3549 VPValue *SplatStart = Builder.createNaryOp(VPInstruction::Broadcast, Start);
3550 VPValue *SplatStep = Builder.createNaryOp(VPInstruction::Broadcast, Step);
3551
3552 Init = Builder.createNaryOp(MulOp, {Init, SplatStep}, Flags);
3553 Init = Builder.createNaryOp(AddOp, {SplatStart, Init}, Flags,
3554 DebugLoc::getUnknown(), "induction");
3555
3556 // Create the widened phi of the vector IV.
3557 auto *WidePHI = new VPWidenPHIRecipe(WidenIVR->getPHINode(), Init,
3558 WidenIVR->getDebugLoc(), "vec.ind");
3559 WidePHI->insertBefore(WidenIVR);
3560
3561 // Create the backedge value for the vector IV.
3562 VPValue *Inc;
3563 VPValue *Prev;
3564 // If unrolled, use the increment and prev value from the operands.
3565 if (auto *SplatVF = WidenIVR->getSplatVFValue()) {
3566 Inc = SplatVF;
3567 Prev = WidenIVR->getLastUnrolledPartOperand();
3568 } else {
3569 if (VPRecipeBase *R = VF->getDefiningRecipe())
3570 Builder.setInsertPoint(R->getParent(), std::next(R->getIterator()));
3571 // Multiply the vectorization factor by the step using integer or
3572 // floating-point arithmetic as appropriate.
3573 if (StepTy->isFloatingPointTy())
3574 VF = Builder.createScalarCast(Instruction::CastOps::UIToFP, VF, StepTy,
3575 DL);
3576 else
3577 VF = Builder.createScalarZExtOrTrunc(VF, StepTy,
3578 TypeInfo.inferScalarType(VF), DL);
3579
3580 Inc = Builder.createNaryOp(MulOp, {Step, VF}, Flags);
3581 Inc = Builder.createNaryOp(VPInstruction::Broadcast, Inc);
3582 Prev = WidePHI;
3583 }
3584
3586 Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator());
3587 auto *Next = Builder.createNaryOp(AddOp, {Prev, Inc}, Flags,
3588 WidenIVR->getDebugLoc(), "vec.ind.next");
3589
3590 WidePHI->addOperand(Next);
3591
3592 WidenIVR->replaceAllUsesWith(WidePHI);
3593}
3594
3595/// Expand a VPWidenPointerInductionRecipe into executable recipes, for the
3596/// initial value, phi and backedge value. In the following example:
3597///
3598/// <x1> vector loop: {
3599/// vector.body:
3600/// EMIT ir<%ptr.iv> = WIDEN-POINTER-INDUCTION %start, %step, %vf
3601/// ...
3602/// EMIT branch-on-count ...
3603/// }
3604///
3605/// WIDEN-POINTER-INDUCTION will get expanded to:
3606///
3607/// <x1> vector loop: {
3608/// vector.body:
3609/// EMIT-SCALAR %pointer.phi = phi %start, %ptr.ind
3610/// EMIT %mul = mul %stepvector, %step
3611/// EMIT %vector.gep = wide-ptradd %pointer.phi, %mul
3612/// ...
3613/// EMIT %ptr.ind = ptradd %pointer.phi, %vf
3614/// EMIT branch-on-count ...
3615/// }
3617 VPTypeAnalysis &TypeInfo) {
3618 VPlan *Plan = R->getParent()->getPlan();
3619 VPValue *Start = R->getStartValue();
3620 VPValue *Step = R->getStepValue();
3621 VPValue *VF = R->getVFValue();
3622
3623 assert(R->getInductionDescriptor().getKind() ==
3625 "Not a pointer induction according to InductionDescriptor!");
3626 assert(TypeInfo.inferScalarType(R)->isPointerTy() && "Unexpected type.");
3627 assert(!R->onlyScalarsGenerated(Plan->hasScalableVF()) &&
3628 "Recipe should have been replaced");
3629
3630 VPBuilder Builder(R);
3631 DebugLoc DL = R->getDebugLoc();
3632
3633 // Build a scalar pointer phi.
3634 VPPhi *ScalarPtrPhi = Builder.createScalarPhi(Start, DL, "pointer.phi");
3635
3636 // Create actual address geps that use the pointer phi as base and a
3637 // vectorized version of the step value (<step*0, ..., step*N>) as offset.
3638 Builder.setInsertPoint(R->getParent(), R->getParent()->getFirstNonPhi());
3639 Type *StepTy = TypeInfo.inferScalarType(Step);
3640 VPValue *Offset = Builder.createNaryOp(VPInstruction::StepVector, {}, StepTy);
3641 Offset = Builder.createOverflowingOp(Instruction::Mul, {Offset, Step});
3642 VPValue *PtrAdd = Builder.createNaryOp(
3643 VPInstruction::WidePtrAdd, {ScalarPtrPhi, Offset}, DL, "vector.gep");
3644 R->replaceAllUsesWith(PtrAdd);
3645
3646 // Create the backedge value for the scalar pointer phi.
3648 Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator());
3649 VF = Builder.createScalarZExtOrTrunc(VF, StepTy, TypeInfo.inferScalarType(VF),
3650 DL);
3651 VPValue *Inc = Builder.createOverflowingOp(Instruction::Mul, {Step, VF});
3652
3653 VPValue *InductionGEP =
3654 Builder.createPtrAdd(ScalarPtrPhi, Inc, DL, "ptr.ind");
3655 ScalarPtrPhi->addOperand(InductionGEP);
3656}
3657
3659 // Replace loop regions with explicity CFG.
3660 SmallVector<VPRegionBlock *> LoopRegions;
3662 vp_depth_first_deep(Plan.getEntry()))) {
3663 if (!R->isReplicator())
3664 LoopRegions.push_back(R);
3665 }
3666 for (VPRegionBlock *R : LoopRegions)
3667 R->dissolveToCFGLoop();
3668}
3669
3671 VPTypeAnalysis TypeInfo(Plan);
3674 vp_depth_first_deep(Plan.getEntry()))) {
3675 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
3676 if (auto *WidenIVR = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R)) {
3677 expandVPWidenIntOrFpInduction(WidenIVR, TypeInfo);
3678 ToRemove.push_back(WidenIVR);
3679 continue;
3680 }
3681
3682 if (auto *WidenIVR = dyn_cast<VPWidenPointerInductionRecipe>(&R)) {
3683 // If the recipe only generates scalars, scalarize it instead of
3684 // expanding it.
3685 if (WidenIVR->onlyScalarsGenerated(Plan.hasScalableVF())) {
3686 VPBuilder Builder(WidenIVR);
3687 VPValue *PtrAdd =
3688 scalarizeVPWidenPointerInduction(WidenIVR, Plan, Builder);
3689 WidenIVR->replaceAllUsesWith(PtrAdd);
3690 ToRemove.push_back(WidenIVR);
3691 continue;
3692 }
3693 expandVPWidenPointerInduction(WidenIVR, TypeInfo);
3694 ToRemove.push_back(WidenIVR);
3695 continue;
3696 }
3697
3698 // Expand VPBlendRecipe into VPInstruction::Select.
3699 VPBuilder Builder(&R);
3700 if (auto *Blend = dyn_cast<VPBlendRecipe>(&R)) {
3701 VPValue *Select = Blend->getIncomingValue(0);
3702 for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
3703 Select = Builder.createSelect(Blend->getMask(I),
3704 Blend->getIncomingValue(I), Select,
3705 R.getDebugLoc(), "predphi");
3706 Blend->replaceAllUsesWith(Select);
3707 ToRemove.push_back(Blend);
3708 }
3709
3710 if (auto *Expr = dyn_cast<VPExpressionRecipe>(&R)) {
3711 Expr->decompose();
3712 ToRemove.push_back(Expr);
3713 }
3714
3715 // Expand LastActiveLane into Not + FirstActiveLane + Sub.
3716 auto *LastActiveL = dyn_cast<VPInstruction>(&R);
3717 if (LastActiveL &&
3718 LastActiveL->getOpcode() == VPInstruction::LastActiveLane) {
3719 // Create Not(Mask) for all operands.
3721 for (VPValue *Op : LastActiveL->operands()) {
3722 VPValue *NotMask = Builder.createNot(Op, LastActiveL->getDebugLoc());
3723 NotMasks.push_back(NotMask);
3724 }
3725
3726 // Create FirstActiveLane on the inverted masks.
3727 VPValue *FirstInactiveLane = Builder.createNaryOp(
3729 LastActiveL->getDebugLoc(), "first.inactive.lane");
3730
3731 // Subtract 1 to get the last active lane.
3732 VPValue *One = Plan.getOrAddLiveIn(
3733 ConstantInt::get(Type::getInt64Ty(Plan.getContext()), 1));
3734 VPValue *LastLane = Builder.createNaryOp(
3735 Instruction::Sub, {FirstInactiveLane, One},
3736 LastActiveL->getDebugLoc(), "last.active.lane");
3737
3738 LastActiveL->replaceAllUsesWith(LastLane);
3739 ToRemove.push_back(LastActiveL);
3740 continue;
3741 }
3742
3743 VPValue *VectorStep;
3744 VPValue *ScalarStep;
3746 m_VPValue(VectorStep), m_VPValue(ScalarStep))))
3747 continue;
3748
3749 // Expand WideIVStep.
3750 auto *VPI = cast<VPInstruction>(&R);
3751 Type *IVTy = TypeInfo.inferScalarType(VPI);
3752 if (TypeInfo.inferScalarType(VectorStep) != IVTy) {
3754 ? Instruction::UIToFP
3755 : Instruction::Trunc;
3756 VectorStep = Builder.createWidenCast(CastOp, VectorStep, IVTy);
3757 }
3758
3759 assert(!match(ScalarStep, m_One()) && "Expected non-unit scalar-step");
3760 if (TypeInfo.inferScalarType(ScalarStep) != IVTy) {
3761 ScalarStep =
3762 Builder.createWidenCast(Instruction::Trunc, ScalarStep, IVTy);
3763 }
3764
3765 VPIRFlags Flags;
3766 if (IVTy->isFloatingPointTy())
3767 Flags = {VPI->getFastMathFlags()};
3768
3769 unsigned MulOpc =
3770 IVTy->isFloatingPointTy() ? Instruction::FMul : Instruction::Mul;
3771 VPInstruction *Mul = Builder.createNaryOp(
3772 MulOpc, {VectorStep, ScalarStep}, Flags, R.getDebugLoc());
3773 VectorStep = Mul;
3774 VPI->replaceAllUsesWith(VectorStep);
3775 ToRemove.push_back(VPI);
3776 }
3777 }
3778
3779 for (VPRecipeBase *R : ToRemove)
3780 R->eraseFromParent();
3781}
3782
3784 VPBasicBlock *EarlyExitVPBB,
3785 VPlan &Plan,
3786 VPBasicBlock *HeaderVPBB,
3787 VPBasicBlock *LatchVPBB) {
3788 VPBlockBase *MiddleVPBB = LatchVPBB->getSuccessors()[0];
3789 if (!EarlyExitVPBB->getSinglePredecessor() &&
3790 EarlyExitVPBB->getPredecessors()[1] == MiddleVPBB) {
3791 assert(EarlyExitVPBB->getNumPredecessors() == 2 &&
3792 EarlyExitVPBB->getPredecessors()[0] == EarlyExitingVPBB &&
3793 "unsupported early exit VPBB");
3794 // Early exit operand should always be last phi operand. If EarlyExitVPBB
3795 // has two predecessors and EarlyExitingVPBB is the first, swap the operands
3796 // of the phis.
3797 for (VPRecipeBase &R : EarlyExitVPBB->phis())
3798 cast<VPIRPhi>(&R)->swapOperands();
3799 }
3800
3801 VPBuilder Builder(LatchVPBB->getTerminator());
3802 VPBlockBase *TrueSucc = EarlyExitingVPBB->getSuccessors()[0];
3803 assert(match(EarlyExitingVPBB->getTerminator(), m_BranchOnCond()) &&
3804 "Terminator must be be BranchOnCond");
3805 VPValue *CondOfEarlyExitingVPBB =
3806 EarlyExitingVPBB->getTerminator()->getOperand(0);
3807 auto *CondToEarlyExit = TrueSucc == EarlyExitVPBB
3808 ? CondOfEarlyExitingVPBB
3809 : Builder.createNot(CondOfEarlyExitingVPBB);
3810
3811 // Split the middle block and have it conditionally branch to the early exit
3812 // block if CondToEarlyExit.
3813 VPValue *IsEarlyExitTaken =
3814 Builder.createNaryOp(VPInstruction::AnyOf, {CondToEarlyExit});
3815 VPBasicBlock *NewMiddle = Plan.createVPBasicBlock("middle.split");
3816 VPBasicBlock *VectorEarlyExitVPBB =
3817 Plan.createVPBasicBlock("vector.early.exit");
3818 VPBlockUtils::insertOnEdge(LatchVPBB, MiddleVPBB, NewMiddle);
3819 VPBlockUtils::connectBlocks(NewMiddle, VectorEarlyExitVPBB);
3820 NewMiddle->swapSuccessors();
3821
3822 VPBlockUtils::connectBlocks(VectorEarlyExitVPBB, EarlyExitVPBB);
3823
3824 // Update the exit phis in the early exit block.
3825 VPBuilder MiddleBuilder(NewMiddle);
3826 VPBuilder EarlyExitB(VectorEarlyExitVPBB);
3827 for (VPRecipeBase &R : EarlyExitVPBB->phis()) {
3828 auto *ExitIRI = cast<VPIRPhi>(&R);
3829 // Early exit operand should always be last, i.e., 0 if EarlyExitVPBB has
3830 // a single predecessor and 1 if it has two.
3831 unsigned EarlyExitIdx = ExitIRI->getNumOperands() - 1;
3832 if (ExitIRI->getNumOperands() != 1) {
3833 // The first of two operands corresponds to the latch exit, via MiddleVPBB
3834 // predecessor. Extract its final lane.
3835 ExitIRI->extractLastLaneOfLastPartOfFirstOperand(MiddleBuilder);
3836 }
3837
3838 VPValue *IncomingFromEarlyExit = ExitIRI->getOperand(EarlyExitIdx);
3839 if (!IncomingFromEarlyExit->isLiveIn()) {
3840 // Update the incoming value from the early exit.
3841 VPValue *FirstActiveLane = EarlyExitB.createNaryOp(
3842 VPInstruction::FirstActiveLane, {CondToEarlyExit},
3843 DebugLoc::getUnknown(), "first.active.lane");
3844 IncomingFromEarlyExit = EarlyExitB.createNaryOp(
3845 VPInstruction::ExtractLane, {FirstActiveLane, IncomingFromEarlyExit},
3846 DebugLoc::getUnknown(), "early.exit.value");
3847 ExitIRI->setOperand(EarlyExitIdx, IncomingFromEarlyExit);
3848 }
3849 }
3850 MiddleBuilder.createNaryOp(VPInstruction::BranchOnCond, {IsEarlyExitTaken});
3851
3852 // Replace the condition controlling the non-early exit from the vector loop
3853 // with one exiting if either the original condition of the vector latch is
3854 // true or the early exit has been taken.
3855 auto *LatchExitingBranch = cast<VPInstruction>(LatchVPBB->getTerminator());
3856 assert(LatchExitingBranch->getOpcode() == VPInstruction::BranchOnCount &&
3857 "Unexpected terminator");
3858 auto *IsLatchExitTaken =
3859 Builder.createICmp(CmpInst::ICMP_EQ, LatchExitingBranch->getOperand(0),
3860 LatchExitingBranch->getOperand(1));
3861 auto *AnyExitTaken = Builder.createNaryOp(
3862 Instruction::Or, {IsEarlyExitTaken, IsLatchExitTaken});
3863 Builder.createNaryOp(VPInstruction::BranchOnCond, AnyExitTaken);
3864 LatchExitingBranch->eraseFromParent();
3865}
3866
3867/// This function tries convert extended in-loop reductions to
3868/// VPExpressionRecipe and clamp the \p Range if it is beneficial and
3869/// valid. The created recipe must be decomposed to its constituent
3870/// recipes before execution.
3871static VPExpressionRecipe *
3873 VFRange &Range) {
3874 Type *RedTy = Ctx.Types.inferScalarType(Red);
3875 VPValue *VecOp = Red->getVecOp();
3876
3877 // Clamp the range if using extended-reduction is profitable.
3878 auto IsExtendedRedValidAndClampRange =
3879 [&](unsigned Opcode, Instruction::CastOps ExtOpc, Type *SrcTy) -> bool {
3881 [&](ElementCount VF) {
3882 auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF));
3884
3885 InstructionCost ExtRedCost;
3886 InstructionCost ExtCost =
3887 cast<VPWidenCastRecipe>(VecOp)->computeCost(VF, Ctx);
3888 InstructionCost RedCost = Red->computeCost(VF, Ctx);
3889
3890 if (Red->isPartialReduction()) {
3893 // FIXME: Move partial reduction creation, costing and clamping
3894 // here from LoopVectorize.cpp.
3895 ExtRedCost = Ctx.TTI.getPartialReductionCost(
3896 Opcode, SrcTy, nullptr, RedTy, VF, ExtKind,
3897 llvm::TargetTransformInfo::PR_None, std::nullopt, Ctx.CostKind);
3898 } else {
3899 ExtRedCost = Ctx.TTI.getExtendedReductionCost(
3900 Opcode, ExtOpc == Instruction::CastOps::ZExt, RedTy, SrcVecTy,
3901 Red->getFastMathFlags(), CostKind);
3902 }
3903 return ExtRedCost.isValid() && ExtRedCost < ExtCost + RedCost;
3904 },
3905 Range);
3906 };
3907
3908 VPValue *A;
3909 // Match reduce(ext)).
3910 if (match(VecOp, m_ZExtOrSExt(m_VPValue(A))) &&
3911 IsExtendedRedValidAndClampRange(
3912 RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()),
3913 cast<VPWidenCastRecipe>(VecOp)->getOpcode(),
3914 Ctx.Types.inferScalarType(A)))
3915 return new VPExpressionRecipe(cast<VPWidenCastRecipe>(VecOp), Red);
3916
3917 return nullptr;
3918}
3919
3920/// This function tries convert extended in-loop reductions to
3921/// VPExpressionRecipe and clamp the \p Range if it is beneficial
3922/// and valid. The created VPExpressionRecipe must be decomposed to its
3923/// constituent recipes before execution. Patterns of the
3924/// VPExpressionRecipe:
3925/// reduce.add(mul(...)),
3926/// reduce.add(mul(ext(A), ext(B))),
3927/// reduce.add(ext(mul(ext(A), ext(B)))).
3928static VPExpressionRecipe *
3930 VPCostContext &Ctx, VFRange &Range) {
3931 unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind());
3932 if (Opcode != Instruction::Add && Opcode != Instruction::Sub)
3933 return nullptr;
3934
3935 Type *RedTy = Ctx.Types.inferScalarType(Red);
3936
3937 // Clamp the range if using multiply-accumulate-reduction is profitable.
3938 auto IsMulAccValidAndClampRange =
3940 VPWidenCastRecipe *OuterExt) -> bool {
3942 [&](ElementCount VF) {
3944 Type *SrcTy =
3945 Ext0 ? Ctx.Types.inferScalarType(Ext0->getOperand(0)) : RedTy;
3946 InstructionCost MulAccCost;
3947
3948 if (Red->isPartialReduction()) {
3949 Type *SrcTy2 =
3950 Ext1 ? Ctx.Types.inferScalarType(Ext1->getOperand(0)) : nullptr;
3951 // FIXME: Move partial reduction creation, costing and clamping
3952 // here from LoopVectorize.cpp.
3953 MulAccCost = Ctx.TTI.getPartialReductionCost(
3954 Opcode, SrcTy, SrcTy2, RedTy, VF,
3956 Ext0->getOpcode())
3959 Ext1->getOpcode())
3961 Mul->getOpcode(), CostKind);
3962 } else {
3963 // Only partial reductions support mixed extends at the moment.
3964 if (Ext0 && Ext1 && Ext0->getOpcode() != Ext1->getOpcode())
3965 return false;
3966
3967 bool IsZExt =
3968 !Ext0 || Ext0->getOpcode() == Instruction::CastOps::ZExt;
3969 auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF));
3970 MulAccCost = Ctx.TTI.getMulAccReductionCost(IsZExt, Opcode, RedTy,
3971 SrcVecTy, CostKind);
3972 }
3973
3974 InstructionCost MulCost = Mul->computeCost(VF, Ctx);
3975 InstructionCost RedCost = Red->computeCost(VF, Ctx);
3976 InstructionCost ExtCost = 0;
3977 if (Ext0)
3978 ExtCost += Ext0->computeCost(VF, Ctx);
3979 if (Ext1)
3980 ExtCost += Ext1->computeCost(VF, Ctx);
3981 if (OuterExt)
3982 ExtCost += OuterExt->computeCost(VF, Ctx);
3983
3984 return MulAccCost.isValid() &&
3985 MulAccCost < ExtCost + MulCost + RedCost;
3986 },
3987 Range);
3988 };
3989
3990 VPValue *VecOp = Red->getVecOp();
3991 VPRecipeBase *Sub = nullptr;
3992 VPValue *A, *B;
3993 VPValue *Tmp = nullptr;
3994 // Sub reductions could have a sub between the add reduction and vec op.
3995 if (match(VecOp, m_Sub(m_ZeroInt(), m_VPValue(Tmp)))) {
3996 Sub = VecOp->getDefiningRecipe();
3997 VecOp = Tmp;
3998 }
3999
4000 // If ValB is a constant and can be safely extended, truncate it to the same
4001 // type as ExtA's operand, then extend it to the same type as ExtA. This
4002 // creates two uniform extends that can more easily be matched by the rest of
4003 // the bundling code. The ExtB reference, ValB and operand 1 of Mul are all
4004 // replaced with the new extend of the constant.
4005 auto ExtendAndReplaceConstantOp = [&Ctx](VPWidenCastRecipe *ExtA,
4006 VPWidenCastRecipe *&ExtB,
4007 VPValue *&ValB, VPWidenRecipe *Mul) {
4008 if (!ExtA || ExtB || !ValB->isLiveIn())
4009 return;
4010 Type *NarrowTy = Ctx.Types.inferScalarType(ExtA->getOperand(0));
4011 Instruction::CastOps ExtOpc = ExtA->getOpcode();
4012 const APInt *Const;
4013 if (!match(ValB, m_APInt(Const)) ||
4015 Const, NarrowTy, TTI::getPartialReductionExtendKind(ExtOpc)))
4016 return;
4017 // The truncate ensures that the type of each extended operand is the
4018 // same, and it's been proven that the constant can be extended from
4019 // NarrowTy safely. Necessary since ExtA's extended operand would be
4020 // e.g. an i8, while the const will likely be an i32. This will be
4021 // elided by later optimisations.
4022 VPBuilder Builder(Mul);
4023 auto *Trunc =
4024 Builder.createWidenCast(Instruction::CastOps::Trunc, ValB, NarrowTy);
4025 Type *WideTy = Ctx.Types.inferScalarType(ExtA);
4026 ValB = ExtB = Builder.createWidenCast(ExtOpc, Trunc, WideTy);
4027 Mul->setOperand(1, ExtB);
4028 };
4029
4030 // Try to match reduce.add(mul(...)).
4031 if (match(VecOp, m_Mul(m_VPValue(A), m_VPValue(B)))) {
4034 auto *Mul = cast<VPWidenRecipe>(VecOp);
4035
4036 // Convert reduce.add(mul(ext, const)) to reduce.add(mul(ext, ext(const)))
4037 ExtendAndReplaceConstantOp(RecipeA, RecipeB, B, Mul);
4038
4039 // Match reduce.add/sub(mul(ext, ext)).
4040 if (RecipeA && RecipeB && match(RecipeA, m_ZExtOrSExt(m_VPValue())) &&
4041 match(RecipeB, m_ZExtOrSExt(m_VPValue())) &&
4042 IsMulAccValidAndClampRange(Mul, RecipeA, RecipeB, nullptr)) {
4043 if (Sub)
4044 return new VPExpressionRecipe(RecipeA, RecipeB, Mul,
4045 cast<VPWidenRecipe>(Sub), Red);
4046 return new VPExpressionRecipe(RecipeA, RecipeB, Mul, Red);
4047 }
4048 // TODO: Add an expression type for this variant with a negated mul
4049 if (!Sub && IsMulAccValidAndClampRange(Mul, nullptr, nullptr, nullptr))
4050 return new VPExpressionRecipe(Mul, Red);
4051 }
4052 // TODO: Add an expression type for negated versions of other expression
4053 // variants.
4054 if (Sub)
4055 return nullptr;
4056
4057 // Match reduce.add(ext(mul(A, B))).
4058 if (match(VecOp, m_ZExtOrSExt(m_Mul(m_VPValue(A), m_VPValue(B))))) {
4059 auto *Ext = cast<VPWidenCastRecipe>(VecOp);
4060 auto *Mul = cast<VPWidenRecipe>(Ext->getOperand(0));
4063
4064 // reduce.add(ext(mul(ext, const)))
4065 // -> reduce.add(ext(mul(ext, ext(const))))
4066 ExtendAndReplaceConstantOp(Ext0, Ext1, B, Mul);
4067
4068 // reduce.add(ext(mul(ext(A), ext(B))))
4069 // -> reduce.add(mul(wider_ext(A), wider_ext(B)))
4070 // The inner extends must either have the same opcode as the outer extend or
4071 // be the same, in which case the multiply can never result in a negative
4072 // value and the outer extend can be folded away by doing wider
4073 // extends for the operands of the mul.
4074 if (Ext0 && Ext1 &&
4075 (Ext->getOpcode() == Ext0->getOpcode() || Ext0 == Ext1) &&
4076 Ext0->getOpcode() == Ext1->getOpcode() &&
4077 IsMulAccValidAndClampRange(Mul, Ext0, Ext1, Ext) && Mul->hasOneUse()) {
4078 auto *NewExt0 = new VPWidenCastRecipe(
4079 Ext0->getOpcode(), Ext0->getOperand(0), Ext->getResultType(), nullptr,
4080 *Ext0, *Ext0, Ext0->getDebugLoc());
4081 NewExt0->insertBefore(Ext0);
4082
4083 VPWidenCastRecipe *NewExt1 = NewExt0;
4084 if (Ext0 != Ext1) {
4085 NewExt1 = new VPWidenCastRecipe(Ext1->getOpcode(), Ext1->getOperand(0),
4086 Ext->getResultType(), nullptr, *Ext1,
4087 *Ext1, Ext1->getDebugLoc());
4088 NewExt1->insertBefore(Ext1);
4089 }
4090 Mul->setOperand(0, NewExt0);
4091 Mul->setOperand(1, NewExt1);
4092 Red->setOperand(1, Mul);
4093 return new VPExpressionRecipe(NewExt0, NewExt1, Mul, Red);
4094 }
4095 }
4096 return nullptr;
4097}
4098
4099/// This function tries to create abstract recipes from the reduction recipe for
4100/// following optimizations and cost estimation.
4102 VPCostContext &Ctx,
4103 VFRange &Range) {
4104 VPExpressionRecipe *AbstractR = nullptr;
4105 auto IP = std::next(Red->getIterator());
4106 auto *VPBB = Red->getParent();
4107 if (auto *MulAcc = tryToMatchAndCreateMulAccumulateReduction(Red, Ctx, Range))
4108 AbstractR = MulAcc;
4109 else if (auto *ExtRed = tryToMatchAndCreateExtendedReduction(Red, Ctx, Range))
4110 AbstractR = ExtRed;
4111 // Cannot create abstract inloop reduction recipes.
4112 if (!AbstractR)
4113 return;
4114
4115 AbstractR->insertBefore(*VPBB, IP);
4116 Red->replaceAllUsesWith(AbstractR);
4117}
4118
4129
4131 if (Plan.hasScalarVFOnly())
4132 return;
4133
4134#ifndef NDEBUG
4135 VPDominatorTree VPDT(Plan);
4136#endif
4137
4138 SmallVector<VPValue *> VPValues;
4141 append_range(VPValues, Plan.getLiveIns());
4142 for (VPRecipeBase &R : *Plan.getEntry())
4143 append_range(VPValues, R.definedValues());
4144
4145 auto *VectorPreheader = Plan.getVectorPreheader();
4146 for (VPValue *VPV : VPValues) {
4148 (VPV->isLiveIn() && VPV->getLiveInIRValue() &&
4149 isa<Constant>(VPV->getLiveInIRValue())))
4150 continue;
4151
4152 // Add explicit broadcast at the insert point that dominates all users.
4153 VPBasicBlock *HoistBlock = VectorPreheader;
4154 VPBasicBlock::iterator HoistPoint = VectorPreheader->end();
4155 for (VPUser *User : VPV->users()) {
4156 if (User->usesScalars(VPV))
4157 continue;
4158 if (cast<VPRecipeBase>(User)->getParent() == VectorPreheader)
4159 HoistPoint = HoistBlock->begin();
4160 else
4161 assert(VPDT.dominates(VectorPreheader,
4162 cast<VPRecipeBase>(User)->getParent()) &&
4163 "All users must be in the vector preheader or dominated by it");
4164 }
4165
4166 VPBuilder Builder(cast<VPBasicBlock>(HoistBlock), HoistPoint);
4167 auto *Broadcast = Builder.createNaryOp(VPInstruction::Broadcast, {VPV});
4168 VPV->replaceUsesWithIf(Broadcast,
4169 [VPV, Broadcast](VPUser &U, unsigned Idx) {
4170 return Broadcast != &U && !U.usesScalars(VPV);
4171 });
4172 }
4173}
4174
4176 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
4177
4178 // Collect candidate loads with invariant addresses and noalias scopes
4179 // metadata and memory-writing recipes with noalias metadata.
4183 vp_depth_first_shallow(LoopRegion->getEntry()))) {
4184 for (VPRecipeBase &R : *VPBB) {
4185 // Only handle single-scalar replicated loads with invariant addresses.
4186 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
4187 if (RepR->isPredicated() || !RepR->isSingleScalar() ||
4188 RepR->getOpcode() != Instruction::Load)
4189 continue;
4190
4191 VPValue *Addr = RepR->getOperand(0);
4192 if (Addr->isDefinedOutsideLoopRegions()) {
4194 if (!Loc.AATags.Scope)
4195 continue;
4196 CandidateLoads.push_back({RepR, Loc});
4197 }
4198 }
4199 if (R.mayWriteToMemory()) {
4201 if (!Loc || !Loc->AATags.Scope || !Loc->AATags.NoAlias)
4202 return;
4203 Stores.push_back(*Loc);
4204 }
4205 }
4206 }
4207
4208 VPBasicBlock *Preheader = Plan.getVectorPreheader();
4209 for (auto &[LoadRecipe, LoadLoc] : CandidateLoads) {
4210 // Hoist the load to the preheader if it doesn't alias with any stores
4211 // according to the noalias metadata. Other loads should have been hoisted
4212 // by other passes
4213 const AAMDNodes &LoadAA = LoadLoc.AATags;
4214 if (all_of(Stores, [&](const MemoryLocation &StoreLoc) {
4216 LoadAA.Scope, StoreLoc.AATags.NoAlias);
4217 })) {
4218 LoadRecipe->moveBefore(*Preheader, Preheader->getFirstNonPhi());
4219 }
4220 }
4221}
4222
4223// Collect common metadata from a group of replicate recipes by intersecting
4224// metadata from all recipes in the group.
4226 VPIRMetadata CommonMetadata = *Recipes.front();
4227 for (VPReplicateRecipe *Recipe : drop_begin(Recipes))
4228 CommonMetadata.intersect(*Recipe);
4229 return CommonMetadata;
4230}
4231
4232template <unsigned Opcode>
4235 const Loop *L) {
4236 static_assert(Opcode == Instruction::Load || Opcode == Instruction::Store,
4237 "Only Load and Store opcodes supported");
4238 constexpr bool IsLoad = (Opcode == Instruction::Load);
4239 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
4240 VPTypeAnalysis TypeInfo(Plan);
4241
4242 // Group predicated operations by their address SCEV.
4244 for (VPBlockBase *Block : vp_depth_first_shallow(LoopRegion->getEntry())) {
4245 auto *VPBB = cast<VPBasicBlock>(Block);
4246 for (VPRecipeBase &R : *VPBB) {
4247 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
4248 if (!RepR || RepR->getOpcode() != Opcode || !RepR->isPredicated())
4249 continue;
4250
4251 // For loads, operand 0 is address; for stores, operand 1 is address.
4252 VPValue *Addr = RepR->getOperand(IsLoad ? 0 : 1);
4253 const SCEV *AddrSCEV = vputils::getSCEVExprForVPValue(Addr, SE, L);
4254 if (!isa<SCEVCouldNotCompute>(AddrSCEV))
4255 RecipesByAddress[AddrSCEV].push_back(RepR);
4256 }
4257 }
4258
4259 // For each address, collect operations with the same or complementary masks.
4261 auto GetLoadStoreValueType = [&](VPReplicateRecipe *Recipe) {
4262 return TypeInfo.inferScalarType(IsLoad ? Recipe : Recipe->getOperand(0));
4263 };
4264 for (auto &[Addr, Recipes] : RecipesByAddress) {
4265 if (Recipes.size() < 2)
4266 continue;
4267
4268 // Collect groups with the same or complementary masks.
4269 for (VPReplicateRecipe *&RecipeI : Recipes) {
4270 if (!RecipeI)
4271 continue;
4272
4273 VPValue *MaskI = RecipeI->getMask();
4274 Type *TypeI = GetLoadStoreValueType(RecipeI);
4276 Group.push_back(RecipeI);
4277 RecipeI = nullptr;
4278
4279 // Find all operations with the same or complementary masks.
4280 bool HasComplementaryMask = false;
4281 for (VPReplicateRecipe *&RecipeJ : Recipes) {
4282 if (!RecipeJ)
4283 continue;
4284
4285 VPValue *MaskJ = RecipeJ->getMask();
4286 Type *TypeJ = GetLoadStoreValueType(RecipeJ);
4287 if (TypeI == TypeJ) {
4288 // Check if any operation in the group has a complementary mask with
4289 // another, that is M1 == NOT(M2) or M2 == NOT(M1).
4290 HasComplementaryMask |= match(MaskI, m_Not(m_Specific(MaskJ))) ||
4291 match(MaskJ, m_Not(m_Specific(MaskI)));
4292 Group.push_back(RecipeJ);
4293 RecipeJ = nullptr;
4294 }
4295 }
4296
4297 if (HasComplementaryMask) {
4298 assert(Group.size() >= 2 && "must have at least 2 entries");
4299 AllGroups.push_back(std::move(Group));
4300 }
4301 }
4302 }
4303
4304 return AllGroups;
4305}
4306
4307// Find the recipe with minimum alignment in the group.
4308template <typename InstType>
4309static VPReplicateRecipe *
4311 return *min_element(Group, [](VPReplicateRecipe *A, VPReplicateRecipe *B) {
4312 return cast<InstType>(A->getUnderlyingInstr())->getAlign() <
4313 cast<InstType>(B->getUnderlyingInstr())->getAlign();
4314 });
4315}
4316
4318 const Loop *L) {
4319 auto Groups =
4321 if (Groups.empty())
4322 return;
4323
4324 VPDominatorTree VPDT(Plan);
4325
4326 // Process each group of loads.
4327 for (auto &Group : Groups) {
4328 // Sort loads by dominance order, with earliest (most dominating) first.
4329 sort(Group, [&VPDT](VPReplicateRecipe *A, VPReplicateRecipe *B) {
4330 return VPDT.properlyDominates(A, B);
4331 });
4332
4333 // Try to use the earliest (most dominating) load to replace all others.
4334 VPReplicateRecipe *EarliestLoad = Group[0];
4335 VPBasicBlock *FirstBB = EarliestLoad->getParent();
4336 VPBasicBlock *LastBB = Group.back()->getParent();
4337
4338 // Check that the load doesn't alias with stores between first and last.
4339 auto LoadLoc = vputils::getMemoryLocation(*EarliestLoad);
4340 if (!LoadLoc || !canHoistOrSinkWithNoAliasCheck(*LoadLoc, FirstBB, LastBB))
4341 continue;
4342
4343 // Collect common metadata from all loads in the group.
4344 VPIRMetadata CommonMetadata = getCommonMetadata(Group);
4345
4346 // Find the load with minimum alignment to use.
4347 auto *LoadWithMinAlign = findRecipeWithMinAlign<LoadInst>(Group);
4348
4349 // Create an unpredicated version of the earliest load with common
4350 // metadata.
4351 auto *UnpredicatedLoad = new VPReplicateRecipe(
4352 LoadWithMinAlign->getUnderlyingInstr(), {EarliestLoad->getOperand(0)},
4353 /*IsSingleScalar=*/false, /*Mask=*/nullptr, *EarliestLoad,
4354 CommonMetadata);
4355
4356 UnpredicatedLoad->insertBefore(EarliestLoad);
4357
4358 // Replace all loads in the group with the unpredicated load.
4359 for (VPReplicateRecipe *Load : Group) {
4360 Load->replaceAllUsesWith(UnpredicatedLoad);
4361 Load->eraseFromParent();
4362 }
4363 }
4364}
4365
4366static bool
4368 ScalarEvolution &SE, const Loop &L,
4369 VPTypeAnalysis &TypeInfo) {
4370 auto StoreLoc = vputils::getMemoryLocation(*StoresToSink.front());
4371 if (!StoreLoc || !StoreLoc->AATags.Scope)
4372 return false;
4373
4374 // When sinking a group of stores, all members of the group alias each other.
4375 // Skip them during the alias checks.
4376 SmallPtrSet<VPRecipeBase *, 4> StoresToSinkSet(StoresToSink.begin(),
4377 StoresToSink.end());
4378
4379 VPBasicBlock *FirstBB = StoresToSink.front()->getParent();
4380 VPBasicBlock *LastBB = StoresToSink.back()->getParent();
4381 SinkStoreInfo SinkInfo(StoresToSinkSet, *StoresToSink[0], SE, L, TypeInfo);
4382 return canHoistOrSinkWithNoAliasCheck(*StoreLoc, FirstBB, LastBB, SinkInfo);
4383}
4384
4386 const Loop *L) {
4387 auto Groups =
4389 if (Groups.empty())
4390 return;
4391
4392 VPDominatorTree VPDT(Plan);
4393 VPTypeAnalysis TypeInfo(Plan);
4394
4395 for (auto &Group : Groups) {
4396 sort(Group, [&VPDT](VPReplicateRecipe *A, VPReplicateRecipe *B) {
4397 return VPDT.properlyDominates(A, B);
4398 });
4399
4400 if (!canSinkStoreWithNoAliasCheck(Group, SE, *L, TypeInfo))
4401 continue;
4402
4403 // Use the last (most dominated) store's location for the unconditional
4404 // store.
4405 VPReplicateRecipe *LastStore = Group.back();
4406 VPBasicBlock *InsertBB = LastStore->getParent();
4407
4408 // Collect common alias metadata from all stores in the group.
4409 VPIRMetadata CommonMetadata = getCommonMetadata(Group);
4410
4411 // Build select chain for stored values.
4412 VPValue *SelectedValue = Group[0]->getOperand(0);
4413 VPBuilder Builder(InsertBB, LastStore->getIterator());
4414
4415 for (unsigned I = 1; I < Group.size(); ++I) {
4416 VPValue *Mask = Group[I]->getMask();
4417 VPValue *Value = Group[I]->getOperand(0);
4418 SelectedValue = Builder.createSelect(Mask, Value, SelectedValue,
4419 Group[I]->getDebugLoc());
4420 }
4421
4422 // Find the store with minimum alignment to use.
4423 auto *StoreWithMinAlign = findRecipeWithMinAlign<StoreInst>(Group);
4424
4425 // Create unconditional store with selected value and common metadata.
4426 auto *UnpredicatedStore =
4427 new VPReplicateRecipe(StoreWithMinAlign->getUnderlyingInstr(),
4428 {SelectedValue, LastStore->getOperand(1)},
4429 /*IsSingleScalar=*/false,
4430 /*Mask=*/nullptr, *LastStore, CommonMetadata);
4431 UnpredicatedStore->insertBefore(*InsertBB, LastStore->getIterator());
4432
4433 // Remove all predicated stores from the group.
4434 for (VPReplicateRecipe *Store : Group)
4435 Store->eraseFromParent();
4436 }
4437}
4438
4440 VPlan &Plan, ElementCount BestVF, unsigned BestUF,
4442 assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
4443 assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
4444
4445 VPValue *TC = Plan.getTripCount();
4446 // Skip cases for which the trip count may be non-trivial to materialize.
4447 // I.e., when a scalar tail is absent - due to tail folding, or when a scalar
4448 // tail is required.
4449 if (!Plan.hasScalarTail() ||
4451 Plan.getScalarPreheader() ||
4452 !TC->isLiveIn())
4453 return;
4454
4455 // Materialize vector trip counts for constants early if it can simply
4456 // be computed as (Original TC / VF * UF) * VF * UF.
4457 // TODO: Compute vector trip counts for loops requiring a scalar epilogue and
4458 // tail-folded loops.
4459 ScalarEvolution &SE = *PSE.getSE();
4460 auto *TCScev = SE.getSCEV(TC->getLiveInIRValue());
4461 if (!isa<SCEVConstant>(TCScev))
4462 return;
4463 const SCEV *VFxUF = SE.getElementCount(TCScev->getType(), BestVF * BestUF);
4464 auto VecTCScev = SE.getMulExpr(SE.getUDivExpr(TCScev, VFxUF), VFxUF);
4465 if (auto *ConstVecTC = dyn_cast<SCEVConstant>(VecTCScev))
4466 Plan.getVectorTripCount().setUnderlyingValue(ConstVecTC->getValue());
4467}
4468
4470 VPBasicBlock *VectorPH) {
4472 if (BTC->getNumUsers() == 0)
4473 return;
4474
4475 VPBuilder Builder(VectorPH, VectorPH->begin());
4476 auto *TCTy = VPTypeAnalysis(Plan).inferScalarType(Plan.getTripCount());
4477 auto *TCMO = Builder.createNaryOp(
4478 Instruction::Sub, {Plan.getTripCount(), Plan.getConstantInt(TCTy, 1)},
4479 DebugLoc::getCompilerGenerated(), "trip.count.minus.1");
4480 BTC->replaceAllUsesWith(TCMO);
4481}
4482
4484 if (Plan.hasScalarVFOnly())
4485 return;
4486
4487 VPTypeAnalysis TypeInfo(Plan);
4488 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
4489 auto VPBBsOutsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
4491 auto VPBBsInsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
4492 vp_depth_first_shallow(LoopRegion->getEntry()));
4493 // Materialize Build(Struct)Vector for all replicating VPReplicateRecipes and
4494 // VPInstructions, excluding ones in replicate regions. Those are not
4495 // materialized explicitly yet. Those vector users are still handled in
4496 // VPReplicateRegion::execute(), via shouldPack().
4497 // TODO: materialize build vectors for replicating recipes in replicating
4498 // regions.
4499 for (VPBasicBlock *VPBB :
4500 concat<VPBasicBlock *>(VPBBsOutsideLoopRegion, VPBBsInsideLoopRegion)) {
4501 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
4503 continue;
4504 auto *DefR = cast<VPRecipeWithIRFlags>(&R);
4505 auto UsesVectorOrInsideReplicateRegion = [DefR, LoopRegion](VPUser *U) {
4506 VPRegionBlock *ParentRegion = cast<VPRecipeBase>(U)->getRegion();
4507 return !U->usesScalars(DefR) || ParentRegion != LoopRegion;
4508 };
4509 if ((isa<VPReplicateRecipe>(DefR) &&
4510 cast<VPReplicateRecipe>(DefR)->isSingleScalar()) ||
4511 (isa<VPInstruction>(DefR) &&
4513 !cast<VPInstruction>(DefR)->doesGeneratePerAllLanes())) ||
4514 none_of(DefR->users(), UsesVectorOrInsideReplicateRegion))
4515 continue;
4516
4517 Type *ScalarTy = TypeInfo.inferScalarType(DefR);
4518 unsigned Opcode = ScalarTy->isStructTy()
4521 auto *BuildVector = new VPInstruction(Opcode, {DefR});
4522 BuildVector->insertAfter(DefR);
4523
4524 DefR->replaceUsesWithIf(
4525 BuildVector, [BuildVector, &UsesVectorOrInsideReplicateRegion](
4526 VPUser &U, unsigned) {
4527 return &U != BuildVector && UsesVectorOrInsideReplicateRegion(&U);
4528 });
4529 }
4530 }
4531
4532 // Create explicit VPInstructions to convert vectors to scalars. The current
4533 // implementation is conservative - it may miss some cases that may or may not
4534 // be vector values. TODO: introduce Unpacks speculatively - remove them later
4535 // if they are known to operate on scalar values.
4536 for (VPBasicBlock *VPBB : VPBBsInsideLoopRegion) {
4537 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
4540 continue;
4541 for (VPValue *Def : R.definedValues()) {
4542 // Skip recipes that are single-scalar or only have their first lane
4543 // used.
4544 // TODO: The Defs skipped here may or may not be vector values.
4545 // Introduce Unpacks, and remove them later, if they are guaranteed to
4546 // produce scalar values.
4548 continue;
4549
4550 // At the moment, we create unpacks only for scalar users outside
4551 // replicate regions. Recipes inside replicate regions still extract the
4552 // required lanes implicitly.
4553 // TODO: Remove once replicate regions are unrolled completely.
4554 auto IsCandidateUnpackUser = [Def](VPUser *U) {
4555 VPRegionBlock *ParentRegion = cast<VPRecipeBase>(U)->getRegion();
4556 return U->usesScalars(Def) &&
4557 (!ParentRegion || !ParentRegion->isReplicator());
4558 };
4559 if (none_of(Def->users(), IsCandidateUnpackUser))
4560 continue;
4561
4562 auto *Unpack = new VPInstruction(VPInstruction::Unpack, {Def});
4563 if (R.isPhi())
4564 Unpack->insertBefore(*VPBB, VPBB->getFirstNonPhi());
4565 else
4566 Unpack->insertAfter(&R);
4567 Def->replaceUsesWithIf(Unpack,
4568 [&IsCandidateUnpackUser](VPUser &U, unsigned) {
4569 return IsCandidateUnpackUser(&U);
4570 });
4571 }
4572 }
4573 }
4574}
4575
4577 VPBasicBlock *VectorPHVPBB,
4578 bool TailByMasking,
4579 bool RequiresScalarEpilogue) {
4580 VPValue &VectorTC = Plan.getVectorTripCount();
4581 assert(VectorTC.isLiveIn() && "vector-trip-count must be a live-in");
4582 // There's nothing to do if there are no users of the vector trip count or its
4583 // IR value has already been set.
4584 if (VectorTC.getNumUsers() == 0 || VectorTC.getLiveInIRValue())
4585 return;
4586
4587 VPValue *TC = Plan.getTripCount();
4588 Type *TCTy = VPTypeAnalysis(Plan).inferScalarType(TC);
4589 VPBuilder Builder(VectorPHVPBB, VectorPHVPBB->begin());
4590 VPValue *Step = &Plan.getVFxUF();
4591
4592 // If the tail is to be folded by masking, round the number of iterations N
4593 // up to a multiple of Step instead of rounding down. This is done by first
4594 // adding Step-1 and then rounding down. Note that it's ok if this addition
4595 // overflows: the vector induction variable will eventually wrap to zero given
4596 // that it starts at zero and its Step is a power of two; the loop will then
4597 // exit, with the last early-exit vector comparison also producing all-true.
4598 // For scalable vectors the VF is not guaranteed to be a power of 2, but this
4599 // is accounted for in emitIterationCountCheck that adds an overflow check.
4600 if (TailByMasking) {
4601 TC = Builder.createNaryOp(
4602 Instruction::Add,
4603 {TC, Builder.createNaryOp(Instruction::Sub,
4604 {Step, Plan.getConstantInt(TCTy, 1)})},
4605 DebugLoc::getCompilerGenerated(), "n.rnd.up");
4606 }
4607
4608 // Now we need to generate the expression for the part of the loop that the
4609 // vectorized body will execute. This is equal to N - (N % Step) if scalar
4610 // iterations are not required for correctness, or N - Step, otherwise. Step
4611 // is equal to the vectorization factor (number of SIMD elements) times the
4612 // unroll factor (number of SIMD instructions).
4613 VPValue *R =
4614 Builder.createNaryOp(Instruction::URem, {TC, Step},
4615 DebugLoc::getCompilerGenerated(), "n.mod.vf");
4616
4617 // There are cases where we *must* run at least one iteration in the remainder
4618 // loop. See the cost model for when this can happen. If the step evenly
4619 // divides the trip count, we set the remainder to be equal to the step. If
4620 // the step does not evenly divide the trip count, no adjustment is necessary
4621 // since there will already be scalar iterations. Note that the minimum
4622 // iterations check ensures that N >= Step.
4623 if (RequiresScalarEpilogue) {
4624 assert(!TailByMasking &&
4625 "requiring scalar epilogue is not supported with fail folding");
4626 VPValue *IsZero =
4627 Builder.createICmp(CmpInst::ICMP_EQ, R, Plan.getConstantInt(TCTy, 0));
4628 R = Builder.createSelect(IsZero, Step, R);
4629 }
4630
4631 VPValue *Res = Builder.createNaryOp(
4632 Instruction::Sub, {TC, R}, DebugLoc::getCompilerGenerated(), "n.vec");
4633 VectorTC.replaceAllUsesWith(Res);
4634}
4635
4637 ElementCount VFEC) {
4638 VPBuilder Builder(VectorPH, VectorPH->begin());
4639 Type *TCTy = VPTypeAnalysis(Plan).inferScalarType(Plan.getTripCount());
4640 VPValue &VF = Plan.getVF();
4641 VPValue &VFxUF = Plan.getVFxUF();
4642 // Note that after the transform, Plan.getVF and Plan.getVFxUF should not be
4643 // used.
4644 // TODO: Assert that they aren't used.
4645
4646 // If there are no users of the runtime VF, compute VFxUF by constant folding
4647 // the multiplication of VF and UF.
4648 if (VF.getNumUsers() == 0) {
4649 VPValue *RuntimeVFxUF =
4650 Builder.createElementCount(TCTy, VFEC * Plan.getUF());
4651 VFxUF.replaceAllUsesWith(RuntimeVFxUF);
4652 return;
4653 }
4654
4655 // For users of the runtime VF, compute it as VF * vscale, and VFxUF as (VF *
4656 // vscale) * UF.
4657 VPValue *RuntimeVF = Builder.createElementCount(TCTy, VFEC);
4659 VPValue *BC = Builder.createNaryOp(VPInstruction::Broadcast, RuntimeVF);
4661 BC, [&VF](VPUser &U, unsigned) { return !U.usesScalars(&VF); });
4662 }
4663 VF.replaceAllUsesWith(RuntimeVF);
4664
4665 VPValue *UF = Plan.getConstantInt(TCTy, Plan.getUF());
4666 VPValue *MulByUF = Builder.createOverflowingOp(
4667 Instruction::Mul, {RuntimeVF, UF}, {true, false});
4668 VFxUF.replaceAllUsesWith(MulByUF);
4669}
4670
4673 const DataLayout &DL = SE.getDataLayout();
4674 SCEVExpander Expander(SE, DL, "induction", /*PreserveLCSSA=*/false);
4675
4676 auto *Entry = cast<VPIRBasicBlock>(Plan.getEntry());
4677 BasicBlock *EntryBB = Entry->getIRBasicBlock();
4678 DenseMap<const SCEV *, Value *> ExpandedSCEVs;
4679 for (VPRecipeBase &R : make_early_inc_range(*Entry)) {
4681 continue;
4682 auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(&R);
4683 if (!ExpSCEV)
4684 break;
4685 const SCEV *Expr = ExpSCEV->getSCEV();
4686 Value *Res =
4687 Expander.expandCodeFor(Expr, Expr->getType(), EntryBB->getTerminator());
4688 ExpandedSCEVs[ExpSCEV->getSCEV()] = Res;
4689 VPValue *Exp = Plan.getOrAddLiveIn(Res);
4690 ExpSCEV->replaceAllUsesWith(Exp);
4691 if (Plan.getTripCount() == ExpSCEV)
4692 Plan.resetTripCount(Exp);
4693 ExpSCEV->eraseFromParent();
4694 }
4696 "VPExpandSCEVRecipes must be at the beginning of the entry block, "
4697 "after any VPIRInstructions");
4698 // Add IR instructions in the entry basic block but not in the VPIRBasicBlock
4699 // to the VPIRBasicBlock.
4700 auto EI = Entry->begin();
4701 for (Instruction &I : drop_end(*EntryBB)) {
4702 if (EI != Entry->end() && isa<VPIRInstruction>(*EI) &&
4703 &cast<VPIRInstruction>(&*EI)->getInstruction() == &I) {
4704 EI++;
4705 continue;
4706 }
4708 }
4709
4710 return ExpandedSCEVs;
4711}
4712
4713/// Returns true if \p V is VPWidenLoadRecipe or VPInterleaveRecipe that can be
4714/// converted to a narrower recipe. \p V is used by a wide recipe that feeds a
4715/// store interleave group at index \p Idx, \p WideMember0 is the recipe feeding
4716/// the same interleave group at index 0. A VPWidenLoadRecipe can be narrowed to
4717/// an index-independent load if it feeds all wide ops at all indices (\p OpV
4718/// must be the operand at index \p OpIdx for both the recipe at lane 0, \p
4719/// WideMember0). A VPInterleaveRecipe can be narrowed to a wide load, if \p V
4720/// is defined at \p Idx of a load interleave group.
4721static bool canNarrowLoad(VPWidenRecipe *WideMember0, unsigned OpIdx,
4722 VPValue *OpV, unsigned Idx) {
4723 VPValue *Member0Op = WideMember0->getOperand(OpIdx);
4724 VPRecipeBase *Member0OpR = Member0Op->getDefiningRecipe();
4725 if (!Member0OpR)
4726 return Member0Op == OpV;
4727 if (auto *W = dyn_cast<VPWidenLoadRecipe>(Member0OpR))
4728 return !W->getMask() && Member0Op == OpV;
4729 if (auto *IR = dyn_cast<VPInterleaveRecipe>(Member0OpR))
4730 return IR->getInterleaveGroup()->isFull() && IR->getVPValue(Idx) == OpV;
4731 return false;
4732}
4733
4734/// Returns true if \p IR is a full interleave group with factor and number of
4735/// members both equal to \p VF. The interleave group must also access the full
4736/// vector width \p VectorRegWidth.
4738 ElementCount VF,
4739 VPTypeAnalysis &TypeInfo,
4740 TypeSize VectorRegWidth) {
4741 if (!InterleaveR || InterleaveR->getMask())
4742 return false;
4743
4744 Type *GroupElementTy = nullptr;
4745 if (InterleaveR->getStoredValues().empty()) {
4746 GroupElementTy = TypeInfo.inferScalarType(InterleaveR->getVPValue(0));
4747 if (!all_of(InterleaveR->definedValues(),
4748 [&TypeInfo, GroupElementTy](VPValue *Op) {
4749 return TypeInfo.inferScalarType(Op) == GroupElementTy;
4750 }))
4751 return false;
4752 } else {
4753 GroupElementTy =
4754 TypeInfo.inferScalarType(InterleaveR->getStoredValues()[0]);
4755 if (!all_of(InterleaveR->getStoredValues(),
4756 [&TypeInfo, GroupElementTy](VPValue *Op) {
4757 return TypeInfo.inferScalarType(Op) == GroupElementTy;
4758 }))
4759 return false;
4760 }
4761
4762 unsigned VFMin = VF.getKnownMinValue();
4763 TypeSize GroupSize = TypeSize::get(
4764 GroupElementTy->getScalarSizeInBits() * VFMin, VF.isScalable());
4765 const auto *IG = InterleaveR->getInterleaveGroup();
4766 return IG->getFactor() == VFMin && IG->getNumMembers() == VFMin &&
4767 GroupSize == VectorRegWidth;
4768}
4769
4770/// Returns true if \p VPValue is a narrow VPValue.
4771static bool isAlreadyNarrow(VPValue *VPV) {
4772 if (VPV->isLiveIn())
4773 return true;
4774 auto *RepR = dyn_cast<VPReplicateRecipe>(VPV);
4775 return RepR && RepR->isSingleScalar();
4776}
4777
4778// Convert a wide recipe defining a VPValue \p V feeding an interleave group to
4779// a narrow variant.
4780static VPValue *
4782 auto *R = V->getDefiningRecipe();
4783 if (!R || NarrowedOps.contains(V))
4784 return V;
4785
4786 if (isAlreadyNarrow(V))
4787 return V;
4788
4789 if (auto *WideMember0 = dyn_cast<VPWidenRecipe>(R)) {
4790 for (unsigned Idx = 0, E = WideMember0->getNumOperands(); Idx != E; ++Idx)
4791 WideMember0->setOperand(
4792 Idx,
4793 narrowInterleaveGroupOp(WideMember0->getOperand(Idx), NarrowedOps));
4794 return V;
4795 }
4796
4797 if (auto *LoadGroup = dyn_cast<VPInterleaveRecipe>(R)) {
4798 // Narrow interleave group to wide load, as transformed VPlan will only
4799 // process one original iteration.
4800 auto *LI = cast<LoadInst>(LoadGroup->getInterleaveGroup()->getInsertPos());
4801 auto *L = new VPWidenLoadRecipe(
4802 *LI, LoadGroup->getAddr(), LoadGroup->getMask(), /*Consecutive=*/true,
4803 /*Reverse=*/false, {}, LoadGroup->getDebugLoc());
4804 L->insertBefore(LoadGroup);
4805 NarrowedOps.insert(L);
4806 return L;
4807 }
4808
4809 if (auto *RepR = dyn_cast<VPReplicateRecipe>(R)) {
4810 assert(RepR->isSingleScalar() &&
4811 isa<LoadInst>(RepR->getUnderlyingInstr()) &&
4812 "must be a single scalar load");
4813 NarrowedOps.insert(RepR);
4814 return RepR;
4815 }
4816
4817 auto *WideLoad = cast<VPWidenLoadRecipe>(R);
4818 VPValue *PtrOp = WideLoad->getAddr();
4819 if (auto *VecPtr = dyn_cast<VPVectorPointerRecipe>(PtrOp))
4820 PtrOp = VecPtr->getOperand(0);
4821 // Narrow wide load to uniform scalar load, as transformed VPlan will only
4822 // process one original iteration.
4823 auto *N = new VPReplicateRecipe(&WideLoad->getIngredient(), {PtrOp},
4824 /*IsUniform*/ true,
4825 /*Mask*/ nullptr, {}, *WideLoad);
4826 N->insertBefore(WideLoad);
4827 NarrowedOps.insert(N);
4828 return N;
4829}
4830
4832 TypeSize VectorRegWidth) {
4833 VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
4834 if (!VectorLoop || VectorLoop->getEntry()->getNumSuccessors() != 0)
4835 return;
4836
4837 VPTypeAnalysis TypeInfo(Plan);
4838
4840 for (auto &R : *VectorLoop->getEntryBasicBlock()) {
4842 continue;
4843
4846 continue;
4847
4848 // Bail out on recipes not supported at the moment:
4849 // * phi recipes other than the canonical induction
4850 // * recipes writing to memory except interleave groups
4851 // Only support plans with a canonical induction phi.
4852 if (R.isPhi())
4853 return;
4854
4855 auto *InterleaveR = dyn_cast<VPInterleaveRecipe>(&R);
4856 if (R.mayWriteToMemory() && !InterleaveR)
4857 return;
4858
4859 // Do not narrow interleave groups if there are VectorPointer recipes and
4860 // the plan was unrolled. The recipe implicitly uses VF from
4861 // VPTransformState.
4862 // TODO: Remove restriction once the VF for the VectorPointer offset is
4863 // modeled explicitly as operand.
4864 if (isa<VPVectorPointerRecipe>(&R) && Plan.getUF() > 1)
4865 return;
4866
4867 // All other ops are allowed, but we reject uses that cannot be converted
4868 // when checking all allowed consumers (store interleave groups) below.
4869 if (!InterleaveR)
4870 continue;
4871
4872 // Bail out on non-consecutive interleave groups.
4873 if (!isConsecutiveInterleaveGroup(InterleaveR, VF, TypeInfo,
4874 VectorRegWidth))
4875 return;
4876
4877 // Skip read interleave groups.
4878 if (InterleaveR->getStoredValues().empty())
4879 continue;
4880
4881 // Narrow interleave groups, if all operands are already matching narrow
4882 // ops.
4883 auto *Member0 = InterleaveR->getStoredValues()[0];
4884 if (isAlreadyNarrow(Member0) &&
4885 all_of(InterleaveR->getStoredValues(),
4886 [Member0](VPValue *VPV) { return Member0 == VPV; })) {
4887 StoreGroups.push_back(InterleaveR);
4888 continue;
4889 }
4890
4891 // For now, we only support full interleave groups storing load interleave
4892 // groups.
4893 if (all_of(enumerate(InterleaveR->getStoredValues()), [](auto Op) {
4894 VPRecipeBase *DefR = Op.value()->getDefiningRecipe();
4895 if (!DefR)
4896 return false;
4897 auto *IR = dyn_cast<VPInterleaveRecipe>(DefR);
4898 return IR && IR->getInterleaveGroup()->isFull() &&
4899 IR->getVPValue(Op.index()) == Op.value();
4900 })) {
4901 StoreGroups.push_back(InterleaveR);
4902 continue;
4903 }
4904
4905 // Check if all values feeding InterleaveR are matching wide recipes, which
4906 // operands that can be narrowed.
4907 auto *WideMember0 =
4908 dyn_cast_or_null<VPWidenRecipe>(InterleaveR->getStoredValues()[0]);
4909 if (!WideMember0)
4910 return;
4911 for (const auto &[I, V] : enumerate(InterleaveR->getStoredValues())) {
4913 if (!R || R->getOpcode() != WideMember0->getOpcode() ||
4914 R->getNumOperands() > 2)
4915 return;
4916 if (any_of(enumerate(R->operands()),
4917 [WideMember0, Idx = I](const auto &P) {
4918 const auto &[OpIdx, OpV] = P;
4919 return !canNarrowLoad(WideMember0, OpIdx, OpV, Idx);
4920 }))
4921 return;
4922 }
4923 StoreGroups.push_back(InterleaveR);
4924 }
4925
4926 if (StoreGroups.empty())
4927 return;
4928
4929 // Convert InterleaveGroup \p R to a single VPWidenLoadRecipe.
4930 SmallPtrSet<VPValue *, 4> NarrowedOps;
4931 // Narrow operation tree rooted at store groups.
4932 for (auto *StoreGroup : StoreGroups) {
4933 VPValue *Res =
4934 narrowInterleaveGroupOp(StoreGroup->getStoredValues()[0], NarrowedOps);
4935 auto *SI =
4936 cast<StoreInst>(StoreGroup->getInterleaveGroup()->getInsertPos());
4937 auto *S = new VPWidenStoreRecipe(
4938 *SI, StoreGroup->getAddr(), Res, nullptr, /*Consecutive=*/true,
4939 /*Reverse=*/false, {}, StoreGroup->getDebugLoc());
4940 S->insertBefore(StoreGroup);
4941 StoreGroup->eraseFromParent();
4942 }
4943
4944 // Adjust induction to reflect that the transformed plan only processes one
4945 // original iteration.
4946 auto *CanIV = VectorLoop->getCanonicalIV();
4947 auto *Inc = cast<VPInstruction>(CanIV->getBackedgeValue());
4948 VPBuilder PHBuilder(Plan.getVectorPreheader());
4949
4950 VPValue *UF = Plan.getOrAddLiveIn(
4951 ConstantInt::get(VectorLoop->getCanonicalIVType(), 1 * Plan.getUF()));
4952 if (VF.isScalable()) {
4953 VPValue *VScale = PHBuilder.createElementCount(
4955 VPValue *VScaleUF = PHBuilder.createOverflowingOp(
4956 Instruction::Mul, {VScale, UF}, {true, false});
4957 Inc->setOperand(1, VScaleUF);
4958 Plan.getVF().replaceAllUsesWith(VScale);
4959 } else {
4960 Inc->setOperand(1, UF);
4962 Plan.getConstantInt(CanIV->getScalarType(), 1));
4963 }
4964 removeDeadRecipes(Plan);
4965}
4966
4967/// Add branch weight metadata, if the \p Plan's middle block is terminated by a
4968/// BranchOnCond recipe.
4970 VPlan &Plan, ElementCount VF, std::optional<unsigned> VScaleForTuning) {
4971 VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
4972 auto *MiddleTerm =
4974 // Only add branch metadata if there is a (conditional) terminator.
4975 if (!MiddleTerm)
4976 return;
4977
4978 assert(MiddleTerm->getOpcode() == VPInstruction::BranchOnCond &&
4979 "must have a BranchOnCond");
4980 // Assume that `TripCount % VectorStep ` is equally distributed.
4981 unsigned VectorStep = Plan.getUF() * VF.getKnownMinValue();
4982 if (VF.isScalable() && VScaleForTuning.has_value())
4983 VectorStep *= *VScaleForTuning;
4984 assert(VectorStep > 0 && "trip count should not be zero");
4985 MDBuilder MDB(Plan.getContext());
4986 MDNode *BranchWeights =
4987 MDB.createBranchWeights({1, VectorStep - 1}, /*IsExpected=*/false);
4988 MiddleTerm->setMetadata(LLVMContext::MD_prof, BranchWeights);
4989}
4990
4991/// Compute and return the end value for \p WideIV, unless it is truncated. If
4992/// the induction recipe is not canonical, creates a VPDerivedIVRecipe to
4993/// compute the end value of the induction.
4995 VPBuilder &VectorPHBuilder,
4996 VPTypeAnalysis &TypeInfo,
4997 VPValue *VectorTC) {
4998 auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
4999 // Truncated wide inductions resume from the last lane of their vector value
5000 // in the last vector iteration which is handled elsewhere.
5001 if (WideIntOrFp && WideIntOrFp->getTruncInst())
5002 return nullptr;
5003
5004 VPValue *Start = WideIV->getStartValue();
5005 VPValue *Step = WideIV->getStepValue();
5007 VPValue *EndValue = VectorTC;
5008 if (!WideIntOrFp || !WideIntOrFp->isCanonical()) {
5009 EndValue = VectorPHBuilder.createDerivedIV(
5010 ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
5011 Start, VectorTC, Step);
5012 }
5013
5014 // EndValue is derived from the vector trip count (which has the same type as
5015 // the widest induction) and thus may be wider than the induction here.
5016 Type *ScalarTypeOfWideIV = TypeInfo.inferScalarType(WideIV);
5017 if (ScalarTypeOfWideIV != TypeInfo.inferScalarType(EndValue)) {
5018 EndValue = VectorPHBuilder.createScalarCast(Instruction::Trunc, EndValue,
5019 ScalarTypeOfWideIV,
5020 WideIV->getDebugLoc());
5021 }
5022
5023 return EndValue;
5024}
5025
5027 VPlan &Plan, DenseMap<VPValue *, VPValue *> &IVEndValues) {
5028 VPTypeAnalysis TypeInfo(Plan);
5029 auto *ScalarPH = Plan.getScalarPreheader();
5030 auto *MiddleVPBB = cast<VPBasicBlock>(ScalarPH->getPredecessors()[0]);
5031 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
5032 VPBuilder VectorPHBuilder(
5033 cast<VPBasicBlock>(VectorRegion->getSinglePredecessor()));
5034 VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
5035 for (VPRecipeBase &PhiR : Plan.getScalarPreheader()->phis()) {
5036 auto *ResumePhiR = cast<VPPhi>(&PhiR);
5037
5038 // TODO: Extract final value from induction recipe initially, optimize to
5039 // pre-computed end value together in optimizeInductionExitUsers.
5040 auto *VectorPhiR = cast<VPHeaderPHIRecipe>(ResumePhiR->getOperand(0));
5041 if (auto *WideIVR = dyn_cast<VPWidenInductionRecipe>(VectorPhiR)) {
5043 WideIVR, VectorPHBuilder, TypeInfo, &Plan.getVectorTripCount())) {
5044 IVEndValues[WideIVR] = EndValue;
5045 ResumePhiR->setOperand(0, EndValue);
5046 ResumePhiR->setName("bc.resume.val");
5047 continue;
5048 }
5049 // TODO: Also handle truncated inductions here. Computing end-values
5050 // separately should be done as VPlan-to-VPlan optimization, after
5051 // legalizing all resume values to use the last lane from the loop.
5052 assert(cast<VPWidenIntOrFpInductionRecipe>(VectorPhiR)->getTruncInst() &&
5053 "should only skip truncated wide inductions");
5054 continue;
5055 }
5056
5057 // The backedge value provides the value to resume coming out of a loop,
5058 // which for FORs is a vector whose last element needs to be extracted. The
5059 // start value provides the value if the loop is bypassed.
5060 bool IsFOR = isa<VPFirstOrderRecurrencePHIRecipe>(VectorPhiR);
5061 auto *ResumeFromVectorLoop = VectorPhiR->getBackedgeValue();
5062 assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
5063 "Cannot handle loops with uncountable early exits");
5064 if (IsFOR) {
5065 auto *ExtractPart = MiddleBuilder.createNaryOp(
5066 VPInstruction::ExtractLastPart, ResumeFromVectorLoop);
5067 ResumeFromVectorLoop = MiddleBuilder.createNaryOp(
5069 "vector.recur.extract");
5070 }
5071 ResumePhiR->setName(IsFOR ? "scalar.recur.init" : "bc.merge.rdx");
5072 ResumePhiR->setOperand(0, ResumeFromVectorLoop);
5073 }
5074}
5075
5077 VFRange &Range) {
5078 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
5079 auto *ScalarPHVPBB = Plan.getScalarPreheader();
5080 auto *MiddleVPBB = Plan.getMiddleBlock();
5081 VPBuilder ScalarPHBuilder(ScalarPHVPBB);
5082 VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
5083
5084 auto IsScalableOne = [](ElementCount VF) -> bool {
5085 return VF == ElementCount::getScalable(1);
5086 };
5087
5088 for (auto &HeaderPhi : VectorRegion->getEntryBasicBlock()->phis()) {
5089 auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&HeaderPhi);
5090 if (!FOR)
5091 continue;
5092
5093 assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
5094 "Cannot handle loops with uncountable early exits");
5095
5096 // This is the second phase of vectorizing first-order recurrences, creating
5097 // extract for users outside the loop. An overview of the transformation is
5098 // described below. Suppose we have the following loop with some use after
5099 // the loop of the last a[i-1],
5100 //
5101 // for (int i = 0; i < n; ++i) {
5102 // t = a[i - 1];
5103 // b[i] = a[i] - t;
5104 // }
5105 // use t;
5106 //
5107 // There is a first-order recurrence on "a". For this loop, the shorthand
5108 // scalar IR looks like:
5109 //
5110 // scalar.ph:
5111 // s.init = a[-1]
5112 // br scalar.body
5113 //
5114 // scalar.body:
5115 // i = phi [0, scalar.ph], [i+1, scalar.body]
5116 // s1 = phi [s.init, scalar.ph], [s2, scalar.body]
5117 // s2 = a[i]
5118 // b[i] = s2 - s1
5119 // br cond, scalar.body, exit.block
5120 //
5121 // exit.block:
5122 // use = lcssa.phi [s1, scalar.body]
5123 //
5124 // In this example, s1 is a recurrence because it's value depends on the
5125 // previous iteration. In the first phase of vectorization, we created a
5126 // VPFirstOrderRecurrencePHIRecipe v1 for s1. Now we create the extracts
5127 // for users in the scalar preheader and exit block.
5128 //
5129 // vector.ph:
5130 // v_init = vector(..., ..., ..., a[-1])
5131 // br vector.body
5132 //
5133 // vector.body
5134 // i = phi [0, vector.ph], [i+4, vector.body]
5135 // v1 = phi [v_init, vector.ph], [v2, vector.body]
5136 // v2 = a[i, i+1, i+2, i+3]
5137 // b[i] = v2 - v1
5138 // // Next, third phase will introduce v1' = splice(v1(3), v2(0, 1, 2))
5139 // b[i, i+1, i+2, i+3] = v2 - v1
5140 // br cond, vector.body, middle.block
5141 //
5142 // middle.block:
5143 // vector.recur.extract.for.phi = v2(2)
5144 // vector.recur.extract = v2(3)
5145 // br cond, scalar.ph, exit.block
5146 //
5147 // scalar.ph:
5148 // scalar.recur.init = phi [vector.recur.extract, middle.block],
5149 // [s.init, otherwise]
5150 // br scalar.body
5151 //
5152 // scalar.body:
5153 // i = phi [0, scalar.ph], [i+1, scalar.body]
5154 // s1 = phi [scalar.recur.init, scalar.ph], [s2, scalar.body]
5155 // s2 = a[i]
5156 // b[i] = s2 - s1
5157 // br cond, scalar.body, exit.block
5158 //
5159 // exit.block:
5160 // lo = lcssa.phi [s1, scalar.body],
5161 // [vector.recur.extract.for.phi, middle.block]
5162 //
5163 // Now update VPIRInstructions modeling LCSSA phis in the exit block.
5164 // Extract the penultimate value of the recurrence and use it as operand for
5165 // the VPIRInstruction modeling the phi.
5167 make_range(MiddleVPBB->getFirstNonPhi(), MiddleVPBB->end()))) {
5169 continue;
5170
5171 // For VF vscale x 1, if vscale = 1, we are unable to extract the
5172 // penultimate value of the recurrence. Instead we rely on the existing
5173 // extract of the last element from the result of
5174 // VPInstruction::FirstOrderRecurrenceSplice.
5175 // TODO: Consider vscale_range info and UF.
5177 Range))
5178 return;
5179 VPValue *PenultimateElement = MiddleBuilder.createNaryOp(
5180 VPInstruction::ExtractPenultimateElement, FOR->getBackedgeValue(), {},
5181 "vector.recur.extract.for.phi");
5182 cast<VPInstruction>(&R)->replaceAllUsesWith(PenultimateElement);
5183 }
5184 }
5185}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
AMDGPU Register Bank Select
This file implements a class to represent arbitrary precision integral constant values and operations...
ReachingDefInfo InstSet & ToRemove
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool isEqual(const Function &Caller, const Function &Callee)
static const Function * getParent(const Value *V)
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
static bool isSentinel(const DWARFDebugNames::AttributeEncoding &AE)
@ Default
Hexagon Common GEP
iv Induction Variable Users
Definition IVUsers.cpp:48
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
licm
Definition LICM.cpp:383
Legalize the Machine IR a function s Machine IR
Definition Legalizer.cpp:80
static bool mergeBlocksIntoPredecessors(Loop &L, DominatorTree &DT, LoopInfo &LI, MemorySSAUpdater *MSSAU, ScalarEvolution &SE)
#define I(x, y, z)
Definition MD5.cpp:57
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first DebugLoc that has line number information, given a range of instructions.
This file provides utility analysis objects describing memory locations.
This file contains the declarations for metadata subclasses.
MachineInstr unsigned OpIdx
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
uint64_t IntrinsicInst * II
#define P(N)
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
const SmallVectorImpl< MachineOperand > & Cond
This file contains some templates that are useful if you are working with the STL at all.
This is the interface for a metadata-based scoped no-alias analysis.
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file defines the SmallPtrSet class.
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This file implements the TypeSwitch template, which mimics a switch() statement whose cases are type ...
This file implements dominator tree analysis for a single level of a VPlan's H-CFG.
This file contains the declarations of different VPlan-related auxiliary helpers.
static VPValue * optimizeLatchExitInductionUser(VPlan &Plan, VPTypeAnalysis &TypeInfo, VPBlockBase *PredVPBB, VPValue *Op, DenseMap< VPValue *, VPValue * > &EndValues, ScalarEvolution &SE)
Attempts to optimize the induction variable exit values for users in the exit block coming from the l...
static void removeCommonBlendMask(VPBlendRecipe *Blend)
Try to see if all of Blend's masks share a common value logically and'ed and remove it from the masks...
static void tryToCreateAbstractReductionRecipe(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries to create abstract recipes from the reduction recipe for following optimizations ...
static VPReplicateRecipe * findRecipeWithMinAlign(ArrayRef< VPReplicateRecipe * > Group)
static bool sinkScalarOperands(VPlan &Plan)
static bool cannotHoistOrSinkRecipe(const VPRecipeBase &R)
Return true if we do not know how to (mechanically) hoist or sink R out of a loop region.
static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Try to simplify the branch condition of Plan.
static SmallVector< SmallVector< VPReplicateRecipe *, 4 > > collectComplementaryPredicatedMemOps(VPlan &Plan, ScalarEvolution &SE, const Loop *L)
static void simplifyRecipe(VPSingleDefRecipe *Def, VPTypeAnalysis &TypeInfo)
Try to simplify VPSingleDefRecipe Def.
static void removeRedundantInductionCasts(VPlan &Plan)
Remove redundant casts of inductions.
static bool tryToReplaceALMWithWideALM(VPlan &Plan, ElementCount VF, unsigned UF)
Try to replace multiple active lane masks used for control flow with a single, wide active lane mask ...
static std::optional< std::pair< bool, unsigned > > getOpcodeOrIntrinsicID(const VPSingleDefRecipe *R)
Get any instruction opcode or intrinsic ID data embedded in recipe R.
static VPExpressionRecipe * tryToMatchAndCreateExtendedReduction(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries convert extended in-loop reductions to VPExpressionRecipe and clamp the Range if ...
static VPScalarIVStepsRecipe * createScalarIVSteps(VPlan &Plan, InductionDescriptor::InductionKind Kind, Instruction::BinaryOps InductionOpcode, FPMathOperator *FPBinOp, Instruction *TruncI, VPValue *StartV, VPValue *Step, DebugLoc DL, VPBuilder &Builder)
static RemoveMask_match< Op0_t, Op1_t > m_RemoveMask(const Op0_t &In, Op1_t &Out)
Match a specific mask In, or a combination of it (logical-and In, Out).
static VPIRMetadata getCommonMetadata(ArrayRef< VPReplicateRecipe * > Recipes)
static VPValue * getPredicatedMask(VPRegionBlock *R)
If R is a region with a VPBranchOnMaskRecipe in the entry block, return the mask.
static bool sinkRecurrenceUsersAfterPrevious(VPFirstOrderRecurrencePHIRecipe *FOR, VPRecipeBase *Previous, VPDominatorTree &VPDT)
Sink users of FOR after the recipe defining the previous value Previous of the recurrence.
static bool mergeReplicateRegionsIntoSuccessors(VPlan &Plan)
static VPActiveLaneMaskPHIRecipe * addVPLaneMaskPhiAndUpdateExitBranch(VPlan &Plan, bool DataAndControlFlowWithoutRuntimeCheck)
static void expandVPWidenPointerInduction(VPWidenPointerInductionRecipe *R, VPTypeAnalysis &TypeInfo)
Expand a VPWidenPointerInductionRecipe into executable recipes, for the initial value,...
static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL)
Replace recipes with their EVL variants.
static bool isDeadRecipe(VPRecipeBase &R)
Returns true if R is dead and can be removed.
static void legalizeAndOptimizeInductions(VPlan &Plan)
Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd (IndStart, ScalarIVSteps (0,...
static void addReplicateRegions(VPlan &Plan)
static VPValue * tryToFoldLiveIns(VPSingleDefRecipe &R, ArrayRef< VPValue * > Operands, const DataLayout &DL, VPTypeAnalysis &TypeInfo)
Try to fold R using InstSimplifyFolder.
static VPValue * tryToComputeEndValueForInduction(VPWidenInductionRecipe *WideIV, VPBuilder &VectorPHBuilder, VPTypeAnalysis &TypeInfo, VPValue *VectorTC)
Compute and return the end value for WideIV, unless it is truncated.
static void removeRedundantExpandSCEVRecipes(VPlan &Plan)
Remove redundant EpxandSCEVRecipes in Plan's entry block by replacing them with already existing reci...
static bool simplifyKnownEVL(VPlan &Plan, ElementCount VF, PredicatedScalarEvolution &PSE)
From the definition of llvm.experimental.get.vector.length, VPInstruction::ExplicitVectorLength(AVL) ...
static bool isConditionTrueViaVFAndUF(VPValue *Cond, VPlan &Plan, ElementCount BestVF, unsigned BestUF, ScalarEvolution &SE)
Return true if Cond is known to be true for given BestVF and BestUF.
static bool hoistPreviousBeforeFORUsers(VPFirstOrderRecurrencePHIRecipe *FOR, VPRecipeBase *Previous, VPDominatorTree &VPDT)
Try to hoist Previous and its operands before all users of FOR.
static VPValue * scalarizeVPWidenPointerInduction(VPWidenPointerInductionRecipe *PtrIV, VPlan &Plan, VPBuilder &Builder)
Scalarize a VPWidenPointerInductionRecipe by replacing it with a PtrAdd (IndStart,...
static bool canSinkStoreWithNoAliasCheck(ArrayRef< VPReplicateRecipe * > StoresToSink, ScalarEvolution &SE, const Loop &L, VPTypeAnalysis &TypeInfo)
static SmallVector< VPUser * > collectUsersRecursively(VPValue *V)
static void recursivelyDeleteDeadRecipes(VPValue *V)
static VPValue * optimizeEarlyExitInductionUser(VPlan &Plan, VPTypeAnalysis &TypeInfo, VPBlockBase *PredVPBB, VPValue *Op, ScalarEvolution &SE)
Attempts to optimize the induction variable exit values for users in the early exit block.
static VPWidenInductionRecipe * getOptimizableIVOf(VPValue *VPV, ScalarEvolution &SE)
Check if VPV is an untruncated wide induction, either before or after the increment.
static VPRegionBlock * createReplicateRegion(VPReplicateRecipe *PredRecipe, VPlan &Plan)
static VPBasicBlock * getPredicatedThenBlock(VPRegionBlock *R)
If R is a triangle region, return the 'then' block of the triangle.
static VPValue * narrowInterleaveGroupOp(VPValue *V, SmallPtrSetImpl< VPValue * > &NarrowedOps)
static bool canHoistOrSinkWithNoAliasCheck(const MemoryLocation &MemLoc, VPBasicBlock *FirstBB, VPBasicBlock *LastBB, std::optional< SinkStoreInfo > SinkInfo={})
Check if a memory operation doesn't alias with memory operations in blocks between FirstBB and LastBB...
static void simplifyBlends(VPlan &Plan)
Normalize and simplify VPBlendRecipes.
static bool isConsecutiveInterleaveGroup(VPInterleaveRecipe *InterleaveR, ElementCount VF, VPTypeAnalysis &TypeInfo, TypeSize VectorRegWidth)
Returns true if IR is a full interleave group with factor and number of members both equal to VF.
static VPRecipeBase * optimizeMaskToEVL(VPValue *HeaderMask, VPRecipeBase &CurRecipe, VPTypeAnalysis &TypeInfo, VPValue &EVL)
Try to optimize a CurRecipe masked by HeaderMask to a corresponding EVL-based recipe without the head...
static bool isAlreadyNarrow(VPValue *VPV)
Returns true if VPValue is a narrow VPValue.
static bool optimizeVectorInductionWidthForTCAndVFUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF)
Optimize the width of vector induction variables in Plan based on a known constant Trip Count,...
static VPExpressionRecipe * tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries convert extended in-loop reductions to VPExpressionRecipe and clamp the Range if ...
static void expandVPWidenIntOrFpInduction(VPWidenIntOrFpInductionRecipe *WidenIVR, VPTypeAnalysis &TypeInfo)
Expand a VPWidenIntOrFpInduction into executable recipes, for the initial value, phi and backedge val...
static VPSingleDefRecipe * findHeaderMask(VPlan &Plan)
Collect the header mask with the pattern: (ICMP_ULE, WideCanonicalIV, backedge-taken-count) TODO: Int...
static void removeRedundantCanonicalIVs(VPlan &Plan)
Try to replace VPWidenCanonicalIVRecipes with a widened canonical IV recipe, if it exists.
static bool canNarrowLoad(VPWidenRecipe *WideMember0, unsigned OpIdx, VPValue *OpV, unsigned Idx)
Returns true if V is VPWidenLoadRecipe or VPInterleaveRecipe that can be converted to a narrower reci...
static void narrowToSingleScalarRecipes(VPlan &Plan)
This file provides utility VPlan to VPlan transformations.
This file declares the class VPlanVerifier, which contains utility functions to check the consistency...
This file contains the declarations of the Vectorization Plan base classes:
static const X86InstrFMA3Group Groups[]
Value * RHS
Value * LHS
BinaryOperator * Mul
static const uint32_t IV[8]
Definition blake3_impl.h:83
Helper for extra no-alias checks via known-safe recipe and SCEV.
SinkStoreInfo(const SmallPtrSetImpl< VPRecipeBase * > &ExcludeRecipes, VPReplicateRecipe &GroupLeader, ScalarEvolution &SE, const Loop &L, VPTypeAnalysis &TypeInfo)
bool shouldSkip(VPRecipeBase &R) const
Return true if R should be skipped during alias checking, either because it's in the exclude set or b...
Class for arbitrary precision integers.
Definition APInt.h:78
LLVM_ABI APInt zext(unsigned width) const
Zero extend to a new width.
Definition APInt.cpp:1012
unsigned getActiveBits() const
Compute the number of active bits in the value.
Definition APInt.h:1513
APInt abs() const
Get the absolute value.
Definition APInt.h:1796
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1489
LLVM_ABI APInt sext(unsigned width) const
Sign extend to a new width.
Definition APInt.cpp:985
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition APInt.h:1222
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
const T & back() const
back - Get the last element.
Definition ArrayRef.h:151
const T & front() const
front - Get the first element.
Definition ArrayRef.h:145
iterator end() const
Definition ArrayRef.h:131
iterator begin() const
Definition ArrayRef.h:130
LLVM Basic Block Representation.
Definition BasicBlock.h:62
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this basic block belongs to.
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233
This class represents a function call, abstracting a target machine's calling convention.
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:701
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:702
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition InstrTypes.h:686
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition InstrTypes.h:789
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
static ConstantInt * getSigned(IntegerType *Ty, int64_t V)
Return a ConstantInt with the specified value for the specified type.
Definition Constants.h:136
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
A debug info location.
Definition DebugLoc.h:123
static DebugLoc getCompilerGenerated()
Definition DebugLoc.h:162
static DebugLoc getUnknown()
Definition DebugLoc.h:161
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition DenseMap.h:205
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:256
bool dominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
dominates - Returns true iff A dominates B.
constexpr bool isVector() const
One or more elements.
Definition TypeSize.h:324
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition TypeSize.h:312
Utility class for floating point operations which can have information about relaxed accuracy require...
Definition Operator.h:200
Represents flags for the getelementptr instruction/expression.
GEPNoWrapFlags withoutNoUnsignedWrap() const
static GEPNoWrapFlags none()
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
A struct for saving information about induction variables.
InductionKind
This enum represents the kinds of inductions that we support.
@ IK_PtrInduction
Pointer induction var. Step = C.
@ IK_IntInduction
Integer induction variable. Step = C.
InstSimplifyFolder - Use InstructionSimplify to fold operations to existing values.
bool isCast() const
bool isBinaryOp() const
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:318
The group of interleaved loads/stores sharing the same stride and close to each other.
InstTy * getMember(uint32_t Index) const
Get the member with the given index Index.
uint32_t getNumMembers() const
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
static bool getDecisionAndClampRange(const std::function< bool(ElementCount)> &Predicate, VFRange &Range)
Test a Predicate on a Range of VF's.
Definition VPlan.cpp:1541
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
LLVM_ABI MDNode * createBranchWeights(uint32_t TrueWeight, uint32_t FalseWeight, bool IsExpected=false)
Return metadata containing two branch weights.
Definition MDBuilder.cpp:38
Metadata node.
Definition Metadata.h:1078
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:36
ValueT lookup(const KeyT &Key) const
Definition MapVector.h:108
Representation for a specific memory location.
AAMDNodes AATags
The metadata nodes which describes the aliasing of the location (each member is null if that kind of ...
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
ScalarEvolution * getSE() const
Returns the ScalarEvolution analysis used.
LLVM_ABI const SCEV * getSCEV(Value *V)
Returns the SCEV expression of V, in the context of the current SCEV predicate.
static LLVM_ABI unsigned getOpcode(RecurKind Kind)
Returns the opcode corresponding to the RecurrenceKind.
RegionT * getParent() const
Get the parent of the Region.
Definition RegionInfo.h:362
This class uses information about analyze scalars to rewrite expressions in canonical form.
LLVM_ABI Value * expandCodeFor(const SCEV *SH, Type *Ty, BasicBlock::iterator I)
Insert code to directly compute the specified SCEV expression into the program.
static const SCEV * rewrite(const SCEV *Scev, ScalarEvolution &SE, ValueToSCEVMapTy &Map)
This class represents an analyzed expression in the program.
LLVM_ABI Type * getType() const
Return the LLVM type of this SCEV expression.
The main scalar evolution driver.
const DataLayout & getDataLayout() const
Return the DataLayout associated with the module this SCEV instance is operating on.
LLVM_ABI const SCEV * getNegativeSCEV(const SCEV *V, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
Return the SCEV object corresponding to -V.
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI const SCEV * getUDivExpr(const SCEV *LHS, const SCEV *RHS)
Get a canonical unsigned division expression, or something simpler if possible.
LLVM_ABI const SCEV * getElementCount(Type *Ty, ElementCount EC, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
LLVM_ABI const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
LLVM_ABI bool isKnownPredicate(CmpPredicate Pred, const SCEV *LHS, const SCEV *RHS)
Test if the given expression is known to satisfy the condition described by Pred, LHS,...
static LLVM_ABI bool mayAliasInScopes(const MDNode *Scopes, const MDNode *NoAlias)
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
Definition SetVector.h:57
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:103
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:151
size_type size() const
Definition SmallPtrSet.h:99
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
iterator begin() const
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
Provides information about what library functions are available for the current target.
static LLVM_ABI PartialReductionExtendKind getPartialReductionExtendKind(Instruction *I)
Get the kind of extension that an instruction represents.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
static constexpr TypeSize get(ScalarTy Quantity, bool Scalable)
Definition TypeSize.h:340
This class implements a switch-like dispatch statement for a value of 'T' using dyn_cast functionalit...
Definition TypeSwitch.h:88
TypeSwitch< T, ResultT > & Case(CallableT &&caseFn)
Add a case on the given type.
Definition TypeSwitch.h:97
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:297
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:296
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
Definition Type.cpp:294
bool isStructTy() const
True if this is an instance of StructType.
Definition Type.h:261
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:230
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:293
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:184
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
op_range operands()
Definition User.h:292
A recipe for generating the active lane mask for the vector loop that is used to predicate the vector...
Definition VPlan.h:3605
VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph.
Definition VPlan.h:3966
void appendRecipe(VPRecipeBase *Recipe)
Augment the existing recipes of a VPBasicBlock with an additional Recipe as the last recipe.
Definition VPlan.h:4041
RecipeListTy::iterator iterator
Instruction iterators...
Definition VPlan.h:3993
iterator end()
Definition VPlan.h:4003
iterator begin()
Recipe iterator methods.
Definition VPlan.h:4001
iterator_range< iterator > phis()
Returns an iterator range over the PHI-like recipes in the block.
Definition VPlan.h:4054
iterator getFirstNonPhi()
Return the position of the first non-phi node recipe in the block.
Definition VPlan.cpp:216
VPRegionBlock * getEnclosingLoopRegion()
Definition VPlan.cpp:578
VPBasicBlock * splitAt(iterator SplitAt)
Split current block at SplitAt by inserting a new block between the current block and its successors ...
Definition VPlan.cpp:550
VPRecipeBase * getTerminator()
If the block has multiple successors, return the branch recipe terminating the block.
Definition VPlan.cpp:623
const VPRecipeBase & back() const
Definition VPlan.h:4015
A recipe for vectorizing a phi-node as a sequence of mask-based select instructions.
Definition VPlan.h:2512
VPValue * getMask(unsigned Idx) const
Return mask number Idx.
Definition VPlan.h:2546
unsigned getNumIncomingValues() const
Return the number of incoming values, taking into account when normalized the first incoming value wi...
Definition VPlan.h:2536
void setMask(unsigned Idx, VPValue *V)
Set mask number Idx to V.
Definition VPlan.h:2552
bool isNormalized() const
A normalized blend is one that has an odd number of operands, whereby the first operand does not have...
Definition VPlan.h:2532
VPBlockBase is the building block of the Hierarchical Control-Flow Graph.
Definition VPlan.h:81
VPRegionBlock * getParent()
Definition VPlan.h:173
const VPBasicBlock * getExitingBasicBlock() const
Definition VPlan.cpp:186
size_t getNumSuccessors() const
Definition VPlan.h:219
void swapSuccessors()
Swap successors of the block. The block must have exactly 2 successors.
Definition VPlan.h:322
size_t getNumPredecessors() const
Definition VPlan.h:220
const VPBlocksTy & getPredecessors() const
Definition VPlan.h:204
VPlan * getPlan()
Definition VPlan.cpp:161
VPBlockBase * getSinglePredecessor() const
Definition VPlan.h:215
const VPBasicBlock * getEntryBasicBlock() const
Definition VPlan.cpp:166
VPBlockBase * getSingleHierarchicalPredecessor()
Definition VPlan.h:264
VPBlockBase * getSingleSuccessor() const
Definition VPlan.h:209
const VPBlocksTy & getSuccessors() const
Definition VPlan.h:198
static auto blocksOnly(const T &Range)
Return an iterator range over Range which only includes BlockTy blocks.
Definition VPlanUtils.h:211
static void insertOnEdge(VPBlockBase *From, VPBlockBase *To, VPBlockBase *BlockPtr)
Inserts BlockPtr on the edge between From and To.
Definition VPlanUtils.h:232
static void insertTwoBlocksAfter(VPBlockBase *IfTrue, VPBlockBase *IfFalse, VPBlockBase *BlockPtr)
Insert disconnected VPBlockBases IfTrue and IfFalse after BlockPtr.
Definition VPlanUtils.h:151
static void connectBlocks(VPBlockBase *From, VPBlockBase *To, unsigned PredIdx=-1u, unsigned SuccIdx=-1u)
Connect VPBlockBases From and To bi-directionally.
Definition VPlanUtils.h:170
static void disconnectBlocks(VPBlockBase *From, VPBlockBase *To)
Disconnect VPBlockBases From and To bi-directionally.
Definition VPlanUtils.h:189
A recipe for generating conditional branches on the bits of a mask.
Definition VPlan.h:3016
RAII object that stores the current insertion point and restores it when the object is destroyed.
VPlan-based builder utility analogous to IRBuilder.
VPValue * createScalarZExtOrTrunc(VPValue *Op, Type *ResultTy, Type *SrcTy, DebugLoc DL)
VPValue * createElementCount(Type *Ty, ElementCount EC)
VPInstruction * createScalarCast(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy, DebugLoc DL, const VPIRFlags &Flags={}, const VPIRMetadata &Metadata={})
VPDerivedIVRecipe * createDerivedIV(InductionDescriptor::InductionKind Kind, FPMathOperator *FPBinOp, VPValue *Start, VPValue *Current, VPValue *Step, const Twine &Name="")
Convert the input value Current to the corresponding value of an induction with Start and Step values...
static VPBuilder getToInsertAfter(VPRecipeBase *R)
Create a VPBuilder to insert after R.
VPInstruction * createOverflowingOp(unsigned Opcode, ArrayRef< VPValue * > Operands, VPRecipeWithIRFlags::WrapFlagsTy WrapFlags={false, false}, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
VPPhi * createScalarPhi(ArrayRef< VPValue * > IncomingValues, DebugLoc DL, const Twine &Name="")
void setInsertPoint(VPBasicBlock *TheBB)
This specifies that created VPInstructions should be appended to the end of the specified block.
VPInstruction * createNaryOp(unsigned Opcode, ArrayRef< VPValue * > Operands, Instruction *Inst=nullptr, const VPIRFlags &Flags={}, const VPIRMetadata &MD={}, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
Create an N-ary operation with Opcode, Operands and set Inst as its underlying Instruction.
Canonical scalar induction phi of the vector loop.
Definition VPlan.h:3547
unsigned getNumDefinedValues() const
Returns the number of values defined by the VPDef.
Definition VPlanValue.h:431
ArrayRef< VPValue * > definedValues()
Returns an ArrayRef of the values defined by the VPDef.
Definition VPlanValue.h:426
VPValue * getVPSingleValue()
Returns the only VPValue defined by the VPDef.
Definition VPlanValue.h:404
VPValue * getVPValue(unsigned I)
Returns the VPValue with index I defined by the VPDef.
Definition VPlanValue.h:416
A recipe for converting the input value IV value to the corresponding value of an IV with different s...
Definition VPlan.h:3716
Template specialization of the standard LLVM dominator tree utility for VPBlockBases.
bool properlyDominates(const VPRecipeBase *A, const VPRecipeBase *B)
A recipe for generating the phi node for the current index of elements, adjusted in accordance with E...
Definition VPlan.h:3637
A recipe to combine multiple recipes into a single 'expression' recipe, which should be considered a ...
Definition VPlan.h:3061
A pure virtual base class for all recipes modeling header phis, including phis for first order recurr...
Definition VPlan.h:2049
virtual VPValue * getBackedgeValue()
Returns the incoming value from the loop backedge.
Definition VPlan.h:2092
VPValue * getStartValue()
Returns the start value of the phi, if one is set.
Definition VPlan.h:2081
A special type of VPBasicBlock that wraps an existing IR basic block.
Definition VPlan.h:4119
BasicBlock * getIRBasicBlock() const
Definition VPlan.h:4143
Class to record and manage LLVM IR flags.
Definition VPlan.h:609
static LLVM_ABI_FOR_TEST VPIRInstruction * create(Instruction &I)
Create a new VPIRPhi for \I , if it is a PHINode, otherwise create a VPIRInstruction.
Helper to manage IR metadata for recipes.
Definition VPlan.h:982
void intersect(const VPIRMetadata &MD)
Intersect this VPIRMetadata object with MD, keeping only metadata nodes that are common to both.
This is a concrete Recipe that models a single VPlan-level instruction.
Definition VPlan.h:1036
@ ExtractLane
Extracts a single lane (first operand) from a set of vector operands.
Definition VPlan.h:1127
@ Unpack
Extracts all lanes from its (non-scalable) vector operand.
Definition VPlan.h:1071
@ BuildVector
Creates a fixed-width vector containing all operands.
Definition VPlan.h:1066
@ BuildStructVector
Given operands of (the same) struct type, creates a struct of fixed- width vectors each containing a ...
Definition VPlan.h:1063
@ CanonicalIVIncrementForPart
Definition VPlan.h:1056
const InterleaveGroup< Instruction > * getInterleaveGroup() const
Definition VPlan.h:2654
VPValue * getMask() const
Return the mask used by this recipe.
Definition VPlan.h:2646
ArrayRef< VPValue * > getStoredValues() const
Return the VPValues stored by this interleave group.
Definition VPlan.h:2675
A recipe for interleaved memory operations with vector-predication intrinsics.
Definition VPlan.h:2728
VPInterleaveRecipe is a recipe for transforming an interleave group of load or stores into one wide l...
Definition VPlan.h:2686
VPPredInstPHIRecipe is a recipe for generating the phi nodes needed when control converges back from ...
Definition VPlan.h:3203
VPRecipeBase is a base class modeling a sequence of one or more output IR instructions.
Definition VPlan.h:387
VPRegionBlock * getRegion()
Definition VPlan.h:4271
VPBasicBlock * getParent()
Definition VPlan.h:408
DebugLoc getDebugLoc() const
Returns the debug location of the recipe.
Definition VPlan.h:479
void moveBefore(VPBasicBlock &BB, iplist< VPRecipeBase >::iterator I)
Unlink this recipe and insert into BB before I.
void insertBefore(VPRecipeBase *InsertPos)
Insert an unlinked recipe into a basic block immediately before the specified recipe.
void insertAfter(VPRecipeBase *InsertPos)
Insert an unlinked Recipe into a basic block immediately after the specified Recipe.
iplist< VPRecipeBase >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Helper class to create VPRecipies from IR instructions.
VPRecipeBase * getRecipe(Instruction *I)
Return the recipe created for given ingredient.
A recipe to represent inloop reduction operations with vector-predication intrinsics,...
Definition VPlan.h:2890
A recipe to represent inloop, ordered or partial reduction operations.
Definition VPlan.h:2779
VPRegionBlock represents a collection of VPBasicBlocks and VPRegionBlocks which form a Single-Entry-S...
Definition VPlan.h:4154
const VPBlockBase * getEntry() const
Definition VPlan.h:4190
Type * getCanonicalIVType()
Return the type of the canonical IV for loop regions.
Definition VPlan.h:4265
bool isReplicator() const
An indicator whether this region is to generate multiple replicated instances of output IR correspond...
Definition VPlan.h:4222
void setExiting(VPBlockBase *ExitingBlock)
Set ExitingBlock as the exiting VPBlockBase of this VPRegionBlock.
Definition VPlan.h:4207
VPCanonicalIVPHIRecipe * getCanonicalIV()
Returns the canonical induction recipe of the region.
Definition VPlan.h:4252
const VPBlockBase * getExiting() const
Definition VPlan.h:4202
VPBasicBlock * getPreheaderVPBB()
Returns the pre-header VPBasicBlock of the loop region.
Definition VPlan.h:4215
VPReplicateRecipe replicates a given instruction producing multiple scalar copies of the original sca...
Definition VPlan.h:2935
bool isSingleScalar() const
Definition VPlan.h:2976
VPValue * getMask()
Return the mask of a predicated VPReplicateRecipe.
Definition VPlan.h:3000
A recipe for handling phi nodes of integer and floating-point inductions, producing their scalar valu...
Definition VPlan.h:3786
VPSingleDef is a base class for recipes for modeling a sequence of one or more output IR that define ...
Definition VPlan.h:531
Instruction * getUnderlyingInstr()
Returns the underlying instruction.
Definition VPlan.h:595
VPSingleDefRecipe * clone() override=0
Clone the current recipe.
An analysis for type-inference for VPValues.
LLVMContext & getContext()
Return the LLVMContext used by the analysis.
Type * inferScalarType(const VPValue *V)
Infer the type of V. Returns the scalar type of V.
This class augments VPValue with operands which provide the inverse def-use edges from VPValue's user...
Definition VPlanValue.h:207
operand_range operands()
Definition VPlanValue.h:275
void setOperand(unsigned I, VPValue *New)
Definition VPlanValue.h:251
VPValue * getOperand(unsigned N) const
Definition VPlanValue.h:246
void addOperand(VPValue *Operand)
Definition VPlanValue.h:240
This is the base class of the VPlan Def/Use graph, used for modeling the data flow into,...
Definition VPlanValue.h:48
bool isDefinedOutsideLoopRegions() const
Returns true if the VPValue is defined outside any loop.
Definition VPlan.cpp:1374
VPRecipeBase * getDefiningRecipe()
Returns the recipe defining this VPValue or nullptr if it is not defined by a recipe,...
Definition VPlan.cpp:131
Value * getLiveInIRValue() const
Returns the underlying IR value, if this VPValue is defined outside the scope of VPlan.
Definition VPlanValue.h:183
Value * getUnderlyingValue() const
Return the underlying Value attached to this VPValue.
Definition VPlanValue.h:85
void setUnderlyingValue(Value *Val)
Definition VPlanValue.h:193
void replaceAllUsesWith(VPValue *New)
Definition VPlan.cpp:1377
unsigned getNumUsers() const
Definition VPlanValue.h:113
bool isLiveIn() const
Returns true if this VPValue is a live-in, i.e. defined outside the VPlan.
Definition VPlanValue.h:178
void replaceUsesWithIf(VPValue *New, llvm::function_ref< bool(VPUser &U, unsigned Idx)> ShouldReplace)
Go through the uses list for this VPValue and make each use point to New if the callback ShouldReplac...
Definition VPlan.cpp:1381
user_range users()
Definition VPlanValue.h:134
A recipe to compute a pointer to the last element of each part of a widened memory access for widened...
Definition VPlan.h:1911
A Recipe for widening the canonical induction variable of the vector loop.
Definition VPlan.h:3679
VPWidenCastRecipe is a recipe to create vector cast instructions.
Definition VPlan.h:1552
Instruction::CastOps getOpcode() const
Definition VPlan.h:1588
A recipe for handling GEP instructions.
Definition VPlan.h:1848
Base class for widened induction (VPWidenIntOrFpInductionRecipe and VPWidenPointerInductionRecipe),...
Definition VPlan.h:2116
PHINode * getPHINode() const
Definition VPlan.h:2158
VPValue * getStepValue()
Returns the step value of the induction.
Definition VPlan.h:2144
const InductionDescriptor & getInductionDescriptor() const
Returns the induction descriptor for the recipe.
Definition VPlan.h:2161
A recipe for handling phi nodes of integer and floating-point inductions, producing their vector valu...
Definition VPlan.h:2192
VPValue * getLastUnrolledPartOperand()
Returns the VPValue representing the value of this induction at the last unrolled part,...
Definition VPlan.h:2268
A recipe for widening vector intrinsics.
Definition VPlan.h:1602
A common base class for widening memory operations.
Definition VPlan.h:3246
A recipe for widened phis.
Definition VPlan.h:2326
VPWidenRecipe is a recipe for producing a widened instruction using the opcode and operands of the re...
Definition VPlan.h:1512
VPlan models a candidate for vectorization, encoding various decisions take to produce efficient outp...
Definition VPlan.h:4284
bool hasVF(ElementCount VF) const
Definition VPlan.h:4489
LLVMContext & getContext() const
Definition VPlan.h:4477
VPBasicBlock * getEntry()
Definition VPlan.h:4377
VPValue & getVectorTripCount()
The vector trip count.
Definition VPlan.h:4468
bool hasScalableVF() const
Definition VPlan.h:4490
VPValue & getVFxUF()
Returns VF * UF of the vector loop region.
Definition VPlan.h:4475
VPValue & getVF()
Returns the VF of the vector loop region.
Definition VPlan.h:4471
VPValue * getTripCount() const
The trip count of the original loop.
Definition VPlan.h:4439
VPValue * getTrue()
Return a VPValue wrapping i1 true.
Definition VPlan.h:4546
VPValue * getOrCreateBackedgeTakenCount()
The backedge taken count of the original loop.
Definition VPlan.h:4460
unsigned getUF() const
Definition VPlan.h:4509
VPRegionBlock * createReplicateRegion(VPBlockBase *Entry, VPBlockBase *Exiting, const std::string &Name="")
Create a new replicate region with Entry, Exiting and Name.
Definition VPlan.h:4623
bool hasUF(unsigned UF) const
Definition VPlan.h:4507
ArrayRef< VPIRBasicBlock * > getExitBlocks() const
Return an ArrayRef containing VPIRBasicBlocks wrapping the exit blocks of the original scalar loop.
Definition VPlan.h:4429
VPValue * getConstantInt(Type *Ty, uint64_t Val, bool IsSigned=false)
Return a VPValue wrapping a ConstantInt with the given type and value.
Definition VPlan.h:4552
void setVF(ElementCount VF)
Definition VPlan.h:4483
bool isUnrolled() const
Returns true if the VPlan already has been unrolled, i.e.
Definition VPlan.h:4522
LLVM_ABI_FOR_TEST VPRegionBlock * getVectorLoopRegion()
Returns the VPRegionBlock of the vector loop.
Definition VPlan.cpp:1011
void resetTripCount(VPValue *NewTripCount)
Resets the trip count for the VPlan.
Definition VPlan.h:4453
VPBasicBlock * getMiddleBlock()
Returns the 'middle' block of the plan, that is the block that selects whether to execute the scalar ...
Definition VPlan.h:4402
VPBasicBlock * createVPBasicBlock(const Twine &Name, VPRecipeBase *Recipe=nullptr)
Create a new VPBasicBlock with Name and containing Recipe if present.
Definition VPlan.h:4601
VPValue * getFalse()
Return a VPValue wrapping i1 false.
Definition VPlan.h:4549
VPValue * getOrAddLiveIn(Value *V)
Gets the live-in VPValue for V or adds a new live-in (if none exists yet) for V.
Definition VPlan.h:4531
bool hasScalarVFOnly() const
Definition VPlan.h:4500
VPBasicBlock * getScalarPreheader() const
Return the VPBasicBlock for the preheader of the scalar loop.
Definition VPlan.h:4420
ArrayRef< VPValue * > getLiveIns() const
Return the list of live-in VPValues available in the VPlan.
Definition VPlan.h:4571
VPIRBasicBlock * getScalarHeader() const
Return the VPIRBasicBlock wrapping the header of the scalar loop.
Definition VPlan.h:4425
VPValue * getLiveIn(Value *V) const
Return the live-in VPValue for V, if there is one or nullptr otherwise.
Definition VPlan.h:4568
VPBasicBlock * getVectorPreheader()
Returns the preheader of the vector loop region, if one exists, or null otherwise.
Definition VPlan.h:4382
void setUF(unsigned UF)
Definition VPlan.h:4514
bool hasScalarTail() const
Returns true if the scalar tail may execute after the vector loop.
Definition VPlan.h:4655
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
iterator_range< user_iterator > users()
Definition Value.h:426
bool hasName() const
Definition Value.h:262
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
static constexpr bool isKnownLT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:216
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:168
constexpr LeafTy multiplyCoefficientBy(ScalarTy RHS) const
Definition TypeSize.h:256
constexpr bool isFixed() const
Returns true if the quantity is not scaled by vscale.
Definition TypeSize.h:171
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
IteratorT end() const
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
LLVM_ABI APInt RoundingUDiv(const APInt &A, const APInt &B, APInt::Rounding RM)
Return A unsign-divided by B, rounded by the given rounding mode.
Definition APInt.cpp:2763
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
m_Intrinsic_Ty< Opnd0, Opnd1, Opnd2 >::Ty m_MaskedStore(const Opnd0 &Op0, const Opnd1 &Op1, const Opnd2 &Op2)
Matches MaskedStore Intrinsic.
ap_match< APInt > m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
CastInst_match< OpTy, TruncInst > m_Trunc(const OpTy &Op)
Matches Trunc.
LogicalOp_match< LHS, RHS, Instruction::And > m_LogicalAnd(const LHS &L, const RHS &R)
Matches L && R either in the form of L & R or L ?
match_combine_or< CastInst_match< OpTy, ZExtInst >, OpTy > m_ZExtOrSelf(const OpTy &Op)
bool match(Val *V, const Pattern &P)
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
m_Intrinsic_Ty< Opnd0, Opnd1, Opnd2 >::Ty m_MaskedLoad(const Opnd0 &Op0, const Opnd1 &Op1, const Opnd2 &Op2)
Matches MaskedLoad Intrinsic.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
SpecificCmpClass_match< LHS, RHS, CmpInst > m_SpecificCmp(CmpPredicate MatchPred, const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
deferredval_ty< Value > m_Deferred(Value *const &V)
Like m_Specific(), but works if the specific value to match is determined as part of the same match()...
SpecificCmpClass_match< LHS, RHS, ICmpInst > m_SpecificICmp(CmpPredicate MatchPred, const LHS &L, const RHS &R)
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
CmpClass_match< LHS, RHS, ICmpInst > m_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
BinaryOp_match< LHS, RHS, Instruction::Mul, true > m_c_Mul(const LHS &L, const RHS &R)
Matches a Mul with LHS and RHS in either order.
MatchFunctor< Val, Pattern > match_fn(const Pattern &P)
A match functor that can be used as a UnaryPredicate in functional algorithms like all_of.
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
bind_cst_ty m_scev_APInt(const APInt *&C)
Match an SCEV constant and bind it to an APInt.
bool match(const SCEV *S, const Pattern &P)
VPInstruction_match< VPInstruction::ExtractLastLane, VPInstruction_match< VPInstruction::ExtractLastPart, Op0_t > > m_ExtractLastLaneOfLastPart(const Op0_t &Op0)
AllRecipe_commutative_match< Instruction::And, Op0_t, Op1_t > m_c_BinaryAnd(const Op0_t &Op0, const Op1_t &Op1)
Match a binary AND operation.
AllRecipe_match< Instruction::Or, Op0_t, Op1_t > m_BinaryOr(const Op0_t &Op0, const Op1_t &Op1)
Match a binary OR operation.
VPInstruction_match< VPInstruction::AnyOf > m_AnyOf()
AllRecipe_commutative_match< Opcode, Op0_t, Op1_t > m_c_Binary(const Op0_t &Op0, const Op1_t &Op1)
AllRecipe_commutative_match< Instruction::Or, Op0_t, Op1_t > m_c_BinaryOr(const Op0_t &Op0, const Op1_t &Op1)
GEPLikeRecipe_match< Op0_t, Op1_t > m_GetElementPtr(const Op0_t &Op0, const Op1_t &Op1)
AllRecipe_match< Opcode, Op0_t, Op1_t > m_Binary(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::LastActiveLane, Op0_t > m_LastActiveLane(const Op0_t &Op0)
VPInstruction_match< Instruction::ExtractElement, Op0_t, Op1_t > m_ExtractElement(const Op0_t &Op0, const Op1_t &Op1)
specific_intval< 1 > m_False()
VPDerivedIV_match< Op0_t, Op1_t, Op2_t > m_DerivedIV(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2)
VPInstruction_match< VPInstruction::ExtractLastLane, Op0_t > m_ExtractLastLane(const Op0_t &Op0)
VPInstruction_match< VPInstruction::ActiveLaneMask, Op0_t, Op1_t, Op2_t > m_ActiveLaneMask(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2)
VPInstruction_match< VPInstruction::BranchOnCount > m_BranchOnCount()
specific_intval< 1 > m_True()
VectorEndPointerRecipe_match< Op0_t, Op1_t > m_VecEndPtr(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::ExtractLastPart, Op0_t > m_ExtractLastPart(const Op0_t &Op0)
VPInstruction_match< VPInstruction::Broadcast, Op0_t > m_Broadcast(const Op0_t &Op0)
class_match< VPValue > m_VPValue()
Match an arbitrary VPValue and ignore it.
VPInstruction_match< VPInstruction::ExplicitVectorLength, Op0_t > m_EVL(const Op0_t &Op0)
VPInstruction_match< VPInstruction::BuildVector > m_BuildVector()
BuildVector is matches only its opcode, w/o matching its operands as the number of operands is not fi...
VPInstruction_match< VPInstruction::ExtractPenultimateElement, Op0_t > m_ExtractPenultimateElement(const Op0_t &Op0)
VPInstruction_match< VPInstruction::FirstActiveLane, Op0_t > m_FirstActiveLane(const Op0_t &Op0)
bind_ty< VPInstruction > m_VPInstruction(VPInstruction *&V)
Match a VPInstruction, capturing if we match.
VPInstruction_match< VPInstruction::BranchOnCond > m_BranchOnCond()
VPInstruction_match< VPInstruction::ExtractLane, Op0_t, Op1_t > m_ExtractLane(const Op0_t &Op0, const Op1_t &Op1)
NodeAddr< DefNode * > Def
Definition RDFGraph.h:384
bool isSingleScalar(const VPValue *VPV)
Returns true if VPV is a single scalar, either because it produces the same value for all lanes or on...
bool isUniformAcrossVFsAndUFs(VPValue *V)
Checks if V is uniform across all VF lanes and UF parts.
VPValue * getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr)
Get or create a VPValue that corresponds to the expansion of Expr.
std::optional< MemoryLocation > getMemoryLocation(const VPRecipeBase &R)
Return a MemoryLocation for R with noalias metadata populated from R, if the recipe is supported and ...
bool onlyFirstLaneUsed(const VPValue *Def)
Returns true if only the first lane of Def is used.
VPIRFlags getFlagsFromIndDesc(const InductionDescriptor &ID)
Extracts and returns NoWrap and FastMath flags from the induction binop in ID.
Definition VPlanUtils.h:85
bool onlyScalarValuesUsed(const VPValue *Def)
Returns true if only scalar values of Def are used by all users.
bool isHeaderMask(const VPValue *V, const VPlan &Plan)
Return true if V is a header mask in Plan.
const SCEV * getSCEVExprForVPValue(const VPValue *V, ScalarEvolution &SE, const Loop *L=nullptr)
Return the SCEV expression for V.
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
@ Offset
Definition DWP.cpp:532
auto min_element(R &&Range)
Provide wrappers to std::min_element which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:2032
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1737
LLVM_ABI Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
DenseMap< const Value *, const SCEV * > ValueToSCEVMapTy
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2484
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
constexpr from_range_t from_range
auto dyn_cast_if_present(const Y &Val)
dyn_cast_if_present<X> - Functionally identical to dyn_cast, except that a null (or none in the case ...
Definition Casting.h:732
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2148
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:632
auto cast_or_null(const Y &Val)
Definition Casting.h:714
iterator_range< df_iterator< VPBlockShallowTraversalWrapper< VPBlockBase * > > > vp_depth_first_shallow(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order.
Definition VPlanCFG.h:216
iterator_range< df_iterator< VPBlockDeepTraversalWrapper< VPBlockBase * > > > vp_depth_first_deep(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order while traversing t...
Definition VPlanCFG.h:243
detail::concat_range< ValueT, RangeTs... > concat(RangeTs &&...Ranges)
Returns a concatenated range across two or more ranges.
Definition STLExtras.h:1150
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:385
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1744
auto reverse(ContainerTy &&C)
Definition STLExtras.h:406
iterator_range< po_iterator< VPBlockDeepTraversalWrapper< VPBlockBase * > > > vp_post_order_deep(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in post order while traversing through ...
Definition VPlanCFG.h:236
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1634
LLVM_ABI_FOR_TEST cl::opt< bool > EnableWideActiveLaneMask
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1751
SmallVector< ValueTypeFromRangeType< R >, Size > to_vector(R &&Range)
Given a range of type R, iterate the entire range and return a SmallVector with elements of the vecto...
iterator_range< filter_iterator< detail::IterOfRange< RangeT >, PredicateT > > make_filter_range(RangeT &&Range, PredicateT Pred)
Convenience function that takes a range of elements and a predicate, and return a new filter_iterator...
Definition STLExtras.h:550
bool canConstantBeExtended(const APInt *C, Type *NarrowType, TTI::PartialReductionExtendKind ExtKind)
Check if a constant CI can be safely treated as having been extended from a narrower type with the gi...
Definition VPlan.cpp:1718
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
auto drop_end(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the last N elements excluded.
Definition STLExtras.h:323
RecurKind
These are the kinds of recurrences that we support.
@ Mul
Product of integers.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
@ AddChainWithSubs
A chain of adds and subs.
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition STLExtras.h:1966
DWARFExpression::Operation Op
auto max_element(R &&Range)
Provide wrappers to std::max_element which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:2042
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition STLExtras.h:1973
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="", bool Before=false)
Split the specified block at the specified instruction.
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1770
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition STLExtras.h:2120
@ DataAndControlFlowWithoutRuntimeCheck
Use predicate to control both data and control flow, but modify the trip count so that a runtime over...
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition Hashing.h:592
bool equal(L &&LRange, R &&RRange)
Wrapper function around std::equal to detect if pair-wise elements between two ranges are the same.
Definition STLExtras.h:2100
Type * toVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
@ Default
The result values are uniform if and only if all operands are uniform.
Definition Uniformity.h:20
constexpr detail::IsaCheckPredicate< Types... > IsaPred
Function object wrapper for the llvm::isa type check.
Definition Casting.h:866
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition Hashing.h:466
#define N
RemoveMask_match(const Op0_t &In, Op1_t &Out)
bool match(OpTy *V) const
A collection of metadata nodes that might be associated with a memory access used by the alias-analys...
Definition Metadata.h:761
MDNode * Scope
The tag for alias scope specification (used with noalias).
Definition Metadata.h:784
MDNode * NoAlias
The tag specifying the noalias scope.
Definition Metadata.h:787
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
An information struct used to provide DenseMap with the various necessary components for a given valu...
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
A range of powers-of-2 vectorization factors with fixed start and adjustable end.
Struct to hold various analysis needed for cost computations.
A recipe for handling first-order recurrence phis.
Definition VPlan.h:2368
A recipe for widening load operations with vector-predication intrinsics, using the address to load f...
Definition VPlan.h:3378
A recipe for widening load operations, using the address to load from and an optional mask.
Definition VPlan.h:3337
A recipe for widening select instructions.
Definition VPlan.h:1801
A recipe for widening store operations with vector-predication intrinsics, using the value to store,...
Definition VPlan.h:3462
A recipe for widening store operations, using the stored value, the address to store to and an option...
Definition VPlan.h:3419
static void hoistPredicatedLoads(VPlan &Plan, ScalarEvolution &SE, const Loop *L)
Hoist predicated loads from the same address to the loop entry block, if they are guaranteed to execu...
static void sinkPredicatedStores(VPlan &Plan, ScalarEvolution &SE, const Loop *L)
Sink predicated stores to the same address with complementary predicates (P and NOT P) to an uncondit...
static void materializeBroadcasts(VPlan &Plan)
Add explicit broadcasts for live-ins and VPValues defined in Plan's entry block if they are used as v...
static void materializePacksAndUnpacks(VPlan &Plan)
Add explicit Build[Struct]Vector recipes to Pack multiple scalar values into vectors and Unpack recip...
static void materializeBackedgeTakenCount(VPlan &Plan, VPBasicBlock *VectorPH)
Materialize the backedge-taken count to be computed explicitly using VPInstructions.
static void optimizeInductionExitUsers(VPlan &Plan, DenseMap< VPValue *, VPValue * > &EndValues, ScalarEvolution &SE)
If there's a single exit block, optimize its phi recipes that use exiting IV values by feeding them p...
static void hoistInvariantLoads(VPlan &Plan)
Hoist single-scalar loads with invariant addresses out of the vector loop to the preheader,...
static void canonicalizeEVLLoops(VPlan &Plan)
Transform EVL loops to use variable-length stepping after region dissolution.
static void dropPoisonGeneratingRecipes(VPlan &Plan, const std::function< bool(BasicBlock *)> &BlockNeedsPredication)
Drop poison flags from recipes that may generate a poison value that is used after vectorization,...
static void createAndOptimizeReplicateRegions(VPlan &Plan)
Wrap predicated VPReplicateRecipes with a mask operand in an if-then region block and remove the mask...
static void createInterleaveGroups(VPlan &Plan, const SmallPtrSetImpl< const InterleaveGroup< Instruction > * > &InterleaveGroups, VPRecipeBuilder &RecipeBuilder, const bool &ScalarEpilogueAllowed)
static bool runPass(bool(*Transform)(VPlan &, ArgsTy...), VPlan &Plan, typename std::remove_reference< ArgsTy >::type &...Args)
Helper to run a VPlan transform Transform on VPlan, forwarding extra arguments to the transform.
static void addBranchWeightToMiddleTerminator(VPlan &Plan, ElementCount VF, std::optional< unsigned > VScaleForTuning)
Add branch weight metadata, if the Plan's middle block is terminated by a BranchOnCond recipe.
static void narrowInterleaveGroups(VPlan &Plan, ElementCount VF, TypeSize VectorRegWidth)
Try to convert a plan with interleave groups with VF elements to a plan with the interleave groups re...
static DenseMap< const SCEV *, Value * > expandSCEVs(VPlan &Plan, ScalarEvolution &SE)
Expand VPExpandSCEVRecipes in Plan's entry block.
static void convertToConcreteRecipes(VPlan &Plan)
Lower abstract recipes to concrete ones, that can be codegen'd.
static void convertToAbstractRecipes(VPlan &Plan, VPCostContext &Ctx, VFRange &Range)
This function converts initial recipes to the abstract recipes and clamps Range based on cost model f...
static void materializeConstantVectorTripCount(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
static LLVM_ABI_FOR_TEST bool tryToConvertVPInstructionsToVPRecipes(VPlan &Plan, function_ref< const InductionDescriptor *(PHINode *)> GetIntOrFpInductionDescriptor, const TargetLibraryInfo &TLI)
Replaces the VPInstructions in Plan with corresponding widen recipes.
static void addExitUsersForFirstOrderRecurrences(VPlan &Plan, VFRange &Range)
Handle users in the exit block for first order reductions in the original exit block.
static void addExplicitVectorLength(VPlan &Plan, const std::optional< unsigned > &MaxEVLSafeElements)
Add a VPEVLBasedIVPHIRecipe and related recipes to Plan and replaces all uses except the canonical IV...
static void replaceSymbolicStrides(VPlan &Plan, PredicatedScalarEvolution &PSE, const DenseMap< Value *, const SCEV * > &StridesMap)
Replace symbolic strides from StridesMap in Plan with constants when possible.
static void removeBranchOnConst(VPlan &Plan)
Remove BranchOnCond recipes with true or false conditions together with removing dead edges to their ...
static void removeDeadRecipes(VPlan &Plan)
Remove dead recipes from Plan.
static void materializeVectorTripCount(VPlan &Plan, VPBasicBlock *VectorPHVPBB, bool TailByMasking, bool RequiresScalarEpilogue)
Materialize vector trip count computations to a set of VPInstructions.
static void simplifyRecipes(VPlan &Plan)
Perform instcombine-like simplifications on recipes in Plan.
static void handleUncountableEarlyExit(VPBasicBlock *EarlyExitingVPBB, VPBasicBlock *EarlyExitVPBB, VPlan &Plan, VPBasicBlock *HeaderVPBB, VPBasicBlock *LatchVPBB)
Update Plan to account for the uncountable early exit from EarlyExitingVPBB to EarlyExitVPBB by.
static void clearReductionWrapFlags(VPlan &Plan)
Clear NSW/NUW flags from reduction instructions if necessary.
static void cse(VPlan &Plan)
Perform common-subexpression-elimination on Plan.
static void addActiveLaneMask(VPlan &Plan, bool UseActiveLaneMaskForControlFlow, bool DataAndControlFlowWithoutRuntimeCheck)
Replace (ICMP_ULE, wide canonical IV, backedge-taken-count) checks with an (active-lane-mask recipe,...
static LLVM_ABI_FOR_TEST void optimize(VPlan &Plan)
Apply VPlan-to-VPlan optimizations to Plan, including induction recipe optimizations,...
static void dissolveLoopRegions(VPlan &Plan)
Replace loop regions with explicit CFG.
static void truncateToMinimalBitwidths(VPlan &Plan, const MapVector< Instruction *, uint64_t > &MinBWs)
Insert truncates and extends for any truncated recipe.
static bool adjustFixedOrderRecurrences(VPlan &Plan, VPBuilder &Builder)
Try to have all users of fixed-order recurrences appear after the recipe defining their previous valu...
static void optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Optimize Plan based on BestVF and BestUF.
static void materializeVFAndVFxUF(VPlan &Plan, VPBasicBlock *VectorPH, ElementCount VF)
Materialize VF and VFxUF to be computed explicitly using VPInstructions.
static void updateScalarResumePhis(VPlan &Plan, DenseMap< VPValue *, VPValue * > &IVEndValues)
Update the resume phis in the scalar preheader after creating wide recipes for first-order recurrence...