LLVM 22.0.0git
VPlanTransforms.cpp
Go to the documentation of this file.
1//===-- VPlanTransforms.cpp - Utility VPlan to VPlan transforms -----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file implements a set of utility VPlan to VPlan transformations.
11///
12//===----------------------------------------------------------------------===//
13
14#include "VPlanTransforms.h"
15#include "VPRecipeBuilder.h"
16#include "VPlan.h"
17#include "VPlanAnalysis.h"
18#include "VPlanCFG.h"
19#include "VPlanDominatorTree.h"
20#include "VPlanHelpers.h"
21#include "VPlanPatternMatch.h"
22#include "VPlanUtils.h"
23#include "VPlanVerifier.h"
24#include "llvm/ADT/APInt.h"
26#include "llvm/ADT/STLExtras.h"
28#include "llvm/ADT/SetVector.h"
30#include "llvm/ADT/TypeSwitch.h"
38#include "llvm/IR/Intrinsics.h"
39#include "llvm/IR/MDBuilder.h"
40#include "llvm/IR/Metadata.h"
44
45using namespace llvm;
46using namespace VPlanPatternMatch;
47using namespace SCEVPatternMatch;
48
50 VPlan &Plan,
52 GetIntOrFpInductionDescriptor,
53 const TargetLibraryInfo &TLI) {
54
56 Plan.getVectorLoopRegion());
58 // Skip blocks outside region
59 if (!VPBB->getParent())
60 break;
61 VPRecipeBase *Term = VPBB->getTerminator();
62 auto EndIter = Term ? Term->getIterator() : VPBB->end();
63 // Introduce each ingredient into VPlan.
64 for (VPRecipeBase &Ingredient :
65 make_early_inc_range(make_range(VPBB->begin(), EndIter))) {
66
67 VPValue *VPV = Ingredient.getVPSingleValue();
68 if (!VPV->getUnderlyingValue())
69 continue;
70
72
73 VPRecipeBase *NewRecipe = nullptr;
74 if (auto *PhiR = dyn_cast<VPPhi>(&Ingredient)) {
75 auto *Phi = cast<PHINode>(PhiR->getUnderlyingValue());
76 const auto *II = GetIntOrFpInductionDescriptor(Phi);
77 if (!II) {
78 NewRecipe = new VPWidenPHIRecipe(Phi, nullptr, PhiR->getDebugLoc());
79 for (VPValue *Op : PhiR->operands())
80 NewRecipe->addOperand(Op);
81 } else {
82 VPIRValue *Start = Plan.getOrAddLiveIn(II->getStartValue());
83 VPValue *Step =
85 // It is always safe to copy over the NoWrap and FastMath flags. In
86 // particular, when folding tail by masking, the masked-off lanes are
87 // never used, so it is safe.
89 NewRecipe = new VPWidenIntOrFpInductionRecipe(
90 Phi, Start, Step, &Plan.getVF(), *II, Flags,
91 Ingredient.getDebugLoc());
92 }
93 } else {
94 auto *VPI = cast<VPInstruction>(&Ingredient);
95 assert(!isa<PHINode>(Inst) && "phis should be handled above");
96 // Create VPWidenMemoryRecipe for loads and stores.
97 if (LoadInst *Load = dyn_cast<LoadInst>(Inst)) {
98 NewRecipe = new VPWidenLoadRecipe(
99 *Load, Ingredient.getOperand(0), nullptr /*Mask*/,
100 false /*Consecutive*/, false /*Reverse*/, *VPI,
101 Ingredient.getDebugLoc());
102 } else if (StoreInst *Store = dyn_cast<StoreInst>(Inst)) {
103 NewRecipe = new VPWidenStoreRecipe(
104 *Store, Ingredient.getOperand(1), Ingredient.getOperand(0),
105 nullptr /*Mask*/, false /*Consecutive*/, false /*Reverse*/, *VPI,
106 Ingredient.getDebugLoc());
108 NewRecipe = new VPWidenGEPRecipe(GEP, Ingredient.operands(), *VPI,
109 Ingredient.getDebugLoc());
110 } else if (CallInst *CI = dyn_cast<CallInst>(Inst)) {
111 Intrinsic::ID VectorID = getVectorIntrinsicIDForCall(CI, &TLI);
112 if (VectorID == Intrinsic::not_intrinsic)
113 return false;
114 NewRecipe = new VPWidenIntrinsicRecipe(
115 *CI, getVectorIntrinsicIDForCall(CI, &TLI),
116 drop_end(Ingredient.operands()), CI->getType(), VPIRFlags(*CI),
117 *VPI, CI->getDebugLoc());
118 } else if (auto *CI = dyn_cast<CastInst>(Inst)) {
119 NewRecipe = new VPWidenCastRecipe(
120 CI->getOpcode(), Ingredient.getOperand(0), CI->getType(), CI,
121 VPIRFlags(*CI), VPIRMetadata(*CI));
122 } else {
123 NewRecipe = new VPWidenRecipe(*Inst, Ingredient.operands(), *VPI,
124 *VPI, Ingredient.getDebugLoc());
125 }
126 }
127
128 NewRecipe->insertBefore(&Ingredient);
129 if (NewRecipe->getNumDefinedValues() == 1)
130 VPV->replaceAllUsesWith(NewRecipe->getVPSingleValue());
131 else
132 assert(NewRecipe->getNumDefinedValues() == 0 &&
133 "Only recpies with zero or one defined values expected");
134 Ingredient.eraseFromParent();
135 }
136 }
137 return true;
138}
139
140/// Helper for extra no-alias checks via known-safe recipe and SCEV.
142 const SmallPtrSetImpl<VPRecipeBase *> &ExcludeRecipes;
143 VPReplicateRecipe &GroupLeader;
145 const Loop &L;
146 VPTypeAnalysis &TypeInfo;
147
148 // Return true if \p A and \p B are known to not alias for all VFs in the
149 // plan, checked via the distance between the accesses
150 bool isNoAliasViaDistance(VPReplicateRecipe *A, VPReplicateRecipe *B) const {
151 if (A->getOpcode() != Instruction::Store ||
152 B->getOpcode() != Instruction::Store)
153 return false;
154
155 VPValue *AddrA = A->getOperand(1);
156 const SCEV *SCEVA = vputils::getSCEVExprForVPValue(AddrA, PSE, &L);
157 VPValue *AddrB = B->getOperand(1);
158 const SCEV *SCEVB = vputils::getSCEVExprForVPValue(AddrB, PSE, &L);
160 return false;
161
162 const APInt *Distance;
163 ScalarEvolution &SE = *PSE.getSE();
164 if (!match(SE.getMinusSCEV(SCEVA, SCEVB), m_scev_APInt(Distance)))
165 return false;
166
167 const DataLayout &DL = SE.getDataLayout();
168 Type *TyA = TypeInfo.inferScalarType(A->getOperand(0));
169 uint64_t SizeA = DL.getTypeStoreSize(TyA);
170 Type *TyB = TypeInfo.inferScalarType(B->getOperand(0));
171 uint64_t SizeB = DL.getTypeStoreSize(TyB);
172
173 // Use the maximum store size to ensure no overlap from either direction.
174 // Currently only handles fixed sizes, as it is only used for
175 // replicating VPReplicateRecipes.
176 uint64_t MaxStoreSize = std::max(SizeA, SizeB);
177
178 auto VFs = B->getParent()->getPlan()->vectorFactors();
180 return Distance->abs().uge(
181 MaxVF.multiplyCoefficientBy(MaxStoreSize).getFixedValue());
182 }
183
184public:
187 const Loop &L, VPTypeAnalysis &TypeInfo)
188 : ExcludeRecipes(ExcludeRecipes), GroupLeader(GroupLeader), PSE(PSE),
189 L(L), TypeInfo(TypeInfo) {}
190
191 /// Return true if \p R should be skipped during alias checking, either
192 /// because it's in the exclude set or because no-alias can be proven via
193 /// SCEV.
194 bool shouldSkip(VPRecipeBase &R) const {
195 auto *Store = dyn_cast<VPReplicateRecipe>(&R);
196 return ExcludeRecipes.contains(&R) ||
197 (Store && isNoAliasViaDistance(Store, &GroupLeader));
198 }
199};
200
201/// Check if a memory operation doesn't alias with memory operations in blocks
202/// between \p FirstBB and \p LastBB using scoped noalias metadata. If
203/// \p SinkInfo is std::nullopt, only recipes that may write to memory are
204/// checked (for load hoisting). Otherwise recipes that both read and write
205/// memory are checked, and SCEV is used to prove no-alias between the group
206/// leader and other replicate recipes (for store sinking).
207static bool
209 VPBasicBlock *FirstBB, VPBasicBlock *LastBB,
210 std::optional<SinkStoreInfo> SinkInfo = {}) {
211 bool CheckReads = SinkInfo.has_value();
212 if (!MemLoc.AATags.Scope)
213 return false;
214
215 const AAMDNodes &MemAA = MemLoc.AATags;
216
217 for (VPBlockBase *Block = FirstBB; Block;
218 Block = Block->getSingleSuccessor()) {
219 assert(Block->getNumSuccessors() <= 1 &&
220 "Expected at most one successor in block chain");
221 auto *VPBB = cast<VPBasicBlock>(Block);
222 for (VPRecipeBase &R : *VPBB) {
223 if (SinkInfo && SinkInfo->shouldSkip(R))
224 continue;
225
226 // Skip recipes that don't need checking.
227 if (!R.mayWriteToMemory() && !(CheckReads && R.mayReadFromMemory()))
228 continue;
229
231 if (!Loc)
232 // Conservatively assume aliasing for memory operations without
233 // location.
234 return false;
235
236 // For reads, check if they don't alias in the reverse direction and
237 // skip if so.
238 if (CheckReads && R.mayReadFromMemory() &&
240 MemAA.NoAlias))
241 continue;
242
243 // Check if the memory operations may alias in the forward direction.
245 Loc->AATags.NoAlias))
246 return false;
247 }
248
249 if (Block == LastBB)
250 break;
251 }
252 return true;
253}
254
255/// Return true if we do not know how to (mechanically) hoist or sink \p R out
256/// of a loop region.
258 // Assumes don't alias anything or throw; as long as they're guaranteed to
259 // execute, they're safe to hoist.
261 return false;
262
263 // TODO: Relax checks in the future, e.g. we could also hoist reads, if their
264 // memory location is not modified in the vector loop.
265 if (R.mayHaveSideEffects() || R.mayReadFromMemory() || R.isPhi())
266 return true;
267
268 // Allocas cannot be hoisted.
269 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
270 return RepR && RepR->getOpcode() == Instruction::Alloca;
271}
272
273static bool sinkScalarOperands(VPlan &Plan) {
274 auto Iter = vp_depth_first_deep(Plan.getEntry());
275 bool ScalarVFOnly = Plan.hasScalarVFOnly();
276 bool Changed = false;
277
279 auto InsertIfValidSinkCandidate = [ScalarVFOnly, &WorkList](
280 VPBasicBlock *SinkTo, VPValue *Op) {
281 auto *Candidate =
282 dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe());
283 if (!Candidate)
284 return;
285
286 // We only know how to sink VPReplicateRecipes and VPScalarIVStepsRecipes
287 // for now.
289 return;
290
291 if (Candidate->getParent() == SinkTo || cannotHoistOrSinkRecipe(*Candidate))
292 return;
293
294 if (auto *RepR = dyn_cast<VPReplicateRecipe>(Candidate))
295 if (!ScalarVFOnly && RepR->isSingleScalar())
296 return;
297
298 WorkList.insert({SinkTo, Candidate});
299 };
300
301 // First, collect the operands of all recipes in replicate blocks as seeds for
302 // sinking.
304 VPBasicBlock *EntryVPBB = VPR->getEntryBasicBlock();
305 if (!VPR->isReplicator() || EntryVPBB->getSuccessors().size() != 2)
306 continue;
307 VPBasicBlock *VPBB = cast<VPBasicBlock>(EntryVPBB->getSuccessors().front());
308 if (VPBB->getSingleSuccessor() != VPR->getExitingBasicBlock())
309 continue;
310 for (auto &Recipe : *VPBB)
311 for (VPValue *Op : Recipe.operands())
312 InsertIfValidSinkCandidate(VPBB, Op);
313 }
314
315 // Try to sink each replicate or scalar IV steps recipe in the worklist.
316 for (unsigned I = 0; I != WorkList.size(); ++I) {
317 VPBasicBlock *SinkTo;
318 VPSingleDefRecipe *SinkCandidate;
319 std::tie(SinkTo, SinkCandidate) = WorkList[I];
320
321 // All recipe users of SinkCandidate must be in the same block SinkTo or all
322 // users outside of SinkTo must only use the first lane of SinkCandidate. In
323 // the latter case, we need to duplicate SinkCandidate.
324 auto UsersOutsideSinkTo =
325 make_filter_range(SinkCandidate->users(), [SinkTo](VPUser *U) {
326 return cast<VPRecipeBase>(U)->getParent() != SinkTo;
327 });
328 if (any_of(UsersOutsideSinkTo, [SinkCandidate](VPUser *U) {
329 return !U->usesFirstLaneOnly(SinkCandidate);
330 }))
331 continue;
332 bool NeedsDuplicating = !UsersOutsideSinkTo.empty();
333
334 if (NeedsDuplicating) {
335 if (ScalarVFOnly)
336 continue;
337 VPSingleDefRecipe *Clone;
338 if (auto *SinkCandidateRepR =
339 dyn_cast<VPReplicateRecipe>(SinkCandidate)) {
340 // TODO: Handle converting to uniform recipes as separate transform,
341 // then cloning should be sufficient here.
342 Instruction *I = SinkCandidate->getUnderlyingInstr();
343 Clone = new VPReplicateRecipe(I, SinkCandidate->operands(), true,
344 nullptr /*Mask*/, *SinkCandidateRepR,
345 *SinkCandidateRepR);
346 // TODO: add ".cloned" suffix to name of Clone's VPValue.
347 } else {
348 Clone = SinkCandidate->clone();
349 }
350
351 Clone->insertBefore(SinkCandidate);
352 SinkCandidate->replaceUsesWithIf(Clone, [SinkTo](VPUser &U, unsigned) {
353 return cast<VPRecipeBase>(&U)->getParent() != SinkTo;
354 });
355 }
356 SinkCandidate->moveBefore(*SinkTo, SinkTo->getFirstNonPhi());
357 for (VPValue *Op : SinkCandidate->operands())
358 InsertIfValidSinkCandidate(SinkTo, Op);
359 Changed = true;
360 }
361 return Changed;
362}
363
364/// If \p R is a region with a VPBranchOnMaskRecipe in the entry block, return
365/// the mask.
367 auto *EntryBB = dyn_cast<VPBasicBlock>(R->getEntry());
368 if (!EntryBB || EntryBB->size() != 1 ||
369 !isa<VPBranchOnMaskRecipe>(EntryBB->begin()))
370 return nullptr;
371
372 return cast<VPBranchOnMaskRecipe>(&*EntryBB->begin())->getOperand(0);
373}
374
375/// If \p R is a triangle region, return the 'then' block of the triangle.
377 auto *EntryBB = cast<VPBasicBlock>(R->getEntry());
378 if (EntryBB->getNumSuccessors() != 2)
379 return nullptr;
380
381 auto *Succ0 = dyn_cast<VPBasicBlock>(EntryBB->getSuccessors()[0]);
382 auto *Succ1 = dyn_cast<VPBasicBlock>(EntryBB->getSuccessors()[1]);
383 if (!Succ0 || !Succ1)
384 return nullptr;
385
386 if (Succ0->getNumSuccessors() + Succ1->getNumSuccessors() != 1)
387 return nullptr;
388 if (Succ0->getSingleSuccessor() == Succ1)
389 return Succ0;
390 if (Succ1->getSingleSuccessor() == Succ0)
391 return Succ1;
392 return nullptr;
393}
394
395// Merge replicate regions in their successor region, if a replicate region
396// is connected to a successor replicate region with the same predicate by a
397// single, empty VPBasicBlock.
399 SmallPtrSet<VPRegionBlock *, 4> TransformedRegions;
400
401 // Collect replicate regions followed by an empty block, followed by another
402 // replicate region with matching masks to process front. This is to avoid
403 // iterator invalidation issues while merging regions.
406 vp_depth_first_deep(Plan.getEntry()))) {
407 if (!Region1->isReplicator())
408 continue;
409 auto *MiddleBasicBlock =
410 dyn_cast_or_null<VPBasicBlock>(Region1->getSingleSuccessor());
411 if (!MiddleBasicBlock || !MiddleBasicBlock->empty())
412 continue;
413
414 auto *Region2 =
415 dyn_cast_or_null<VPRegionBlock>(MiddleBasicBlock->getSingleSuccessor());
416 if (!Region2 || !Region2->isReplicator())
417 continue;
418
419 VPValue *Mask1 = getPredicatedMask(Region1);
420 VPValue *Mask2 = getPredicatedMask(Region2);
421 if (!Mask1 || Mask1 != Mask2)
422 continue;
423
424 assert(Mask1 && Mask2 && "both region must have conditions");
425 WorkList.push_back(Region1);
426 }
427
428 // Move recipes from Region1 to its successor region, if both are triangles.
429 for (VPRegionBlock *Region1 : WorkList) {
430 if (TransformedRegions.contains(Region1))
431 continue;
432 auto *MiddleBasicBlock = cast<VPBasicBlock>(Region1->getSingleSuccessor());
433 auto *Region2 = cast<VPRegionBlock>(MiddleBasicBlock->getSingleSuccessor());
434
435 VPBasicBlock *Then1 = getPredicatedThenBlock(Region1);
436 VPBasicBlock *Then2 = getPredicatedThenBlock(Region2);
437 if (!Then1 || !Then2)
438 continue;
439
440 // Note: No fusion-preventing memory dependencies are expected in either
441 // region. Such dependencies should be rejected during earlier dependence
442 // checks, which guarantee accesses can be re-ordered for vectorization.
443 //
444 // Move recipes to the successor region.
445 for (VPRecipeBase &ToMove : make_early_inc_range(reverse(*Then1)))
446 ToMove.moveBefore(*Then2, Then2->getFirstNonPhi());
447
448 auto *Merge1 = cast<VPBasicBlock>(Then1->getSingleSuccessor());
449 auto *Merge2 = cast<VPBasicBlock>(Then2->getSingleSuccessor());
450
451 // Move VPPredInstPHIRecipes from the merge block to the successor region's
452 // merge block. Update all users inside the successor region to use the
453 // original values.
454 for (VPRecipeBase &Phi1ToMove : make_early_inc_range(reverse(*Merge1))) {
455 VPValue *PredInst1 =
456 cast<VPPredInstPHIRecipe>(&Phi1ToMove)->getOperand(0);
457 VPValue *Phi1ToMoveV = Phi1ToMove.getVPSingleValue();
458 Phi1ToMoveV->replaceUsesWithIf(PredInst1, [Then2](VPUser &U, unsigned) {
459 return cast<VPRecipeBase>(&U)->getParent() == Then2;
460 });
461
462 // Remove phi recipes that are unused after merging the regions.
463 if (Phi1ToMove.getVPSingleValue()->getNumUsers() == 0) {
464 Phi1ToMove.eraseFromParent();
465 continue;
466 }
467 Phi1ToMove.moveBefore(*Merge2, Merge2->begin());
468 }
469
470 // Remove the dead recipes in Region1's entry block.
471 for (VPRecipeBase &R :
472 make_early_inc_range(reverse(*Region1->getEntryBasicBlock())))
473 R.eraseFromParent();
474
475 // Finally, remove the first region.
476 for (VPBlockBase *Pred : make_early_inc_range(Region1->getPredecessors())) {
477 VPBlockUtils::disconnectBlocks(Pred, Region1);
478 VPBlockUtils::connectBlocks(Pred, MiddleBasicBlock);
479 }
480 VPBlockUtils::disconnectBlocks(Region1, MiddleBasicBlock);
481 TransformedRegions.insert(Region1);
482 }
483
484 return !TransformedRegions.empty();
485}
486
488 VPlan &Plan) {
489 Instruction *Instr = PredRecipe->getUnderlyingInstr();
490 // Build the triangular if-then region.
491 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
492 assert(Instr->getParent() && "Predicated instruction not in any basic block");
493 auto *BlockInMask = PredRecipe->getMask();
494 auto *MaskDef = BlockInMask->getDefiningRecipe();
495 auto *BOMRecipe = new VPBranchOnMaskRecipe(
496 BlockInMask, MaskDef ? MaskDef->getDebugLoc() : DebugLoc::getUnknown());
497 auto *Entry =
498 Plan.createVPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
499
500 // Replace predicated replicate recipe with a replicate recipe without a
501 // mask but in the replicate region.
502 auto *RecipeWithoutMask = new VPReplicateRecipe(
503 PredRecipe->getUnderlyingInstr(), drop_end(PredRecipe->operands()),
504 PredRecipe->isSingleScalar(), nullptr /*Mask*/, *PredRecipe, *PredRecipe,
505 PredRecipe->getDebugLoc());
506 auto *Pred =
507 Plan.createVPBasicBlock(Twine(RegionName) + ".if", RecipeWithoutMask);
508
509 VPPredInstPHIRecipe *PHIRecipe = nullptr;
510 if (PredRecipe->getNumUsers() != 0) {
511 PHIRecipe = new VPPredInstPHIRecipe(RecipeWithoutMask,
512 RecipeWithoutMask->getDebugLoc());
513 PredRecipe->replaceAllUsesWith(PHIRecipe);
514 PHIRecipe->setOperand(0, RecipeWithoutMask);
515 }
516 PredRecipe->eraseFromParent();
517 auto *Exiting =
518 Plan.createVPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
520 Plan.createReplicateRegion(Entry, Exiting, RegionName);
521
522 // Note: first set Entry as region entry and then connect successors starting
523 // from it in order, to propagate the "parent" of each VPBasicBlock.
524 VPBlockUtils::insertTwoBlocksAfter(Pred, Exiting, Entry);
525 VPBlockUtils::connectBlocks(Pred, Exiting);
526
527 return Region;
528}
529
530static void addReplicateRegions(VPlan &Plan) {
533 vp_depth_first_deep(Plan.getEntry()))) {
534 for (VPRecipeBase &R : *VPBB)
535 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
536 if (RepR->isPredicated())
537 WorkList.push_back(RepR);
538 }
539 }
540
541 unsigned BBNum = 0;
542 for (VPReplicateRecipe *RepR : WorkList) {
543 VPBasicBlock *CurrentBlock = RepR->getParent();
544 VPBasicBlock *SplitBlock = CurrentBlock->splitAt(RepR->getIterator());
545
546 BasicBlock *OrigBB = RepR->getUnderlyingInstr()->getParent();
547 SplitBlock->setName(
548 OrigBB->hasName() ? OrigBB->getName() + "." + Twine(BBNum++) : "");
549 // Record predicated instructions for above packing optimizations.
551 Region->setParent(CurrentBlock->getParent());
553
554 VPRegionBlock *ParentRegion = Region->getParent();
555 if (ParentRegion && ParentRegion->getExiting() == CurrentBlock)
556 ParentRegion->setExiting(SplitBlock);
557 }
558}
559
560/// Remove redundant VPBasicBlocks by merging them into their predecessor if
561/// the predecessor has a single successor.
565 vp_depth_first_deep(Plan.getEntry()))) {
566 // Don't fold the blocks in the skeleton of the Plan into their single
567 // predecessors for now.
568 // TODO: Remove restriction once more of the skeleton is modeled in VPlan.
569 if (!VPBB->getParent())
570 continue;
571 auto *PredVPBB =
572 dyn_cast_or_null<VPBasicBlock>(VPBB->getSinglePredecessor());
573 if (!PredVPBB || PredVPBB->getNumSuccessors() != 1 ||
574 isa<VPIRBasicBlock>(PredVPBB))
575 continue;
576 WorkList.push_back(VPBB);
577 }
578
579 for (VPBasicBlock *VPBB : WorkList) {
580 VPBasicBlock *PredVPBB = cast<VPBasicBlock>(VPBB->getSinglePredecessor());
581 for (VPRecipeBase &R : make_early_inc_range(*VPBB))
582 R.moveBefore(*PredVPBB, PredVPBB->end());
583 VPBlockUtils::disconnectBlocks(PredVPBB, VPBB);
584 auto *ParentRegion = VPBB->getParent();
585 if (ParentRegion && ParentRegion->getExiting() == VPBB)
586 ParentRegion->setExiting(PredVPBB);
587 for (auto *Succ : to_vector(VPBB->successors())) {
589 VPBlockUtils::connectBlocks(PredVPBB, Succ);
590 }
591 // VPBB is now dead and will be cleaned up when the plan gets destroyed.
592 }
593 return !WorkList.empty();
594}
595
597 // Convert masked VPReplicateRecipes to if-then region blocks.
599
600 bool ShouldSimplify = true;
601 while (ShouldSimplify) {
602 ShouldSimplify = sinkScalarOperands(Plan);
603 ShouldSimplify |= mergeReplicateRegionsIntoSuccessors(Plan);
604 ShouldSimplify |= mergeBlocksIntoPredecessors(Plan);
605 }
606}
607
608/// Remove redundant casts of inductions.
609///
610/// Such redundant casts are casts of induction variables that can be ignored,
611/// because we already proved that the casted phi is equal to the uncasted phi
612/// in the vectorized loop. There is no need to vectorize the cast - the same
613/// value can be used for both the phi and casts in the vector loop.
615 for (auto &Phi : Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
617 if (!IV || IV->getTruncInst())
618 continue;
619
620 // A sequence of IR Casts has potentially been recorded for IV, which
621 // *must be bypassed* when the IV is vectorized, because the vectorized IV
622 // will produce the desired casted value. This sequence forms a def-use
623 // chain and is provided in reverse order, ending with the cast that uses
624 // the IV phi. Search for the recipe of the last cast in the chain and
625 // replace it with the original IV. Note that only the final cast is
626 // expected to have users outside the cast-chain and the dead casts left
627 // over will be cleaned up later.
628 ArrayRef<Instruction *> Casts = IV->getInductionDescriptor().getCastInsts();
629 VPValue *FindMyCast = IV;
630 for (Instruction *IRCast : reverse(Casts)) {
631 VPSingleDefRecipe *FoundUserCast = nullptr;
632 for (auto *U : FindMyCast->users()) {
633 auto *UserCast = dyn_cast<VPSingleDefRecipe>(U);
634 if (UserCast && UserCast->getUnderlyingValue() == IRCast) {
635 FoundUserCast = UserCast;
636 break;
637 }
638 }
639 FindMyCast = FoundUserCast;
640 }
641 FindMyCast->replaceAllUsesWith(IV);
642 }
643}
644
645/// Try to replace VPWidenCanonicalIVRecipes with a widened canonical IV
646/// recipe, if it exists.
648 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
649 VPCanonicalIVPHIRecipe *CanonicalIV = LoopRegion->getCanonicalIV();
650 VPWidenCanonicalIVRecipe *WidenNewIV = nullptr;
651 for (VPUser *U : CanonicalIV->users()) {
653 if (WidenNewIV)
654 break;
655 }
656
657 if (!WidenNewIV)
658 return;
659
660 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
661 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
662 auto *WidenOriginalIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
663
664 if (!WidenOriginalIV || !WidenOriginalIV->isCanonical())
665 continue;
666
667 // Replace WidenNewIV with WidenOriginalIV if WidenOriginalIV provides
668 // everything WidenNewIV's users need. That is, WidenOriginalIV will
669 // generate a vector phi or all users of WidenNewIV demand the first lane
670 // only.
671 if (!vputils::onlyScalarValuesUsed(WidenOriginalIV) ||
672 vputils::onlyFirstLaneUsed(WidenNewIV)) {
673 // We are replacing a wide canonical iv with a suitable wide induction.
674 // This is used to compute header mask, hence all lanes will be used and
675 // we need to drop wrap flags only applying to lanes guranteed to execute
676 // in the original scalar loop.
677 WidenOriginalIV->dropPoisonGeneratingFlags();
678 WidenNewIV->replaceAllUsesWith(WidenOriginalIV);
679 WidenNewIV->eraseFromParent();
680 return;
681 }
682 }
683}
684
685/// Returns true if \p R is dead and can be removed.
686static bool isDeadRecipe(VPRecipeBase &R) {
687 // Do remove conditional assume instructions as their conditions may be
688 // flattened.
689 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
690 bool IsConditionalAssume = RepR && RepR->isPredicated() &&
692 if (IsConditionalAssume)
693 return true;
694
695 if (R.mayHaveSideEffects())
696 return false;
697
698 // Recipe is dead if no user keeps the recipe alive.
699 return all_of(R.definedValues(),
700 [](VPValue *V) { return V->getNumUsers() == 0; });
701}
702
705 vp_post_order_deep(Plan.getEntry()))) {
706 // The recipes in the block are processed in reverse order, to catch chains
707 // of dead recipes.
708 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
709 if (isDeadRecipe(R)) {
710 R.eraseFromParent();
711 continue;
712 }
713
714 // Check if R is a dead VPPhi <-> update cycle and remove it.
715 auto *PhiR = dyn_cast<VPPhi>(&R);
716 if (!PhiR || PhiR->getNumOperands() != 2)
717 continue;
718 VPUser *PhiUser = PhiR->getSingleUser();
719 if (!PhiUser)
720 continue;
721 VPValue *Incoming = PhiR->getOperand(1);
722 if (PhiUser != Incoming->getDefiningRecipe() ||
723 Incoming->getNumUsers() != 1)
724 continue;
725 PhiR->replaceAllUsesWith(PhiR->getOperand(0));
726 PhiR->eraseFromParent();
727 Incoming->getDefiningRecipe()->eraseFromParent();
728 }
729 }
730}
731
734 Instruction::BinaryOps InductionOpcode,
735 FPMathOperator *FPBinOp, Instruction *TruncI,
736 VPIRValue *StartV, VPValue *Step, DebugLoc DL,
737 VPBuilder &Builder) {
738 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
739 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
740 VPCanonicalIVPHIRecipe *CanonicalIV = LoopRegion->getCanonicalIV();
741 VPSingleDefRecipe *BaseIV = Builder.createDerivedIV(
742 Kind, FPBinOp, StartV, CanonicalIV, Step, "offset.idx");
743
744 // Truncate base induction if needed.
745 VPTypeAnalysis TypeInfo(Plan);
746 Type *ResultTy = TypeInfo.inferScalarType(BaseIV);
747 if (TruncI) {
748 Type *TruncTy = TruncI->getType();
749 assert(ResultTy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits() &&
750 "Not truncating.");
751 assert(ResultTy->isIntegerTy() && "Truncation requires an integer type");
752 BaseIV = Builder.createScalarCast(Instruction::Trunc, BaseIV, TruncTy, DL);
753 ResultTy = TruncTy;
754 }
755
756 // Truncate step if needed.
757 Type *StepTy = TypeInfo.inferScalarType(Step);
758 if (ResultTy != StepTy) {
759 assert(StepTy->getScalarSizeInBits() > ResultTy->getScalarSizeInBits() &&
760 "Not truncating.");
761 assert(StepTy->isIntegerTy() && "Truncation requires an integer type");
762 auto *VecPreheader =
764 VPBuilder::InsertPointGuard Guard(Builder);
765 Builder.setInsertPoint(VecPreheader);
766 Step = Builder.createScalarCast(Instruction::Trunc, Step, ResultTy, DL);
767 }
768 return Builder.createScalarIVSteps(InductionOpcode, FPBinOp, BaseIV, Step,
769 &Plan.getVF(), DL);
770}
771
774 for (unsigned I = 0; I != Users.size(); ++I) {
776 if (isa<VPHeaderPHIRecipe>(Cur))
777 continue;
778 for (VPValue *V : Cur->definedValues())
779 Users.insert_range(V->users());
780 }
781 return Users.takeVector();
782}
783
784/// Scalarize a VPWidenPointerInductionRecipe by replacing it with a PtrAdd
785/// (IndStart, ScalarIVSteps (0, Step)). This is used when the recipe only
786/// generates scalar values.
787static VPValue *
789 VPlan &Plan, VPBuilder &Builder) {
791 VPIRValue *StartV = Plan.getConstantInt(ID.getStep()->getType(), 0);
792 VPValue *StepV = PtrIV->getOperand(1);
794 Plan, InductionDescriptor::IK_IntInduction, Instruction::Add, nullptr,
795 nullptr, StartV, StepV, PtrIV->getDebugLoc(), Builder);
796
797 return Builder.createPtrAdd(PtrIV->getStartValue(), Steps,
798 PtrIV->getDebugLoc(), "next.gep");
799}
800
801/// Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd
802/// (IndStart, ScalarIVSteps (0, Step)) if only its scalar values are used, as
803/// VPWidenPointerInductionRecipe will generate vectors only. If some users
804/// require vectors while other require scalars, the scalar uses need to extract
805/// the scalars from the generated vectors (Note that this is different to how
806/// int/fp inductions are handled). Legalize extract-from-ends using uniform
807/// VPReplicateRecipe of wide inductions to use regular VPReplicateRecipe, so
808/// the correct end value is available. Also optimize
809/// VPWidenIntOrFpInductionRecipe, if any of its users needs scalar values, by
810/// providing them scalar steps built on the canonical scalar IV and update the
811/// original IV's users. This is an optional optimization to reduce the needs of
812/// vector extracts.
815 bool HasOnlyVectorVFs = !Plan.hasScalarVFOnly();
816 VPBuilder Builder(HeaderVPBB, HeaderVPBB->getFirstNonPhi());
817 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
818 auto *PhiR = dyn_cast<VPWidenInductionRecipe>(&Phi);
819 if (!PhiR)
820 continue;
821
822 // Try to narrow wide and replicating recipes to uniform recipes, based on
823 // VPlan analysis.
824 // TODO: Apply to all recipes in the future, to replace legacy uniformity
825 // analysis.
826 auto Users = collectUsersRecursively(PhiR);
827 for (VPUser *U : reverse(Users)) {
828 auto *Def = dyn_cast<VPRecipeWithIRFlags>(U);
829 auto *RepR = dyn_cast<VPReplicateRecipe>(U);
830 // Skip recipes that shouldn't be narrowed.
831 if (!Def || !isa<VPReplicateRecipe, VPWidenRecipe>(Def) ||
832 Def->getNumUsers() == 0 || !Def->getUnderlyingValue() ||
833 (RepR && (RepR->isSingleScalar() || RepR->isPredicated())))
834 continue;
835
836 // Skip recipes that may have other lanes than their first used.
838 continue;
839
840 auto *Clone = new VPReplicateRecipe(Def->getUnderlyingInstr(),
841 Def->operands(), /*IsUniform*/ true,
842 /*Mask*/ nullptr, /*Flags*/ *Def);
843 Clone->insertAfter(Def);
844 Def->replaceAllUsesWith(Clone);
845 }
846
847 // Replace wide pointer inductions which have only their scalars used by
848 // PtrAdd(IndStart, ScalarIVSteps (0, Step)).
849 if (auto *PtrIV = dyn_cast<VPWidenPointerInductionRecipe>(&Phi)) {
850 if (!Plan.hasScalarVFOnly() &&
851 !PtrIV->onlyScalarsGenerated(Plan.hasScalableVF()))
852 continue;
853
854 VPValue *PtrAdd = scalarizeVPWidenPointerInduction(PtrIV, Plan, Builder);
855 PtrIV->replaceAllUsesWith(PtrAdd);
856 continue;
857 }
858
859 // Replace widened induction with scalar steps for users that only use
860 // scalars.
861 auto *WideIV = cast<VPWidenIntOrFpInductionRecipe>(&Phi);
862 if (HasOnlyVectorVFs && none_of(WideIV->users(), [WideIV](VPUser *U) {
863 return U->usesScalars(WideIV);
864 }))
865 continue;
866
867 const InductionDescriptor &ID = WideIV->getInductionDescriptor();
869 Plan, ID.getKind(), ID.getInductionOpcode(),
870 dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
871 WideIV->getTruncInst(), WideIV->getStartValue(), WideIV->getStepValue(),
872 WideIV->getDebugLoc(), Builder);
873
874 // Update scalar users of IV to use Step instead.
875 if (!HasOnlyVectorVFs) {
876 assert(!Plan.hasScalableVF() &&
877 "plans containing a scalar VF cannot also include scalable VFs");
878 WideIV->replaceAllUsesWith(Steps);
879 } else {
880 bool HasScalableVF = Plan.hasScalableVF();
881 WideIV->replaceUsesWithIf(Steps,
882 [WideIV, HasScalableVF](VPUser &U, unsigned) {
883 if (HasScalableVF)
884 return U.usesFirstLaneOnly(WideIV);
885 return U.usesScalars(WideIV);
886 });
887 }
888 }
889}
890
891/// Check if \p VPV is an untruncated wide induction, either before or after the
892/// increment. If so return the header IV (before the increment), otherwise
893/// return null.
896 auto *WideIV = dyn_cast<VPWidenInductionRecipe>(VPV);
897 if (WideIV) {
898 // VPV itself is a wide induction, separately compute the end value for exit
899 // users if it is not a truncated IV.
900 auto *IntOrFpIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
901 return (IntOrFpIV && IntOrFpIV->getTruncInst()) ? nullptr : WideIV;
902 }
903
904 // Check if VPV is an optimizable induction increment.
905 VPRecipeBase *Def = VPV->getDefiningRecipe();
906 if (!Def || Def->getNumOperands() != 2)
907 return nullptr;
908 WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(0));
909 if (!WideIV)
910 WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(1));
911 if (!WideIV)
912 return nullptr;
913
914 auto IsWideIVInc = [&]() {
915 auto &ID = WideIV->getInductionDescriptor();
916
917 // Check if VPV increments the induction by the induction step.
918 VPValue *IVStep = WideIV->getStepValue();
919 switch (ID.getInductionOpcode()) {
920 case Instruction::Add:
921 return match(VPV, m_c_Add(m_Specific(WideIV), m_Specific(IVStep)));
922 case Instruction::FAdd:
924 m_Specific(IVStep)));
925 case Instruction::FSub:
926 return match(VPV, m_Binary<Instruction::FSub>(m_Specific(WideIV),
927 m_Specific(IVStep)));
928 case Instruction::Sub: {
929 // IVStep will be the negated step of the subtraction. Check if Step == -1
930 // * IVStep.
931 VPValue *Step;
932 if (!match(VPV, m_Sub(m_VPValue(), m_VPValue(Step))))
933 return false;
934 const SCEV *IVStepSCEV = vputils::getSCEVExprForVPValue(IVStep, PSE);
935 const SCEV *StepSCEV = vputils::getSCEVExprForVPValue(Step, PSE);
936 ScalarEvolution &SE = *PSE.getSE();
937 return !isa<SCEVCouldNotCompute>(IVStepSCEV) &&
938 !isa<SCEVCouldNotCompute>(StepSCEV) &&
939 IVStepSCEV == SE.getNegativeSCEV(StepSCEV);
940 }
941 default:
942 return ID.getKind() == InductionDescriptor::IK_PtrInduction &&
943 match(VPV, m_GetElementPtr(m_Specific(WideIV),
944 m_Specific(WideIV->getStepValue())));
945 }
946 llvm_unreachable("should have been covered by switch above");
947 };
948 return IsWideIVInc() ? WideIV : nullptr;
949}
950
951/// Attempts to optimize the induction variable exit values for users in the
952/// early exit block.
954 VPTypeAnalysis &TypeInfo,
955 VPBlockBase *PredVPBB,
956 VPValue *Op,
958 VPValue *Incoming, *Mask;
961 return nullptr;
962
963 auto *WideIV = getOptimizableIVOf(Incoming, PSE);
964 if (!WideIV)
965 return nullptr;
966
967 auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
968 if (WideIntOrFp && WideIntOrFp->getTruncInst())
969 return nullptr;
970
971 // Calculate the final index.
972 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
973 auto *CanonicalIV = LoopRegion->getCanonicalIV();
974 Type *CanonicalIVType = LoopRegion->getCanonicalIVType();
975 VPBuilder B(cast<VPBasicBlock>(PredVPBB));
976
977 DebugLoc DL = cast<VPInstruction>(Op)->getDebugLoc();
978 VPValue *FirstActiveLane =
979 B.createNaryOp(VPInstruction::FirstActiveLane, Mask, DL);
980 Type *FirstActiveLaneType = TypeInfo.inferScalarType(FirstActiveLane);
981 FirstActiveLane = B.createScalarZExtOrTrunc(FirstActiveLane, CanonicalIVType,
982 FirstActiveLaneType, DL);
983 VPValue *EndValue =
984 B.createNaryOp(Instruction::Add, {CanonicalIV, FirstActiveLane}, DL);
985
986 // `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
987 // changed it means the exit is using the incremented value, so we need to
988 // add the step.
989 if (Incoming != WideIV) {
990 VPValue *One = Plan.getConstantInt(CanonicalIVType, 1);
991 EndValue = B.createNaryOp(Instruction::Add, {EndValue, One}, DL);
992 }
993
994 if (!WideIntOrFp || !WideIntOrFp->isCanonical()) {
995 const InductionDescriptor &ID = WideIV->getInductionDescriptor();
996 VPIRValue *Start = WideIV->getStartValue();
997 VPValue *Step = WideIV->getStepValue();
998 EndValue = B.createDerivedIV(
999 ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
1000 Start, EndValue, Step);
1001 }
1002
1003 return EndValue;
1004}
1005
1006/// Attempts to optimize the induction variable exit values for users in the
1007/// exit block coming from the latch in the original scalar loop.
1009 VPlan &Plan, VPTypeAnalysis &TypeInfo, VPBlockBase *PredVPBB, VPValue *Op,
1013 return nullptr;
1014
1015 auto *WideIV = getOptimizableIVOf(Incoming, PSE);
1016 if (!WideIV)
1017 return nullptr;
1018
1019 VPValue *EndValue = EndValues.lookup(WideIV);
1020 assert(EndValue && "end value must have been pre-computed");
1021
1022 // `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
1023 // changed it means the exit is using the incremented value, so we don't
1024 // need to subtract the step.
1025 if (Incoming != WideIV)
1026 return EndValue;
1027
1028 // Otherwise, subtract the step from the EndValue.
1029 VPBuilder B(cast<VPBasicBlock>(PredVPBB)->getTerminator());
1030 VPValue *Step = WideIV->getStepValue();
1031 Type *ScalarTy = TypeInfo.inferScalarType(WideIV);
1032 if (ScalarTy->isIntegerTy())
1033 return B.createNaryOp(Instruction::Sub, {EndValue, Step},
1034 DebugLoc::getUnknown(), "ind.escape");
1035 if (ScalarTy->isPointerTy()) {
1036 Type *StepTy = TypeInfo.inferScalarType(Step);
1037 auto *Zero = Plan.getConstantInt(StepTy, 0);
1038 return B.createPtrAdd(EndValue,
1039 B.createNaryOp(Instruction::Sub, {Zero, Step}),
1040 DebugLoc::getUnknown(), "ind.escape");
1041 }
1042 if (ScalarTy->isFloatingPointTy()) {
1043 const auto &ID = WideIV->getInductionDescriptor();
1044 return B.createNaryOp(
1045 ID.getInductionBinOp()->getOpcode() == Instruction::FAdd
1046 ? Instruction::FSub
1047 : Instruction::FAdd,
1048 {EndValue, Step}, {ID.getInductionBinOp()->getFastMathFlags()});
1049 }
1050 llvm_unreachable("all possible induction types must be handled");
1051 return nullptr;
1052}
1053
1055 VPlan &Plan, DenseMap<VPValue *, VPValue *> &EndValues,
1057 VPBlockBase *MiddleVPBB = Plan.getMiddleBlock();
1058 VPTypeAnalysis TypeInfo(Plan);
1059 for (VPIRBasicBlock *ExitVPBB : Plan.getExitBlocks()) {
1060 for (VPRecipeBase &R : ExitVPBB->phis()) {
1061 auto *ExitIRI = cast<VPIRPhi>(&R);
1062
1063 for (auto [Idx, PredVPBB] : enumerate(ExitVPBB->getPredecessors())) {
1064 VPValue *Escape = nullptr;
1065 if (PredVPBB == MiddleVPBB)
1066 Escape = optimizeLatchExitInductionUser(Plan, TypeInfo, PredVPBB,
1067 ExitIRI->getOperand(Idx),
1068 EndValues, PSE);
1069 else
1071 Plan, TypeInfo, PredVPBB, ExitIRI->getOperand(Idx), PSE);
1072 if (Escape)
1073 ExitIRI->setOperand(Idx, Escape);
1074 }
1075 }
1076 }
1077}
1078
1079/// Remove redundant EpxandSCEVRecipes in \p Plan's entry block by replacing
1080/// them with already existing recipes expanding the same SCEV expression.
1083
1084 for (VPRecipeBase &R :
1086 auto *ExpR = dyn_cast<VPExpandSCEVRecipe>(&R);
1087 if (!ExpR)
1088 continue;
1089
1090 const auto &[V, Inserted] = SCEV2VPV.try_emplace(ExpR->getSCEV(), ExpR);
1091 if (Inserted)
1092 continue;
1093 ExpR->replaceAllUsesWith(V->second);
1094 ExpR->eraseFromParent();
1095 }
1096}
1097
1099 SmallVector<VPValue *> WorkList;
1101 WorkList.push_back(V);
1102
1103 while (!WorkList.empty()) {
1104 VPValue *Cur = WorkList.pop_back_val();
1105 if (!Seen.insert(Cur).second)
1106 continue;
1107 VPRecipeBase *R = Cur->getDefiningRecipe();
1108 if (!R)
1109 continue;
1110 if (!isDeadRecipe(*R))
1111 continue;
1112 append_range(WorkList, R->operands());
1113 R->eraseFromParent();
1114 }
1115}
1116
1117/// Get any instruction opcode or intrinsic ID data embedded in recipe \p R.
1118/// Returns an optional pair, where the first element indicates whether it is
1119/// an intrinsic ID.
1120static std::optional<std::pair<bool, unsigned>>
1122 return TypeSwitch<const VPSingleDefRecipe *,
1123 std::optional<std::pair<bool, unsigned>>>(R)
1126 [](auto *I) { return std::make_pair(false, I->getOpcode()); })
1127 .Case<VPWidenIntrinsicRecipe>([](auto *I) {
1128 return std::make_pair(true, I->getVectorIntrinsicID());
1129 })
1130 .Case<VPVectorPointerRecipe, VPPredInstPHIRecipe>([](auto *I) {
1131 // For recipes that do not directly map to LLVM IR instructions,
1132 // assign opcodes after the last VPInstruction opcode (which is also
1133 // after the last IR Instruction opcode), based on the VPDefID.
1134 return std::make_pair(false,
1135 VPInstruction::OpsEnd + 1 + I->getVPDefID());
1136 })
1137 .Default([](auto *) { return std::nullopt; });
1138}
1139
1140/// Try to fold \p R using InstSimplifyFolder. Will succeed and return a
1141/// non-nullptr VPValue for a handled opcode or intrinsic ID if corresponding \p
1142/// Operands are foldable live-ins.
1144 ArrayRef<VPValue *> Operands,
1145 const DataLayout &DL,
1146 VPTypeAnalysis &TypeInfo) {
1147 auto OpcodeOrIID = getOpcodeOrIntrinsicID(&R);
1148 if (!OpcodeOrIID)
1149 return nullptr;
1150
1152 for (VPValue *Op : Operands) {
1154 return nullptr;
1155 Value *V = Op->getUnderlyingValue();
1156 if (!V)
1157 return nullptr;
1158 Ops.push_back(V);
1159 }
1160
1161 auto FoldToIRValue = [&]() -> Value * {
1162 InstSimplifyFolder Folder(DL);
1163 if (OpcodeOrIID->first) {
1164 if (R.getNumOperands() != 2)
1165 return nullptr;
1166 unsigned ID = OpcodeOrIID->second;
1167 return Folder.FoldBinaryIntrinsic(ID, Ops[0], Ops[1],
1168 TypeInfo.inferScalarType(&R));
1169 }
1170 unsigned Opcode = OpcodeOrIID->second;
1171 if (Instruction::isBinaryOp(Opcode))
1172 return Folder.FoldBinOp(static_cast<Instruction::BinaryOps>(Opcode),
1173 Ops[0], Ops[1]);
1174 if (Instruction::isCast(Opcode))
1175 return Folder.FoldCast(static_cast<Instruction::CastOps>(Opcode), Ops[0],
1176 TypeInfo.inferScalarType(R.getVPSingleValue()));
1177 switch (Opcode) {
1179 return Folder.FoldSelect(Ops[0], Ops[1],
1181 case VPInstruction::Not:
1182 return Folder.FoldBinOp(Instruction::BinaryOps::Xor, Ops[0],
1184 case Instruction::Select:
1185 return Folder.FoldSelect(Ops[0], Ops[1], Ops[2]);
1186 case Instruction::ICmp:
1187 case Instruction::FCmp:
1188 return Folder.FoldCmp(cast<VPRecipeWithIRFlags>(R).getPredicate(), Ops[0],
1189 Ops[1]);
1190 case Instruction::GetElementPtr: {
1191 auto &RFlags = cast<VPRecipeWithIRFlags>(R);
1192 auto *GEP = cast<GetElementPtrInst>(RFlags.getUnderlyingInstr());
1193 return Folder.FoldGEP(GEP->getSourceElementType(), Ops[0],
1194 drop_begin(Ops), RFlags.getGEPNoWrapFlags());
1195 }
1198 return Folder.FoldGEP(IntegerType::getInt8Ty(TypeInfo.getContext()),
1199 Ops[0], Ops[1],
1200 cast<VPRecipeWithIRFlags>(R).getGEPNoWrapFlags());
1201 // An extract of a live-in is an extract of a broadcast, so return the
1202 // broadcasted element.
1203 case Instruction::ExtractElement:
1204 assert(!Ops[0]->getType()->isVectorTy() && "Live-ins should be scalar");
1205 return Ops[0];
1206 }
1207 return nullptr;
1208 };
1209
1210 if (Value *V = FoldToIRValue())
1211 return R.getParent()->getPlan()->getOrAddLiveIn(V);
1212 return nullptr;
1213}
1214
1215/// Try to simplify VPSingleDefRecipe \p Def.
1217 VPlan *Plan = Def->getParent()->getPlan();
1218
1219 // Simplification of live-in IR values for SingleDef recipes using
1220 // InstSimplifyFolder.
1221 const DataLayout &DL =
1223 if (VPValue *V = tryToFoldLiveIns(*Def, Def->operands(), DL, TypeInfo))
1224 return Def->replaceAllUsesWith(V);
1225
1226 // Fold PredPHI LiveIn -> LiveIn.
1227 if (auto *PredPHI = dyn_cast<VPPredInstPHIRecipe>(Def)) {
1228 VPValue *Op = PredPHI->getOperand(0);
1229 if (isa<VPIRValue>(Op))
1230 PredPHI->replaceAllUsesWith(Op);
1231 }
1232
1233 VPBuilder Builder(Def);
1234 VPValue *A;
1235 if (match(Def, m_Trunc(m_ZExtOrSExt(m_VPValue(A))))) {
1236 Type *TruncTy = TypeInfo.inferScalarType(Def);
1237 Type *ATy = TypeInfo.inferScalarType(A);
1238 if (TruncTy == ATy) {
1239 Def->replaceAllUsesWith(A);
1240 } else {
1241 // Don't replace a scalarizing recipe with a widened cast.
1242 if (isa<VPReplicateRecipe>(Def))
1243 return;
1244 if (ATy->getScalarSizeInBits() < TruncTy->getScalarSizeInBits()) {
1245
1246 unsigned ExtOpcode = match(Def->getOperand(0), m_SExt(m_VPValue()))
1247 ? Instruction::SExt
1248 : Instruction::ZExt;
1249 auto *Ext = Builder.createWidenCast(Instruction::CastOps(ExtOpcode), A,
1250 TruncTy);
1251 if (auto *UnderlyingExt = Def->getOperand(0)->getUnderlyingValue()) {
1252 // UnderlyingExt has distinct return type, used to retain legacy cost.
1253 Ext->setUnderlyingValue(UnderlyingExt);
1254 }
1255 Def->replaceAllUsesWith(Ext);
1256 } else if (ATy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits()) {
1257 auto *Trunc = Builder.createWidenCast(Instruction::Trunc, A, TruncTy);
1258 Def->replaceAllUsesWith(Trunc);
1259 }
1260 }
1261#ifndef NDEBUG
1262 // Verify that the cached type info is for both A and its users is still
1263 // accurate by comparing it to freshly computed types.
1264 VPTypeAnalysis TypeInfo2(*Plan);
1265 assert(TypeInfo.inferScalarType(A) == TypeInfo2.inferScalarType(A));
1266 for (VPUser *U : A->users()) {
1267 auto *R = cast<VPRecipeBase>(U);
1268 for (VPValue *VPV : R->definedValues())
1269 assert(TypeInfo.inferScalarType(VPV) == TypeInfo2.inferScalarType(VPV));
1270 }
1271#endif
1272 }
1273
1274 // Simplify (X && Y) || (X && !Y) -> X.
1275 // TODO: Split up into simpler, modular combines: (X && Y) || (X && Z) into X
1276 // && (Y || Z) and (X || !X) into true. This requires queuing newly created
1277 // recipes to be visited during simplification.
1278 VPValue *X, *Y, *Z;
1279 if (match(Def,
1282 Def->replaceAllUsesWith(X);
1283 Def->eraseFromParent();
1284 return;
1285 }
1286
1287 // x | 1 -> 1
1288 if (match(Def, m_c_BinaryOr(m_VPValue(X), m_AllOnes())))
1289 return Def->replaceAllUsesWith(Def->getOperand(Def->getOperand(0) == X));
1290
1291 // x | 0 -> x
1292 if (match(Def, m_c_BinaryOr(m_VPValue(X), m_ZeroInt())))
1293 return Def->replaceAllUsesWith(X);
1294
1295 // x & 0 -> 0
1296 if (match(Def, m_c_BinaryAnd(m_VPValue(X), m_ZeroInt())))
1297 return Def->replaceAllUsesWith(Def->getOperand(Def->getOperand(0) == X));
1298
1299 // x && false -> false
1300 if (match(Def, m_LogicalAnd(m_VPValue(X), m_False())))
1301 return Def->replaceAllUsesWith(Def->getOperand(1));
1302
1303 // (x && y) || (x && z) -> x && (y || z)
1306 // Simplify only if one of the operands has one use to avoid creating an
1307 // extra recipe.
1308 (!Def->getOperand(0)->hasMoreThanOneUniqueUser() ||
1309 !Def->getOperand(1)->hasMoreThanOneUniqueUser()))
1310 return Def->replaceAllUsesWith(
1311 Builder.createLogicalAnd(X, Builder.createOr(Y, Z)));
1312
1313 // x && !x -> 0
1315 return Def->replaceAllUsesWith(Plan->getFalse());
1316
1317 if (match(Def, m_Select(m_VPValue(), m_VPValue(X), m_Deferred(X))))
1318 return Def->replaceAllUsesWith(X);
1319
1320 // select !c, x, y -> select c, y, x
1321 VPValue *C;
1322 if (match(Def, m_Select(m_Not(m_VPValue(C)), m_VPValue(X), m_VPValue(Y)))) {
1323 Def->setOperand(0, C);
1324 Def->setOperand(1, Y);
1325 Def->setOperand(2, X);
1326 return;
1327 }
1328
1329 // Reassociate (x && y) && z -> x && (y && z) if x has multiple users. With
1330 // tail folding it is likely that x is a header mask and can be simplified
1331 // further.
1333 m_VPValue(Z))) &&
1334 X->hasMoreThanOneUniqueUser())
1335 return Def->replaceAllUsesWith(
1336 Builder.createLogicalAnd(X, Builder.createLogicalAnd(Y, Z)));
1337
1338 if (match(Def, m_c_Add(m_VPValue(A), m_ZeroInt())))
1339 return Def->replaceAllUsesWith(A);
1340
1341 if (match(Def, m_c_Mul(m_VPValue(A), m_One())))
1342 return Def->replaceAllUsesWith(A);
1343
1344 if (match(Def, m_c_Mul(m_VPValue(A), m_ZeroInt())))
1345 return Def->replaceAllUsesWith(
1346 Def->getOperand(0) == A ? Def->getOperand(1) : Def->getOperand(0));
1347
1348 const APInt *APC;
1349 if (match(Def, m_c_Mul(m_VPValue(A), m_APInt(APC))) && APC->isPowerOf2())
1350 return Def->replaceAllUsesWith(Builder.createNaryOp(
1351 Instruction::Shl,
1352 {A, Plan->getConstantInt(APC->getBitWidth(), APC->exactLogBase2())},
1353 *cast<VPRecipeWithIRFlags>(Def), Def->getDebugLoc()));
1354
1355 if (match(Def, m_UDiv(m_VPValue(A), m_APInt(APC))) && APC->isPowerOf2())
1356 return Def->replaceAllUsesWith(Builder.createNaryOp(
1357 Instruction::LShr,
1358 {A, Plan->getConstantInt(APC->getBitWidth(), APC->exactLogBase2())}, {},
1359 Def->getDebugLoc()));
1360
1361 if (match(Def, m_Not(m_VPValue(A)))) {
1362 if (match(A, m_Not(m_VPValue(A))))
1363 return Def->replaceAllUsesWith(A);
1364
1365 // Try to fold Not into compares by adjusting the predicate in-place.
1366 CmpPredicate Pred;
1367 if (match(A, m_Cmp(Pred, m_VPValue(), m_VPValue()))) {
1368 auto *Cmp = cast<VPRecipeWithIRFlags>(A);
1369 if (all_of(Cmp->users(),
1371 m_Not(m_Specific(Cmp)),
1372 m_Select(m_Specific(Cmp), m_VPValue(), m_VPValue()))))) {
1373 Cmp->setPredicate(CmpInst::getInversePredicate(Pred));
1374 for (VPUser *U : to_vector(Cmp->users())) {
1375 auto *R = cast<VPSingleDefRecipe>(U);
1376 if (match(R, m_Select(m_Specific(Cmp), m_VPValue(X), m_VPValue(Y)))) {
1377 // select (cmp pred), x, y -> select (cmp inv_pred), y, x
1378 R->setOperand(1, Y);
1379 R->setOperand(2, X);
1380 } else {
1381 // not (cmp pred) -> cmp inv_pred
1382 assert(match(R, m_Not(m_Specific(Cmp))) && "Unexpected user");
1383 R->replaceAllUsesWith(Cmp);
1384 }
1385 }
1386 // If Cmp doesn't have a debug location, use the one from the negation,
1387 // to preserve the location.
1388 if (!Cmp->getDebugLoc() && Def->getDebugLoc())
1389 Cmp->setDebugLoc(Def->getDebugLoc());
1390 }
1391 }
1392 }
1393
1394 // Fold any-of (fcmp uno %A, %A), (fcmp uno %B, %B), ... ->
1395 // any-of (fcmp uno %A, %B), ...
1396 if (match(Def, m_AnyOf())) {
1398 VPRecipeBase *UnpairedCmp = nullptr;
1399 for (VPValue *Op : Def->operands()) {
1400 VPValue *X;
1401 if (Op->getNumUsers() > 1 ||
1403 m_Deferred(X)))) {
1404 NewOps.push_back(Op);
1405 } else if (!UnpairedCmp) {
1406 UnpairedCmp = Op->getDefiningRecipe();
1407 } else {
1408 NewOps.push_back(Builder.createFCmp(CmpInst::FCMP_UNO,
1409 UnpairedCmp->getOperand(0), X));
1410 UnpairedCmp = nullptr;
1411 }
1412 }
1413
1414 if (UnpairedCmp)
1415 NewOps.push_back(UnpairedCmp->getVPSingleValue());
1416
1417 if (NewOps.size() < Def->getNumOperands()) {
1418 VPValue *NewAnyOf = Builder.createNaryOp(VPInstruction::AnyOf, NewOps);
1419 return Def->replaceAllUsesWith(NewAnyOf);
1420 }
1421 }
1422
1423 // Fold (fcmp uno %X, %X) or (fcmp uno %Y, %Y) -> fcmp uno %X, %Y
1424 // This is useful for fmax/fmin without fast-math flags, where we need to
1425 // check if any operand is NaN.
1427 m_Deferred(X)),
1429 m_Deferred(Y))))) {
1430 VPValue *NewCmp = Builder.createFCmp(CmpInst::FCMP_UNO, X, Y);
1431 return Def->replaceAllUsesWith(NewCmp);
1432 }
1433
1434 // Remove redundant DerviedIVs, that is 0 + A * 1 -> A and 0 + 0 * x -> 0.
1435 if ((match(Def, m_DerivedIV(m_ZeroInt(), m_VPValue(A), m_One())) ||
1436 match(Def, m_DerivedIV(m_ZeroInt(), m_ZeroInt(), m_VPValue()))) &&
1437 TypeInfo.inferScalarType(Def->getOperand(1)) ==
1438 TypeInfo.inferScalarType(Def))
1439 return Def->replaceAllUsesWith(Def->getOperand(1));
1440
1442 m_One()))) {
1443 Type *WideStepTy = TypeInfo.inferScalarType(Def);
1444 if (TypeInfo.inferScalarType(X) != WideStepTy)
1445 X = Builder.createWidenCast(Instruction::Trunc, X, WideStepTy);
1446 Def->replaceAllUsesWith(X);
1447 return;
1448 }
1449
1450 // For i1 vp.merges produced by AnyOf reductions:
1451 // vp.merge true, (or x, y), x, evl -> vp.merge y, true, x, evl
1453 m_VPValue(X), m_VPValue())) &&
1455 TypeInfo.inferScalarType(Def)->isIntegerTy(1)) {
1456 Def->setOperand(1, Def->getOperand(0));
1457 Def->setOperand(0, Y);
1458 return;
1459 }
1460
1461 if (auto *Phi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(Def)) {
1462 if (Phi->getOperand(0) == Phi->getOperand(1))
1463 Phi->replaceAllUsesWith(Phi->getOperand(0));
1464 return;
1465 }
1466
1467 // Look through ExtractLastLane.
1468 if (match(Def, m_ExtractLastLane(m_VPValue(A)))) {
1469 if (match(A, m_BuildVector())) {
1470 auto *BuildVector = cast<VPInstruction>(A);
1471 Def->replaceAllUsesWith(
1472 BuildVector->getOperand(BuildVector->getNumOperands() - 1));
1473 return;
1474 }
1475 if (Plan->hasScalarVFOnly())
1476 return Def->replaceAllUsesWith(A);
1477 }
1478
1479 // Look through ExtractPenultimateElement (BuildVector ....).
1481 auto *BuildVector = cast<VPInstruction>(Def->getOperand(0));
1482 Def->replaceAllUsesWith(
1483 BuildVector->getOperand(BuildVector->getNumOperands() - 2));
1484 return;
1485 }
1486
1487 uint64_t Idx;
1489 auto *BuildVector = cast<VPInstruction>(Def->getOperand(0));
1490 Def->replaceAllUsesWith(BuildVector->getOperand(Idx));
1491 return;
1492 }
1493
1494 if (match(Def, m_BuildVector()) && all_equal(Def->operands())) {
1495 Def->replaceAllUsesWith(
1496 Builder.createNaryOp(VPInstruction::Broadcast, Def->getOperand(0)));
1497 return;
1498 }
1499
1500 // Look through broadcast of single-scalar when used as select conditions; in
1501 // that case the scalar condition can be used directly.
1502 if (match(Def,
1505 "broadcast operand must be single-scalar");
1506 Def->setOperand(0, C);
1507 return;
1508 }
1509
1510 if (auto *Phi = dyn_cast<VPPhi>(Def)) {
1511 if (Phi->getNumOperands() == 1)
1512 Phi->replaceAllUsesWith(Phi->getOperand(0));
1513 return;
1514 }
1515
1516 // Some simplifications can only be applied after unrolling. Perform them
1517 // below.
1518 if (!Plan->isUnrolled())
1519 return;
1520
1521 // Hoist an invariant increment Y of a phi X, by having X start at Y.
1522 if (match(Def, m_c_Add(m_VPValue(X), m_VPValue(Y))) && isa<VPIRValue>(Y) &&
1523 isa<VPPhi>(X)) {
1524 auto *Phi = cast<VPPhi>(X);
1525 if (Phi->getOperand(1) != Def && match(Phi->getOperand(0), m_ZeroInt()) &&
1526 Phi->getSingleUser() == Def) {
1527 Phi->setOperand(0, Y);
1528 Def->replaceAllUsesWith(Phi);
1529 return;
1530 }
1531 }
1532
1533 // Simplify unrolled VectorPointer without offset, or with zero offset, to
1534 // just the pointer operand.
1535 if (auto *VPR = dyn_cast<VPVectorPointerRecipe>(Def))
1536 if (!VPR->getOffset() || match(VPR->getOffset(), m_ZeroInt()))
1537 return VPR->replaceAllUsesWith(VPR->getOperand(0));
1538
1539 // VPScalarIVSteps for part 0 can be replaced by their start value, if only
1540 // the first lane is demanded.
1541 if (auto *Steps = dyn_cast<VPScalarIVStepsRecipe>(Def)) {
1542 if (Steps->isPart0() && vputils::onlyFirstLaneUsed(Steps)) {
1543 Steps->replaceAllUsesWith(Steps->getOperand(0));
1544 return;
1545 }
1546 }
1547 // Simplify redundant ReductionStartVector recipes after unrolling.
1548 VPValue *StartV;
1550 m_VPValue(StartV), m_VPValue(), m_VPValue()))) {
1551 Def->replaceUsesWithIf(StartV, [](const VPUser &U, unsigned Idx) {
1552 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&U);
1553 return PhiR && PhiR->isInLoop();
1554 });
1555 return;
1556 }
1557
1559 Def->replaceAllUsesWith(A);
1560 return;
1561 }
1562
1563 if (match(Def, m_ExtractLastLane(m_VPValue(A))) &&
1566 cast<VPReplicateRecipe>(A)->isSingleScalar())) &&
1567 all_of(A->users(),
1568 [Def, A](VPUser *U) { return U->usesScalars(A) || Def == U; })) {
1569 return Def->replaceAllUsesWith(A);
1570 }
1571
1572 if (Plan->getUF() == 1 && match(Def, m_ExtractLastPart(m_VPValue(A))))
1573 return Def->replaceAllUsesWith(A);
1574}
1575
1578 Plan.getEntry());
1579 VPTypeAnalysis TypeInfo(Plan);
1581 for (VPRecipeBase &R : make_early_inc_range(*VPBB))
1582 if (auto *Def = dyn_cast<VPSingleDefRecipe>(&R))
1583 simplifyRecipe(Def, TypeInfo);
1584 }
1585}
1586
1588 if (Plan.hasScalarVFOnly())
1589 return;
1590
1591 // Try to narrow wide and replicating recipes to single scalar recipes,
1592 // based on VPlan analysis. Only process blocks in the loop region for now,
1593 // without traversing into nested regions, as recipes in replicate regions
1594 // cannot be converted yet.
1597 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
1599 VPWidenStoreRecipe>(&R))
1600 continue;
1601 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
1602 if (RepR && (RepR->isSingleScalar() || RepR->isPredicated()))
1603 continue;
1604
1605 // Convert an unmasked scatter with an uniform address into
1606 // extract-last-lane + scalar store.
1607 // TODO: Add a profitability check comparing the cost of a scatter vs.
1608 // extract + scalar store.
1609 auto *WidenStoreR = dyn_cast<VPWidenStoreRecipe>(&R);
1610 if (WidenStoreR && vputils::isSingleScalar(WidenStoreR->getAddr()) &&
1611 !WidenStoreR->isConsecutive()) {
1612 assert(!WidenStoreR->isReverse() &&
1613 "Not consecutive memory recipes shouldn't be reversed");
1614 VPValue *Mask = WidenStoreR->getMask();
1615
1616 // Only convert the scatter to a scalar store if it is unmasked.
1617 // TODO: Support converting scatter masked by the header mask to scalar
1618 // store.
1619 if (Mask)
1620 continue;
1621
1623 {WidenStoreR->getOperand(1)});
1624 Extract->insertBefore(WidenStoreR);
1625
1626 // TODO: Sink the scalar store recipe to middle block if possible.
1627 auto *ScalarStore = new VPReplicateRecipe(
1628 &WidenStoreR->getIngredient(), {Extract, WidenStoreR->getAddr()},
1629 true /*IsSingleScalar*/, nullptr /*Mask*/, {},
1630 *WidenStoreR /*Metadata*/);
1631 ScalarStore->insertBefore(WidenStoreR);
1632 WidenStoreR->eraseFromParent();
1633 continue;
1634 }
1635
1636 auto *RepOrWidenR = dyn_cast<VPRecipeWithIRFlags>(&R);
1637 if (RepR && isa<StoreInst>(RepR->getUnderlyingInstr()) &&
1638 vputils::isSingleScalar(RepR->getOperand(1))) {
1639 auto *Clone = new VPReplicateRecipe(
1640 RepOrWidenR->getUnderlyingInstr(), RepOrWidenR->operands(),
1641 true /*IsSingleScalar*/, nullptr /*Mask*/, *RepR /*Flags*/,
1642 *RepR /*Metadata*/, RepR->getDebugLoc());
1643 Clone->insertBefore(RepOrWidenR);
1644 VPBuilder Builder(Clone);
1645 VPValue *ExtractOp = Clone->getOperand(0);
1646 if (vputils::isUniformAcrossVFsAndUFs(RepR->getOperand(1)))
1647 ExtractOp =
1648 Builder.createNaryOp(VPInstruction::ExtractLastPart, ExtractOp);
1649 ExtractOp =
1650 Builder.createNaryOp(VPInstruction::ExtractLastLane, ExtractOp);
1651 Clone->setOperand(0, ExtractOp);
1652 RepR->eraseFromParent();
1653 continue;
1654 }
1655
1656 // Skip recipes that aren't single scalars.
1657 if (!RepOrWidenR || !vputils::isSingleScalar(RepOrWidenR))
1658 continue;
1659
1660 // Skip recipes for which conversion to single-scalar does introduce
1661 // additional broadcasts. No extra broadcasts are needed, if either only
1662 // the scalars of the recipe are used, or at least one of the operands
1663 // would require a broadcast. In the latter case, the single-scalar may
1664 // need to be broadcasted, but another broadcast is removed.
1665 if (!all_of(RepOrWidenR->users(),
1666 [RepOrWidenR](const VPUser *U) {
1667 if (auto *VPI = dyn_cast<VPInstruction>(U)) {
1668 unsigned Opcode = VPI->getOpcode();
1669 if (Opcode == VPInstruction::ExtractLastLane ||
1670 Opcode == VPInstruction::ExtractLastPart ||
1671 Opcode == VPInstruction::ExtractPenultimateElement)
1672 return true;
1673 }
1674
1675 return U->usesScalars(RepOrWidenR);
1676 }) &&
1677 none_of(RepOrWidenR->operands(), [RepOrWidenR](VPValue *Op) {
1678 if (Op->getSingleUser() != RepOrWidenR)
1679 return false;
1680 // Non-constant live-ins require broadcasts, while constants do not
1681 // need explicit broadcasts.
1682 auto *IRV = dyn_cast<VPIRValue>(Op);
1683 bool LiveInNeedsBroadcast = IRV && !isa<Constant>(IRV->getValue());
1684 auto *OpR = dyn_cast<VPReplicateRecipe>(Op);
1685 return LiveInNeedsBroadcast || (OpR && OpR->isSingleScalar());
1686 }))
1687 continue;
1688
1689 auto *Clone = new VPReplicateRecipe(
1690 RepOrWidenR->getUnderlyingInstr(), RepOrWidenR->operands(),
1691 true /*IsSingleScalar*/, nullptr, *RepOrWidenR);
1692 Clone->insertBefore(RepOrWidenR);
1693 RepOrWidenR->replaceAllUsesWith(Clone);
1694 if (isDeadRecipe(*RepOrWidenR))
1695 RepOrWidenR->eraseFromParent();
1696 }
1697 }
1698}
1699
1700/// Try to see if all of \p Blend's masks share a common value logically and'ed
1701/// and remove it from the masks.
1703 if (Blend->isNormalized())
1704 return;
1705 VPValue *CommonEdgeMask;
1706 if (!match(Blend->getMask(0),
1707 m_LogicalAnd(m_VPValue(CommonEdgeMask), m_VPValue())))
1708 return;
1709 for (unsigned I = 0; I < Blend->getNumIncomingValues(); I++)
1710 if (!match(Blend->getMask(I),
1711 m_LogicalAnd(m_Specific(CommonEdgeMask), m_VPValue())))
1712 return;
1713 for (unsigned I = 0; I < Blend->getNumIncomingValues(); I++)
1714 Blend->setMask(I, Blend->getMask(I)->getDefiningRecipe()->getOperand(1));
1715}
1716
1717/// Normalize and simplify VPBlendRecipes. Should be run after simplifyRecipes
1718/// to make sure the masks are simplified.
1719static void simplifyBlends(VPlan &Plan) {
1722 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
1723 auto *Blend = dyn_cast<VPBlendRecipe>(&R);
1724 if (!Blend)
1725 continue;
1726
1727 removeCommonBlendMask(Blend);
1728
1729 // Try to remove redundant blend recipes.
1730 SmallPtrSet<VPValue *, 4> UniqueValues;
1731 if (Blend->isNormalized() || !match(Blend->getMask(0), m_False()))
1732 UniqueValues.insert(Blend->getIncomingValue(0));
1733 for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
1734 if (!match(Blend->getMask(I), m_False()))
1735 UniqueValues.insert(Blend->getIncomingValue(I));
1736
1737 if (UniqueValues.size() == 1) {
1738 Blend->replaceAllUsesWith(*UniqueValues.begin());
1739 Blend->eraseFromParent();
1740 continue;
1741 }
1742
1743 if (Blend->isNormalized())
1744 continue;
1745
1746 // Normalize the blend so its first incoming value is used as the initial
1747 // value with the others blended into it.
1748
1749 unsigned StartIndex = 0;
1750 for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
1751 // If a value's mask is used only by the blend then is can be deadcoded.
1752 // TODO: Find the most expensive mask that can be deadcoded, or a mask
1753 // that's used by multiple blends where it can be removed from them all.
1754 VPValue *Mask = Blend->getMask(I);
1755 if (Mask->getNumUsers() == 1 && !match(Mask, m_False())) {
1756 StartIndex = I;
1757 break;
1758 }
1759 }
1760
1761 SmallVector<VPValue *, 4> OperandsWithMask;
1762 OperandsWithMask.push_back(Blend->getIncomingValue(StartIndex));
1763
1764 for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
1765 if (I == StartIndex)
1766 continue;
1767 OperandsWithMask.push_back(Blend->getIncomingValue(I));
1768 OperandsWithMask.push_back(Blend->getMask(I));
1769 }
1770
1771 auto *NewBlend =
1772 new VPBlendRecipe(cast_or_null<PHINode>(Blend->getUnderlyingValue()),
1773 OperandsWithMask, Blend->getDebugLoc());
1774 NewBlend->insertBefore(&R);
1775
1776 VPValue *DeadMask = Blend->getMask(StartIndex);
1777 Blend->replaceAllUsesWith(NewBlend);
1778 Blend->eraseFromParent();
1780
1781 /// Simplify BLEND %a, %b, Not(%mask) -> BLEND %b, %a, %mask.
1782 VPValue *NewMask;
1783 if (NewBlend->getNumOperands() == 3 &&
1784 match(NewBlend->getMask(1), m_Not(m_VPValue(NewMask)))) {
1785 VPValue *Inc0 = NewBlend->getOperand(0);
1786 VPValue *Inc1 = NewBlend->getOperand(1);
1787 VPValue *OldMask = NewBlend->getOperand(2);
1788 NewBlend->setOperand(0, Inc1);
1789 NewBlend->setOperand(1, Inc0);
1790 NewBlend->setOperand(2, NewMask);
1791 if (OldMask->getNumUsers() == 0)
1792 cast<VPInstruction>(OldMask)->eraseFromParent();
1793 }
1794 }
1795 }
1796}
1797
1798/// Optimize the width of vector induction variables in \p Plan based on a known
1799/// constant Trip Count, \p BestVF and \p BestUF.
1801 ElementCount BestVF,
1802 unsigned BestUF) {
1803 // Only proceed if we have not completely removed the vector region.
1804 if (!Plan.getVectorLoopRegion())
1805 return false;
1806
1807 const APInt *TC;
1808 if (!BestVF.isFixed() || !match(Plan.getTripCount(), m_APInt(TC)))
1809 return false;
1810
1811 // Calculate the minimum power-of-2 bit width that can fit the known TC, VF
1812 // and UF. Returns at least 8.
1813 auto ComputeBitWidth = [](APInt TC, uint64_t Align) {
1814 APInt AlignedTC =
1817 APInt MaxVal = AlignedTC - 1;
1818 return std::max<unsigned>(PowerOf2Ceil(MaxVal.getActiveBits()), 8);
1819 };
1820 unsigned NewBitWidth =
1821 ComputeBitWidth(*TC, BestVF.getKnownMinValue() * BestUF);
1822
1823 LLVMContext &Ctx = Plan.getContext();
1824 auto *NewIVTy = IntegerType::get(Ctx, NewBitWidth);
1825
1826 bool MadeChange = false;
1827
1828 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
1829 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
1830 auto *WideIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
1831
1832 // Currently only handle canonical IVs as it is trivial to replace the start
1833 // and stop values, and we currently only perform the optimization when the
1834 // IV has a single use.
1835 if (!WideIV || !WideIV->isCanonical() ||
1836 WideIV->hasMoreThanOneUniqueUser() ||
1837 NewIVTy == WideIV->getScalarType())
1838 continue;
1839
1840 // Currently only handle cases where the single user is a header-mask
1841 // comparison with the backedge-taken-count.
1842 VPUser *SingleUser = WideIV->getSingleUser();
1843 if (!SingleUser ||
1844 !match(SingleUser, m_ICmp(m_Specific(WideIV),
1847 continue;
1848
1849 // Update IV operands and comparison bound to use new narrower type.
1850 auto *NewStart = Plan.getConstantInt(NewIVTy, 0);
1851 WideIV->setStartValue(NewStart);
1852 auto *NewStep = Plan.getConstantInt(NewIVTy, 1);
1853 WideIV->setStepValue(NewStep);
1854
1855 auto *NewBTC = new VPWidenCastRecipe(
1856 Instruction::Trunc, Plan.getOrCreateBackedgeTakenCount(), NewIVTy);
1857 Plan.getVectorPreheader()->appendRecipe(NewBTC);
1858 auto *Cmp = cast<VPInstruction>(WideIV->getSingleUser());
1859 Cmp->setOperand(1, NewBTC);
1860
1861 MadeChange = true;
1862 }
1863
1864 return MadeChange;
1865}
1866
1867/// Return true if \p Cond is known to be true for given \p BestVF and \p
1868/// BestUF.
1870 ElementCount BestVF, unsigned BestUF,
1873 return any_of(Cond->getDefiningRecipe()->operands(), [&Plan, BestVF, BestUF,
1874 &PSE](VPValue *C) {
1875 return isConditionTrueViaVFAndUF(C, Plan, BestVF, BestUF, PSE);
1876 });
1877
1878 auto *CanIV = Plan.getVectorLoopRegion()->getCanonicalIV();
1880 m_Specific(CanIV->getBackedgeValue()),
1881 m_Specific(&Plan.getVectorTripCount()))))
1882 return false;
1883
1884 // The compare checks CanIV + VFxUF == vector trip count. The vector trip
1885 // count is not conveniently available as SCEV so far, so we compare directly
1886 // against the original trip count. This is stricter than necessary, as we
1887 // will only return true if the trip count == vector trip count.
1888 const SCEV *VectorTripCount =
1890 if (isa<SCEVCouldNotCompute>(VectorTripCount))
1891 VectorTripCount = vputils::getSCEVExprForVPValue(Plan.getTripCount(), PSE);
1892 assert(!isa<SCEVCouldNotCompute>(VectorTripCount) &&
1893 "Trip count SCEV must be computable");
1894 ScalarEvolution &SE = *PSE.getSE();
1895 ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF);
1896 const SCEV *C = SE.getElementCount(VectorTripCount->getType(), NumElements);
1897 return SE.isKnownPredicate(CmpInst::ICMP_EQ, VectorTripCount, C);
1898}
1899
1900/// Try to replace multiple active lane masks used for control flow with
1901/// a single, wide active lane mask instruction followed by multiple
1902/// extract subvector intrinsics. This applies to the active lane mask
1903/// instructions both in the loop and in the preheader.
1904/// Incoming values of all ActiveLaneMaskPHIs are updated to use the
1905/// new extracts from the first active lane mask, which has it's last
1906/// operand (multiplier) set to UF.
1908 unsigned UF) {
1909 if (!EnableWideActiveLaneMask || !VF.isVector() || UF == 1)
1910 return false;
1911
1912 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
1913 VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
1914 auto *Term = &ExitingVPBB->back();
1915
1916 using namespace llvm::VPlanPatternMatch;
1918 m_VPValue(), m_VPValue(), m_VPValue())))))
1919 return false;
1920
1921 auto *Header = cast<VPBasicBlock>(VectorRegion->getEntry());
1922 LLVMContext &Ctx = Plan.getContext();
1923
1924 auto ExtractFromALM = [&](VPInstruction *ALM,
1925 SmallVectorImpl<VPValue *> &Extracts) {
1926 DebugLoc DL = ALM->getDebugLoc();
1927 for (unsigned Part = 0; Part < UF; ++Part) {
1929 Ops.append({ALM, Plan.getOrAddLiveIn(
1930 ConstantInt::get(IntegerType::getInt64Ty(Ctx),
1931 VF.getKnownMinValue() * Part))});
1932 auto *Ext =
1933 new VPWidenIntrinsicRecipe(Intrinsic::vector_extract, Ops,
1934 IntegerType::getInt1Ty(Ctx), {}, {}, DL);
1935 Extracts[Part] = Ext;
1936 Ext->insertAfter(ALM);
1937 }
1938 };
1939
1940 // Create a list of each active lane mask phi, ordered by unroll part.
1942 for (VPRecipeBase &R : Header->phis()) {
1944 if (!Phi)
1945 continue;
1946 VPValue *Index = nullptr;
1947 match(Phi->getBackedgeValue(),
1949 assert(Index && "Expected index from ActiveLaneMask instruction");
1950
1951 uint64_t Part;
1952 if (match(Index,
1954 m_VPValue(), m_ConstantInt(Part))))
1955 Phis[Part] = Phi;
1956 else
1957 // Anything other than a CanonicalIVIncrementForPart is part 0
1958 Phis[0] = Phi;
1959 }
1960
1961 assert(all_of(Phis, [](VPActiveLaneMaskPHIRecipe *Phi) { return Phi; }) &&
1962 "Expected one VPActiveLaneMaskPHIRecipe for each unroll part");
1963
1964 auto *EntryALM = cast<VPInstruction>(Phis[0]->getStartValue());
1965 auto *LoopALM = cast<VPInstruction>(Phis[0]->getBackedgeValue());
1966
1967 assert((EntryALM->getOpcode() == VPInstruction::ActiveLaneMask &&
1968 LoopALM->getOpcode() == VPInstruction::ActiveLaneMask) &&
1969 "Expected incoming values of Phi to be ActiveLaneMasks");
1970
1971 // When using wide lane masks, the return type of the get.active.lane.mask
1972 // intrinsic is VF x UF (last operand).
1973 VPValue *ALMMultiplier = Plan.getConstantInt(64, UF);
1974 EntryALM->setOperand(2, ALMMultiplier);
1975 LoopALM->setOperand(2, ALMMultiplier);
1976
1977 // Create UF x extract vectors and insert into preheader.
1978 SmallVector<VPValue *> EntryExtracts(UF);
1979 ExtractFromALM(EntryALM, EntryExtracts);
1980
1981 // Create UF x extract vectors and insert before the loop compare & branch,
1982 // updating the compare to use the first extract.
1983 SmallVector<VPValue *> LoopExtracts(UF);
1984 ExtractFromALM(LoopALM, LoopExtracts);
1985 VPInstruction *Not = cast<VPInstruction>(Term->getOperand(0));
1986 Not->setOperand(0, LoopExtracts[0]);
1987
1988 // Update the incoming values of active lane mask phis.
1989 for (unsigned Part = 0; Part < UF; ++Part) {
1990 Phis[Part]->setStartValue(EntryExtracts[Part]);
1991 Phis[Part]->setBackedgeValue(LoopExtracts[Part]);
1992 }
1993
1994 return true;
1995}
1996
1997/// Try to simplify the branch condition of \p Plan. This may restrict the
1998/// resulting plan to \p BestVF and \p BestUF.
2000 unsigned BestUF,
2002 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
2003 VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
2004 auto *Term = &ExitingVPBB->back();
2005 VPValue *Cond;
2006 if (match(Term, m_BranchOnCount()) ||
2008 m_VPValue(), m_VPValue(), m_VPValue()))))) {
2009 // Try to simplify the branch condition if VectorTC <= VF * UF when the
2010 // latch terminator is BranchOnCount or BranchOnCond(Not(ActiveLaneMask)).
2011 const SCEV *VectorTripCount =
2013 if (isa<SCEVCouldNotCompute>(VectorTripCount))
2014 VectorTripCount =
2016 assert(!isa<SCEVCouldNotCompute>(VectorTripCount) &&
2017 "Trip count SCEV must be computable");
2018 ScalarEvolution &SE = *PSE.getSE();
2019 ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF);
2020 const SCEV *C = SE.getElementCount(VectorTripCount->getType(), NumElements);
2021 if (!SE.isKnownPredicate(CmpInst::ICMP_ULE, VectorTripCount, C))
2022 return false;
2023 } else if (match(Term, m_BranchOnCond(m_VPValue(Cond))) ||
2025 // For BranchOnCond, check if we can prove the condition to be true using VF
2026 // and UF.
2027 if (!isConditionTrueViaVFAndUF(Cond, Plan, BestVF, BestUF, PSE))
2028 return false;
2029 } else {
2030 return false;
2031 }
2032
2033 // The vector loop region only executes once. If possible, completely remove
2034 // the region, otherwise replace the terminator controlling the latch with
2035 // (BranchOnCond true).
2036 // TODO: VPWidenIntOrFpInductionRecipe is only partially supported; add
2037 // support for other non-canonical widen induction recipes (e.g.,
2038 // VPWidenPointerInductionRecipe).
2039 // TODO: fold branch-on-constant after dissolving region.
2040 auto *Header = cast<VPBasicBlock>(VectorRegion->getEntry());
2041 if (all_of(Header->phis(), [](VPRecipeBase &Phi) {
2042 if (auto *R = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi))
2043 return R->isCanonical();
2044 return isa<VPCanonicalIVPHIRecipe, VPEVLBasedIVPHIRecipe,
2045 VPFirstOrderRecurrencePHIRecipe, VPPhi>(&Phi);
2046 })) {
2047 for (VPRecipeBase &HeaderR : make_early_inc_range(Header->phis())) {
2048 if (auto *R = dyn_cast<VPWidenIntOrFpInductionRecipe>(&HeaderR)) {
2049 VPBuilder Builder(Plan.getVectorPreheader());
2050 VPValue *StepV = Builder.createNaryOp(VPInstruction::StepVector, {},
2051 R->getScalarType());
2052 HeaderR.getVPSingleValue()->replaceAllUsesWith(StepV);
2053 HeaderR.eraseFromParent();
2054 continue;
2055 }
2056 auto *Phi = cast<VPPhiAccessors>(&HeaderR);
2057 HeaderR.getVPSingleValue()->replaceAllUsesWith(Phi->getIncomingValue(0));
2058 HeaderR.eraseFromParent();
2059 }
2060
2061 VPBlockBase *Preheader = VectorRegion->getSinglePredecessor();
2062 SmallVector<VPBlockBase *> Exits = to_vector(VectorRegion->getSuccessors());
2063 VPBlockUtils::disconnectBlocks(Preheader, VectorRegion);
2064 for (VPBlockBase *Exit : Exits)
2065 VPBlockUtils::disconnectBlocks(VectorRegion, Exit);
2066
2067 for (VPBlockBase *B : vp_depth_first_shallow(VectorRegion->getEntry()))
2068 B->setParent(nullptr);
2069
2070 VPBlockUtils::connectBlocks(Preheader, Header);
2071
2072 for (VPBlockBase *Exit : Exits)
2073 VPBlockUtils::connectBlocks(ExitingVPBB, Exit);
2074
2075 // Replace terminating branch-on-two-conds with branch-on-cond to early
2076 // exit.
2077 if (Exits.size() != 1) {
2078 assert(match(Term, m_BranchOnTwoConds()) && Exits.size() == 2 &&
2079 "BranchOnTwoConds needs 2 remaining exits");
2081 Term->getOperand(0));
2082 }
2084 } else {
2085 // The vector region contains header phis for which we cannot remove the
2086 // loop region yet.
2087
2088 // For BranchOnTwoConds, set the latch exit condition to true directly.
2089 if (match(Term, m_BranchOnTwoConds())) {
2090 Term->setOperand(1, Plan.getTrue());
2091 return true;
2092 }
2093
2094 auto *BOC = new VPInstruction(VPInstruction::BranchOnCond, {Plan.getTrue()},
2095 {}, {}, Term->getDebugLoc());
2096 ExitingVPBB->appendRecipe(BOC);
2097 }
2098
2099 Term->eraseFromParent();
2100
2101 return true;
2102}
2103
2104/// From the definition of llvm.experimental.get.vector.length,
2105/// VPInstruction::ExplicitVectorLength(%AVL) = %AVL when %AVL <= VF.
2109 vp_depth_first_deep(Plan.getEntry()))) {
2110 for (VPRecipeBase &R : *VPBB) {
2111 VPValue *AVL;
2112 if (!match(&R, m_EVL(m_VPValue(AVL))))
2113 continue;
2114
2115 const SCEV *AVLSCEV = vputils::getSCEVExprForVPValue(AVL, PSE);
2116 if (isa<SCEVCouldNotCompute>(AVLSCEV))
2117 continue;
2118 ScalarEvolution &SE = *PSE.getSE();
2119 const SCEV *VFSCEV = SE.getElementCount(AVLSCEV->getType(), VF);
2120 if (!SE.isKnownPredicate(CmpInst::ICMP_ULE, AVLSCEV, VFSCEV))
2121 continue;
2122
2124 AVL, Type::getInt32Ty(Plan.getContext()), AVLSCEV->getType(),
2125 R.getDebugLoc());
2126 R.getVPSingleValue()->replaceAllUsesWith(Trunc);
2127 return true;
2128 }
2129 }
2130 return false;
2131}
2132
2134 unsigned BestUF,
2136 assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
2137 assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
2138
2139 bool MadeChange = tryToReplaceALMWithWideALM(Plan, BestVF, BestUF);
2140 MadeChange |= simplifyBranchConditionForVFAndUF(Plan, BestVF, BestUF, PSE);
2141 MadeChange |= optimizeVectorInductionWidthForTCAndVFUF(Plan, BestVF, BestUF);
2142 MadeChange |= simplifyKnownEVL(Plan, BestVF, PSE);
2143
2144 if (MadeChange) {
2145 Plan.setVF(BestVF);
2146 assert(Plan.getUF() == BestUF && "BestUF must match the Plan's UF");
2147 }
2148}
2149
2150/// Sink users of \p FOR after the recipe defining the previous value \p
2151/// Previous of the recurrence. \returns true if all users of \p FOR could be
2152/// re-arranged as needed or false if it is not possible.
2153static bool
2155 VPRecipeBase *Previous,
2156 VPDominatorTree &VPDT) {
2157 // Collect recipes that need sinking.
2160 Seen.insert(Previous);
2161 auto TryToPushSinkCandidate = [&](VPRecipeBase *SinkCandidate) {
2162 // The previous value must not depend on the users of the recurrence phi. In
2163 // that case, FOR is not a fixed order recurrence.
2164 if (SinkCandidate == Previous)
2165 return false;
2166
2167 if (isa<VPHeaderPHIRecipe>(SinkCandidate) ||
2168 !Seen.insert(SinkCandidate).second ||
2169 VPDT.properlyDominates(Previous, SinkCandidate))
2170 return true;
2171
2172 if (cannotHoistOrSinkRecipe(*SinkCandidate))
2173 return false;
2174
2175 WorkList.push_back(SinkCandidate);
2176 return true;
2177 };
2178
2179 // Recursively sink users of FOR after Previous.
2180 WorkList.push_back(FOR);
2181 for (unsigned I = 0; I != WorkList.size(); ++I) {
2182 VPRecipeBase *Current = WorkList[I];
2183 assert(Current->getNumDefinedValues() == 1 &&
2184 "only recipes with a single defined value expected");
2185
2186 for (VPUser *User : Current->getVPSingleValue()->users()) {
2187 if (!TryToPushSinkCandidate(cast<VPRecipeBase>(User)))
2188 return false;
2189 }
2190 }
2191
2192 // Keep recipes to sink ordered by dominance so earlier instructions are
2193 // processed first.
2194 sort(WorkList, [&VPDT](const VPRecipeBase *A, const VPRecipeBase *B) {
2195 return VPDT.properlyDominates(A, B);
2196 });
2197
2198 for (VPRecipeBase *SinkCandidate : WorkList) {
2199 if (SinkCandidate == FOR)
2200 continue;
2201
2202 SinkCandidate->moveAfter(Previous);
2203 Previous = SinkCandidate;
2204 }
2205 return true;
2206}
2207
2208/// Try to hoist \p Previous and its operands before all users of \p FOR.
2210 VPRecipeBase *Previous,
2211 VPDominatorTree &VPDT) {
2212 if (cannotHoistOrSinkRecipe(*Previous))
2213 return false;
2214
2215 // Collect recipes that need hoisting.
2216 SmallVector<VPRecipeBase *> HoistCandidates;
2218 VPRecipeBase *HoistPoint = nullptr;
2219 // Find the closest hoist point by looking at all users of FOR and selecting
2220 // the recipe dominating all other users.
2221 for (VPUser *U : FOR->users()) {
2222 auto *R = cast<VPRecipeBase>(U);
2223 if (!HoistPoint || VPDT.properlyDominates(R, HoistPoint))
2224 HoistPoint = R;
2225 }
2226 assert(all_of(FOR->users(),
2227 [&VPDT, HoistPoint](VPUser *U) {
2228 auto *R = cast<VPRecipeBase>(U);
2229 return HoistPoint == R ||
2230 VPDT.properlyDominates(HoistPoint, R);
2231 }) &&
2232 "HoistPoint must dominate all users of FOR");
2233
2234 auto NeedsHoisting = [HoistPoint, &VPDT,
2235 &Visited](VPValue *HoistCandidateV) -> VPRecipeBase * {
2236 VPRecipeBase *HoistCandidate = HoistCandidateV->getDefiningRecipe();
2237 if (!HoistCandidate)
2238 return nullptr;
2239 VPRegionBlock *EnclosingLoopRegion =
2240 HoistCandidate->getParent()->getEnclosingLoopRegion();
2241 assert((!HoistCandidate->getRegion() ||
2242 HoistCandidate->getRegion() == EnclosingLoopRegion) &&
2243 "CFG in VPlan should still be flat, without replicate regions");
2244 // Hoist candidate was already visited, no need to hoist.
2245 if (!Visited.insert(HoistCandidate).second)
2246 return nullptr;
2247
2248 // Candidate is outside loop region or a header phi, dominates FOR users w/o
2249 // hoisting.
2250 if (!EnclosingLoopRegion || isa<VPHeaderPHIRecipe>(HoistCandidate))
2251 return nullptr;
2252
2253 // If we reached a recipe that dominates HoistPoint, we don't need to
2254 // hoist the recipe.
2255 if (VPDT.properlyDominates(HoistCandidate, HoistPoint))
2256 return nullptr;
2257 return HoistCandidate;
2258 };
2259
2260 if (!NeedsHoisting(Previous->getVPSingleValue()))
2261 return true;
2262
2263 // Recursively try to hoist Previous and its operands before all users of FOR.
2264 HoistCandidates.push_back(Previous);
2265
2266 for (unsigned I = 0; I != HoistCandidates.size(); ++I) {
2267 VPRecipeBase *Current = HoistCandidates[I];
2268 assert(Current->getNumDefinedValues() == 1 &&
2269 "only recipes with a single defined value expected");
2270 if (cannotHoistOrSinkRecipe(*Current))
2271 return false;
2272
2273 for (VPValue *Op : Current->operands()) {
2274 // If we reach FOR, it means the original Previous depends on some other
2275 // recurrence that in turn depends on FOR. If that is the case, we would
2276 // also need to hoist recipes involving the other FOR, which may break
2277 // dependencies.
2278 if (Op == FOR)
2279 return false;
2280
2281 if (auto *R = NeedsHoisting(Op)) {
2282 // Bail out if the recipe defines multiple values.
2283 // TODO: Hoisting such recipes requires additional handling.
2284 if (R->getNumDefinedValues() != 1)
2285 return false;
2286 HoistCandidates.push_back(R);
2287 }
2288 }
2289 }
2290
2291 // Order recipes to hoist by dominance so earlier instructions are processed
2292 // first.
2293 sort(HoistCandidates, [&VPDT](const VPRecipeBase *A, const VPRecipeBase *B) {
2294 return VPDT.properlyDominates(A, B);
2295 });
2296
2297 for (VPRecipeBase *HoistCandidate : HoistCandidates) {
2298 HoistCandidate->moveBefore(*HoistPoint->getParent(),
2299 HoistPoint->getIterator());
2300 }
2301
2302 return true;
2303}
2304
2306 VPBuilder &LoopBuilder) {
2307 VPDominatorTree VPDT(Plan);
2308
2310 for (VPRecipeBase &R :
2313 RecurrencePhis.push_back(FOR);
2314
2315 for (VPFirstOrderRecurrencePHIRecipe *FOR : RecurrencePhis) {
2317 VPRecipeBase *Previous = FOR->getBackedgeValue()->getDefiningRecipe();
2318 // Fixed-order recurrences do not contain cycles, so this loop is guaranteed
2319 // to terminate.
2320 while (auto *PrevPhi =
2322 assert(PrevPhi->getParent() == FOR->getParent());
2323 assert(SeenPhis.insert(PrevPhi).second);
2324 Previous = PrevPhi->getBackedgeValue()->getDefiningRecipe();
2325 }
2326
2327 if (!sinkRecurrenceUsersAfterPrevious(FOR, Previous, VPDT) &&
2328 !hoistPreviousBeforeFORUsers(FOR, Previous, VPDT))
2329 return false;
2330
2331 // Introduce a recipe to combine the incoming and previous values of a
2332 // fixed-order recurrence.
2333 VPBasicBlock *InsertBlock = Previous->getParent();
2334 if (isa<VPHeaderPHIRecipe>(Previous))
2335 LoopBuilder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi());
2336 else
2337 LoopBuilder.setInsertPoint(InsertBlock,
2338 std::next(Previous->getIterator()));
2339
2340 auto *RecurSplice =
2342 {FOR, FOR->getBackedgeValue()});
2343
2344 FOR->replaceAllUsesWith(RecurSplice);
2345 // Set the first operand of RecurSplice to FOR again, after replacing
2346 // all users.
2347 RecurSplice->setOperand(0, FOR);
2348
2349 // Check for users extracting at the penultimate active lane of the FOR.
2350 // If only a single lane is active in the current iteration, we need to
2351 // select the last element from the previous iteration (from the FOR phi
2352 // directly).
2353 for (VPUser *U : RecurSplice->users()) {
2355 m_Specific(RecurSplice))))
2356 continue;
2357
2359 VPValue *LastActiveLane = cast<VPInstruction>(U)->getOperand(0);
2360 Type *I64Ty = Type::getInt64Ty(Plan.getContext());
2361 VPValue *Zero = Plan.getOrAddLiveIn(ConstantInt::get(I64Ty, 0));
2362 VPValue *One = Plan.getOrAddLiveIn(ConstantInt::get(I64Ty, 1));
2363 VPValue *PenultimateIndex =
2364 B.createNaryOp(Instruction::Sub, {LastActiveLane, One});
2365 VPValue *PenultimateLastIter =
2366 B.createNaryOp(VPInstruction::ExtractLane,
2367 {PenultimateIndex, FOR->getBackedgeValue()});
2368 VPValue *LastPrevIter =
2369 B.createNaryOp(VPInstruction::ExtractLastLane, FOR);
2370
2371 VPValue *Cmp = B.createICmp(CmpInst::ICMP_EQ, LastActiveLane, Zero);
2372 VPValue *Sel = B.createSelect(Cmp, LastPrevIter, PenultimateLastIter);
2373 cast<VPInstruction>(U)->replaceAllUsesWith(Sel);
2374 }
2375 }
2376 return true;
2377}
2378
2380 for (VPRecipeBase &R :
2382 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
2383 if (!PhiR)
2384 continue;
2385 RecurKind RK = PhiR->getRecurrenceKind();
2386 if (RK != RecurKind::Add && RK != RecurKind::Mul && RK != RecurKind::Sub &&
2388 continue;
2389
2390 for (VPUser *U : collectUsersRecursively(PhiR))
2391 if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(U)) {
2392 RecWithFlags->dropPoisonGeneratingFlags();
2393 }
2394 }
2395}
2396
2397namespace {
2398struct VPCSEDenseMapInfo : public DenseMapInfo<VPSingleDefRecipe *> {
2399 static bool isSentinel(const VPSingleDefRecipe *Def) {
2400 return Def == getEmptyKey() || Def == getTombstoneKey();
2401 }
2402
2403 /// If recipe \p R will lower to a GEP with a non-i8 source element type,
2404 /// return that source element type.
2405 static Type *getGEPSourceElementType(const VPSingleDefRecipe *R) {
2406 // All VPInstructions that lower to GEPs must have the i8 source element
2407 // type (as they are PtrAdds), so we omit it.
2409 .Case<VPReplicateRecipe>([](auto *I) -> Type * {
2410 if (auto *GEP = dyn_cast<GetElementPtrInst>(I->getUnderlyingValue()))
2411 return GEP->getSourceElementType();
2412 return nullptr;
2413 })
2414 .Case<VPVectorPointerRecipe, VPWidenGEPRecipe>(
2415 [](auto *I) { return I->getSourceElementType(); })
2416 .Default([](auto *) { return nullptr; });
2417 }
2418
2419 /// Returns true if recipe \p Def can be safely handed for CSE.
2420 static bool canHandle(const VPSingleDefRecipe *Def) {
2421 // We can extend the list of handled recipes in the future,
2422 // provided we account for the data embedded in them while checking for
2423 // equality or hashing.
2424 auto C = getOpcodeOrIntrinsicID(Def);
2425
2426 // The issue with (Insert|Extract)Value is that the index of the
2427 // insert/extract is not a proper operand in LLVM IR, and hence also not in
2428 // VPlan.
2429 if (!C || (!C->first && (C->second == Instruction::InsertValue ||
2430 C->second == Instruction::ExtractValue)))
2431 return false;
2432
2433 // During CSE, we can only handle recipes that don't read from memory: if
2434 // they read from memory, there could be an intervening write to memory
2435 // before the next instance is CSE'd, leading to an incorrect result.
2436 return !Def->mayReadFromMemory();
2437 }
2438
2439 /// Hash the underlying data of \p Def.
2440 static unsigned getHashValue(const VPSingleDefRecipe *Def) {
2441 const VPlan *Plan = Def->getParent()->getPlan();
2442 VPTypeAnalysis TypeInfo(*Plan);
2443 hash_code Result = hash_combine(
2444 Def->getVPDefID(), getOpcodeOrIntrinsicID(Def),
2445 getGEPSourceElementType(Def), TypeInfo.inferScalarType(Def),
2447 if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(Def))
2448 if (RFlags->hasPredicate())
2449 return hash_combine(Result, RFlags->getPredicate());
2450 return Result;
2451 }
2452
2453 /// Check equality of underlying data of \p L and \p R.
2454 static bool isEqual(const VPSingleDefRecipe *L, const VPSingleDefRecipe *R) {
2455 if (isSentinel(L) || isSentinel(R))
2456 return L == R;
2457 if (L->getVPDefID() != R->getVPDefID() ||
2459 getGEPSourceElementType(L) != getGEPSourceElementType(R) ||
2461 !equal(L->operands(), R->operands()))
2462 return false;
2464 "must have valid opcode info for both recipes");
2465 if (auto *LFlags = dyn_cast<VPRecipeWithIRFlags>(L))
2466 if (LFlags->hasPredicate() &&
2467 LFlags->getPredicate() !=
2468 cast<VPRecipeWithIRFlags>(R)->getPredicate())
2469 return false;
2470 // Recipes in replicate regions implicitly depend on predicate. If either
2471 // recipe is in a replicate region, only consider them equal if both have
2472 // the same parent.
2473 const VPRegionBlock *RegionL = L->getRegion();
2474 const VPRegionBlock *RegionR = R->getRegion();
2475 if (((RegionL && RegionL->isReplicator()) ||
2476 (RegionR && RegionR->isReplicator())) &&
2477 L->getParent() != R->getParent())
2478 return false;
2479 const VPlan *Plan = L->getParent()->getPlan();
2480 VPTypeAnalysis TypeInfo(*Plan);
2481 return TypeInfo.inferScalarType(L) == TypeInfo.inferScalarType(R);
2482 }
2483};
2484} // end anonymous namespace
2485
2486/// Perform a common-subexpression-elimination of VPSingleDefRecipes on the \p
2487/// Plan.
2489 VPDominatorTree VPDT(Plan);
2491
2493 vp_depth_first_deep(Plan.getEntry()))) {
2494 for (VPRecipeBase &R : *VPBB) {
2495 auto *Def = dyn_cast<VPSingleDefRecipe>(&R);
2496 if (!Def || !VPCSEDenseMapInfo::canHandle(Def))
2497 continue;
2498 if (VPSingleDefRecipe *V = CSEMap.lookup(Def)) {
2499 // V must dominate Def for a valid replacement.
2500 if (!VPDT.dominates(V->getParent(), VPBB))
2501 continue;
2502 // Only keep flags present on both V and Def.
2503 if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(V))
2504 RFlags->intersectFlags(*cast<VPRecipeWithIRFlags>(Def));
2505 Def->replaceAllUsesWith(V);
2506 continue;
2507 }
2508 CSEMap[Def] = Def;
2509 }
2510 }
2511}
2512
2513/// Move loop-invariant recipes out of the vector loop region in \p Plan.
2514static void licm(VPlan &Plan) {
2515 VPBasicBlock *Preheader = Plan.getVectorPreheader();
2516
2517 // Hoist any loop invariant recipes from the vector loop region to the
2518 // preheader. Preform a shallow traversal of the vector loop region, to
2519 // exclude recipes in replicate regions. Since the top-level blocks in the
2520 // vector loop region are guaranteed to execute if the vector pre-header is,
2521 // we don't need to check speculation safety.
2522 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2523 assert(Preheader->getSingleSuccessor() == LoopRegion &&
2524 "Expected vector prehader's successor to be the vector loop region");
2526 vp_depth_first_shallow(LoopRegion->getEntry()))) {
2527 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
2529 continue;
2530 if (any_of(R.operands(), [](VPValue *Op) {
2531 return !Op->isDefinedOutsideLoopRegions();
2532 }))
2533 continue;
2534 R.moveBefore(*Preheader, Preheader->end());
2535 }
2536 }
2537}
2538
2540 VPlan &Plan, const MapVector<Instruction *, uint64_t> &MinBWs) {
2541 if (Plan.hasScalarVFOnly())
2542 return;
2543 // Keep track of created truncates, so they can be re-used. Note that we
2544 // cannot use RAUW after creating a new truncate, as this would could make
2545 // other uses have different types for their operands, making them invalidly
2546 // typed.
2548 VPTypeAnalysis TypeInfo(Plan);
2549 VPBasicBlock *PH = Plan.getVectorPreheader();
2552 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
2555 continue;
2556
2557 VPValue *ResultVPV = R.getVPSingleValue();
2558 auto *UI = cast_or_null<Instruction>(ResultVPV->getUnderlyingValue());
2559 unsigned NewResSizeInBits = MinBWs.lookup(UI);
2560 if (!NewResSizeInBits)
2561 continue;
2562
2563 // If the value wasn't vectorized, we must maintain the original scalar
2564 // type. Skip those here, after incrementing NumProcessedRecipes. Also
2565 // skip casts which do not need to be handled explicitly here, as
2566 // redundant casts will be removed during recipe simplification.
2568 continue;
2569
2570 Type *OldResTy = TypeInfo.inferScalarType(ResultVPV);
2571 unsigned OldResSizeInBits = OldResTy->getScalarSizeInBits();
2572 assert(OldResTy->isIntegerTy() && "only integer types supported");
2573 (void)OldResSizeInBits;
2574
2575 auto *NewResTy = IntegerType::get(Plan.getContext(), NewResSizeInBits);
2576
2577 // Any wrapping introduced by shrinking this operation shouldn't be
2578 // considered undefined behavior. So, we can't unconditionally copy
2579 // arithmetic wrapping flags to VPW.
2580 if (auto *VPW = dyn_cast<VPRecipeWithIRFlags>(&R))
2581 VPW->dropPoisonGeneratingFlags();
2582
2583 if (OldResSizeInBits != NewResSizeInBits &&
2584 !match(&R, m_ICmp(m_VPValue(), m_VPValue()))) {
2585 // Extend result to original width.
2586 auto *Ext =
2587 new VPWidenCastRecipe(Instruction::ZExt, ResultVPV, OldResTy);
2588 Ext->insertAfter(&R);
2589 ResultVPV->replaceAllUsesWith(Ext);
2590 Ext->setOperand(0, ResultVPV);
2591 assert(OldResSizeInBits > NewResSizeInBits && "Nothing to shrink?");
2592 } else {
2593 assert(match(&R, m_ICmp(m_VPValue(), m_VPValue())) &&
2594 "Only ICmps should not need extending the result.");
2595 }
2596
2597 assert(!isa<VPWidenStoreRecipe>(&R) && "stores cannot be narrowed");
2599 continue;
2600
2601 // Shrink operands by introducing truncates as needed.
2602 unsigned StartIdx =
2603 match(&R, m_Select(m_VPValue(), m_VPValue(), m_VPValue())) ? 1 : 0;
2604 for (unsigned Idx = StartIdx; Idx != R.getNumOperands(); ++Idx) {
2605 auto *Op = R.getOperand(Idx);
2606 unsigned OpSizeInBits =
2608 if (OpSizeInBits == NewResSizeInBits)
2609 continue;
2610 assert(OpSizeInBits > NewResSizeInBits && "nothing to truncate");
2611 auto [ProcessedIter, IterIsEmpty] = ProcessedTruncs.try_emplace(Op);
2612 if (!IterIsEmpty) {
2613 R.setOperand(Idx, ProcessedIter->second);
2614 continue;
2615 }
2616
2617 VPBuilder Builder;
2618 if (isa<VPIRValue>(Op))
2619 Builder.setInsertPoint(PH);
2620 else
2621 Builder.setInsertPoint(&R);
2622 VPWidenCastRecipe *NewOp =
2623 Builder.createWidenCast(Instruction::Trunc, Op, NewResTy);
2624 ProcessedIter->second = NewOp;
2625 R.setOperand(Idx, NewOp);
2626 }
2627
2628 }
2629 }
2630}
2631
2635 VPValue *Cond;
2636 // Skip blocks that are not terminated by BranchOnCond.
2637 if (VPBB->empty() || !match(&VPBB->back(), m_BranchOnCond(m_VPValue(Cond))))
2638 continue;
2639
2640 assert(VPBB->getNumSuccessors() == 2 &&
2641 "Two successors expected for BranchOnCond");
2642 unsigned RemovedIdx;
2643 if (match(Cond, m_True()))
2644 RemovedIdx = 1;
2645 else if (match(Cond, m_False()))
2646 RemovedIdx = 0;
2647 else
2648 continue;
2649
2650 VPBasicBlock *RemovedSucc =
2651 cast<VPBasicBlock>(VPBB->getSuccessors()[RemovedIdx]);
2652 assert(count(RemovedSucc->getPredecessors(), VPBB) == 1 &&
2653 "There must be a single edge between VPBB and its successor");
2654 // Values coming from VPBB into phi recipes of RemoveSucc are removed from
2655 // these recipes.
2656 for (VPRecipeBase &R : RemovedSucc->phis())
2657 cast<VPPhiAccessors>(&R)->removeIncomingValueFor(VPBB);
2658
2659 // Disconnect blocks and remove the terminator. RemovedSucc will be deleted
2660 // automatically on VPlan destruction if it becomes unreachable.
2661 VPBlockUtils::disconnectBlocks(VPBB, RemovedSucc);
2662 VPBB->back().eraseFromParent();
2663 }
2664}
2665
2685
2686// Add a VPActiveLaneMaskPHIRecipe and related recipes to \p Plan and replace
2687// the loop terminator with a branch-on-cond recipe with the negated
2688// active-lane-mask as operand. Note that this turns the loop into an
2689// uncountable one. Only the existing terminator is replaced, all other existing
2690// recipes/users remain unchanged, except for poison-generating flags being
2691// dropped from the canonical IV increment. Return the created
2692// VPActiveLaneMaskPHIRecipe.
2693//
2694// The function uses the following definitions:
2695//
2696// %TripCount = DataWithControlFlowWithoutRuntimeCheck ?
2697// calculate-trip-count-minus-VF (original TC) : original TC
2698// %IncrementValue = DataWithControlFlowWithoutRuntimeCheck ?
2699// CanonicalIVPhi : CanonicalIVIncrement
2700// %StartV is the canonical induction start value.
2701//
2702// The function adds the following recipes:
2703//
2704// vector.ph:
2705// %TripCount = calculate-trip-count-minus-VF (original TC)
2706// [if DataWithControlFlowWithoutRuntimeCheck]
2707// %EntryInc = canonical-iv-increment-for-part %StartV
2708// %EntryALM = active-lane-mask %EntryInc, %TripCount
2709//
2710// vector.body:
2711// ...
2712// %P = active-lane-mask-phi [ %EntryALM, %vector.ph ], [ %ALM, %vector.body ]
2713// ...
2714// %InLoopInc = canonical-iv-increment-for-part %IncrementValue
2715// %ALM = active-lane-mask %InLoopInc, TripCount
2716// %Negated = Not %ALM
2717// branch-on-cond %Negated
2718//
2721 VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
2722 VPBasicBlock *EB = TopRegion->getExitingBasicBlock();
2723 auto *CanonicalIVPHI = TopRegion->getCanonicalIV();
2724 VPValue *StartV = CanonicalIVPHI->getStartValue();
2725
2726 auto *CanonicalIVIncrement =
2727 cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue());
2728 // TODO: Check if dropping the flags is needed if
2729 // !DataAndControlFlowWithoutRuntimeCheck.
2730 CanonicalIVIncrement->dropPoisonGeneratingFlags();
2731 DebugLoc DL = CanonicalIVIncrement->getDebugLoc();
2732 // We can't use StartV directly in the ActiveLaneMask VPInstruction, since
2733 // we have to take unrolling into account. Each part needs to start at
2734 // Part * VF
2735 auto *VecPreheader = Plan.getVectorPreheader();
2736 VPBuilder Builder(VecPreheader);
2737
2738 // Create the ActiveLaneMask instruction using the correct start values.
2739 VPValue *TC = Plan.getTripCount();
2740
2741 VPValue *TripCount, *IncrementValue;
2743 // When the loop is guarded by a runtime overflow check for the loop
2744 // induction variable increment by VF, we can increment the value before
2745 // the get.active.lane mask and use the unmodified tripcount.
2746 IncrementValue = CanonicalIVIncrement;
2747 TripCount = TC;
2748 } else {
2749 // When avoiding a runtime check, the active.lane.mask inside the loop
2750 // uses a modified trip count and the induction variable increment is
2751 // done after the active.lane.mask intrinsic is called.
2752 IncrementValue = CanonicalIVPHI;
2753 TripCount = Builder.createNaryOp(VPInstruction::CalculateTripCountMinusVF,
2754 {TC}, DL);
2755 }
2756 auto *EntryIncrement = Builder.createOverflowingOp(
2757 VPInstruction::CanonicalIVIncrementForPart, {StartV}, {false, false}, DL,
2758 "index.part.next");
2759
2760 // Create the active lane mask instruction in the VPlan preheader.
2761 VPValue *ALMMultiplier =
2762 Plan.getConstantInt(TopRegion->getCanonicalIVType(), 1);
2763 auto *EntryALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
2764 {EntryIncrement, TC, ALMMultiplier}, DL,
2765 "active.lane.mask.entry");
2766
2767 // Now create the ActiveLaneMaskPhi recipe in the main loop using the
2768 // preheader ActiveLaneMask instruction.
2769 auto *LaneMaskPhi =
2771 LaneMaskPhi->insertAfter(CanonicalIVPHI);
2772
2773 // Create the active lane mask for the next iteration of the loop before the
2774 // original terminator.
2775 VPRecipeBase *OriginalTerminator = EB->getTerminator();
2776 Builder.setInsertPoint(OriginalTerminator);
2777 auto *InLoopIncrement =
2778 Builder.createOverflowingOp(VPInstruction::CanonicalIVIncrementForPart,
2779 {IncrementValue}, {false, false}, DL);
2780 auto *ALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
2781 {InLoopIncrement, TripCount, ALMMultiplier},
2782 DL, "active.lane.mask.next");
2783 LaneMaskPhi->addOperand(ALM);
2784
2785 // Replace the original terminator with BranchOnCond. We have to invert the
2786 // mask here because a true condition means jumping to the exit block.
2787 auto *NotMask = Builder.createNot(ALM, DL);
2788 Builder.createNaryOp(VPInstruction::BranchOnCond, {NotMask}, DL);
2789 OriginalTerminator->eraseFromParent();
2790 return LaneMaskPhi;
2791}
2792
2793/// Collect the header mask with the pattern:
2794/// (ICMP_ULE, WideCanonicalIV, backedge-taken-count)
2795/// TODO: Introduce explicit recipe for header-mask instead of searching
2796/// for the header-mask pattern manually.
2798 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2799 SmallVector<VPValue *> WideCanonicalIVs;
2800 auto *FoundWidenCanonicalIVUser = find_if(
2802 assert(count_if(LoopRegion->getCanonicalIV()->users(),
2804 "Must have at most one VPWideCanonicalIVRecipe");
2805 if (FoundWidenCanonicalIVUser !=
2806 LoopRegion->getCanonicalIV()->users().end()) {
2807 auto *WideCanonicalIV =
2808 cast<VPWidenCanonicalIVRecipe>(*FoundWidenCanonicalIVUser);
2809 WideCanonicalIVs.push_back(WideCanonicalIV);
2810 }
2811
2812 // Also include VPWidenIntOrFpInductionRecipes that represent a widened
2813 // version of the canonical induction.
2814 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
2815 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
2816 auto *WidenOriginalIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
2817 if (WidenOriginalIV && WidenOriginalIV->isCanonical())
2818 WideCanonicalIVs.push_back(WidenOriginalIV);
2819 }
2820
2821 // Walk users of wide canonical IVs and find the single compare of the form
2822 // (ICMP_ULE, WideCanonicalIV, backedge-taken-count).
2823 VPSingleDefRecipe *HeaderMask = nullptr;
2824 for (auto *Wide : WideCanonicalIVs) {
2825 for (VPUser *U : Wide->users()) {
2826 auto *VPI = dyn_cast<VPInstruction>(U);
2827 if (!VPI || !vputils::isHeaderMask(VPI, Plan))
2828 continue;
2829
2830 assert(VPI->getOperand(0) == Wide &&
2831 "WidenCanonicalIV must be the first operand of the compare");
2832 assert(!HeaderMask && "Multiple header masks found?");
2833 HeaderMask = VPI;
2834 }
2835 }
2836 return HeaderMask;
2837}
2838
2840 VPlan &Plan, bool UseActiveLaneMaskForControlFlow,
2843 UseActiveLaneMaskForControlFlow) &&
2844 "DataAndControlFlowWithoutRuntimeCheck implies "
2845 "UseActiveLaneMaskForControlFlow");
2846
2847 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2848 auto *FoundWidenCanonicalIVUser = find_if(
2850 assert(FoundWidenCanonicalIVUser &&
2851 "Must have widened canonical IV when tail folding!");
2852 VPSingleDefRecipe *HeaderMask = findHeaderMask(Plan);
2853 auto *WideCanonicalIV =
2854 cast<VPWidenCanonicalIVRecipe>(*FoundWidenCanonicalIVUser);
2855 VPSingleDefRecipe *LaneMask;
2856 if (UseActiveLaneMaskForControlFlow) {
2859 } else {
2860 VPBuilder B = VPBuilder::getToInsertAfter(WideCanonicalIV);
2861 VPValue *ALMMultiplier = Plan.getOrAddLiveIn(
2862 ConstantInt::get(LoopRegion->getCanonicalIVType(), 1));
2863 LaneMask =
2864 B.createNaryOp(VPInstruction::ActiveLaneMask,
2865 {WideCanonicalIV, Plan.getTripCount(), ALMMultiplier},
2866 nullptr, "active.lane.mask");
2867 }
2868
2869 // Walk users of WideCanonicalIV and replace the header mask of the form
2870 // (ICMP_ULE, WideCanonicalIV, backedge-taken-count) with an active-lane-mask,
2871 // removing the old one to ensure there is always only a single header mask.
2872 HeaderMask->replaceAllUsesWith(LaneMask);
2873 HeaderMask->eraseFromParent();
2874}
2875
2876template <typename Op0_t, typename Op1_t> struct RemoveMask_match {
2877 Op0_t In;
2879
2880 RemoveMask_match(const Op0_t &In, Op1_t &Out) : In(In), Out(Out) {}
2881
2882 template <typename OpTy> bool match(OpTy *V) const {
2883 if (m_Specific(In).match(V)) {
2884 Out = nullptr;
2885 return true;
2886 }
2887 return m_LogicalAnd(m_Specific(In), m_VPValue(Out)).match(V);
2888 }
2889};
2890
2891/// Match a specific mask \p In, or a combination of it (logical-and In, Out).
2892/// Returns the remaining part \p Out if so, or nullptr otherwise.
2893template <typename Op0_t, typename Op1_t>
2894static inline RemoveMask_match<Op0_t, Op1_t> m_RemoveMask(const Op0_t &In,
2895 Op1_t &Out) {
2896 return RemoveMask_match<Op0_t, Op1_t>(In, Out);
2897}
2898
2899/// Try to optimize a \p CurRecipe masked by \p HeaderMask to a corresponding
2900/// EVL-based recipe without the header mask. Returns nullptr if no EVL-based
2901/// recipe could be created.
2902/// \p HeaderMask Header Mask.
2903/// \p CurRecipe Recipe to be transform.
2904/// \p TypeInfo VPlan-based type analysis.
2905/// \p EVL The explicit vector length parameter of vector-predication
2906/// intrinsics.
2908 VPRecipeBase &CurRecipe,
2909 VPTypeAnalysis &TypeInfo, VPValue &EVL) {
2910 VPlan *Plan = CurRecipe.getParent()->getPlan();
2911 DebugLoc DL = CurRecipe.getDebugLoc();
2912 VPValue *Addr, *Mask, *EndPtr;
2913
2914 /// Adjust any end pointers so that they point to the end of EVL lanes not VF.
2915 auto AdjustEndPtr = [&CurRecipe, &EVL](VPValue *EndPtr) {
2916 auto *EVLEndPtr = cast<VPVectorEndPointerRecipe>(EndPtr)->clone();
2917 EVLEndPtr->insertBefore(&CurRecipe);
2918 EVLEndPtr->setOperand(1, &EVL);
2919 return EVLEndPtr;
2920 };
2921
2922 if (match(&CurRecipe,
2923 m_MaskedLoad(m_VPValue(Addr), m_RemoveMask(HeaderMask, Mask))) &&
2924 !cast<VPWidenLoadRecipe>(CurRecipe).isReverse())
2925 return new VPWidenLoadEVLRecipe(cast<VPWidenLoadRecipe>(CurRecipe), Addr,
2926 EVL, Mask);
2927
2928 VPValue *ReversedVal;
2929 if (match(&CurRecipe, m_Reverse(m_VPValue(ReversedVal))) &&
2930 match(ReversedVal,
2931 m_MaskedLoad(m_VPValue(EndPtr), m_RemoveMask(HeaderMask, Mask))) &&
2932 match(EndPtr, m_VecEndPtr(m_VPValue(Addr), m_Specific(&Plan->getVF()))) &&
2933 cast<VPWidenLoadRecipe>(ReversedVal)->isReverse()) {
2934 auto *LoadR = new VPWidenLoadEVLRecipe(
2935 *cast<VPWidenLoadRecipe>(ReversedVal), AdjustEndPtr(EndPtr), EVL, Mask);
2936 LoadR->insertBefore(&CurRecipe);
2937 return new VPWidenIntrinsicRecipe(
2938 Intrinsic::experimental_vp_reverse, {LoadR, Plan->getTrue(), &EVL},
2939 TypeInfo.inferScalarType(LoadR), {}, {}, DL);
2940 }
2941
2942 VPValue *StoredVal;
2943 if (match(&CurRecipe, m_MaskedStore(m_VPValue(Addr), m_VPValue(StoredVal),
2944 m_RemoveMask(HeaderMask, Mask))) &&
2945 !cast<VPWidenStoreRecipe>(CurRecipe).isReverse())
2946 return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(CurRecipe), Addr,
2947 StoredVal, EVL, Mask);
2948
2949 if (match(&CurRecipe,
2950 m_MaskedStore(m_VPValue(EndPtr), m_Reverse(m_VPValue(ReversedVal)),
2951 m_RemoveMask(HeaderMask, Mask))) &&
2952 match(EndPtr, m_VecEndPtr(m_VPValue(Addr), m_Specific(&Plan->getVF()))) &&
2953 cast<VPWidenStoreRecipe>(CurRecipe).isReverse()) {
2954 auto *NewReverse = new VPWidenIntrinsicRecipe(
2955 Intrinsic::experimental_vp_reverse,
2956 {ReversedVal, Plan->getTrue(), &EVL},
2957 TypeInfo.inferScalarType(ReversedVal), {}, {}, DL);
2958 NewReverse->insertBefore(&CurRecipe);
2959 return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(CurRecipe),
2960 AdjustEndPtr(EndPtr), NewReverse, EVL,
2961 Mask);
2962 }
2963
2964 if (auto *Rdx = dyn_cast<VPReductionRecipe>(&CurRecipe))
2965 if (Rdx->isConditional() &&
2966 match(Rdx->getCondOp(), m_RemoveMask(HeaderMask, Mask)))
2967 return new VPReductionEVLRecipe(*Rdx, EVL, Mask);
2968
2969 if (auto *Interleave = dyn_cast<VPInterleaveRecipe>(&CurRecipe))
2970 if (Interleave->getMask() &&
2971 match(Interleave->getMask(), m_RemoveMask(HeaderMask, Mask)))
2972 return new VPInterleaveEVLRecipe(*Interleave, EVL, Mask);
2973
2974 VPValue *LHS, *RHS;
2975 if (match(&CurRecipe,
2976 m_Select(m_Specific(HeaderMask), m_VPValue(LHS), m_VPValue(RHS))))
2977 return new VPWidenIntrinsicRecipe(
2978 Intrinsic::vp_merge, {Plan->getTrue(), LHS, RHS, &EVL},
2979 TypeInfo.inferScalarType(LHS), {}, {}, DL);
2980
2981 if (match(&CurRecipe, m_Select(m_RemoveMask(HeaderMask, Mask), m_VPValue(LHS),
2982 m_VPValue(RHS))))
2983 return new VPWidenIntrinsicRecipe(
2984 Intrinsic::vp_merge, {Mask, LHS, RHS, &EVL},
2985 TypeInfo.inferScalarType(LHS), {}, {}, DL);
2986
2987 if (match(&CurRecipe, m_LastActiveLane(m_Specific(HeaderMask)))) {
2988 Type *Ty = TypeInfo.inferScalarType(CurRecipe.getVPSingleValue());
2989 VPValue *ZExt =
2990 VPBuilder(&CurRecipe).createScalarCast(Instruction::ZExt, &EVL, Ty, DL);
2991 return new VPInstruction(Instruction::Sub,
2992 {ZExt, Plan->getConstantInt(Ty, 1)}, {}, {}, DL);
2993 }
2994
2995 return nullptr;
2996}
2997
2998/// Replace recipes with their EVL variants.
3000 VPTypeAnalysis TypeInfo(Plan);
3001 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3002 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
3003
3004 assert(all_of(Plan.getVF().users(),
3007 "User of VF that we can't transform to EVL.");
3008 Plan.getVF().replaceUsesWithIf(&EVL, [](VPUser &U, unsigned Idx) {
3010 });
3011
3012 assert(all_of(Plan.getVFxUF().users(),
3013 [&LoopRegion, &Plan](VPUser *U) {
3014 return match(U,
3015 m_c_Add(m_Specific(LoopRegion->getCanonicalIV()),
3016 m_Specific(&Plan.getVFxUF()))) ||
3017 isa<VPWidenPointerInductionRecipe>(U);
3018 }) &&
3019 "Only users of VFxUF should be VPWidenPointerInductionRecipe and the "
3020 "increment of the canonical induction.");
3021 Plan.getVFxUF().replaceUsesWithIf(&EVL, [](VPUser &U, unsigned Idx) {
3022 // Only replace uses in VPWidenPointerInductionRecipe; The increment of the
3023 // canonical induction must not be updated.
3025 });
3026
3027 // Defer erasing recipes till the end so that we don't invalidate the
3028 // VPTypeAnalysis cache.
3030
3031 // Create a scalar phi to track the previous EVL if fixed-order recurrence is
3032 // contained.
3033 bool ContainsFORs =
3035 if (ContainsFORs) {
3036 // TODO: Use VPInstruction::ExplicitVectorLength to get maximum EVL.
3037 VPValue *MaxEVL = &Plan.getVF();
3038 // Emit VPScalarCastRecipe in preheader if VF is not a 32 bits integer.
3039 VPBuilder Builder(LoopRegion->getPreheaderVPBB());
3040 MaxEVL = Builder.createScalarZExtOrTrunc(
3041 MaxEVL, Type::getInt32Ty(Plan.getContext()),
3042 TypeInfo.inferScalarType(MaxEVL), DebugLoc::getUnknown());
3043
3044 Builder.setInsertPoint(Header, Header->getFirstNonPhi());
3045 VPValue *PrevEVL = Builder.createScalarPhi(
3046 {MaxEVL, &EVL}, DebugLoc::getUnknown(), "prev.evl");
3047
3050 for (VPRecipeBase &R : *VPBB) {
3051 VPValue *V1, *V2;
3052 if (!match(&R,
3054 m_VPValue(V1), m_VPValue(V2))))
3055 continue;
3056 VPValue *Imm = Plan.getOrAddLiveIn(
3059 Intrinsic::experimental_vp_splice,
3060 {V1, V2, Imm, Plan.getTrue(), PrevEVL, &EVL},
3061 TypeInfo.inferScalarType(R.getVPSingleValue()), {}, {},
3062 R.getDebugLoc());
3063 VPSplice->insertBefore(&R);
3064 R.getVPSingleValue()->replaceAllUsesWith(VPSplice);
3065 ToErase.push_back(&R);
3066 }
3067 }
3068 }
3069
3070 VPValue *HeaderMask = findHeaderMask(Plan);
3071 if (!HeaderMask)
3072 return;
3073
3074 // Replace header masks with a mask equivalent to predicating by EVL:
3075 //
3076 // icmp ule widen-canonical-iv backedge-taken-count
3077 // ->
3078 // icmp ult step-vector, EVL
3079 VPRecipeBase *EVLR = EVL.getDefiningRecipe();
3080 VPBuilder Builder(EVLR->getParent(), std::next(EVLR->getIterator()));
3081 Type *EVLType = TypeInfo.inferScalarType(&EVL);
3082 VPValue *EVLMask = Builder.createICmp(
3084 Builder.createNaryOp(VPInstruction::StepVector, {}, EVLType), &EVL);
3085 HeaderMask->replaceAllUsesWith(EVLMask);
3086 ToErase.push_back(HeaderMask->getDefiningRecipe());
3087
3088 // Try to optimize header mask recipes away to their EVL variants.
3089 // TODO: Split optimizeMaskToEVL out and move into
3090 // VPlanTransforms::optimize. transformRecipestoEVLRecipes should be run in
3091 // tryToBuildVPlanWithVPRecipes beforehand.
3092 for (VPUser *U : collectUsersRecursively(EVLMask)) {
3093 auto *CurRecipe = cast<VPRecipeBase>(U);
3094 VPRecipeBase *EVLRecipe =
3095 optimizeMaskToEVL(EVLMask, *CurRecipe, TypeInfo, EVL);
3096 if (!EVLRecipe)
3097 continue;
3098
3099 unsigned NumDefVal = EVLRecipe->getNumDefinedValues();
3100 assert(NumDefVal == CurRecipe->getNumDefinedValues() &&
3101 "New recipe must define the same number of values as the "
3102 "original.");
3103 EVLRecipe->insertBefore(CurRecipe);
3105 EVLRecipe)) {
3106 for (unsigned I = 0; I < NumDefVal; ++I) {
3107 VPValue *CurVPV = CurRecipe->getVPValue(I);
3108 CurVPV->replaceAllUsesWith(EVLRecipe->getVPValue(I));
3109 }
3110 }
3111 ToErase.push_back(CurRecipe);
3112 }
3113 // Remove dead EVL mask.
3114 if (EVLMask->getNumUsers() == 0)
3115 ToErase.push_back(EVLMask->getDefiningRecipe());
3116
3117 for (VPRecipeBase *R : reverse(ToErase)) {
3118 SmallVector<VPValue *> PossiblyDead(R->operands());
3119 R->eraseFromParent();
3120 for (VPValue *Op : PossiblyDead)
3122 }
3123}
3124
3125/// Add a VPEVLBasedIVPHIRecipe and related recipes to \p Plan and
3126/// replaces all uses except the canonical IV increment of
3127/// VPCanonicalIVPHIRecipe with a VPEVLBasedIVPHIRecipe. VPCanonicalIVPHIRecipe
3128/// is used only for loop iterations counting after this transformation.
3129///
3130/// The function uses the following definitions:
3131/// %StartV is the canonical induction start value.
3132///
3133/// The function adds the following recipes:
3134///
3135/// vector.ph:
3136/// ...
3137///
3138/// vector.body:
3139/// ...
3140/// %EVLPhi = EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI [ %StartV, %vector.ph ],
3141/// [ %NextEVLIV, %vector.body ]
3142/// %AVL = phi [ trip-count, %vector.ph ], [ %NextAVL, %vector.body ]
3143/// %VPEVL = EXPLICIT-VECTOR-LENGTH %AVL
3144/// ...
3145/// %OpEVL = cast i32 %VPEVL to IVSize
3146/// %NextEVLIV = add IVSize %OpEVL, %EVLPhi
3147/// %NextAVL = sub IVSize nuw %AVL, %OpEVL
3148/// ...
3149///
3150/// If MaxSafeElements is provided, the function adds the following recipes:
3151/// vector.ph:
3152/// ...
3153///
3154/// vector.body:
3155/// ...
3156/// %EVLPhi = EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI [ %StartV, %vector.ph ],
3157/// [ %NextEVLIV, %vector.body ]
3158/// %AVL = phi [ trip-count, %vector.ph ], [ %NextAVL, %vector.body ]
3159/// %cmp = cmp ult %AVL, MaxSafeElements
3160/// %SAFE_AVL = select %cmp, %AVL, MaxSafeElements
3161/// %VPEVL = EXPLICIT-VECTOR-LENGTH %SAFE_AVL
3162/// ...
3163/// %OpEVL = cast i32 %VPEVL to IVSize
3164/// %NextEVLIV = add IVSize %OpEVL, %EVLPhi
3165/// %NextAVL = sub IVSize nuw %AVL, %OpEVL
3166/// ...
3167///
3169 VPlan &Plan, const std::optional<unsigned> &MaxSafeElements) {
3170 if (Plan.hasScalarVFOnly())
3171 return;
3172 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3173 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
3174
3175 auto *CanonicalIVPHI = LoopRegion->getCanonicalIV();
3176 auto *CanIVTy = LoopRegion->getCanonicalIVType();
3177 VPValue *StartV = CanonicalIVPHI->getStartValue();
3178
3179 // Create the ExplicitVectorLengthPhi recipe in the main loop.
3180 auto *EVLPhi = new VPEVLBasedIVPHIRecipe(StartV, DebugLoc::getUnknown());
3181 EVLPhi->insertAfter(CanonicalIVPHI);
3182 VPBuilder Builder(Header, Header->getFirstNonPhi());
3183 // Create the AVL (application vector length), starting from TC -> 0 in steps
3184 // of EVL.
3185 VPPhi *AVLPhi = Builder.createScalarPhi(
3186 {Plan.getTripCount()}, DebugLoc::getCompilerGenerated(), "avl");
3187 VPValue *AVL = AVLPhi;
3188
3189 if (MaxSafeElements) {
3190 // Support for MaxSafeDist for correct loop emission.
3191 VPValue *AVLSafe = Plan.getConstantInt(CanIVTy, *MaxSafeElements);
3192 VPValue *Cmp = Builder.createICmp(ICmpInst::ICMP_ULT, AVL, AVLSafe);
3193 AVL = Builder.createSelect(Cmp, AVL, AVLSafe, DebugLoc::getUnknown(),
3194 "safe_avl");
3195 }
3196 auto *VPEVL = Builder.createNaryOp(VPInstruction::ExplicitVectorLength, AVL,
3198
3199 auto *CanonicalIVIncrement =
3200 cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue());
3201 Builder.setInsertPoint(CanonicalIVIncrement);
3202 VPValue *OpVPEVL = VPEVL;
3203
3204 auto *I32Ty = Type::getInt32Ty(Plan.getContext());
3205 OpVPEVL = Builder.createScalarZExtOrTrunc(
3206 OpVPEVL, CanIVTy, I32Ty, CanonicalIVIncrement->getDebugLoc());
3207
3208 auto *NextEVLIV = Builder.createOverflowingOp(
3209 Instruction::Add, {OpVPEVL, EVLPhi},
3210 {CanonicalIVIncrement->hasNoUnsignedWrap(),
3211 CanonicalIVIncrement->hasNoSignedWrap()},
3212 CanonicalIVIncrement->getDebugLoc(), "index.evl.next");
3213 EVLPhi->addOperand(NextEVLIV);
3214
3215 VPValue *NextAVL = Builder.createOverflowingOp(
3216 Instruction::Sub, {AVLPhi, OpVPEVL}, {/*hasNUW=*/true, /*hasNSW=*/false},
3217 DebugLoc::getCompilerGenerated(), "avl.next");
3218 AVLPhi->addOperand(NextAVL);
3219
3220 transformRecipestoEVLRecipes(Plan, *VPEVL);
3221
3222 // Replace all uses of VPCanonicalIVPHIRecipe by
3223 // VPEVLBasedIVPHIRecipe except for the canonical IV increment.
3224 CanonicalIVPHI->replaceAllUsesWith(EVLPhi);
3225 CanonicalIVIncrement->setOperand(0, CanonicalIVPHI);
3226 // TODO: support unroll factor > 1.
3227 Plan.setUF(1);
3228}
3229
3231 // Find EVL loop entries by locating VPEVLBasedIVPHIRecipe.
3232 // There should be only one EVL PHI in the entire plan.
3233 VPEVLBasedIVPHIRecipe *EVLPhi = nullptr;
3234
3237 for (VPRecipeBase &R : VPBB->phis())
3238 if (auto *PhiR = dyn_cast<VPEVLBasedIVPHIRecipe>(&R)) {
3239 assert(!EVLPhi && "Found multiple EVL PHIs. Only one expected");
3240 EVLPhi = PhiR;
3241 }
3242
3243 // Early return if no EVL PHI is found.
3244 if (!EVLPhi)
3245 return;
3246
3247 VPBasicBlock *HeaderVPBB = EVLPhi->getParent();
3248 VPValue *EVLIncrement = EVLPhi->getBackedgeValue();
3249 VPValue *AVL;
3250 [[maybe_unused]] bool FoundAVL =
3251 match(EVLIncrement,
3252 m_c_Add(m_ZExtOrSelf(m_EVL(m_VPValue(AVL))), m_Specific(EVLPhi)));
3253 assert(FoundAVL && "Didn't find AVL?");
3254
3255 // The AVL may be capped to a safe distance.
3256 VPValue *SafeAVL;
3257 if (match(AVL, m_Select(m_VPValue(), m_VPValue(SafeAVL), m_VPValue())))
3258 AVL = SafeAVL;
3259
3260 VPValue *AVLNext;
3261 [[maybe_unused]] bool FoundAVLNext =
3263 m_Specific(Plan.getTripCount()), m_VPValue(AVLNext)));
3264 assert(FoundAVLNext && "Didn't find AVL backedge?");
3265
3266 // Convert EVLPhi to concrete recipe.
3267 auto *ScalarR =
3268 VPBuilder(EVLPhi).createScalarPhi({EVLPhi->getStartValue(), EVLIncrement},
3269 EVLPhi->getDebugLoc(), "evl.based.iv");
3270 EVLPhi->replaceAllUsesWith(ScalarR);
3271 EVLPhi->eraseFromParent();
3272
3273 // Replace CanonicalIVInc with EVL-PHI increment.
3274 auto *CanonicalIV = cast<VPPhi>(&*HeaderVPBB->begin());
3275 VPValue *Backedge = CanonicalIV->getIncomingValue(1);
3276 assert(match(Backedge, m_c_Add(m_Specific(CanonicalIV),
3277 m_Specific(&Plan.getVFxUF()))) &&
3278 "Unexpected canonical iv");
3279 Backedge->replaceAllUsesWith(EVLIncrement);
3280
3281 // Remove unused phi and increment.
3282 VPRecipeBase *CanonicalIVIncrement = Backedge->getDefiningRecipe();
3283 CanonicalIVIncrement->eraseFromParent();
3284 CanonicalIV->eraseFromParent();
3285
3286 // Replace the use of VectorTripCount in the latch-exiting block.
3287 // Before: (branch-on-cond (icmp eq EVLIVInc, VectorTripCount))
3288 // After: (branch-on-cond icmp eq AVLNext, 0)
3289 VPBasicBlock *LatchExiting =
3290 HeaderVPBB->getPredecessors()[1]->getEntryBasicBlock();
3291 auto *LatchExitingBr = cast<VPInstruction>(LatchExiting->getTerminator());
3292 if (match(LatchExitingBr, m_BranchOnCond(m_True())))
3293 return;
3294
3295 assert(match(LatchExitingBr, m_BranchOnCond(m_SpecificCmp(
3296 CmpInst::ICMP_EQ, m_VPValue(EVLIncrement),
3297 m_Specific(&Plan.getVectorTripCount())))) &&
3298 "Expected BranchOnCond with ICmp comparing EVL increment with vector "
3299 "trip count");
3300
3301 Type *AVLTy = VPTypeAnalysis(Plan).inferScalarType(AVLNext);
3302 VPBuilder Builder(LatchExitingBr);
3303 LatchExitingBr->setOperand(0,
3304 Builder.createICmp(CmpInst::ICMP_EQ, AVLNext,
3305 Plan.getConstantInt(AVLTy, 0)));
3306}
3307
3309 VPlan &Plan, PredicatedScalarEvolution &PSE,
3310 const DenseMap<Value *, const SCEV *> &StridesMap) {
3311 // Replace VPValues for known constant strides guaranteed by predicate scalar
3312 // evolution.
3313 auto CanUseVersionedStride = [&Plan](VPUser &U, unsigned) {
3314 auto *R = cast<VPRecipeBase>(&U);
3315 return R->getRegion() ||
3316 R->getParent() == Plan.getVectorLoopRegion()->getSinglePredecessor();
3317 };
3318 ValueToSCEVMapTy RewriteMap;
3319 for (const SCEV *Stride : StridesMap.values()) {
3320 using namespace SCEVPatternMatch;
3321 auto *StrideV = cast<SCEVUnknown>(Stride)->getValue();
3322 const APInt *StrideConst;
3323 if (!match(PSE.getSCEV(StrideV), m_scev_APInt(StrideConst)))
3324 // Only handle constant strides for now.
3325 continue;
3326
3327 auto *CI = Plan.getConstantInt(*StrideConst);
3328 if (VPValue *StrideVPV = Plan.getLiveIn(StrideV))
3329 StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
3330
3331 // The versioned value may not be used in the loop directly but through a
3332 // sext/zext. Add new live-ins in those cases.
3333 for (Value *U : StrideV->users()) {
3335 continue;
3336 VPValue *StrideVPV = Plan.getLiveIn(U);
3337 if (!StrideVPV)
3338 continue;
3339 unsigned BW = U->getType()->getScalarSizeInBits();
3340 APInt C =
3341 isa<SExtInst>(U) ? StrideConst->sext(BW) : StrideConst->zext(BW);
3342 VPValue *CI = Plan.getConstantInt(C);
3343 StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
3344 }
3345 RewriteMap[StrideV] = PSE.getSCEV(StrideV);
3346 }
3347
3348 for (VPRecipeBase &R : *Plan.getEntry()) {
3349 auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(&R);
3350 if (!ExpSCEV)
3351 continue;
3352 const SCEV *ScevExpr = ExpSCEV->getSCEV();
3353 auto *NewSCEV =
3354 SCEVParameterRewriter::rewrite(ScevExpr, *PSE.getSE(), RewriteMap);
3355 if (NewSCEV != ScevExpr) {
3356 VPValue *NewExp = vputils::getOrCreateVPValueForSCEVExpr(Plan, NewSCEV);
3357 ExpSCEV->replaceAllUsesWith(NewExp);
3358 if (Plan.getTripCount() == ExpSCEV)
3359 Plan.resetTripCount(NewExp);
3360 }
3361 }
3362}
3363
3365 VPlan &Plan,
3366 const std::function<bool(BasicBlock *)> &BlockNeedsPredication) {
3367 // Collect recipes in the backward slice of `Root` that may generate a poison
3368 // value that is used after vectorization.
3370 auto CollectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) {
3372 Worklist.push_back(Root);
3373
3374 // Traverse the backward slice of Root through its use-def chain.
3375 while (!Worklist.empty()) {
3376 VPRecipeBase *CurRec = Worklist.pop_back_val();
3377
3378 if (!Visited.insert(CurRec).second)
3379 continue;
3380
3381 // Prune search if we find another recipe generating a widen memory
3382 // instruction. Widen memory instructions involved in address computation
3383 // will lead to gather/scatter instructions, which don't need to be
3384 // handled.
3386 VPHeaderPHIRecipe>(CurRec))
3387 continue;
3388
3389 // This recipe contributes to the address computation of a widen
3390 // load/store. If the underlying instruction has poison-generating flags,
3391 // drop them directly.
3392 if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(CurRec)) {
3393 VPValue *A, *B;
3394 // Dropping disjoint from an OR may yield incorrect results, as some
3395 // analysis may have converted it to an Add implicitly (e.g. SCEV used
3396 // for dependence analysis). Instead, replace it with an equivalent Add.
3397 // This is possible as all users of the disjoint OR only access lanes
3398 // where the operands are disjoint or poison otherwise.
3399 if (match(RecWithFlags, m_BinaryOr(m_VPValue(A), m_VPValue(B))) &&
3400 RecWithFlags->isDisjoint()) {
3401 VPBuilder Builder(RecWithFlags);
3402 VPInstruction *New = Builder.createOverflowingOp(
3403 Instruction::Add, {A, B}, {false, false},
3404 RecWithFlags->getDebugLoc());
3405 New->setUnderlyingValue(RecWithFlags->getUnderlyingValue());
3406 RecWithFlags->replaceAllUsesWith(New);
3407 RecWithFlags->eraseFromParent();
3408 CurRec = New;
3409 } else
3410 RecWithFlags->dropPoisonGeneratingFlags();
3411 } else {
3414 (void)Instr;
3415 assert((!Instr || !Instr->hasPoisonGeneratingFlags()) &&
3416 "found instruction with poison generating flags not covered by "
3417 "VPRecipeWithIRFlags");
3418 }
3419
3420 // Add new definitions to the worklist.
3421 for (VPValue *Operand : CurRec->operands())
3422 if (VPRecipeBase *OpDef = Operand->getDefiningRecipe())
3423 Worklist.push_back(OpDef);
3424 }
3425 });
3426
3427 // Traverse all the recipes in the VPlan and collect the poison-generating
3428 // recipes in the backward slice starting at the address of a VPWidenRecipe or
3429 // VPInterleaveRecipe.
3430 auto Iter = vp_depth_first_deep(Plan.getEntry());
3432 for (VPRecipeBase &Recipe : *VPBB) {
3433 if (auto *WidenRec = dyn_cast<VPWidenMemoryRecipe>(&Recipe)) {
3434 Instruction &UnderlyingInstr = WidenRec->getIngredient();
3435 VPRecipeBase *AddrDef = WidenRec->getAddr()->getDefiningRecipe();
3436 if (AddrDef && WidenRec->isConsecutive() &&
3437 BlockNeedsPredication(UnderlyingInstr.getParent()))
3438 CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
3439 } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) {
3440 VPRecipeBase *AddrDef = InterleaveRec->getAddr()->getDefiningRecipe();
3441 if (AddrDef) {
3442 // Check if any member of the interleave group needs predication.
3443 const InterleaveGroup<Instruction> *InterGroup =
3444 InterleaveRec->getInterleaveGroup();
3445 bool NeedPredication = false;
3446 for (int I = 0, NumMembers = InterGroup->getNumMembers();
3447 I < NumMembers; ++I) {
3448 Instruction *Member = InterGroup->getMember(I);
3449 if (Member)
3450 NeedPredication |= BlockNeedsPredication(Member->getParent());
3451 }
3452
3453 if (NeedPredication)
3454 CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
3455 }
3456 }
3457 }
3458 }
3459}
3460
3462 VPlan &Plan,
3464 &InterleaveGroups,
3465 VPRecipeBuilder &RecipeBuilder, const bool &ScalarEpilogueAllowed) {
3466 if (InterleaveGroups.empty())
3467 return;
3468
3469 // Interleave memory: for each Interleave Group we marked earlier as relevant
3470 // for this VPlan, replace the Recipes widening its memory instructions with a
3471 // single VPInterleaveRecipe at its insertion point.
3472 VPDominatorTree VPDT(Plan);
3473 for (const auto *IG : InterleaveGroups) {
3474 auto *Start =
3475 cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IG->getMember(0)));
3476 VPIRMetadata InterleaveMD(*Start);
3477 SmallVector<VPValue *, 4> StoredValues;
3478 if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(Start))
3479 StoredValues.push_back(StoreR->getStoredValue());
3480 for (unsigned I = 1; I < IG->getFactor(); ++I) {
3481 Instruction *MemberI = IG->getMember(I);
3482 if (!MemberI)
3483 continue;
3484 VPWidenMemoryRecipe *MemoryR =
3485 cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(MemberI));
3486 if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(MemoryR))
3487 StoredValues.push_back(StoreR->getStoredValue());
3488 InterleaveMD.intersect(*MemoryR);
3489 }
3490
3491 bool NeedsMaskForGaps =
3492 (IG->requiresScalarEpilogue() && !ScalarEpilogueAllowed) ||
3493 (!StoredValues.empty() && !IG->isFull());
3494
3495 Instruction *IRInsertPos = IG->getInsertPos();
3496 auto *InsertPos =
3497 cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IRInsertPos));
3498
3500 if (auto *Gep = dyn_cast<GetElementPtrInst>(
3501 getLoadStorePointerOperand(IRInsertPos)->stripPointerCasts()))
3502 NW = Gep->getNoWrapFlags().withoutNoUnsignedWrap();
3503
3504 // Get or create the start address for the interleave group.
3505 VPValue *Addr = Start->getAddr();
3506 VPRecipeBase *AddrDef = Addr->getDefiningRecipe();
3507 if (AddrDef && !VPDT.properlyDominates(AddrDef, InsertPos)) {
3508 // We cannot re-use the address of member zero because it does not
3509 // dominate the insert position. Instead, use the address of the insert
3510 // position and create a PtrAdd adjusting it to the address of member
3511 // zero.
3512 // TODO: Hoist Addr's defining recipe (and any operands as needed) to
3513 // InsertPos or sink loads above zero members to join it.
3514 assert(IG->getIndex(IRInsertPos) != 0 &&
3515 "index of insert position shouldn't be zero");
3516 auto &DL = IRInsertPos->getDataLayout();
3517 APInt Offset(32,
3518 DL.getTypeAllocSize(getLoadStoreType(IRInsertPos)) *
3519 IG->getIndex(IRInsertPos),
3520 /*IsSigned=*/true);
3521 VPValue *OffsetVPV = Plan.getConstantInt(-Offset);
3522 VPBuilder B(InsertPos);
3523 Addr = B.createNoWrapPtrAdd(InsertPos->getAddr(), OffsetVPV, NW);
3524 }
3525 // If the group is reverse, adjust the index to refer to the last vector
3526 // lane instead of the first. We adjust the index from the first vector
3527 // lane, rather than directly getting the pointer for lane VF - 1, because
3528 // the pointer operand of the interleaved access is supposed to be uniform.
3529 if (IG->isReverse()) {
3530 auto *ReversePtr = new VPVectorEndPointerRecipe(
3531 Addr, &Plan.getVF(), getLoadStoreType(IRInsertPos),
3532 -(int64_t)IG->getFactor(), NW, InsertPos->getDebugLoc());
3533 ReversePtr->insertBefore(InsertPos);
3534 Addr = ReversePtr;
3535 }
3536 auto *VPIG = new VPInterleaveRecipe(IG, Addr, StoredValues,
3537 InsertPos->getMask(), NeedsMaskForGaps,
3538 InterleaveMD, InsertPos->getDebugLoc());
3539 VPIG->insertBefore(InsertPos);
3540
3541 unsigned J = 0;
3542 for (unsigned i = 0; i < IG->getFactor(); ++i)
3543 if (Instruction *Member = IG->getMember(i)) {
3544 VPRecipeBase *MemberR = RecipeBuilder.getRecipe(Member);
3545 if (!Member->getType()->isVoidTy()) {
3546 VPValue *OriginalV = MemberR->getVPSingleValue();
3547 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));
3548 J++;
3549 }
3550 MemberR->eraseFromParent();
3551 }
3552 }
3553}
3554
3555/// Expand a VPWidenIntOrFpInduction into executable recipes, for the initial
3556/// value, phi and backedge value. In the following example:
3557///
3558/// vector.ph:
3559/// Successor(s): vector loop
3560///
3561/// <x1> vector loop: {
3562/// vector.body:
3563/// WIDEN-INDUCTION %i = phi %start, %step, %vf
3564/// ...
3565/// EMIT branch-on-count ...
3566/// No successors
3567/// }
3568///
3569/// WIDEN-INDUCTION will get expanded to:
3570///
3571/// vector.ph:
3572/// ...
3573/// vp<%induction.start> = ...
3574/// vp<%induction.increment> = ...
3575///
3576/// Successor(s): vector loop
3577///
3578/// <x1> vector loop: {
3579/// vector.body:
3580/// ir<%i> = WIDEN-PHI vp<%induction.start>, vp<%vec.ind.next>
3581/// ...
3582/// vp<%vec.ind.next> = add ir<%i>, vp<%induction.increment>
3583/// EMIT branch-on-count ...
3584/// No successors
3585/// }
3586static void
3588 VPTypeAnalysis &TypeInfo) {
3589 VPlan *Plan = WidenIVR->getParent()->getPlan();
3590 VPValue *Start = WidenIVR->getStartValue();
3591 VPValue *Step = WidenIVR->getStepValue();
3592 VPValue *VF = WidenIVR->getVFValue();
3593 DebugLoc DL = WidenIVR->getDebugLoc();
3594
3595 // The value from the original loop to which we are mapping the new induction
3596 // variable.
3597 Type *Ty = TypeInfo.inferScalarType(WidenIVR);
3598
3599 const InductionDescriptor &ID = WidenIVR->getInductionDescriptor();
3602 VPIRFlags Flags = *WidenIVR;
3603 if (ID.getKind() == InductionDescriptor::IK_IntInduction) {
3604 AddOp = Instruction::Add;
3605 MulOp = Instruction::Mul;
3606 } else {
3607 AddOp = ID.getInductionOpcode();
3608 MulOp = Instruction::FMul;
3609 }
3610
3611 // If the phi is truncated, truncate the start and step values.
3612 VPBuilder Builder(Plan->getVectorPreheader());
3613 Type *StepTy = TypeInfo.inferScalarType(Step);
3614 if (Ty->getScalarSizeInBits() < StepTy->getScalarSizeInBits()) {
3615 assert(StepTy->isIntegerTy() && "Truncation requires an integer type");
3616 Step = Builder.createScalarCast(Instruction::Trunc, Step, Ty, DL);
3617 Start = Builder.createScalarCast(Instruction::Trunc, Start, Ty, DL);
3618 // Truncation doesn't preserve WrapFlags.
3619 Flags.dropPoisonGeneratingFlags();
3620 StepTy = Ty;
3621 }
3622
3623 // Construct the initial value of the vector IV in the vector loop preheader.
3624 Type *IVIntTy =
3626 VPValue *Init = Builder.createNaryOp(VPInstruction::StepVector, {}, IVIntTy);
3627 if (StepTy->isFloatingPointTy())
3628 Init = Builder.createWidenCast(Instruction::UIToFP, Init, StepTy);
3629
3630 VPValue *SplatStart = Builder.createNaryOp(VPInstruction::Broadcast, Start);
3631 VPValue *SplatStep = Builder.createNaryOp(VPInstruction::Broadcast, Step);
3632
3633 Init = Builder.createNaryOp(MulOp, {Init, SplatStep}, Flags);
3634 Init = Builder.createNaryOp(AddOp, {SplatStart, Init}, Flags,
3635 DebugLoc::getUnknown(), "induction");
3636
3637 // Create the widened phi of the vector IV.
3638 auto *WidePHI = new VPWidenPHIRecipe(WidenIVR->getPHINode(), Init,
3639 WidenIVR->getDebugLoc(), "vec.ind");
3640 WidePHI->insertBefore(WidenIVR);
3641
3642 // Create the backedge value for the vector IV.
3643 VPValue *Inc;
3644 VPValue *Prev;
3645 // If unrolled, use the increment and prev value from the operands.
3646 if (auto *SplatVF = WidenIVR->getSplatVFValue()) {
3647 Inc = SplatVF;
3648 Prev = WidenIVR->getLastUnrolledPartOperand();
3649 } else {
3650 if (VPRecipeBase *R = VF->getDefiningRecipe())
3651 Builder.setInsertPoint(R->getParent(), std::next(R->getIterator()));
3652 // Multiply the vectorization factor by the step using integer or
3653 // floating-point arithmetic as appropriate.
3654 if (StepTy->isFloatingPointTy())
3655 VF = Builder.createScalarCast(Instruction::CastOps::UIToFP, VF, StepTy,
3656 DL);
3657 else
3658 VF = Builder.createScalarZExtOrTrunc(VF, StepTy,
3659 TypeInfo.inferScalarType(VF), DL);
3660
3661 Inc = Builder.createNaryOp(MulOp, {Step, VF}, Flags);
3662 Inc = Builder.createNaryOp(VPInstruction::Broadcast, Inc);
3663 Prev = WidePHI;
3664 }
3665
3667 Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator());
3668 auto *Next = Builder.createNaryOp(AddOp, {Prev, Inc}, Flags,
3669 WidenIVR->getDebugLoc(), "vec.ind.next");
3670
3671 WidePHI->addOperand(Next);
3672
3673 WidenIVR->replaceAllUsesWith(WidePHI);
3674}
3675
3676/// Expand a VPWidenPointerInductionRecipe into executable recipes, for the
3677/// initial value, phi and backedge value. In the following example:
3678///
3679/// <x1> vector loop: {
3680/// vector.body:
3681/// EMIT ir<%ptr.iv> = WIDEN-POINTER-INDUCTION %start, %step, %vf
3682/// ...
3683/// EMIT branch-on-count ...
3684/// }
3685///
3686/// WIDEN-POINTER-INDUCTION will get expanded to:
3687///
3688/// <x1> vector loop: {
3689/// vector.body:
3690/// EMIT-SCALAR %pointer.phi = phi %start, %ptr.ind
3691/// EMIT %mul = mul %stepvector, %step
3692/// EMIT %vector.gep = wide-ptradd %pointer.phi, %mul
3693/// ...
3694/// EMIT %ptr.ind = ptradd %pointer.phi, %vf
3695/// EMIT branch-on-count ...
3696/// }
3698 VPTypeAnalysis &TypeInfo) {
3699 VPlan *Plan = R->getParent()->getPlan();
3700 VPValue *Start = R->getStartValue();
3701 VPValue *Step = R->getStepValue();
3702 VPValue *VF = R->getVFValue();
3703
3704 assert(R->getInductionDescriptor().getKind() ==
3706 "Not a pointer induction according to InductionDescriptor!");
3707 assert(TypeInfo.inferScalarType(R)->isPointerTy() && "Unexpected type.");
3708 assert(!R->onlyScalarsGenerated(Plan->hasScalableVF()) &&
3709 "Recipe should have been replaced");
3710
3711 VPBuilder Builder(R);
3712 DebugLoc DL = R->getDebugLoc();
3713
3714 // Build a scalar pointer phi.
3715 VPPhi *ScalarPtrPhi = Builder.createScalarPhi(Start, DL, "pointer.phi");
3716
3717 // Create actual address geps that use the pointer phi as base and a
3718 // vectorized version of the step value (<step*0, ..., step*N>) as offset.
3719 Builder.setInsertPoint(R->getParent(), R->getParent()->getFirstNonPhi());
3720 Type *StepTy = TypeInfo.inferScalarType(Step);
3721 VPValue *Offset = Builder.createNaryOp(VPInstruction::StepVector, {}, StepTy);
3722 Offset = Builder.createOverflowingOp(Instruction::Mul, {Offset, Step});
3723 VPValue *PtrAdd = Builder.createNaryOp(
3724 VPInstruction::WidePtrAdd, {ScalarPtrPhi, Offset}, DL, "vector.gep");
3725 R->replaceAllUsesWith(PtrAdd);
3726
3727 // Create the backedge value for the scalar pointer phi.
3729 Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator());
3730 VF = Builder.createScalarZExtOrTrunc(VF, StepTy, TypeInfo.inferScalarType(VF),
3731 DL);
3732 VPValue *Inc = Builder.createOverflowingOp(Instruction::Mul, {Step, VF});
3733
3734 VPValue *InductionGEP =
3735 Builder.createPtrAdd(ScalarPtrPhi, Inc, DL, "ptr.ind");
3736 ScalarPtrPhi->addOperand(InductionGEP);
3737}
3738
3740 // Replace loop regions with explicity CFG.
3741 SmallVector<VPRegionBlock *> LoopRegions;
3743 vp_depth_first_deep(Plan.getEntry()))) {
3744 if (!R->isReplicator())
3745 LoopRegions.push_back(R);
3746 }
3747 for (VPRegionBlock *R : LoopRegions)
3748 R->dissolveToCFGLoop();
3749}
3750
3753 // The transform runs after dissolving loop regions, so all VPBasicBlocks
3754 // terminated with BranchOnTwoConds are reached via a shallow traversal.
3757 if (!VPBB->empty() && match(&VPBB->back(), m_BranchOnTwoConds()))
3758 WorkList.push_back(cast<VPInstruction>(&VPBB->back()));
3759 }
3760
3761 // Expand BranchOnTwoConds instructions into explicit CFG with
3762 // single-condition branches, by introducing a new branch in VPBB that jumps
3763 // to a new intermediate block if either condition is true and to the
3764 // third successor otherwise. The intermediate block jumps to the first or
3765 // second successor, depending on the first condition.
3766 for (VPInstruction *Br : WorkList) {
3767 assert(Br->getNumOperands() == 2 &&
3768 "BranchOnTwoConds must have exactly 2 conditions");
3769 DebugLoc DL = Br->getDebugLoc();
3770 VPBasicBlock *Latch = Br->getParent();
3771 const auto Successors = to_vector(Latch->getSuccessors());
3772 assert(Successors.size() == 3 &&
3773 "BranchOnTwoConds must have exactly 3 successors");
3774
3775 for (VPBlockBase *Succ : Successors)
3776 VPBlockUtils::disconnectBlocks(Latch, Succ);
3777
3778 VPValue *EarlyExitingCond = Br->getOperand(0);
3779 VPValue *LateExitingCond = Br->getOperand(1);
3780 VPBlockBase *EarlyExitBB = Successors[0];
3781 VPBlockBase *LateExitBB = Successors[1];
3782 VPBlockBase *Header = Successors[2];
3783
3784 VPBasicBlock *MiddleSplit = Plan.createVPBasicBlock("middle.split");
3785 MiddleSplit->setParent(LateExitBB->getParent());
3786
3787 VPBuilder Builder(Latch);
3788 VPValue *AnyExitTaken = Builder.createNaryOp(
3789 Instruction::Or, {EarlyExitingCond, LateExitingCond}, DL);
3790 Builder.createNaryOp(VPInstruction::BranchOnCond, {AnyExitTaken}, DL);
3791 VPBlockUtils::connectBlocks(Latch, MiddleSplit);
3792 VPBlockUtils::connectBlocks(Latch, Header);
3793
3794 VPBuilder(MiddleSplit)
3795 .createNaryOp(VPInstruction::BranchOnCond, {EarlyExitingCond}, DL);
3796 VPBlockUtils::connectBlocks(MiddleSplit, EarlyExitBB);
3797 VPBlockUtils::connectBlocks(MiddleSplit, LateExitBB);
3798
3799 Br->eraseFromParent();
3800 }
3801}
3802
3804 VPTypeAnalysis TypeInfo(Plan);
3807 vp_depth_first_deep(Plan.getEntry()))) {
3808 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
3809 if (auto *WidenIVR = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R)) {
3810 expandVPWidenIntOrFpInduction(WidenIVR, TypeInfo);
3811 ToRemove.push_back(WidenIVR);
3812 continue;
3813 }
3814
3815 if (auto *WidenIVR = dyn_cast<VPWidenPointerInductionRecipe>(&R)) {
3816 // If the recipe only generates scalars, scalarize it instead of
3817 // expanding it.
3818 if (WidenIVR->onlyScalarsGenerated(Plan.hasScalableVF())) {
3819 VPBuilder Builder(WidenIVR);
3820 VPValue *PtrAdd =
3821 scalarizeVPWidenPointerInduction(WidenIVR, Plan, Builder);
3822 WidenIVR->replaceAllUsesWith(PtrAdd);
3823 ToRemove.push_back(WidenIVR);
3824 continue;
3825 }
3826 expandVPWidenPointerInduction(WidenIVR, TypeInfo);
3827 ToRemove.push_back(WidenIVR);
3828 continue;
3829 }
3830
3831 // Expand VPBlendRecipe into VPInstruction::Select.
3832 VPBuilder Builder(&R);
3833 if (auto *Blend = dyn_cast<VPBlendRecipe>(&R)) {
3834 VPValue *Select = Blend->getIncomingValue(0);
3835 for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
3836 Select = Builder.createSelect(Blend->getMask(I),
3837 Blend->getIncomingValue(I), Select,
3838 R.getDebugLoc(), "predphi");
3839 Blend->replaceAllUsesWith(Select);
3840 ToRemove.push_back(Blend);
3841 }
3842
3843 if (auto *Expr = dyn_cast<VPExpressionRecipe>(&R)) {
3844 Expr->decompose();
3845 ToRemove.push_back(Expr);
3846 }
3847
3848 // Expand LastActiveLane into Not + FirstActiveLane + Sub.
3849 auto *LastActiveL = dyn_cast<VPInstruction>(&R);
3850 if (LastActiveL &&
3851 LastActiveL->getOpcode() == VPInstruction::LastActiveLane) {
3852 // Create Not(Mask) for all operands.
3854 for (VPValue *Op : LastActiveL->operands()) {
3855 VPValue *NotMask = Builder.createNot(Op, LastActiveL->getDebugLoc());
3856 NotMasks.push_back(NotMask);
3857 }
3858
3859 // Create FirstActiveLane on the inverted masks.
3860 VPValue *FirstInactiveLane = Builder.createNaryOp(
3862 LastActiveL->getDebugLoc(), "first.inactive.lane");
3863
3864 // Subtract 1 to get the last active lane.
3865 VPValue *One = Plan.getOrAddLiveIn(
3866 ConstantInt::get(Type::getInt64Ty(Plan.getContext()), 1));
3867 VPValue *LastLane = Builder.createNaryOp(
3868 Instruction::Sub, {FirstInactiveLane, One},
3869 LastActiveL->getDebugLoc(), "last.active.lane");
3870
3871 LastActiveL->replaceAllUsesWith(LastLane);
3872 ToRemove.push_back(LastActiveL);
3873 continue;
3874 }
3875
3876 // Lower BranchOnCount to ICmp + BranchOnCond.
3877 VPValue *IV, *TC;
3878 if (match(&R, m_BranchOnCount(m_VPValue(IV), m_VPValue(TC)))) {
3879 auto *BranchOnCountInst = cast<VPInstruction>(&R);
3880 DebugLoc DL = BranchOnCountInst->getDebugLoc();
3881 VPValue *Cond = Builder.createICmp(CmpInst::ICMP_EQ, IV, TC, DL);
3882 Builder.createNaryOp(VPInstruction::BranchOnCond, Cond, DL);
3883 ToRemove.push_back(BranchOnCountInst);
3884 continue;
3885 }
3886
3887 VPValue *VectorStep;
3888 VPValue *ScalarStep;
3890 m_VPValue(VectorStep), m_VPValue(ScalarStep))))
3891 continue;
3892
3893 // Expand WideIVStep.
3894 auto *VPI = cast<VPInstruction>(&R);
3895 Type *IVTy = TypeInfo.inferScalarType(VPI);
3896 if (TypeInfo.inferScalarType(VectorStep) != IVTy) {
3898 ? Instruction::UIToFP
3899 : Instruction::Trunc;
3900 VectorStep = Builder.createWidenCast(CastOp, VectorStep, IVTy);
3901 }
3902
3903 assert(!match(ScalarStep, m_One()) && "Expected non-unit scalar-step");
3904 if (TypeInfo.inferScalarType(ScalarStep) != IVTy) {
3905 ScalarStep =
3906 Builder.createWidenCast(Instruction::Trunc, ScalarStep, IVTy);
3907 }
3908
3909 VPIRFlags Flags;
3910 if (IVTy->isFloatingPointTy())
3911 Flags = {VPI->getFastMathFlags()};
3912
3913 unsigned MulOpc =
3914 IVTy->isFloatingPointTy() ? Instruction::FMul : Instruction::Mul;
3915 VPInstruction *Mul = Builder.createNaryOp(
3916 MulOpc, {VectorStep, ScalarStep}, Flags, R.getDebugLoc());
3917 VectorStep = Mul;
3918 VPI->replaceAllUsesWith(VectorStep);
3919 ToRemove.push_back(VPI);
3920 }
3921 }
3922
3923 for (VPRecipeBase *R : ToRemove)
3924 R->eraseFromParent();
3925}
3926
3928 VPBasicBlock *EarlyExitVPBB,
3929 VPlan &Plan,
3930 VPBasicBlock *HeaderVPBB,
3931 VPBasicBlock *LatchVPBB) {
3932 auto *MiddleVPBB = cast<VPBasicBlock>(LatchVPBB->getSuccessors()[0]);
3933 if (!EarlyExitVPBB->getSinglePredecessor() &&
3934 EarlyExitVPBB->getPredecessors()[1] == MiddleVPBB) {
3935 assert(EarlyExitVPBB->getNumPredecessors() == 2 &&
3936 EarlyExitVPBB->getPredecessors()[0] == EarlyExitingVPBB &&
3937 "unsupported early exit VPBB");
3938 // Early exit operand should always be last phi operand. If EarlyExitVPBB
3939 // has two predecessors and EarlyExitingVPBB is the first, swap the operands
3940 // of the phis.
3941 for (VPRecipeBase &R : EarlyExitVPBB->phis())
3942 cast<VPIRPhi>(&R)->swapOperands();
3943 }
3944
3945 VPBuilder Builder(LatchVPBB->getTerminator());
3946 VPBlockBase *TrueSucc = EarlyExitingVPBB->getSuccessors()[0];
3947 assert(match(EarlyExitingVPBB->getTerminator(), m_BranchOnCond()) &&
3948 "Terminator must be be BranchOnCond");
3949 VPValue *CondOfEarlyExitingVPBB =
3950 EarlyExitingVPBB->getTerminator()->getOperand(0);
3951 auto *CondToEarlyExit = TrueSucc == EarlyExitVPBB
3952 ? CondOfEarlyExitingVPBB
3953 : Builder.createNot(CondOfEarlyExitingVPBB);
3954
3955 // Create a BranchOnTwoConds in the latch that branches to:
3956 // [0] vector.early.exit, [1] middle block, [2] header (continue looping).
3957 VPValue *IsEarlyExitTaken =
3958 Builder.createNaryOp(VPInstruction::AnyOf, {CondToEarlyExit});
3959 VPBasicBlock *VectorEarlyExitVPBB =
3960 Plan.createVPBasicBlock("vector.early.exit");
3961 VectorEarlyExitVPBB->setParent(EarlyExitVPBB->getParent());
3962
3963 VPBlockUtils::connectBlocks(VectorEarlyExitVPBB, EarlyExitVPBB);
3964
3965 // Update the exit phis in the early exit block.
3966 VPBuilder MiddleBuilder(MiddleVPBB);
3967 VPBuilder EarlyExitB(VectorEarlyExitVPBB);
3968 for (VPRecipeBase &R : EarlyExitVPBB->phis()) {
3969 auto *ExitIRI = cast<VPIRPhi>(&R);
3970 // Early exit operand should always be last, i.e., 0 if EarlyExitVPBB has
3971 // a single predecessor and 1 if it has two.
3972 unsigned EarlyExitIdx = ExitIRI->getNumOperands() - 1;
3973 if (ExitIRI->getNumOperands() != 1) {
3974 // The first of two operands corresponds to the latch exit, via MiddleVPBB
3975 // predecessor. Extract its final lane.
3976 ExitIRI->extractLastLaneOfLastPartOfFirstOperand(MiddleBuilder);
3977 }
3978
3979 VPValue *IncomingFromEarlyExit = ExitIRI->getOperand(EarlyExitIdx);
3980 if (!isa<VPIRValue>(IncomingFromEarlyExit)) {
3981 // Update the incoming value from the early exit.
3982 VPValue *FirstActiveLane = EarlyExitB.createNaryOp(
3983 VPInstruction::FirstActiveLane, {CondToEarlyExit},
3984 DebugLoc::getUnknown(), "first.active.lane");
3985 IncomingFromEarlyExit = EarlyExitB.createNaryOp(
3986 VPInstruction::ExtractLane, {FirstActiveLane, IncomingFromEarlyExit},
3987 DebugLoc::getUnknown(), "early.exit.value");
3988 ExitIRI->setOperand(EarlyExitIdx, IncomingFromEarlyExit);
3989 }
3990 }
3991
3992 // Replace the conditional branch controlling the latch exit from the vector
3993 // loop with a multi-conditional branch exiting to vector early exit if the
3994 // early exit has been taken, exiting to middle block if the original
3995 // condition of the vector latch is true, otherwise continuing back to header.
3996 auto *LatchExitingBranch = cast<VPInstruction>(LatchVPBB->getTerminator());
3997 assert(LatchExitingBranch->getOpcode() == VPInstruction::BranchOnCount &&
3998 "Unexpected terminator");
3999 auto *IsLatchExitTaken =
4000 Builder.createICmp(CmpInst::ICMP_EQ, LatchExitingBranch->getOperand(0),
4001 LatchExitingBranch->getOperand(1));
4002
4003 DebugLoc LatchDL = LatchExitingBranch->getDebugLoc();
4004 LatchExitingBranch->eraseFromParent();
4005
4006 Builder.setInsertPoint(LatchVPBB);
4007 Builder.createNaryOp(VPInstruction::BranchOnTwoConds,
4008 {IsEarlyExitTaken, IsLatchExitTaken}, LatchDL);
4009 LatchVPBB->clearSuccessors();
4010 LatchVPBB->setSuccessors({VectorEarlyExitVPBB, MiddleVPBB, HeaderVPBB});
4011 VectorEarlyExitVPBB->setPredecessors({LatchVPBB});
4012}
4013
4014/// This function tries convert extended in-loop reductions to
4015/// VPExpressionRecipe and clamp the \p Range if it is beneficial and
4016/// valid. The created recipe must be decomposed to its constituent
4017/// recipes before execution.
4018static VPExpressionRecipe *
4020 VFRange &Range) {
4021 Type *RedTy = Ctx.Types.inferScalarType(Red);
4022 VPValue *VecOp = Red->getVecOp();
4023
4024 // Clamp the range if using extended-reduction is profitable.
4025 auto IsExtendedRedValidAndClampRange =
4026 [&](unsigned Opcode, Instruction::CastOps ExtOpc, Type *SrcTy) -> bool {
4028 [&](ElementCount VF) {
4029 auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF));
4031
4032 InstructionCost ExtRedCost;
4033 InstructionCost ExtCost =
4034 cast<VPWidenCastRecipe>(VecOp)->computeCost(VF, Ctx);
4035 InstructionCost RedCost = Red->computeCost(VF, Ctx);
4036
4037 if (Red->isPartialReduction()) {
4040 // FIXME: Move partial reduction creation, costing and clamping
4041 // here from LoopVectorize.cpp.
4042 ExtRedCost = Ctx.TTI.getPartialReductionCost(
4043 Opcode, SrcTy, nullptr, RedTy, VF, ExtKind,
4044 llvm::TargetTransformInfo::PR_None, std::nullopt, Ctx.CostKind);
4045 } else {
4046 ExtRedCost = Ctx.TTI.getExtendedReductionCost(
4047 Opcode, ExtOpc == Instruction::CastOps::ZExt, RedTy, SrcVecTy,
4048 Red->getFastMathFlags(), CostKind);
4049 }
4050 return ExtRedCost.isValid() && ExtRedCost < ExtCost + RedCost;
4051 },
4052 Range);
4053 };
4054
4055 VPValue *A;
4056 // Match reduce(ext)).
4057 if (match(VecOp, m_ZExtOrSExt(m_VPValue(A))) &&
4058 IsExtendedRedValidAndClampRange(
4059 RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()),
4060 cast<VPWidenCastRecipe>(VecOp)->getOpcode(),
4061 Ctx.Types.inferScalarType(A)))
4062 return new VPExpressionRecipe(cast<VPWidenCastRecipe>(VecOp), Red);
4063
4064 return nullptr;
4065}
4066
4067/// This function tries convert extended in-loop reductions to
4068/// VPExpressionRecipe and clamp the \p Range if it is beneficial
4069/// and valid. The created VPExpressionRecipe must be decomposed to its
4070/// constituent recipes before execution. Patterns of the
4071/// VPExpressionRecipe:
4072/// reduce.add(mul(...)),
4073/// reduce.add(mul(ext(A), ext(B))),
4074/// reduce.add(ext(mul(ext(A), ext(B)))).
4075static VPExpressionRecipe *
4077 VPCostContext &Ctx, VFRange &Range) {
4078 unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind());
4079 if (Opcode != Instruction::Add && Opcode != Instruction::Sub)
4080 return nullptr;
4081
4082 Type *RedTy = Ctx.Types.inferScalarType(Red);
4083
4084 // Clamp the range if using multiply-accumulate-reduction is profitable.
4085 auto IsMulAccValidAndClampRange =
4087 VPWidenCastRecipe *OuterExt) -> bool {
4089 [&](ElementCount VF) {
4091 Type *SrcTy =
4092 Ext0 ? Ctx.Types.inferScalarType(Ext0->getOperand(0)) : RedTy;
4093 InstructionCost MulAccCost;
4094
4095 if (Red->isPartialReduction()) {
4096 Type *SrcTy2 =
4097 Ext1 ? Ctx.Types.inferScalarType(Ext1->getOperand(0)) : nullptr;
4098 // FIXME: Move partial reduction creation, costing and clamping
4099 // here from LoopVectorize.cpp.
4100 MulAccCost = Ctx.TTI.getPartialReductionCost(
4101 Opcode, SrcTy, SrcTy2, RedTy, VF,
4103 Ext0->getOpcode())
4106 Ext1->getOpcode())
4108 Mul->getOpcode(), CostKind);
4109 } else {
4110 // Only partial reductions support mixed extends at the moment.
4111 if (Ext0 && Ext1 && Ext0->getOpcode() != Ext1->getOpcode())
4112 return false;
4113
4114 bool IsZExt =
4115 !Ext0 || Ext0->getOpcode() == Instruction::CastOps::ZExt;
4116 auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF));
4117 MulAccCost = Ctx.TTI.getMulAccReductionCost(IsZExt, Opcode, RedTy,
4118 SrcVecTy, CostKind);
4119 }
4120
4121 InstructionCost MulCost = Mul->computeCost(VF, Ctx);
4122 InstructionCost RedCost = Red->computeCost(VF, Ctx);
4123 InstructionCost ExtCost = 0;
4124 if (Ext0)
4125 ExtCost += Ext0->computeCost(VF, Ctx);
4126 if (Ext1)
4127 ExtCost += Ext1->computeCost(VF, Ctx);
4128 if (OuterExt)
4129 ExtCost += OuterExt->computeCost(VF, Ctx);
4130
4131 return MulAccCost.isValid() &&
4132 MulAccCost < ExtCost + MulCost + RedCost;
4133 },
4134 Range);
4135 };
4136
4137 VPValue *VecOp = Red->getVecOp();
4138 VPRecipeBase *Sub = nullptr;
4139 VPValue *A, *B;
4140 VPValue *Tmp = nullptr;
4141 // Sub reductions could have a sub between the add reduction and vec op.
4142 if (match(VecOp, m_Sub(m_ZeroInt(), m_VPValue(Tmp)))) {
4143 Sub = VecOp->getDefiningRecipe();
4144 VecOp = Tmp;
4145 }
4146
4147 // If ValB is a constant and can be safely extended, truncate it to the same
4148 // type as ExtA's operand, then extend it to the same type as ExtA. This
4149 // creates two uniform extends that can more easily be matched by the rest of
4150 // the bundling code. The ExtB reference, ValB and operand 1 of Mul are all
4151 // replaced with the new extend of the constant.
4152 auto ExtendAndReplaceConstantOp = [&Ctx](VPWidenCastRecipe *ExtA,
4153 VPWidenCastRecipe *&ExtB,
4154 VPValue *&ValB, VPWidenRecipe *Mul) {
4155 if (!ExtA || ExtB || !isa<VPIRValue>(ValB))
4156 return;
4157 Type *NarrowTy = Ctx.Types.inferScalarType(ExtA->getOperand(0));
4158 Instruction::CastOps ExtOpc = ExtA->getOpcode();
4159 const APInt *Const;
4160 if (!match(ValB, m_APInt(Const)) ||
4162 Const, NarrowTy, TTI::getPartialReductionExtendKind(ExtOpc)))
4163 return;
4164 // The truncate ensures that the type of each extended operand is the
4165 // same, and it's been proven that the constant can be extended from
4166 // NarrowTy safely. Necessary since ExtA's extended operand would be
4167 // e.g. an i8, while the const will likely be an i32. This will be
4168 // elided by later optimisations.
4169 VPBuilder Builder(Mul);
4170 auto *Trunc =
4171 Builder.createWidenCast(Instruction::CastOps::Trunc, ValB, NarrowTy);
4172 Type *WideTy = Ctx.Types.inferScalarType(ExtA);
4173 ValB = ExtB = Builder.createWidenCast(ExtOpc, Trunc, WideTy);
4174 Mul->setOperand(1, ExtB);
4175 };
4176
4177 // Try to match reduce.add(mul(...)).
4178 if (match(VecOp, m_Mul(m_VPValue(A), m_VPValue(B)))) {
4181 auto *Mul = cast<VPWidenRecipe>(VecOp);
4182
4183 // Convert reduce.add(mul(ext, const)) to reduce.add(mul(ext, ext(const)))
4184 ExtendAndReplaceConstantOp(RecipeA, RecipeB, B, Mul);
4185
4186 // Match reduce.add/sub(mul(ext, ext)).
4187 if (RecipeA && RecipeB && match(RecipeA, m_ZExtOrSExt(m_VPValue())) &&
4188 match(RecipeB, m_ZExtOrSExt(m_VPValue())) &&
4189 IsMulAccValidAndClampRange(Mul, RecipeA, RecipeB, nullptr)) {
4190 if (Sub)
4191 return new VPExpressionRecipe(RecipeA, RecipeB, Mul,
4192 cast<VPWidenRecipe>(Sub), Red);
4193 return new VPExpressionRecipe(RecipeA, RecipeB, Mul, Red);
4194 }
4195 // TODO: Add an expression type for this variant with a negated mul
4196 if (!Sub && IsMulAccValidAndClampRange(Mul, nullptr, nullptr, nullptr))
4197 return new VPExpressionRecipe(Mul, Red);
4198 }
4199 // TODO: Add an expression type for negated versions of other expression
4200 // variants.
4201 if (Sub)
4202 return nullptr;
4203
4204 // Match reduce.add(ext(mul(A, B))).
4205 if (match(VecOp, m_ZExtOrSExt(m_Mul(m_VPValue(A), m_VPValue(B))))) {
4206 auto *Ext = cast<VPWidenCastRecipe>(VecOp);
4207 auto *Mul = cast<VPWidenRecipe>(Ext->getOperand(0));
4210
4211 // reduce.add(ext(mul(ext, const)))
4212 // -> reduce.add(ext(mul(ext, ext(const))))
4213 ExtendAndReplaceConstantOp(Ext0, Ext1, B, Mul);
4214
4215 // reduce.add(ext(mul(ext(A), ext(B))))
4216 // -> reduce.add(mul(wider_ext(A), wider_ext(B)))
4217 // The inner extends must either have the same opcode as the outer extend or
4218 // be the same, in which case the multiply can never result in a negative
4219 // value and the outer extend can be folded away by doing wider
4220 // extends for the operands of the mul.
4221 if (Ext0 && Ext1 &&
4222 (Ext->getOpcode() == Ext0->getOpcode() || Ext0 == Ext1) &&
4223 Ext0->getOpcode() == Ext1->getOpcode() &&
4224 IsMulAccValidAndClampRange(Mul, Ext0, Ext1, Ext) && Mul->hasOneUse()) {
4225 auto *NewExt0 = new VPWidenCastRecipe(
4226 Ext0->getOpcode(), Ext0->getOperand(0), Ext->getResultType(), nullptr,
4227 *Ext0, *Ext0, Ext0->getDebugLoc());
4228 NewExt0->insertBefore(Ext0);
4229
4230 VPWidenCastRecipe *NewExt1 = NewExt0;
4231 if (Ext0 != Ext1) {
4232 NewExt1 = new VPWidenCastRecipe(Ext1->getOpcode(), Ext1->getOperand(0),
4233 Ext->getResultType(), nullptr, *Ext1,
4234 *Ext1, Ext1->getDebugLoc());
4235 NewExt1->insertBefore(Ext1);
4236 }
4237 Mul->setOperand(0, NewExt0);
4238 Mul->setOperand(1, NewExt1);
4239 Red->setOperand(1, Mul);
4240 return new VPExpressionRecipe(NewExt0, NewExt1, Mul, Red);
4241 }
4242 }
4243 return nullptr;
4244}
4245
4246/// This function tries to create abstract recipes from the reduction recipe for
4247/// following optimizations and cost estimation.
4249 VPCostContext &Ctx,
4250 VFRange &Range) {
4251 VPExpressionRecipe *AbstractR = nullptr;
4252 auto IP = std::next(Red->getIterator());
4253 auto *VPBB = Red->getParent();
4254 if (auto *MulAcc = tryToMatchAndCreateMulAccumulateReduction(Red, Ctx, Range))
4255 AbstractR = MulAcc;
4256 else if (auto *ExtRed = tryToMatchAndCreateExtendedReduction(Red, Ctx, Range))
4257 AbstractR = ExtRed;
4258 // Cannot create abstract inloop reduction recipes.
4259 if (!AbstractR)
4260 return;
4261
4262 AbstractR->insertBefore(*VPBB, IP);
4263 Red->replaceAllUsesWith(AbstractR);
4264}
4265
4276
4278 if (Plan.hasScalarVFOnly())
4279 return;
4280
4281#ifndef NDEBUG
4282 VPDominatorTree VPDT(Plan);
4283#endif
4284
4285 SmallVector<VPValue *> VPValues;
4288 append_range(VPValues, Plan.getLiveIns());
4289 for (VPRecipeBase &R : *Plan.getEntry())
4290 append_range(VPValues, R.definedValues());
4291
4292 auto *VectorPreheader = Plan.getVectorPreheader();
4293 for (VPValue *VPV : VPValues) {
4295 (isa<VPIRValue>(VPV) && isa<Constant>(VPV->getLiveInIRValue())))
4296 continue;
4297
4298 // Add explicit broadcast at the insert point that dominates all users.
4299 VPBasicBlock *HoistBlock = VectorPreheader;
4300 VPBasicBlock::iterator HoistPoint = VectorPreheader->end();
4301 for (VPUser *User : VPV->users()) {
4302 if (User->usesScalars(VPV))
4303 continue;
4304 if (cast<VPRecipeBase>(User)->getParent() == VectorPreheader)
4305 HoistPoint = HoistBlock->begin();
4306 else
4307 assert(VPDT.dominates(VectorPreheader,
4308 cast<VPRecipeBase>(User)->getParent()) &&
4309 "All users must be in the vector preheader or dominated by it");
4310 }
4311
4312 VPBuilder Builder(cast<VPBasicBlock>(HoistBlock), HoistPoint);
4313 auto *Broadcast = Builder.createNaryOp(VPInstruction::Broadcast, {VPV});
4314 VPV->replaceUsesWithIf(Broadcast,
4315 [VPV, Broadcast](VPUser &U, unsigned Idx) {
4316 return Broadcast != &U && !U.usesScalars(VPV);
4317 });
4318 }
4319}
4320
4322 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
4323
4324 // Collect candidate loads with invariant addresses and noalias scopes
4325 // metadata and memory-writing recipes with noalias metadata.
4329 vp_depth_first_shallow(LoopRegion->getEntry()))) {
4330 for (VPRecipeBase &R : *VPBB) {
4331 // Only handle single-scalar replicated loads with invariant addresses.
4332 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
4333 if (RepR->isPredicated() || !RepR->isSingleScalar() ||
4334 RepR->getOpcode() != Instruction::Load)
4335 continue;
4336
4337 VPValue *Addr = RepR->getOperand(0);
4338 if (Addr->isDefinedOutsideLoopRegions()) {
4340 if (!Loc.AATags.Scope)
4341 continue;
4342 CandidateLoads.push_back({RepR, Loc});
4343 }
4344 }
4345 if (R.mayWriteToMemory()) {
4347 if (!Loc || !Loc->AATags.Scope || !Loc->AATags.NoAlias)
4348 return;
4349 Stores.push_back(*Loc);
4350 }
4351 }
4352 }
4353
4354 VPBasicBlock *Preheader = Plan.getVectorPreheader();
4355 for (auto &[LoadRecipe, LoadLoc] : CandidateLoads) {
4356 // Hoist the load to the preheader if it doesn't alias with any stores
4357 // according to the noalias metadata. Other loads should have been hoisted
4358 // by other passes
4359 const AAMDNodes &LoadAA = LoadLoc.AATags;
4360 if (all_of(Stores, [&](const MemoryLocation &StoreLoc) {
4362 LoadAA.Scope, StoreLoc.AATags.NoAlias);
4363 })) {
4364 LoadRecipe->moveBefore(*Preheader, Preheader->getFirstNonPhi());
4365 }
4366 }
4367}
4368
4369// Collect common metadata from a group of replicate recipes by intersecting
4370// metadata from all recipes in the group.
4372 VPIRMetadata CommonMetadata = *Recipes.front();
4373 for (VPReplicateRecipe *Recipe : drop_begin(Recipes))
4374 CommonMetadata.intersect(*Recipe);
4375 return CommonMetadata;
4376}
4377
4378template <unsigned Opcode>
4382 const Loop *L) {
4383 static_assert(Opcode == Instruction::Load || Opcode == Instruction::Store,
4384 "Only Load and Store opcodes supported");
4385 constexpr bool IsLoad = (Opcode == Instruction::Load);
4386 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
4387 VPTypeAnalysis TypeInfo(Plan);
4388
4389 // Group predicated operations by their address SCEV.
4391 for (VPBlockBase *Block : vp_depth_first_shallow(LoopRegion->getEntry())) {
4392 auto *VPBB = cast<VPBasicBlock>(Block);
4393 for (VPRecipeBase &R : *VPBB) {
4394 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
4395 if (!RepR || RepR->getOpcode() != Opcode || !RepR->isPredicated())
4396 continue;
4397
4398 // For loads, operand 0 is address; for stores, operand 1 is address.
4399 VPValue *Addr = RepR->getOperand(IsLoad ? 0 : 1);
4400 const SCEV *AddrSCEV = vputils::getSCEVExprForVPValue(Addr, PSE, L);
4401 if (!isa<SCEVCouldNotCompute>(AddrSCEV))
4402 RecipesByAddress[AddrSCEV].push_back(RepR);
4403 }
4404 }
4405
4406 // For each address, collect operations with the same or complementary masks.
4408 auto GetLoadStoreValueType = [&](VPReplicateRecipe *Recipe) {
4409 return TypeInfo.inferScalarType(IsLoad ? Recipe : Recipe->getOperand(0));
4410 };
4411 for (auto &[Addr, Recipes] : RecipesByAddress) {
4412 if (Recipes.size() < 2)
4413 continue;
4414
4415 // Collect groups with the same or complementary masks.
4416 for (VPReplicateRecipe *&RecipeI : Recipes) {
4417 if (!RecipeI)
4418 continue;
4419
4420 VPValue *MaskI = RecipeI->getMask();
4421 Type *TypeI = GetLoadStoreValueType(RecipeI);
4423 Group.push_back(RecipeI);
4424 RecipeI = nullptr;
4425
4426 // Find all operations with the same or complementary masks.
4427 bool HasComplementaryMask = false;
4428 for (VPReplicateRecipe *&RecipeJ : Recipes) {
4429 if (!RecipeJ)
4430 continue;
4431
4432 VPValue *MaskJ = RecipeJ->getMask();
4433 Type *TypeJ = GetLoadStoreValueType(RecipeJ);
4434 if (TypeI == TypeJ) {
4435 // Check if any operation in the group has a complementary mask with
4436 // another, that is M1 == NOT(M2) or M2 == NOT(M1).
4437 HasComplementaryMask |= match(MaskI, m_Not(m_Specific(MaskJ))) ||
4438 match(MaskJ, m_Not(m_Specific(MaskI)));
4439 Group.push_back(RecipeJ);
4440 RecipeJ = nullptr;
4441 }
4442 }
4443
4444 if (HasComplementaryMask) {
4445 assert(Group.size() >= 2 && "must have at least 2 entries");
4446 AllGroups.push_back(std::move(Group));
4447 }
4448 }
4449 }
4450
4451 return AllGroups;
4452}
4453
4454// Find the recipe with minimum alignment in the group.
4455template <typename InstType>
4456static VPReplicateRecipe *
4458 return *min_element(Group, [](VPReplicateRecipe *A, VPReplicateRecipe *B) {
4459 return cast<InstType>(A->getUnderlyingInstr())->getAlign() <
4460 cast<InstType>(B->getUnderlyingInstr())->getAlign();
4461 });
4462}
4463
4466 const Loop *L) {
4467 auto Groups =
4469 if (Groups.empty())
4470 return;
4471
4472 VPDominatorTree VPDT(Plan);
4473
4474 // Process each group of loads.
4475 for (auto &Group : Groups) {
4476 // Sort loads by dominance order, with earliest (most dominating) first.
4477 sort(Group, [&VPDT](VPReplicateRecipe *A, VPReplicateRecipe *B) {
4478 return VPDT.properlyDominates(A, B);
4479 });
4480
4481 // Try to use the earliest (most dominating) load to replace all others.
4482 VPReplicateRecipe *EarliestLoad = Group[0];
4483 VPBasicBlock *FirstBB = EarliestLoad->getParent();
4484 VPBasicBlock *LastBB = Group.back()->getParent();
4485
4486 // Check that the load doesn't alias with stores between first and last.
4487 auto LoadLoc = vputils::getMemoryLocation(*EarliestLoad);
4488 if (!LoadLoc || !canHoistOrSinkWithNoAliasCheck(*LoadLoc, FirstBB, LastBB))
4489 continue;
4490
4491 // Collect common metadata from all loads in the group.
4492 VPIRMetadata CommonMetadata = getCommonMetadata(Group);
4493
4494 // Find the load with minimum alignment to use.
4495 auto *LoadWithMinAlign = findRecipeWithMinAlign<LoadInst>(Group);
4496
4497 // Create an unpredicated version of the earliest load with common
4498 // metadata.
4499 auto *UnpredicatedLoad = new VPReplicateRecipe(
4500 LoadWithMinAlign->getUnderlyingInstr(), {EarliestLoad->getOperand(0)},
4501 /*IsSingleScalar=*/false, /*Mask=*/nullptr, *EarliestLoad,
4502 CommonMetadata);
4503
4504 UnpredicatedLoad->insertBefore(EarliestLoad);
4505
4506 // Replace all loads in the group with the unpredicated load.
4507 for (VPReplicateRecipe *Load : Group) {
4508 Load->replaceAllUsesWith(UnpredicatedLoad);
4509 Load->eraseFromParent();
4510 }
4511 }
4512}
4513
4514static bool
4516 PredicatedScalarEvolution &PSE, const Loop &L,
4517 VPTypeAnalysis &TypeInfo) {
4518 auto StoreLoc = vputils::getMemoryLocation(*StoresToSink.front());
4519 if (!StoreLoc || !StoreLoc->AATags.Scope)
4520 return false;
4521
4522 // When sinking a group of stores, all members of the group alias each other.
4523 // Skip them during the alias checks.
4524 SmallPtrSet<VPRecipeBase *, 4> StoresToSinkSet(StoresToSink.begin(),
4525 StoresToSink.end());
4526
4527 VPBasicBlock *FirstBB = StoresToSink.front()->getParent();
4528 VPBasicBlock *LastBB = StoresToSink.back()->getParent();
4529 SinkStoreInfo SinkInfo(StoresToSinkSet, *StoresToSink[0], PSE, L, TypeInfo);
4530 return canHoistOrSinkWithNoAliasCheck(*StoreLoc, FirstBB, LastBB, SinkInfo);
4531}
4532
4535 const Loop *L) {
4536 auto Groups =
4538 if (Groups.empty())
4539 return;
4540
4541 VPDominatorTree VPDT(Plan);
4542 VPTypeAnalysis TypeInfo(Plan);
4543
4544 for (auto &Group : Groups) {
4545 sort(Group, [&VPDT](VPReplicateRecipe *A, VPReplicateRecipe *B) {
4546 return VPDT.properlyDominates(A, B);
4547 });
4548
4549 if (!canSinkStoreWithNoAliasCheck(Group, PSE, *L, TypeInfo))
4550 continue;
4551
4552 // Use the last (most dominated) store's location for the unconditional
4553 // store.
4554 VPReplicateRecipe *LastStore = Group.back();
4555 VPBasicBlock *InsertBB = LastStore->getParent();
4556
4557 // Collect common alias metadata from all stores in the group.
4558 VPIRMetadata CommonMetadata = getCommonMetadata(Group);
4559
4560 // Build select chain for stored values.
4561 VPValue *SelectedValue = Group[0]->getOperand(0);
4562 VPBuilder Builder(InsertBB, LastStore->getIterator());
4563
4564 for (unsigned I = 1; I < Group.size(); ++I) {
4565 VPValue *Mask = Group[I]->getMask();
4566 VPValue *Value = Group[I]->getOperand(0);
4567 SelectedValue = Builder.createSelect(Mask, Value, SelectedValue,
4568 Group[I]->getDebugLoc());
4569 }
4570
4571 // Find the store with minimum alignment to use.
4572 auto *StoreWithMinAlign = findRecipeWithMinAlign<StoreInst>(Group);
4573
4574 // Create unconditional store with selected value and common metadata.
4575 auto *UnpredicatedStore =
4576 new VPReplicateRecipe(StoreWithMinAlign->getUnderlyingInstr(),
4577 {SelectedValue, LastStore->getOperand(1)},
4578 /*IsSingleScalar=*/false,
4579 /*Mask=*/nullptr, *LastStore, CommonMetadata);
4580 UnpredicatedStore->insertBefore(*InsertBB, LastStore->getIterator());
4581
4582 // Remove all predicated stores from the group.
4583 for (VPReplicateRecipe *Store : Group)
4584 Store->eraseFromParent();
4585 }
4586}
4587
4589 VPlan &Plan, ElementCount BestVF, unsigned BestUF,
4591 assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
4592 assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
4593
4594 VPValue *TC = Plan.getTripCount();
4595 // Skip cases for which the trip count may be non-trivial to materialize.
4596 // I.e., when a scalar tail is absent - due to tail folding, or when a scalar
4597 // tail is required.
4598 if (!Plan.hasScalarTail() ||
4600 Plan.getScalarPreheader() ||
4601 !isa<VPIRValue>(TC))
4602 return;
4603
4604 // Materialize vector trip counts for constants early if it can simply
4605 // be computed as (Original TC / VF * UF) * VF * UF.
4606 // TODO: Compute vector trip counts for loops requiring a scalar epilogue and
4607 // tail-folded loops.
4608 ScalarEvolution &SE = *PSE.getSE();
4609 auto *TCScev = SE.getSCEV(TC->getLiveInIRValue());
4610 if (!isa<SCEVConstant>(TCScev))
4611 return;
4612 const SCEV *VFxUF = SE.getElementCount(TCScev->getType(), BestVF * BestUF);
4613 auto VecTCScev = SE.getMulExpr(SE.getUDivExpr(TCScev, VFxUF), VFxUF);
4614 if (auto *ConstVecTC = dyn_cast<SCEVConstant>(VecTCScev))
4615 Plan.getVectorTripCount().setUnderlyingValue(ConstVecTC->getValue());
4616}
4617
4619 VPBasicBlock *VectorPH) {
4621 if (BTC->getNumUsers() == 0)
4622 return;
4623
4624 VPBuilder Builder(VectorPH, VectorPH->begin());
4625 auto *TCTy = VPTypeAnalysis(Plan).inferScalarType(Plan.getTripCount());
4626 auto *TCMO = Builder.createNaryOp(
4627 Instruction::Sub, {Plan.getTripCount(), Plan.getConstantInt(TCTy, 1)},
4628 DebugLoc::getCompilerGenerated(), "trip.count.minus.1");
4629 BTC->replaceAllUsesWith(TCMO);
4630}
4631
4633 if (Plan.hasScalarVFOnly())
4634 return;
4635
4636 VPTypeAnalysis TypeInfo(Plan);
4637 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
4638 auto VPBBsOutsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
4640 auto VPBBsInsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
4641 vp_depth_first_shallow(LoopRegion->getEntry()));
4642 // Materialize Build(Struct)Vector for all replicating VPReplicateRecipes and
4643 // VPInstructions, excluding ones in replicate regions. Those are not
4644 // materialized explicitly yet. Those vector users are still handled in
4645 // VPReplicateRegion::execute(), via shouldPack().
4646 // TODO: materialize build vectors for replicating recipes in replicating
4647 // regions.
4648 for (VPBasicBlock *VPBB :
4649 concat<VPBasicBlock *>(VPBBsOutsideLoopRegion, VPBBsInsideLoopRegion)) {
4650 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
4652 continue;
4653 auto *DefR = cast<VPRecipeWithIRFlags>(&R);
4654 auto UsesVectorOrInsideReplicateRegion = [DefR, LoopRegion](VPUser *U) {
4655 VPRegionBlock *ParentRegion = cast<VPRecipeBase>(U)->getRegion();
4656 return !U->usesScalars(DefR) || ParentRegion != LoopRegion;
4657 };
4658 if ((isa<VPReplicateRecipe>(DefR) &&
4659 cast<VPReplicateRecipe>(DefR)->isSingleScalar()) ||
4660 (isa<VPInstruction>(DefR) &&
4662 !cast<VPInstruction>(DefR)->doesGeneratePerAllLanes())) ||
4663 none_of(DefR->users(), UsesVectorOrInsideReplicateRegion))
4664 continue;
4665
4666 Type *ScalarTy = TypeInfo.inferScalarType(DefR);
4667 unsigned Opcode = ScalarTy->isStructTy()
4670 auto *BuildVector = new VPInstruction(Opcode, {DefR});
4671 BuildVector->insertAfter(DefR);
4672
4673 DefR->replaceUsesWithIf(
4674 BuildVector, [BuildVector, &UsesVectorOrInsideReplicateRegion](
4675 VPUser &U, unsigned) {
4676 return &U != BuildVector && UsesVectorOrInsideReplicateRegion(&U);
4677 });
4678 }
4679 }
4680
4681 // Create explicit VPInstructions to convert vectors to scalars. The current
4682 // implementation is conservative - it may miss some cases that may or may not
4683 // be vector values. TODO: introduce Unpacks speculatively - remove them later
4684 // if they are known to operate on scalar values.
4685 for (VPBasicBlock *VPBB : VPBBsInsideLoopRegion) {
4686 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
4689 continue;
4690 for (VPValue *Def : R.definedValues()) {
4691 // Skip recipes that are single-scalar or only have their first lane
4692 // used.
4693 // TODO: The Defs skipped here may or may not be vector values.
4694 // Introduce Unpacks, and remove them later, if they are guaranteed to
4695 // produce scalar values.
4697 continue;
4698
4699 // At the moment, we create unpacks only for scalar users outside
4700 // replicate regions. Recipes inside replicate regions still extract the
4701 // required lanes implicitly.
4702 // TODO: Remove once replicate regions are unrolled completely.
4703 auto IsCandidateUnpackUser = [Def](VPUser *U) {
4704 VPRegionBlock *ParentRegion = cast<VPRecipeBase>(U)->getRegion();
4705 return U->usesScalars(Def) &&
4706 (!ParentRegion || !ParentRegion->isReplicator());
4707 };
4708 if (none_of(Def->users(), IsCandidateUnpackUser))
4709 continue;
4710
4711 auto *Unpack = new VPInstruction(VPInstruction::Unpack, {Def});
4712 if (R.isPhi())
4713 Unpack->insertBefore(*VPBB, VPBB->getFirstNonPhi());
4714 else
4715 Unpack->insertAfter(&R);
4716 Def->replaceUsesWithIf(Unpack,
4717 [&IsCandidateUnpackUser](VPUser &U, unsigned) {
4718 return IsCandidateUnpackUser(&U);
4719 });
4720 }
4721 }
4722 }
4723}
4724
4726 VPBasicBlock *VectorPHVPBB,
4727 bool TailByMasking,
4728 bool RequiresScalarEpilogue) {
4729 VPSymbolicValue &VectorTC = Plan.getVectorTripCount();
4730 // There's nothing to do if there are no users of the vector trip count or its
4731 // IR value has already been set.
4732 if (VectorTC.getNumUsers() == 0 || VectorTC.getUnderlyingValue())
4733 return;
4734
4735 VPValue *TC = Plan.getTripCount();
4736 Type *TCTy = VPTypeAnalysis(Plan).inferScalarType(TC);
4737 VPBuilder Builder(VectorPHVPBB, VectorPHVPBB->begin());
4738 VPValue *Step = &Plan.getVFxUF();
4739
4740 // If the tail is to be folded by masking, round the number of iterations N
4741 // up to a multiple of Step instead of rounding down. This is done by first
4742 // adding Step-1 and then rounding down. Note that it's ok if this addition
4743 // overflows: the vector induction variable will eventually wrap to zero given
4744 // that it starts at zero and its Step is a power of two; the loop will then
4745 // exit, with the last early-exit vector comparison also producing all-true.
4746 // For scalable vectors the VF is not guaranteed to be a power of 2, but this
4747 // is accounted for in emitIterationCountCheck that adds an overflow check.
4748 if (TailByMasking) {
4749 TC = Builder.createNaryOp(
4750 Instruction::Add,
4751 {TC, Builder.createNaryOp(Instruction::Sub,
4752 {Step, Plan.getConstantInt(TCTy, 1)})},
4753 DebugLoc::getCompilerGenerated(), "n.rnd.up");
4754 }
4755
4756 // Now we need to generate the expression for the part of the loop that the
4757 // vectorized body will execute. This is equal to N - (N % Step) if scalar
4758 // iterations are not required for correctness, or N - Step, otherwise. Step
4759 // is equal to the vectorization factor (number of SIMD elements) times the
4760 // unroll factor (number of SIMD instructions).
4761 VPValue *R =
4762 Builder.createNaryOp(Instruction::URem, {TC, Step},
4763 DebugLoc::getCompilerGenerated(), "n.mod.vf");
4764
4765 // There are cases where we *must* run at least one iteration in the remainder
4766 // loop. See the cost model for when this can happen. If the step evenly
4767 // divides the trip count, we set the remainder to be equal to the step. If
4768 // the step does not evenly divide the trip count, no adjustment is necessary
4769 // since there will already be scalar iterations. Note that the minimum
4770 // iterations check ensures that N >= Step.
4771 if (RequiresScalarEpilogue) {
4772 assert(!TailByMasking &&
4773 "requiring scalar epilogue is not supported with fail folding");
4774 VPValue *IsZero =
4775 Builder.createICmp(CmpInst::ICMP_EQ, R, Plan.getConstantInt(TCTy, 0));
4776 R = Builder.createSelect(IsZero, Step, R);
4777 }
4778
4779 VPValue *Res = Builder.createNaryOp(
4780 Instruction::Sub, {TC, R}, DebugLoc::getCompilerGenerated(), "n.vec");
4781 VectorTC.replaceAllUsesWith(Res);
4782}
4783
4785 ElementCount VFEC) {
4786 VPBuilder Builder(VectorPH, VectorPH->begin());
4787 Type *TCTy = VPTypeAnalysis(Plan).inferScalarType(Plan.getTripCount());
4788 VPValue &VF = Plan.getVF();
4789 VPValue &VFxUF = Plan.getVFxUF();
4790 // Note that after the transform, Plan.getVF and Plan.getVFxUF should not be
4791 // used.
4792 // TODO: Assert that they aren't used.
4793
4794 // If there are no users of the runtime VF, compute VFxUF by constant folding
4795 // the multiplication of VF and UF.
4796 if (VF.getNumUsers() == 0) {
4797 VPValue *RuntimeVFxUF =
4798 Builder.createElementCount(TCTy, VFEC * Plan.getUF());
4799 VFxUF.replaceAllUsesWith(RuntimeVFxUF);
4800 return;
4801 }
4802
4803 // For users of the runtime VF, compute it as VF * vscale, and VFxUF as (VF *
4804 // vscale) * UF.
4805 VPValue *RuntimeVF = Builder.createElementCount(TCTy, VFEC);
4807 VPValue *BC = Builder.createNaryOp(VPInstruction::Broadcast, RuntimeVF);
4809 BC, [&VF](VPUser &U, unsigned) { return !U.usesScalars(&VF); });
4810 }
4811 VF.replaceAllUsesWith(RuntimeVF);
4812
4813 VPValue *UF = Plan.getConstantInt(TCTy, Plan.getUF());
4814 VPValue *MulByUF = Builder.createOverflowingOp(
4815 Instruction::Mul, {RuntimeVF, UF}, {true, false});
4816 VFxUF.replaceAllUsesWith(MulByUF);
4817}
4818
4821 SCEVExpander Expander(SE, "induction", /*PreserveLCSSA=*/false);
4822
4823 auto *Entry = cast<VPIRBasicBlock>(Plan.getEntry());
4824 BasicBlock *EntryBB = Entry->getIRBasicBlock();
4825 DenseMap<const SCEV *, Value *> ExpandedSCEVs;
4826 for (VPRecipeBase &R : make_early_inc_range(*Entry)) {
4828 continue;
4829 auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(&R);
4830 if (!ExpSCEV)
4831 break;
4832 const SCEV *Expr = ExpSCEV->getSCEV();
4833 Value *Res =
4834 Expander.expandCodeFor(Expr, Expr->getType(), EntryBB->getTerminator());
4835 ExpandedSCEVs[ExpSCEV->getSCEV()] = Res;
4836 VPValue *Exp = Plan.getOrAddLiveIn(Res);
4837 ExpSCEV->replaceAllUsesWith(Exp);
4838 if (Plan.getTripCount() == ExpSCEV)
4839 Plan.resetTripCount(Exp);
4840 ExpSCEV->eraseFromParent();
4841 }
4843 "VPExpandSCEVRecipes must be at the beginning of the entry block, "
4844 "after any VPIRInstructions");
4845 // Add IR instructions in the entry basic block but not in the VPIRBasicBlock
4846 // to the VPIRBasicBlock.
4847 auto EI = Entry->begin();
4848 for (Instruction &I : drop_end(*EntryBB)) {
4849 if (EI != Entry->end() && isa<VPIRInstruction>(*EI) &&
4850 &cast<VPIRInstruction>(&*EI)->getInstruction() == &I) {
4851 EI++;
4852 continue;
4853 }
4855 }
4856
4857 return ExpandedSCEVs;
4858}
4859
4860/// Returns true if \p V is VPWidenLoadRecipe or VPInterleaveRecipe that can be
4861/// converted to a narrower recipe. \p V is used by a wide recipe that feeds a
4862/// store interleave group at index \p Idx, \p WideMember0 is the recipe feeding
4863/// the same interleave group at index 0. A VPWidenLoadRecipe can be narrowed to
4864/// an index-independent load if it feeds all wide ops at all indices (\p OpV
4865/// must be the operand at index \p OpIdx for both the recipe at lane 0, \p
4866/// WideMember0). A VPInterleaveRecipe can be narrowed to a wide load, if \p V
4867/// is defined at \p Idx of a load interleave group.
4868static bool canNarrowLoad(VPWidenRecipe *WideMember0, unsigned OpIdx,
4869 VPValue *OpV, unsigned Idx) {
4870 VPValue *Member0Op = WideMember0->getOperand(OpIdx);
4871 VPRecipeBase *Member0OpR = Member0Op->getDefiningRecipe();
4872 if (!Member0OpR)
4873 return Member0Op == OpV;
4874 if (auto *W = dyn_cast<VPWidenLoadRecipe>(Member0OpR))
4875 return !W->getMask() && Member0Op == OpV;
4876 if (auto *IR = dyn_cast<VPInterleaveRecipe>(Member0OpR))
4877 return IR->getInterleaveGroup()->isFull() && IR->getVPValue(Idx) == OpV;
4878 return false;
4879}
4880
4881/// Returns true if \p IR is a full interleave group with factor and number of
4882/// members both equal to \p VF. The interleave group must also access the full
4883/// vector width \p VectorRegWidth.
4885 ElementCount VF,
4886 VPTypeAnalysis &TypeInfo,
4887 TypeSize VectorRegWidth) {
4888 if (!InterleaveR || InterleaveR->getMask())
4889 return false;
4890
4891 Type *GroupElementTy = nullptr;
4892 if (InterleaveR->getStoredValues().empty()) {
4893 GroupElementTy = TypeInfo.inferScalarType(InterleaveR->getVPValue(0));
4894 if (!all_of(InterleaveR->definedValues(),
4895 [&TypeInfo, GroupElementTy](VPValue *Op) {
4896 return TypeInfo.inferScalarType(Op) == GroupElementTy;
4897 }))
4898 return false;
4899 } else {
4900 GroupElementTy =
4901 TypeInfo.inferScalarType(InterleaveR->getStoredValues()[0]);
4902 if (!all_of(InterleaveR->getStoredValues(),
4903 [&TypeInfo, GroupElementTy](VPValue *Op) {
4904 return TypeInfo.inferScalarType(Op) == GroupElementTy;
4905 }))
4906 return false;
4907 }
4908
4909 unsigned VFMin = VF.getKnownMinValue();
4910 TypeSize GroupSize = TypeSize::get(
4911 GroupElementTy->getScalarSizeInBits() * VFMin, VF.isScalable());
4912 const auto *IG = InterleaveR->getInterleaveGroup();
4913 return IG->getFactor() == VFMin && IG->getNumMembers() == VFMin &&
4914 GroupSize == VectorRegWidth;
4915}
4916
4917/// Returns true if \p VPValue is a narrow VPValue.
4918static bool isAlreadyNarrow(VPValue *VPV) {
4919 if (isa<VPIRValue>(VPV))
4920 return true;
4921 auto *RepR = dyn_cast<VPReplicateRecipe>(VPV);
4922 return RepR && RepR->isSingleScalar();
4923}
4924
4925// Convert a wide recipe defining a VPValue \p V feeding an interleave group to
4926// a narrow variant.
4927static VPValue *
4929 auto *R = V->getDefiningRecipe();
4930 if (!R || NarrowedOps.contains(V))
4931 return V;
4932
4933 if (isAlreadyNarrow(V))
4934 return V;
4935
4936 if (auto *WideMember0 = dyn_cast<VPWidenRecipe>(R)) {
4937 for (unsigned Idx = 0, E = WideMember0->getNumOperands(); Idx != E; ++Idx)
4938 WideMember0->setOperand(
4939 Idx,
4940 narrowInterleaveGroupOp(WideMember0->getOperand(Idx), NarrowedOps));
4941 return V;
4942 }
4943
4944 if (auto *LoadGroup = dyn_cast<VPInterleaveRecipe>(R)) {
4945 // Narrow interleave group to wide load, as transformed VPlan will only
4946 // process one original iteration.
4947 auto *LI = cast<LoadInst>(LoadGroup->getInterleaveGroup()->getInsertPos());
4948 auto *L = new VPWidenLoadRecipe(
4949 *LI, LoadGroup->getAddr(), LoadGroup->getMask(), /*Consecutive=*/true,
4950 /*Reverse=*/false, {}, LoadGroup->getDebugLoc());
4951 L->insertBefore(LoadGroup);
4952 NarrowedOps.insert(L);
4953 return L;
4954 }
4955
4956 if (auto *RepR = dyn_cast<VPReplicateRecipe>(R)) {
4957 assert(RepR->isSingleScalar() &&
4958 isa<LoadInst>(RepR->getUnderlyingInstr()) &&
4959 "must be a single scalar load");
4960 NarrowedOps.insert(RepR);
4961 return RepR;
4962 }
4963
4964 auto *WideLoad = cast<VPWidenLoadRecipe>(R);
4965 VPValue *PtrOp = WideLoad->getAddr();
4966 if (auto *VecPtr = dyn_cast<VPVectorPointerRecipe>(PtrOp))
4967 PtrOp = VecPtr->getOperand(0);
4968 // Narrow wide load to uniform scalar load, as transformed VPlan will only
4969 // process one original iteration.
4970 auto *N = new VPReplicateRecipe(&WideLoad->getIngredient(), {PtrOp},
4971 /*IsUniform*/ true,
4972 /*Mask*/ nullptr, {}, *WideLoad);
4973 N->insertBefore(WideLoad);
4974 NarrowedOps.insert(N);
4975 return N;
4976}
4977
4979 TypeSize VectorRegWidth) {
4980 VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
4981 if (!VectorLoop || VectorLoop->getEntry()->getNumSuccessors() != 0)
4982 return;
4983
4984 VPTypeAnalysis TypeInfo(Plan);
4985
4987 for (auto &R : *VectorLoop->getEntryBasicBlock()) {
4989 continue;
4990
4993 continue;
4994
4995 // Bail out on recipes not supported at the moment:
4996 // * phi recipes other than the canonical induction
4997 // * recipes writing to memory except interleave groups
4998 // Only support plans with a canonical induction phi.
4999 if (R.isPhi())
5000 return;
5001
5002 auto *InterleaveR = dyn_cast<VPInterleaveRecipe>(&R);
5003 if (R.mayWriteToMemory() && !InterleaveR)
5004 return;
5005
5006 // Do not narrow interleave groups if there are VectorPointer recipes and
5007 // the plan was unrolled. The recipe implicitly uses VF from
5008 // VPTransformState.
5009 // TODO: Remove restriction once the VF for the VectorPointer offset is
5010 // modeled explicitly as operand.
5011 if (isa<VPVectorPointerRecipe>(&R) && Plan.getUF() > 1)
5012 return;
5013
5014 // All other ops are allowed, but we reject uses that cannot be converted
5015 // when checking all allowed consumers (store interleave groups) below.
5016 if (!InterleaveR)
5017 continue;
5018
5019 // Bail out on non-consecutive interleave groups.
5020 if (!isConsecutiveInterleaveGroup(InterleaveR, VF, TypeInfo,
5021 VectorRegWidth))
5022 return;
5023
5024 // Skip read interleave groups.
5025 if (InterleaveR->getStoredValues().empty())
5026 continue;
5027
5028 // Narrow interleave groups, if all operands are already matching narrow
5029 // ops.
5030 auto *Member0 = InterleaveR->getStoredValues()[0];
5031 if (isAlreadyNarrow(Member0) &&
5032 all_of(InterleaveR->getStoredValues(),
5033 [Member0](VPValue *VPV) { return Member0 == VPV; })) {
5034 StoreGroups.push_back(InterleaveR);
5035 continue;
5036 }
5037
5038 // For now, we only support full interleave groups storing load interleave
5039 // groups.
5040 if (all_of(enumerate(InterleaveR->getStoredValues()), [](auto Op) {
5041 VPRecipeBase *DefR = Op.value()->getDefiningRecipe();
5042 if (!DefR)
5043 return false;
5044 auto *IR = dyn_cast<VPInterleaveRecipe>(DefR);
5045 return IR && IR->getInterleaveGroup()->isFull() &&
5046 IR->getVPValue(Op.index()) == Op.value();
5047 })) {
5048 StoreGroups.push_back(InterleaveR);
5049 continue;
5050 }
5051
5052 // Check if all values feeding InterleaveR are matching wide recipes, which
5053 // operands that can be narrowed.
5054 auto *WideMember0 =
5055 dyn_cast_or_null<VPWidenRecipe>(InterleaveR->getStoredValues()[0]);
5056 if (!WideMember0)
5057 return;
5058 for (const auto &[I, V] : enumerate(InterleaveR->getStoredValues())) {
5060 if (!R || R->getOpcode() != WideMember0->getOpcode() ||
5061 R->getNumOperands() > 2)
5062 return;
5063 if (any_of(enumerate(R->operands()),
5064 [WideMember0, Idx = I](const auto &P) {
5065 const auto &[OpIdx, OpV] = P;
5066 return !canNarrowLoad(WideMember0, OpIdx, OpV, Idx);
5067 }))
5068 return;
5069 }
5070 StoreGroups.push_back(InterleaveR);
5071 }
5072
5073 if (StoreGroups.empty())
5074 return;
5075
5076 // Convert InterleaveGroup \p R to a single VPWidenLoadRecipe.
5077 SmallPtrSet<VPValue *, 4> NarrowedOps;
5078 // Narrow operation tree rooted at store groups.
5079 for (auto *StoreGroup : StoreGroups) {
5080 VPValue *Res =
5081 narrowInterleaveGroupOp(StoreGroup->getStoredValues()[0], NarrowedOps);
5082 auto *SI =
5083 cast<StoreInst>(StoreGroup->getInterleaveGroup()->getInsertPos());
5084 auto *S = new VPWidenStoreRecipe(
5085 *SI, StoreGroup->getAddr(), Res, nullptr, /*Consecutive=*/true,
5086 /*Reverse=*/false, {}, StoreGroup->getDebugLoc());
5087 S->insertBefore(StoreGroup);
5088 StoreGroup->eraseFromParent();
5089 }
5090
5091 // Adjust induction to reflect that the transformed plan only processes one
5092 // original iteration.
5093 auto *CanIV = VectorLoop->getCanonicalIV();
5094 auto *Inc = cast<VPInstruction>(CanIV->getBackedgeValue());
5095 VPBuilder PHBuilder(Plan.getVectorPreheader());
5096
5097 VPValue *UF = Plan.getOrAddLiveIn(
5098 ConstantInt::get(VectorLoop->getCanonicalIVType(), 1 * Plan.getUF()));
5099 if (VF.isScalable()) {
5100 VPValue *VScale = PHBuilder.createElementCount(
5102 VPValue *VScaleUF = PHBuilder.createOverflowingOp(
5103 Instruction::Mul, {VScale, UF}, {true, false});
5104 Inc->setOperand(1, VScaleUF);
5105 Plan.getVF().replaceAllUsesWith(VScale);
5106 } else {
5107 Inc->setOperand(1, UF);
5109 Plan.getConstantInt(CanIV->getScalarType(), 1));
5110 }
5111 removeDeadRecipes(Plan);
5112}
5113
5114/// Add branch weight metadata, if the \p Plan's middle block is terminated by a
5115/// BranchOnCond recipe.
5117 VPlan &Plan, ElementCount VF, std::optional<unsigned> VScaleForTuning) {
5118 VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
5119 auto *MiddleTerm =
5121 // Only add branch metadata if there is a (conditional) terminator.
5122 if (!MiddleTerm)
5123 return;
5124
5125 assert(MiddleTerm->getOpcode() == VPInstruction::BranchOnCond &&
5126 "must have a BranchOnCond");
5127 // Assume that `TripCount % VectorStep ` is equally distributed.
5128 unsigned VectorStep = Plan.getUF() * VF.getKnownMinValue();
5129 if (VF.isScalable() && VScaleForTuning.has_value())
5130 VectorStep *= *VScaleForTuning;
5131 assert(VectorStep > 0 && "trip count should not be zero");
5132 MDBuilder MDB(Plan.getContext());
5133 MDNode *BranchWeights =
5134 MDB.createBranchWeights({1, VectorStep - 1}, /*IsExpected=*/false);
5135 MiddleTerm->setMetadata(LLVMContext::MD_prof, BranchWeights);
5136}
5137
5138/// Compute and return the end value for \p WideIV, unless it is truncated. If
5139/// the induction recipe is not canonical, creates a VPDerivedIVRecipe to
5140/// compute the end value of the induction.
5142 VPBuilder &VectorPHBuilder,
5143 VPTypeAnalysis &TypeInfo,
5144 VPValue *VectorTC) {
5145 auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
5146 // Truncated wide inductions resume from the last lane of their vector value
5147 // in the last vector iteration which is handled elsewhere.
5148 if (WideIntOrFp && WideIntOrFp->getTruncInst())
5149 return nullptr;
5150
5151 VPIRValue *Start = WideIV->getStartValue();
5152 VPValue *Step = WideIV->getStepValue();
5154 VPValue *EndValue = VectorTC;
5155 if (!WideIntOrFp || !WideIntOrFp->isCanonical()) {
5156 EndValue = VectorPHBuilder.createDerivedIV(
5157 ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
5158 Start, VectorTC, Step);
5159 }
5160
5161 // EndValue is derived from the vector trip count (which has the same type as
5162 // the widest induction) and thus may be wider than the induction here.
5163 Type *ScalarTypeOfWideIV = TypeInfo.inferScalarType(WideIV);
5164 if (ScalarTypeOfWideIV != TypeInfo.inferScalarType(EndValue)) {
5165 EndValue = VectorPHBuilder.createScalarCast(Instruction::Trunc, EndValue,
5166 ScalarTypeOfWideIV,
5167 WideIV->getDebugLoc());
5168 }
5169
5170 return EndValue;
5171}
5172
5174 VPlan &Plan, DenseMap<VPValue *, VPValue *> &IVEndValues) {
5175 VPTypeAnalysis TypeInfo(Plan);
5176 auto *ScalarPH = Plan.getScalarPreheader();
5177 auto *MiddleVPBB = cast<VPBasicBlock>(ScalarPH->getPredecessors()[0]);
5178 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
5179 VPBuilder VectorPHBuilder(
5180 cast<VPBasicBlock>(VectorRegion->getSinglePredecessor()));
5181 VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
5182 for (VPRecipeBase &PhiR : Plan.getScalarPreheader()->phis()) {
5183 auto *ResumePhiR = cast<VPPhi>(&PhiR);
5184
5185 // TODO: Extract final value from induction recipe initially, optimize to
5186 // pre-computed end value together in optimizeInductionExitUsers.
5187 auto *VectorPhiR = cast<VPHeaderPHIRecipe>(ResumePhiR->getOperand(0));
5188 if (auto *WideIVR = dyn_cast<VPWidenInductionRecipe>(VectorPhiR)) {
5190 WideIVR, VectorPHBuilder, TypeInfo, &Plan.getVectorTripCount())) {
5191 IVEndValues[WideIVR] = EndValue;
5192 ResumePhiR->setOperand(0, EndValue);
5193 ResumePhiR->setName("bc.resume.val");
5194 continue;
5195 }
5196 // TODO: Also handle truncated inductions here. Computing end-values
5197 // separately should be done as VPlan-to-VPlan optimization, after
5198 // legalizing all resume values to use the last lane from the loop.
5199 assert(cast<VPWidenIntOrFpInductionRecipe>(VectorPhiR)->getTruncInst() &&
5200 "should only skip truncated wide inductions");
5201 continue;
5202 }
5203
5204 // The backedge value provides the value to resume coming out of a loop,
5205 // which for FORs is a vector whose last element needs to be extracted. The
5206 // start value provides the value if the loop is bypassed.
5207 bool IsFOR = isa<VPFirstOrderRecurrencePHIRecipe>(VectorPhiR);
5208 auto *ResumeFromVectorLoop = VectorPhiR->getBackedgeValue();
5209 assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
5210 "Cannot handle loops with uncountable early exits");
5211 if (IsFOR) {
5212 auto *ExtractPart = MiddleBuilder.createNaryOp(
5213 VPInstruction::ExtractLastPart, ResumeFromVectorLoop);
5214 ResumeFromVectorLoop = MiddleBuilder.createNaryOp(
5216 "vector.recur.extract");
5217 }
5218 ResumePhiR->setName(IsFOR ? "scalar.recur.init" : "bc.merge.rdx");
5219 ResumePhiR->setOperand(0, ResumeFromVectorLoop);
5220 }
5221}
5222
5224 VFRange &Range) {
5225 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
5226 auto *ScalarPHVPBB = Plan.getScalarPreheader();
5227 auto *MiddleVPBB = Plan.getMiddleBlock();
5228 VPBuilder ScalarPHBuilder(ScalarPHVPBB);
5229 VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
5230
5231 auto IsScalableOne = [](ElementCount VF) -> bool {
5232 return VF == ElementCount::getScalable(1);
5233 };
5234
5235 for (auto &HeaderPhi : VectorRegion->getEntryBasicBlock()->phis()) {
5236 auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&HeaderPhi);
5237 if (!FOR)
5238 continue;
5239
5240 assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
5241 "Cannot handle loops with uncountable early exits");
5242
5243 // This is the second phase of vectorizing first-order recurrences, creating
5244 // extract for users outside the loop. An overview of the transformation is
5245 // described below. Suppose we have the following loop with some use after
5246 // the loop of the last a[i-1],
5247 //
5248 // for (int i = 0; i < n; ++i) {
5249 // t = a[i - 1];
5250 // b[i] = a[i] - t;
5251 // }
5252 // use t;
5253 //
5254 // There is a first-order recurrence on "a". For this loop, the shorthand
5255 // scalar IR looks like:
5256 //
5257 // scalar.ph:
5258 // s.init = a[-1]
5259 // br scalar.body
5260 //
5261 // scalar.body:
5262 // i = phi [0, scalar.ph], [i+1, scalar.body]
5263 // s1 = phi [s.init, scalar.ph], [s2, scalar.body]
5264 // s2 = a[i]
5265 // b[i] = s2 - s1
5266 // br cond, scalar.body, exit.block
5267 //
5268 // exit.block:
5269 // use = lcssa.phi [s1, scalar.body]
5270 //
5271 // In this example, s1 is a recurrence because it's value depends on the
5272 // previous iteration. In the first phase of vectorization, we created a
5273 // VPFirstOrderRecurrencePHIRecipe v1 for s1. Now we create the extracts
5274 // for users in the scalar preheader and exit block.
5275 //
5276 // vector.ph:
5277 // v_init = vector(..., ..., ..., a[-1])
5278 // br vector.body
5279 //
5280 // vector.body
5281 // i = phi [0, vector.ph], [i+4, vector.body]
5282 // v1 = phi [v_init, vector.ph], [v2, vector.body]
5283 // v2 = a[i, i+1, i+2, i+3]
5284 // b[i] = v2 - v1
5285 // // Next, third phase will introduce v1' = splice(v1(3), v2(0, 1, 2))
5286 // b[i, i+1, i+2, i+3] = v2 - v1
5287 // br cond, vector.body, middle.block
5288 //
5289 // middle.block:
5290 // vector.recur.extract.for.phi = v2(2)
5291 // vector.recur.extract = v2(3)
5292 // br cond, scalar.ph, exit.block
5293 //
5294 // scalar.ph:
5295 // scalar.recur.init = phi [vector.recur.extract, middle.block],
5296 // [s.init, otherwise]
5297 // br scalar.body
5298 //
5299 // scalar.body:
5300 // i = phi [0, scalar.ph], [i+1, scalar.body]
5301 // s1 = phi [scalar.recur.init, scalar.ph], [s2, scalar.body]
5302 // s2 = a[i]
5303 // b[i] = s2 - s1
5304 // br cond, scalar.body, exit.block
5305 //
5306 // exit.block:
5307 // lo = lcssa.phi [s1, scalar.body],
5308 // [vector.recur.extract.for.phi, middle.block]
5309 //
5310 // Now update VPIRInstructions modeling LCSSA phis in the exit block.
5311 // Extract the penultimate value of the recurrence and use it as operand for
5312 // the VPIRInstruction modeling the phi.
5314 make_range(MiddleVPBB->getFirstNonPhi(), MiddleVPBB->end()))) {
5316 continue;
5317
5318 // For VF vscale x 1, if vscale = 1, we are unable to extract the
5319 // penultimate value of the recurrence. Instead we rely on the existing
5320 // extract of the last element from the result of
5321 // VPInstruction::FirstOrderRecurrenceSplice.
5322 // TODO: Consider vscale_range info and UF.
5324 Range))
5325 return;
5326 VPValue *PenultimateElement = MiddleBuilder.createNaryOp(
5327 VPInstruction::ExtractPenultimateElement, FOR->getBackedgeValue(), {},
5328 "vector.recur.extract.for.phi");
5329 cast<VPInstruction>(&R)->replaceAllUsesWith(PenultimateElement);
5330 }
5331 }
5332}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
AMDGPU Register Bank Select
This file implements a class to represent arbitrary precision integral constant values and operations...
ReachingDefInfo InstSet & ToRemove
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool isEqual(const Function &Caller, const Function &Callee)
static const Function * getParent(const Value *V)
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
static bool isSentinel(const DWARFDebugNames::AttributeEncoding &AE)
@ Default
Hexagon Common GEP
iv Induction Variable Users
Definition IVUsers.cpp:48
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
licm
Definition LICM.cpp:383
Legalize the Machine IR a function s Machine IR
Definition Legalizer.cpp:80
static bool mergeBlocksIntoPredecessors(Loop &L, DominatorTree &DT, LoopInfo &LI, MemorySSAUpdater *MSSAU, ScalarEvolution &SE)
#define I(x, y, z)
Definition MD5.cpp:57
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first DebugLoc that has line number information, given a range of instructions.
This file provides utility analysis objects describing memory locations.
This file contains the declarations for metadata subclasses.
MachineInstr unsigned OpIdx
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
uint64_t IntrinsicInst * II
#define P(N)
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
const SmallVectorImpl< MachineOperand > & Cond
This file contains some templates that are useful if you are working with the STL at all.
This is the interface for a metadata-based scoped no-alias analysis.
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file defines the SmallPtrSet class.
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This file implements the TypeSwitch template, which mimics a switch() statement whose cases are type ...
This file implements dominator tree analysis for a single level of a VPlan's H-CFG.
This file contains the declarations of different VPlan-related auxiliary helpers.
static SmallVector< SmallVector< VPReplicateRecipe *, 4 > > collectComplementaryPredicatedMemOps(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L)
static void removeCommonBlendMask(VPBlendRecipe *Blend)
Try to see if all of Blend's masks share a common value logically and'ed and remove it from the masks...
static void tryToCreateAbstractReductionRecipe(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries to create abstract recipes from the reduction recipe for following optimizations ...
static VPReplicateRecipe * findRecipeWithMinAlign(ArrayRef< VPReplicateRecipe * > Group)
static bool sinkScalarOperands(VPlan &Plan)
static bool cannotHoistOrSinkRecipe(const VPRecipeBase &R)
Return true if we do not know how to (mechanically) hoist or sink R out of a loop region.
static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Try to simplify the branch condition of Plan.
static void simplifyRecipe(VPSingleDefRecipe *Def, VPTypeAnalysis &TypeInfo)
Try to simplify VPSingleDefRecipe Def.
static void removeRedundantInductionCasts(VPlan &Plan)
Remove redundant casts of inductions.
static bool isConditionTrueViaVFAndUF(VPValue *Cond, VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Return true if Cond is known to be true for given BestVF and BestUF.
static bool tryToReplaceALMWithWideALM(VPlan &Plan, ElementCount VF, unsigned UF)
Try to replace multiple active lane masks used for control flow with a single, wide active lane mask ...
static std::optional< std::pair< bool, unsigned > > getOpcodeOrIntrinsicID(const VPSingleDefRecipe *R)
Get any instruction opcode or intrinsic ID data embedded in recipe R.
static VPExpressionRecipe * tryToMatchAndCreateExtendedReduction(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries convert extended in-loop reductions to VPExpressionRecipe and clamp the Range if ...
static RemoveMask_match< Op0_t, Op1_t > m_RemoveMask(const Op0_t &In, Op1_t &Out)
Match a specific mask In, or a combination of it (logical-and In, Out).
static VPIRMetadata getCommonMetadata(ArrayRef< VPReplicateRecipe * > Recipes)
static VPValue * getPredicatedMask(VPRegionBlock *R)
If R is a region with a VPBranchOnMaskRecipe in the entry block, return the mask.
static bool sinkRecurrenceUsersAfterPrevious(VPFirstOrderRecurrencePHIRecipe *FOR, VPRecipeBase *Previous, VPDominatorTree &VPDT)
Sink users of FOR after the recipe defining the previous value Previous of the recurrence.
static bool mergeReplicateRegionsIntoSuccessors(VPlan &Plan)
static VPScalarIVStepsRecipe * createScalarIVSteps(VPlan &Plan, InductionDescriptor::InductionKind Kind, Instruction::BinaryOps InductionOpcode, FPMathOperator *FPBinOp, Instruction *TruncI, VPIRValue *StartV, VPValue *Step, DebugLoc DL, VPBuilder &Builder)
static VPWidenInductionRecipe * getOptimizableIVOf(VPValue *VPV, PredicatedScalarEvolution &PSE)
Check if VPV is an untruncated wide induction, either before or after the increment.
static VPActiveLaneMaskPHIRecipe * addVPLaneMaskPhiAndUpdateExitBranch(VPlan &Plan, bool DataAndControlFlowWithoutRuntimeCheck)
static void expandVPWidenPointerInduction(VPWidenPointerInductionRecipe *R, VPTypeAnalysis &TypeInfo)
Expand a VPWidenPointerInductionRecipe into executable recipes, for the initial value,...
static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL)
Replace recipes with their EVL variants.
static bool isDeadRecipe(VPRecipeBase &R)
Returns true if R is dead and can be removed.
static void legalizeAndOptimizeInductions(VPlan &Plan)
Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd (IndStart, ScalarIVSteps (0,...
static void addReplicateRegions(VPlan &Plan)
static VPValue * tryToFoldLiveIns(VPSingleDefRecipe &R, ArrayRef< VPValue * > Operands, const DataLayout &DL, VPTypeAnalysis &TypeInfo)
Try to fold R using InstSimplifyFolder.
static VPValue * tryToComputeEndValueForInduction(VPWidenInductionRecipe *WideIV, VPBuilder &VectorPHBuilder, VPTypeAnalysis &TypeInfo, VPValue *VectorTC)
Compute and return the end value for WideIV, unless it is truncated.
static void removeRedundantExpandSCEVRecipes(VPlan &Plan)
Remove redundant EpxandSCEVRecipes in Plan's entry block by replacing them with already existing reci...
static bool simplifyKnownEVL(VPlan &Plan, ElementCount VF, PredicatedScalarEvolution &PSE)
From the definition of llvm.experimental.get.vector.length, VPInstruction::ExplicitVectorLength(AVL) ...
static bool hoistPreviousBeforeFORUsers(VPFirstOrderRecurrencePHIRecipe *FOR, VPRecipeBase *Previous, VPDominatorTree &VPDT)
Try to hoist Previous and its operands before all users of FOR.
static VPValue * scalarizeVPWidenPointerInduction(VPWidenPointerInductionRecipe *PtrIV, VPlan &Plan, VPBuilder &Builder)
Scalarize a VPWidenPointerInductionRecipe by replacing it with a PtrAdd (IndStart,...
static SmallVector< VPUser * > collectUsersRecursively(VPValue *V)
static VPValue * optimizeEarlyExitInductionUser(VPlan &Plan, VPTypeAnalysis &TypeInfo, VPBlockBase *PredVPBB, VPValue *Op, PredicatedScalarEvolution &PSE)
Attempts to optimize the induction variable exit values for users in the early exit block.
static void recursivelyDeleteDeadRecipes(VPValue *V)
static bool canSinkStoreWithNoAliasCheck(ArrayRef< VPReplicateRecipe * > StoresToSink, PredicatedScalarEvolution &PSE, const Loop &L, VPTypeAnalysis &TypeInfo)
static VPRegionBlock * createReplicateRegion(VPReplicateRecipe *PredRecipe, VPlan &Plan)
static VPBasicBlock * getPredicatedThenBlock(VPRegionBlock *R)
If R is a triangle region, return the 'then' block of the triangle.
static VPValue * narrowInterleaveGroupOp(VPValue *V, SmallPtrSetImpl< VPValue * > &NarrowedOps)
static bool canHoistOrSinkWithNoAliasCheck(const MemoryLocation &MemLoc, VPBasicBlock *FirstBB, VPBasicBlock *LastBB, std::optional< SinkStoreInfo > SinkInfo={})
Check if a memory operation doesn't alias with memory operations in blocks between FirstBB and LastBB...
static void simplifyBlends(VPlan &Plan)
Normalize and simplify VPBlendRecipes.
static bool isConsecutiveInterleaveGroup(VPInterleaveRecipe *InterleaveR, ElementCount VF, VPTypeAnalysis &TypeInfo, TypeSize VectorRegWidth)
Returns true if IR is a full interleave group with factor and number of members both equal to VF.
static VPRecipeBase * optimizeMaskToEVL(VPValue *HeaderMask, VPRecipeBase &CurRecipe, VPTypeAnalysis &TypeInfo, VPValue &EVL)
Try to optimize a CurRecipe masked by HeaderMask to a corresponding EVL-based recipe without the head...
static bool isAlreadyNarrow(VPValue *VPV)
Returns true if VPValue is a narrow VPValue.
static bool optimizeVectorInductionWidthForTCAndVFUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF)
Optimize the width of vector induction variables in Plan based on a known constant Trip Count,...
static VPExpressionRecipe * tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries convert extended in-loop reductions to VPExpressionRecipe and clamp the Range if ...
static void expandVPWidenIntOrFpInduction(VPWidenIntOrFpInductionRecipe *WidenIVR, VPTypeAnalysis &TypeInfo)
Expand a VPWidenIntOrFpInduction into executable recipes, for the initial value, phi and backedge val...
static VPSingleDefRecipe * findHeaderMask(VPlan &Plan)
Collect the header mask with the pattern: (ICMP_ULE, WideCanonicalIV, backedge-taken-count) TODO: Int...
static void removeRedundantCanonicalIVs(VPlan &Plan)
Try to replace VPWidenCanonicalIVRecipes with a widened canonical IV recipe, if it exists.
static bool canNarrowLoad(VPWidenRecipe *WideMember0, unsigned OpIdx, VPValue *OpV, unsigned Idx)
Returns true if V is VPWidenLoadRecipe or VPInterleaveRecipe that can be converted to a narrower reci...
static void narrowToSingleScalarRecipes(VPlan &Plan)
static VPValue * optimizeLatchExitInductionUser(VPlan &Plan, VPTypeAnalysis &TypeInfo, VPBlockBase *PredVPBB, VPValue *Op, DenseMap< VPValue *, VPValue * > &EndValues, PredicatedScalarEvolution &PSE)
Attempts to optimize the induction variable exit values for users in the exit block coming from the l...
This file provides utility VPlan to VPlan transformations.
This file declares the class VPlanVerifier, which contains utility functions to check the consistency...
This file contains the declarations of the Vectorization Plan base classes:
static const X86InstrFMA3Group Groups[]
Value * RHS
Value * LHS
BinaryOperator * Mul
static const uint32_t IV[8]
Definition blake3_impl.h:83
Helper for extra no-alias checks via known-safe recipe and SCEV.
SinkStoreInfo(const SmallPtrSetImpl< VPRecipeBase * > &ExcludeRecipes, VPReplicateRecipe &GroupLeader, PredicatedScalarEvolution &PSE, const Loop &L, VPTypeAnalysis &TypeInfo)
bool shouldSkip(VPRecipeBase &R) const
Return true if R should be skipped during alias checking, either because it's in the exclude set or b...
Class for arbitrary precision integers.
Definition APInt.h:78
LLVM_ABI APInt zext(unsigned width) const
Zero extend to a new width.
Definition APInt.cpp:1012
unsigned getActiveBits() const
Compute the number of active bits in the value.
Definition APInt.h:1513
APInt abs() const
Get the absolute value.
Definition APInt.h:1796
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1489
LLVM_ABI APInt sext(unsigned width) const
Sign extend to a new width.
Definition APInt.cpp:985
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition APInt.h:441
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition APInt.h:1222
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
const T & back() const
back - Get the last element.
Definition ArrayRef.h:151
const T & front() const
front - Get the first element.
Definition ArrayRef.h:145
iterator end() const
Definition ArrayRef.h:131
iterator begin() const
Definition ArrayRef.h:130
LLVM Basic Block Representation.
Definition BasicBlock.h:62
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this basic block belongs to.
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233
This class represents a function call, abstracting a target machine's calling convention.
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:701
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:702
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition InstrTypes.h:686
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition InstrTypes.h:789
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
static ConstantInt * getSigned(IntegerType *Ty, int64_t V, bool ImplicitTrunc=false)
Return a ConstantInt with the specified value for the specified type.
Definition Constants.h:135
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
A debug info location.
Definition DebugLoc.h:123
static DebugLoc getCompilerGenerated()
Definition DebugLoc.h:162
static DebugLoc getUnknown()
Definition DebugLoc.h:161
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition DenseMap.h:205
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:256
bool dominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
dominates - Returns true iff A dominates B.
constexpr bool isVector() const
One or more elements.
Definition TypeSize.h:324
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition TypeSize.h:312
Utility class for floating point operations which can have information about relaxed accuracy require...
Definition Operator.h:200
Represents flags for the getelementptr instruction/expression.
GEPNoWrapFlags withoutNoUnsignedWrap() const
static GEPNoWrapFlags none()
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
A struct for saving information about induction variables.
InductionKind
This enum represents the kinds of inductions that we support.
@ IK_PtrInduction
Pointer induction var. Step = C.
@ IK_IntInduction
Integer induction variable. Step = C.
InstSimplifyFolder - Use InstructionSimplify to fold operations to existing values.
bool isCast() const
bool isBinaryOp() const
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:318
The group of interleaved loads/stores sharing the same stride and close to each other.
InstTy * getMember(uint32_t Index) const
Get the member with the given index Index.
uint32_t getNumMembers() const
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
static bool getDecisionAndClampRange(const std::function< bool(ElementCount)> &Predicate, VFRange &Range)
Test a Predicate on a Range of VF's.
Definition VPlan.cpp:1549
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
LLVM_ABI MDNode * createBranchWeights(uint32_t TrueWeight, uint32_t FalseWeight, bool IsExpected=false)
Return metadata containing two branch weights.
Definition MDBuilder.cpp:38
Metadata node.
Definition Metadata.h:1078
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:36
ValueT lookup(const KeyT &Key) const
Definition MapVector.h:108
Representation for a specific memory location.
AAMDNodes AATags
The metadata nodes which describes the aliasing of the location (each member is null if that kind of ...
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
ScalarEvolution * getSE() const
Returns the ScalarEvolution analysis used.
LLVM_ABI const SCEV * getSCEV(Value *V)
Returns the SCEV expression of V, in the context of the current SCEV predicate.
static LLVM_ABI unsigned getOpcode(RecurKind Kind)
Returns the opcode corresponding to the RecurrenceKind.
RegionT * getParent() const
Get the parent of the Region.
Definition RegionInfo.h:362
This class uses information about analyze scalars to rewrite expressions in canonical form.
LLVM_ABI Value * expandCodeFor(const SCEV *SH, Type *Ty, BasicBlock::iterator I)
Insert code to directly compute the specified SCEV expression into the program.
static const SCEV * rewrite(const SCEV *Scev, ScalarEvolution &SE, ValueToSCEVMapTy &Map)
This class represents an analyzed expression in the program.
LLVM_ABI Type * getType() const
Return the LLVM type of this SCEV expression.
The main scalar evolution driver.
const DataLayout & getDataLayout() const
Return the DataLayout associated with the module this SCEV instance is operating on.
LLVM_ABI const SCEV * getNegativeSCEV(const SCEV *V, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
Return the SCEV object corresponding to -V.
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI const SCEV * getUDivExpr(const SCEV *LHS, const SCEV *RHS)
Get a canonical unsigned division expression, or something simpler if possible.
LLVM_ABI const SCEV * getElementCount(Type *Ty, ElementCount EC, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
LLVM_ABI const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
LLVM_ABI const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
LLVM_ABI bool isKnownPredicate(CmpPredicate Pred, const SCEV *LHS, const SCEV *RHS)
Test if the given expression is known to satisfy the condition described by Pred, LHS,...
static LLVM_ABI bool mayAliasInScopes(const MDNode *Scopes, const MDNode *NoAlias)
A vector that has set insertion semantics.
Definition SetVector.h:57
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:103
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:151
size_type size() const
Definition SmallPtrSet.h:99
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
iterator begin() const
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
Provides information about what library functions are available for the current target.
static LLVM_ABI PartialReductionExtendKind getPartialReductionExtendKind(Instruction *I)
Get the kind of extension that an instruction represents.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
static constexpr TypeSize get(ScalarTy Quantity, bool Scalable)
Definition TypeSize.h:340
This class implements a switch-like dispatch statement for a value of 'T' using dyn_cast functionalit...
Definition TypeSwitch.h:89
TypeSwitch< T, ResultT > & Case(CallableT &&caseFn)
Add a case on the given type.
Definition TypeSwitch.h:98
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:297
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:296
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
Definition Type.cpp:294
bool isStructTy() const
True if this is an instance of StructType.
Definition Type.h:261
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:230
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:293
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:184
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
op_range operands()
Definition User.h:293
A recipe for generating the active lane mask for the vector loop that is used to predicate the vector...
Definition VPlan.h:3592
VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph.
Definition VPlan.h:3949
void appendRecipe(VPRecipeBase *Recipe)
Augment the existing recipes of a VPBasicBlock with an additional Recipe as the last recipe.
Definition VPlan.h:4024
RecipeListTy::iterator iterator
Instruction iterators...
Definition VPlan.h:3976
iterator end()
Definition VPlan.h:3986
iterator begin()
Recipe iterator methods.
Definition VPlan.h:3984
iterator_range< iterator > phis()
Returns an iterator range over the PHI-like recipes in the block.
Definition VPlan.h:4037
iterator getFirstNonPhi()
Return the position of the first non-phi node recipe in the block.
Definition VPlan.cpp:228
VPRegionBlock * getEnclosingLoopRegion()
Definition VPlan.cpp:589
VPBasicBlock * splitAt(iterator SplitAt)
Split current block at SplitAt by inserting a new block between the current block and its successors ...
Definition VPlan.cpp:561
VPRecipeBase * getTerminator()
If the block has multiple successors, return the branch recipe terminating the block.
Definition VPlan.cpp:635
const VPRecipeBase & back() const
Definition VPlan.h:3998
A recipe for vectorizing a phi-node as a sequence of mask-based select instructions.
Definition VPlan.h:2499
VPValue * getMask(unsigned Idx) const
Return mask number Idx.
Definition VPlan.h:2533
unsigned getNumIncomingValues() const
Return the number of incoming values, taking into account when normalized the first incoming value wi...
Definition VPlan.h:2523
void setMask(unsigned Idx, VPValue *V)
Set mask number Idx to V.
Definition VPlan.h:2539
bool isNormalized() const
A normalized blend is one that has an odd number of operands, whereby the first operand does not have...
Definition VPlan.h:2519
VPBlockBase is the building block of the Hierarchical Control-Flow Graph.
Definition VPlan.h:81
void setSuccessors(ArrayRef< VPBlockBase * > NewSuccs)
Set each VPBasicBlock in NewSuccss as successor of this VPBlockBase.
Definition VPlan.h:300
VPRegionBlock * getParent()
Definition VPlan.h:173
const VPBasicBlock * getExitingBasicBlock() const
Definition VPlan.cpp:198
size_t getNumSuccessors() const
Definition VPlan.h:219
size_t getNumPredecessors() const
Definition VPlan.h:220
void setPredecessors(ArrayRef< VPBlockBase * > NewPreds)
Set each VPBasicBlock in NewPreds as predecessor of this VPBlockBase.
Definition VPlan.h:291
const VPBlocksTy & getPredecessors() const
Definition VPlan.h:204
VPlan * getPlan()
Definition VPlan.cpp:173
void clearSuccessors()
Remove all the successors of this block.
Definition VPlan.h:310
VPBlockBase * getSinglePredecessor() const
Definition VPlan.h:215
const VPBasicBlock * getEntryBasicBlock() const
Definition VPlan.cpp:178
void setParent(VPRegionBlock *P)
Definition VPlan.h:184
VPBlockBase * getSingleHierarchicalPredecessor()
Definition VPlan.h:264
VPBlockBase * getSingleSuccessor() const
Definition VPlan.h:209
const VPBlocksTy & getSuccessors() const
Definition VPlan.h:198
static auto blocksOnly(const T &Range)
Return an iterator range over Range which only includes BlockTy blocks.
Definition VPlanUtils.h:221
static void insertOnEdge(VPBlockBase *From, VPBlockBase *To, VPBlockBase *BlockPtr)
Inserts BlockPtr on the edge between From and To.
Definition VPlanUtils.h:242
static void insertTwoBlocksAfter(VPBlockBase *IfTrue, VPBlockBase *IfFalse, VPBlockBase *BlockPtr)
Insert disconnected VPBlockBases IfTrue and IfFalse after BlockPtr.
Definition VPlanUtils.h:154
static void connectBlocks(VPBlockBase *From, VPBlockBase *To, unsigned PredIdx=-1u, unsigned SuccIdx=-1u)
Connect VPBlockBases From and To bi-directionally.
Definition VPlanUtils.h:173
static void disconnectBlocks(VPBlockBase *From, VPBlockBase *To)
Disconnect VPBlockBases From and To bi-directionally.
Definition VPlanUtils.h:191
A recipe for generating conditional branches on the bits of a mask.
Definition VPlan.h:3003
RAII object that stores the current insertion point and restores it when the object is destroyed.
VPlan-based builder utility analogous to IRBuilder.
VPValue * createScalarZExtOrTrunc(VPValue *Op, Type *ResultTy, Type *SrcTy, DebugLoc DL)
VPValue * createElementCount(Type *Ty, ElementCount EC)
VPInstruction * createScalarCast(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy, DebugLoc DL, const VPIRFlags &Flags={}, const VPIRMetadata &Metadata={})
static VPBuilder getToInsertAfter(VPRecipeBase *R)
Create a VPBuilder to insert after R.
VPInstruction * createOverflowingOp(unsigned Opcode, ArrayRef< VPValue * > Operands, VPRecipeWithIRFlags::WrapFlagsTy WrapFlags={false, false}, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
VPDerivedIVRecipe * createDerivedIV(InductionDescriptor::InductionKind Kind, FPMathOperator *FPBinOp, VPIRValue *Start, VPValue *Current, VPValue *Step, const Twine &Name="")
Convert the input value Current to the corresponding value of an induction with Start and Step values...
VPPhi * createScalarPhi(ArrayRef< VPValue * > IncomingValues, DebugLoc DL, const Twine &Name="")
void setInsertPoint(VPBasicBlock *TheBB)
This specifies that created VPInstructions should be appended to the end of the specified block.
VPInstruction * createNaryOp(unsigned Opcode, ArrayRef< VPValue * > Operands, Instruction *Inst=nullptr, const VPIRFlags &Flags={}, const VPIRMetadata &MD={}, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
Create an N-ary operation with Opcode, Operands and set Inst as its underlying Instruction.
Canonical scalar induction phi of the vector loop.
Definition VPlan.h:3535
unsigned getNumDefinedValues() const
Returns the number of values defined by the VPDef.
Definition VPlanValue.h:453
VPValue * getVPSingleValue()
Returns the only VPValue defined by the VPDef.
Definition VPlanValue.h:426
VPValue * getVPValue(unsigned I)
Returns the VPValue with index I defined by the VPDef.
Definition VPlanValue.h:438
ArrayRef< VPRecipeValue * > definedValues()
Returns an ArrayRef of the values defined by the VPDef.
Definition VPlanValue.h:448
A recipe for converting the input value IV value to the corresponding value of an IV with different s...
Definition VPlan.h:3703
Template specialization of the standard LLVM dominator tree utility for VPBlockBases.
bool properlyDominates(const VPRecipeBase *A, const VPRecipeBase *B)
A recipe for generating the phi node for the current index of elements, adjusted in accordance with E...
Definition VPlan.h:3624
A recipe to combine multiple recipes into a single 'expression' recipe, which should be considered a ...
Definition VPlan.h:3048
A pure virtual base class for all recipes modeling header phis, including phis for first order recurr...
Definition VPlan.h:2020
virtual VPValue * getBackedgeValue()
Returns the incoming value from the loop backedge.
Definition VPlan.h:2063
VPValue * getStartValue()
Returns the start value of the phi, if one is set.
Definition VPlan.h:2052
A special type of VPBasicBlock that wraps an existing IR basic block.
Definition VPlan.h:4102
BasicBlock * getIRBasicBlock() const
Definition VPlan.h:4126
Class to record and manage LLVM IR flags.
Definition VPlan.h:608
static LLVM_ABI_FOR_TEST VPIRInstruction * create(Instruction &I)
Create a new VPIRPhi for \I , if it is a PHINode, otherwise create a VPIRInstruction.
Helper to manage IR metadata for recipes.
Definition VPlan.h:980
void intersect(const VPIRMetadata &MD)
Intersect this VPIRMetadata object with MD, keeping only metadata nodes that are common to both.
This is a concrete Recipe that models a single VPlan-level instruction.
Definition VPlan.h:1034
@ ExtractLane
Extracts a single lane (first operand) from a set of vector operands.
Definition VPlan.h:1136
@ Unpack
Extracts all lanes from its (non-scalable) vector operand.
Definition VPlan.h:1078
@ BuildVector
Creates a fixed-width vector containing all operands.
Definition VPlan.h:1073
@ BuildStructVector
Given operands of (the same) struct type, creates a struct of fixed- width vectors each containing a ...
Definition VPlan.h:1070
@ CanonicalIVIncrementForPart
Definition VPlan.h:1054
const InterleaveGroup< Instruction > * getInterleaveGroup() const
Definition VPlan.h:2641
VPValue * getMask() const
Return the mask used by this recipe.
Definition VPlan.h:2633
ArrayRef< VPValue * > getStoredValues() const
Return the VPValues stored by this interleave group.
Definition VPlan.h:2662
A recipe for interleaved memory operations with vector-predication intrinsics.
Definition VPlan.h:2715
VPInterleaveRecipe is a recipe for transforming an interleave group of load or stores into one wide l...
Definition VPlan.h:2673
VPPredInstPHIRecipe is a recipe for generating the phi nodes needed when control converges back from ...
Definition VPlan.h:3190
VPRecipeBase is a base class modeling a sequence of one or more output IR instructions.
Definition VPlan.h:387
VPRegionBlock * getRegion()
Definition VPlan.h:4254
VPBasicBlock * getParent()
Definition VPlan.h:408
DebugLoc getDebugLoc() const
Returns the debug location of the recipe.
Definition VPlan.h:479
void moveBefore(VPBasicBlock &BB, iplist< VPRecipeBase >::iterator I)
Unlink this recipe and insert into BB before I.
void insertBefore(VPRecipeBase *InsertPos)
Insert an unlinked recipe into a basic block immediately before the specified recipe.
void insertAfter(VPRecipeBase *InsertPos)
Insert an unlinked Recipe into a basic block immediately after the specified Recipe.
iplist< VPRecipeBase >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Helper class to create VPRecipies from IR instructions.
VPRecipeBase * getRecipe(Instruction *I)
Return the recipe created for given ingredient.
A recipe to represent inloop reduction operations with vector-predication intrinsics,...
Definition VPlan.h:2877
A recipe to represent inloop, ordered or partial reduction operations.
Definition VPlan.h:2766
VPRegionBlock represents a collection of VPBasicBlocks and VPRegionBlocks which form a Single-Entry-S...
Definition VPlan.h:4137
const VPBlockBase * getEntry() const
Definition VPlan.h:4173
Type * getCanonicalIVType()
Return the type of the canonical IV for loop regions.
Definition VPlan.h:4248
bool isReplicator() const
An indicator whether this region is to generate multiple replicated instances of output IR correspond...
Definition VPlan.h:4205
void setExiting(VPBlockBase *ExitingBlock)
Set ExitingBlock as the exiting VPBlockBase of this VPRegionBlock.
Definition VPlan.h:4190
VPCanonicalIVPHIRecipe * getCanonicalIV()
Returns the canonical induction recipe of the region.
Definition VPlan.h:4235
const VPBlockBase * getExiting() const
Definition VPlan.h:4185
VPBasicBlock * getPreheaderVPBB()
Returns the pre-header VPBasicBlock of the loop region.
Definition VPlan.h:4198
VPReplicateRecipe replicates a given instruction producing multiple scalar copies of the original sca...
Definition VPlan.h:2922
bool isSingleScalar() const
Definition VPlan.h:2963
VPValue * getMask()
Return the mask of a predicated VPReplicateRecipe.
Definition VPlan.h:2987
A recipe for handling phi nodes of integer and floating-point inductions, producing their scalar valu...
Definition VPlan.h:3771
VPSingleDef is a base class for recipes for modeling a sequence of one or more output IR that define ...
Definition VPlan.h:531
Instruction * getUnderlyingInstr()
Returns the underlying instruction.
Definition VPlan.h:594
VPSingleDefRecipe * clone() override=0
Clone the current recipe.
An analysis for type-inference for VPValues.
LLVMContext & getContext()
Return the LLVMContext used by the analysis.
Type * inferScalarType(const VPValue *V)
Infer the type of V. Returns the scalar type of V.
This class augments VPValue with operands which provide the inverse def-use edges from VPValue's user...
Definition VPlanValue.h:229
operand_range operands()
Definition VPlanValue.h:297
void setOperand(unsigned I, VPValue *New)
Definition VPlanValue.h:273
VPValue * getOperand(unsigned N) const
Definition VPlanValue.h:268
void addOperand(VPValue *Operand)
Definition VPlanValue.h:262
This is the base class of the VPlan Def/Use graph, used for modeling the data flow into,...
Definition VPlanValue.h:45
Value * getLiveInIRValue() const
Return the underlying IR value for a VPIRValue.
Definition VPlan.cpp:133
bool isDefinedOutsideLoopRegions() const
Returns true if the VPValue is defined outside any loop.
Definition VPlan.cpp:1382
VPRecipeBase * getDefiningRecipe()
Returns the recipe defining this VPValue or nullptr if it is not defined by a recipe,...
Definition VPlan.cpp:119
Value * getUnderlyingValue() const
Return the underlying Value attached to this VPValue.
Definition VPlanValue.h:72
void setUnderlyingValue(Value *Val)
Definition VPlanValue.h:173
void replaceAllUsesWith(VPValue *New)
Definition VPlan.cpp:1385
unsigned getNumUsers() const
Definition VPlanValue.h:105
void replaceUsesWithIf(VPValue *New, llvm::function_ref< bool(VPUser &U, unsigned Idx)> ShouldReplace)
Go through the uses list for this VPValue and make each use point to New if the callback ShouldReplac...
Definition VPlan.cpp:1389
user_range users()
Definition VPlanValue.h:126
A recipe to compute a pointer to the last element of each part of a widened memory access for widened...
Definition VPlan.h:1879
A Recipe for widening the canonical induction variable of the vector loop.
Definition VPlan.h:3666
VPWidenCastRecipe is a recipe to create vector cast instructions.
Definition VPlan.h:1570
Instruction::CastOps getOpcode() const
Definition VPlan.h:1606
A recipe for handling GEP instructions.
Definition VPlan.h:1816
Base class for widened induction (VPWidenIntOrFpInductionRecipe and VPWidenPointerInductionRecipe),...
Definition VPlan.h:2087
VPIRValue * getStartValue() const
Returns the start value of the induction.
Definition VPlan.h:2115
PHINode * getPHINode() const
Definition VPlan.h:2132
VPValue * getStepValue()
Returns the step value of the induction.
Definition VPlan.h:2118
const InductionDescriptor & getInductionDescriptor() const
Returns the induction descriptor for the recipe.
Definition VPlan.h:2135
A recipe for handling phi nodes of integer and floating-point inductions, producing their vector valu...
Definition VPlan.h:2166
VPIRValue * getStartValue() const
Returns the start value of the induction.
Definition VPlan.h:2213
VPValue * getLastUnrolledPartOperand()
Returns the VPValue representing the value of this induction at the last unrolled part,...
Definition VPlan.h:2244
A recipe for widening vector intrinsics.
Definition VPlan.h:1620
A common base class for widening memory operations.
Definition VPlan.h:3233
A recipe for widened phis.
Definition VPlan.h:2302
VPWidenRecipe is a recipe for producing a widened instruction using the opcode and operands of the re...
Definition VPlan.h:1522
VPlan models a candidate for vectorization, encoding various decisions take to produce efficient outp...
Definition VPlan.h:4267
VPIRValue * getLiveIn(Value *V) const
Return the live-in VPIRValue for V, if there is one or nullptr otherwise.
Definition VPlan.h:4546
bool hasVF(ElementCount VF) const
Definition VPlan.h:4464
LLVMContext & getContext() const
Definition VPlan.h:4452
VPBasicBlock * getEntry()
Definition VPlan.h:4356
bool hasScalableVF() const
Definition VPlan.h:4465
VPValue & getVFxUF()
Returns VF * UF of the vector loop region.
Definition VPlan.h:4450
VPValue & getVF()
Returns the VF of the vector loop region.
Definition VPlan.h:4446
VPValue * getTripCount() const
The trip count of the original loop.
Definition VPlan.h:4414
VPValue * getOrCreateBackedgeTakenCount()
The backedge taken count of the original loop.
Definition VPlan.h:4435
VPIRValue * getFalse()
Return a VPIRValue wrapping i1 false.
Definition VPlan.h:4525
unsigned getUF() const
Definition VPlan.h:4484
VPRegionBlock * createReplicateRegion(VPBlockBase *Entry, VPBlockBase *Exiting, const std::string &Name="")
Create a new replicate region with Entry, Exiting and Name.
Definition VPlan.h:4594
auto getLiveIns() const
Return the list of live-in VPValues available in the VPlan.
Definition VPlan.h:4549
bool hasUF(unsigned UF) const
Definition VPlan.h:4482
ArrayRef< VPIRBasicBlock * > getExitBlocks() const
Return an ArrayRef containing VPIRBasicBlocks wrapping the exit blocks of the original scalar loop.
Definition VPlan.h:4404
VPSymbolicValue & getVectorTripCount()
The vector trip count.
Definition VPlan.h:4443
VPIRValue * getOrAddLiveIn(Value *V)
Gets the live-in VPIRValue for V or adds a new live-in (if none exists yet) for V.
Definition VPlan.h:4506
void setVF(ElementCount VF)
Definition VPlan.h:4458
bool isUnrolled() const
Returns true if the VPlan already has been unrolled, i.e.
Definition VPlan.h:4497
LLVM_ABI_FOR_TEST VPRegionBlock * getVectorLoopRegion()
Returns the VPRegionBlock of the vector loop.
Definition VPlan.cpp:1022
void resetTripCount(VPValue *NewTripCount)
Resets the trip count for the VPlan.
Definition VPlan.h:4428
VPBasicBlock * getMiddleBlock()
Returns the 'middle' block of the plan, that is the block that selects whether to execute the scalar ...
Definition VPlan.h:4381
VPBasicBlock * createVPBasicBlock(const Twine &Name, VPRecipeBase *Recipe=nullptr)
Create a new VPBasicBlock with Name and containing Recipe if present.
Definition VPlan.h:4572
VPIRValue * getTrue()
Return a VPIRValue wrapping i1 true.
Definition VPlan.h:4522
bool hasScalarVFOnly() const
Definition VPlan.h:4475
VPBasicBlock * getScalarPreheader() const
Return the VPBasicBlock for the preheader of the scalar loop.
Definition VPlan.h:4395
VPIRBasicBlock * getScalarHeader() const
Return the VPIRBasicBlock wrapping the header of the scalar loop.
Definition VPlan.h:4400
VPBasicBlock * getVectorPreheader()
Returns the preheader of the vector loop region, if one exists, or null otherwise.
Definition VPlan.h:4361
void setUF(unsigned UF)
Definition VPlan.h:4489
bool hasScalarTail() const
Returns true if the scalar tail may execute after the vector loop.
Definition VPlan.h:4626
VPIRValue * getConstantInt(Type *Ty, uint64_t Val, bool IsSigned=false)
Return a VPIRValue wrapping a ConstantInt with the given type and value.
Definition VPlan.h:4528
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
iterator_range< user_iterator > users()
Definition Value.h:426
bool hasName() const
Definition Value.h:262
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
static constexpr bool isKnownLT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:216
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:168
constexpr LeafTy multiplyCoefficientBy(ScalarTy RHS) const
Definition TypeSize.h:256
constexpr bool isFixed() const
Returns true if the quantity is not scaled by vscale.
Definition TypeSize.h:171
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
IteratorT end() const
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
LLVM_ABI APInt RoundingUDiv(const APInt &A, const APInt &B, APInt::Rounding RM)
Return A unsign-divided by B, rounded by the given rounding mode.
Definition APInt.cpp:2763
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
m_Intrinsic_Ty< Opnd0, Opnd1, Opnd2 >::Ty m_MaskedStore(const Opnd0 &Op0, const Opnd1 &Op1, const Opnd2 &Op2)
Matches MaskedStore Intrinsic.
ap_match< APInt > m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
CastInst_match< OpTy, TruncInst > m_Trunc(const OpTy &Op)
Matches Trunc.
LogicalOp_match< LHS, RHS, Instruction::And > m_LogicalAnd(const LHS &L, const RHS &R)
Matches L && R either in the form of L & R or L ?
match_combine_or< CastInst_match< OpTy, ZExtInst >, OpTy > m_ZExtOrSelf(const OpTy &Op)
bool match(Val *V, const Pattern &P)
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
m_Intrinsic_Ty< Opnd0, Opnd1, Opnd2 >::Ty m_MaskedLoad(const Opnd0 &Op0, const Opnd1 &Op1, const Opnd2 &Op2)
Matches MaskedLoad Intrinsic.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
SpecificCmpClass_match< LHS, RHS, CmpInst > m_SpecificCmp(CmpPredicate MatchPred, const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
deferredval_ty< Value > m_Deferred(Value *const &V)
Like m_Specific(), but works if the specific value to match is determined as part of the same match()...
SpecificCmpClass_match< LHS, RHS, ICmpInst > m_SpecificICmp(CmpPredicate MatchPred, const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::UDiv > m_UDiv(const LHS &L, const RHS &R)
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
CmpClass_match< LHS, RHS, ICmpInst > m_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
BinaryOp_match< LHS, RHS, Instruction::Mul, true > m_c_Mul(const LHS &L, const RHS &R)
Matches a Mul with LHS and RHS in either order.
MatchFunctor< Val, Pattern > match_fn(const Pattern &P)
A match functor that can be used as a UnaryPredicate in functional algorithms like all_of.
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
bind_cst_ty m_scev_APInt(const APInt *&C)
Match an SCEV constant and bind it to an APInt.
bool match(const SCEV *S, const Pattern &P)
VPInstruction_match< VPInstruction::ExtractLastLane, VPInstruction_match< VPInstruction::ExtractLastPart, Op0_t > > m_ExtractLastLaneOfLastPart(const Op0_t &Op0)
AllRecipe_commutative_match< Instruction::And, Op0_t, Op1_t > m_c_BinaryAnd(const Op0_t &Op0, const Op1_t &Op1)
Match a binary AND operation.
AllRecipe_match< Instruction::Or, Op0_t, Op1_t > m_BinaryOr(const Op0_t &Op0, const Op1_t &Op1)
Match a binary OR operation.
VPInstruction_match< VPInstruction::AnyOf > m_AnyOf()
AllRecipe_commutative_match< Opcode, Op0_t, Op1_t > m_c_Binary(const Op0_t &Op0, const Op1_t &Op1)
AllRecipe_commutative_match< Instruction::Or, Op0_t, Op1_t > m_c_BinaryOr(const Op0_t &Op0, const Op1_t &Op1)
GEPLikeRecipe_match< Op0_t, Op1_t > m_GetElementPtr(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::BranchOnTwoConds > m_BranchOnTwoConds()
AllRecipe_match< Opcode, Op0_t, Op1_t > m_Binary(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::LastActiveLane, Op0_t > m_LastActiveLane(const Op0_t &Op0)
VPInstruction_match< Instruction::ExtractElement, Op0_t, Op1_t > m_ExtractElement(const Op0_t &Op0, const Op1_t &Op1)
specific_intval< 1 > m_False()
VPDerivedIV_match< Op0_t, Op1_t, Op2_t > m_DerivedIV(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2)
VPInstruction_match< VPInstruction::ExtractLastLane, Op0_t > m_ExtractLastLane(const Op0_t &Op0)
VPInstruction_match< VPInstruction::ActiveLaneMask, Op0_t, Op1_t, Op2_t > m_ActiveLaneMask(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2)
VPInstruction_match< VPInstruction::BranchOnCount > m_BranchOnCount()
specific_intval< 1 > m_True()
VectorEndPointerRecipe_match< Op0_t, Op1_t > m_VecEndPtr(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::ExtractLastPart, Op0_t > m_ExtractLastPart(const Op0_t &Op0)
VPInstruction_match< VPInstruction::Broadcast, Op0_t > m_Broadcast(const Op0_t &Op0)
class_match< VPValue > m_VPValue()
Match an arbitrary VPValue and ignore it.
VPInstruction_match< VPInstruction::ExplicitVectorLength, Op0_t > m_EVL(const Op0_t &Op0)
VPInstruction_match< VPInstruction::BuildVector > m_BuildVector()
BuildVector is matches only its opcode, w/o matching its operands as the number of operands is not fi...
VPInstruction_match< VPInstruction::ExtractPenultimateElement, Op0_t > m_ExtractPenultimateElement(const Op0_t &Op0)
VPInstruction_match< VPInstruction::FirstActiveLane, Op0_t > m_FirstActiveLane(const Op0_t &Op0)
bind_ty< VPInstruction > m_VPInstruction(VPInstruction *&V)
Match a VPInstruction, capturing if we match.
VPInstruction_match< VPInstruction::BranchOnCond > m_BranchOnCond()
VPInstruction_match< VPInstruction::ExtractLane, Op0_t, Op1_t > m_ExtractLane(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::Reverse, Op0_t > m_Reverse(const Op0_t &Op0)
NodeAddr< DefNode * > Def
Definition RDFGraph.h:384
bool isSingleScalar(const VPValue *VPV)
Returns true if VPV is a single scalar, either because it produces the same value for all lanes or on...
bool isUniformAcrossVFsAndUFs(VPValue *V)
Checks if V is uniform across all VF lanes and UF parts.
VPValue * getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr)
Get or create a VPValue that corresponds to the expansion of Expr.
std::optional< MemoryLocation > getMemoryLocation(const VPRecipeBase &R)
Return a MemoryLocation for R with noalias metadata populated from R, if the recipe is supported and ...
bool onlyFirstLaneUsed(const VPValue *Def)
Returns true if only the first lane of Def is used.
VPIRFlags getFlagsFromIndDesc(const InductionDescriptor &ID)
Extracts and returns NoWrap and FastMath flags from the induction binop in ID.
Definition VPlanUtils.h:93
bool onlyScalarValuesUsed(const VPValue *Def)
Returns true if only scalar values of Def are used by all users.
const SCEV * getSCEVExprForVPValue(const VPValue *V, PredicatedScalarEvolution &PSE, const Loop *L=nullptr)
Return the SCEV expression for V.
bool isHeaderMask(const VPValue *V, const VPlan &Plan)
Return true if V is a header mask in Plan.
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
@ Offset
Definition DWP.cpp:532
auto min_element(R &&Range)
Provide wrappers to std::min_element which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:2068
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1737
LLVM_ABI Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
DenseMap< const Value *, const SCEV * > ValueToSCEVMapTy
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2530
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
constexpr from_range_t from_range
auto dyn_cast_if_present(const Y &Val)
dyn_cast_if_present<X> - Functionally identical to dyn_cast, except that a null (or none in the case ...
Definition Casting.h:732
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2184
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:632
auto cast_or_null(const Y &Val)
Definition Casting.h:714
iterator_range< df_iterator< VPBlockShallowTraversalWrapper< VPBlockBase * > > > vp_depth_first_shallow(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order.
Definition VPlanCFG.h:216
iterator_range< df_iterator< VPBlockDeepTraversalWrapper< VPBlockBase * > > > vp_depth_first_deep(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order while traversing t...
Definition VPlanCFG.h:243
detail::concat_range< ValueT, RangeTs... > concat(RangeTs &&...Ranges)
Returns a concatenated range across two or more ranges.
Definition STLExtras.h:1150
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:385
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1744
auto reverse(ContainerTy &&C)
Definition STLExtras.h:406
iterator_range< po_iterator< VPBlockDeepTraversalWrapper< VPBlockBase * > > > vp_post_order_deep(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in post order while traversing through ...
Definition VPlanCFG.h:236
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1634
LLVM_ABI_FOR_TEST cl::opt< bool > EnableWideActiveLaneMask
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1751
SmallVector< ValueTypeFromRangeType< R >, Size > to_vector(R &&Range)
Given a range of type R, iterate the entire range and return a SmallVector with elements of the vecto...
iterator_range< filter_iterator< detail::IterOfRange< RangeT >, PredicateT > > make_filter_range(RangeT &&Range, PredicateT Pred)
Convenience function that takes a range of elements and a predicate, and return a new filter_iterator...
Definition STLExtras.h:550
bool canConstantBeExtended(const APInt *C, Type *NarrowType, TTI::PartialReductionExtendKind ExtKind)
Check if a constant CI can be safely treated as having been extended from a narrower type with the gi...
Definition VPlan.cpp:1726
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
auto drop_end(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the last N elements excluded.
Definition STLExtras.h:323
RecurKind
These are the kinds of recurrences that we support.
@ Mul
Product of integers.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
@ AddChainWithSubs
A chain of adds and subs.
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition STLExtras.h:2002
DWARFExpression::Operation Op
auto max_element(R &&Range)
Provide wrappers to std::max_element which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:2078
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition STLExtras.h:2009
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="", bool Before=false)
Split the specified block at the specified instruction.
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1770
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition STLExtras.h:2156
@ DataAndControlFlowWithoutRuntimeCheck
Use predicate to control both data and control flow, but modify the trip count so that a runtime over...
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition Hashing.h:592
bool equal(L &&LRange, R &&RRange)
Wrapper function around std::equal to detect if pair-wise elements between two ranges are the same.
Definition STLExtras.h:2136
Type * toVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
@ Default
The result values are uniform if and only if all operands are uniform.
Definition Uniformity.h:20
constexpr detail::IsaCheckPredicate< Types... > IsaPred
Function object wrapper for the llvm::isa type check.
Definition Casting.h:866
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition Hashing.h:466
#define N
RemoveMask_match(const Op0_t &In, Op1_t &Out)
bool match(OpTy *V) const
A collection of metadata nodes that might be associated with a memory access used by the alias-analys...
Definition Metadata.h:761
MDNode * Scope
The tag for alias scope specification (used with noalias).
Definition Metadata.h:784
MDNode * NoAlias
The tag specifying the noalias scope.
Definition Metadata.h:787
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
An information struct used to provide DenseMap with the various necessary components for a given valu...
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
A range of powers-of-2 vectorization factors with fixed start and adjustable end.
Struct to hold various analysis needed for cost computations.
A recipe for handling first-order recurrence phis.
Definition VPlan.h:2344
A VPValue representing a live-in from the input IR or a constant.
Definition VPlanValue.h:184
Type * getType() const
Returns the type of the underlying IR value.
Definition VPlan.cpp:137
A symbolic live-in VPValue, used for values like vector trip count, VF, and VFxUF.
Definition VPlanValue.h:202
A recipe for widening load operations with vector-predication intrinsics, using the address to load f...
Definition VPlan.h:3366
A recipe for widening load operations, using the address to load from and an optional mask.
Definition VPlan.h:3324
A recipe for widening store operations with vector-predication intrinsics, using the value to store,...
Definition VPlan.h:3450
A recipe for widening store operations, using the stored value, the address to store to and an option...
Definition VPlan.h:3407
static void materializeBroadcasts(VPlan &Plan)
Add explicit broadcasts for live-ins and VPValues defined in Plan's entry block if they are used as v...
static void materializePacksAndUnpacks(VPlan &Plan)
Add explicit Build[Struct]Vector recipes to Pack multiple scalar values into vectors and Unpack recip...
static void optimizeInductionExitUsers(VPlan &Plan, DenseMap< VPValue *, VPValue * > &EndValues, PredicatedScalarEvolution &PSE)
If there's a single exit block, optimize its phi recipes that use exiting IV values by feeding them p...
static void materializeBackedgeTakenCount(VPlan &Plan, VPBasicBlock *VectorPH)
Materialize the backedge-taken count to be computed explicitly using VPInstructions.
static void hoistInvariantLoads(VPlan &Plan)
Hoist single-scalar loads with invariant addresses out of the vector loop to the preheader,...
static void canonicalizeEVLLoops(VPlan &Plan)
Transform EVL loops to use variable-length stepping after region dissolution.
static void dropPoisonGeneratingRecipes(VPlan &Plan, const std::function< bool(BasicBlock *)> &BlockNeedsPredication)
Drop poison flags from recipes that may generate a poison value that is used after vectorization,...
static void createAndOptimizeReplicateRegions(VPlan &Plan)
Wrap predicated VPReplicateRecipes with a mask operand in an if-then region block and remove the mask...
static void createInterleaveGroups(VPlan &Plan, const SmallPtrSetImpl< const InterleaveGroup< Instruction > * > &InterleaveGroups, VPRecipeBuilder &RecipeBuilder, const bool &ScalarEpilogueAllowed)
static bool runPass(bool(*Transform)(VPlan &, ArgsTy...), VPlan &Plan, typename std::remove_reference< ArgsTy >::type &...Args)
Helper to run a VPlan transform Transform on VPlan, forwarding extra arguments to the transform.
static void addBranchWeightToMiddleTerminator(VPlan &Plan, ElementCount VF, std::optional< unsigned > VScaleForTuning)
Add branch weight metadata, if the Plan's middle block is terminated by a BranchOnCond recipe.
static void narrowInterleaveGroups(VPlan &Plan, ElementCount VF, TypeSize VectorRegWidth)
Try to convert a plan with interleave groups with VF elements to a plan with the interleave groups re...
static DenseMap< const SCEV *, Value * > expandSCEVs(VPlan &Plan, ScalarEvolution &SE)
Expand VPExpandSCEVRecipes in Plan's entry block.
static void convertToConcreteRecipes(VPlan &Plan)
Lower abstract recipes to concrete ones, that can be codegen'd.
static void expandBranchOnTwoConds(VPlan &Plan)
Expand BranchOnTwoConds instructions into explicit CFG with BranchOnCond instructions.
static void hoistPredicatedLoads(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L)
Hoist predicated loads from the same address to the loop entry block, if they are guaranteed to execu...
static void convertToAbstractRecipes(VPlan &Plan, VPCostContext &Ctx, VFRange &Range)
This function converts initial recipes to the abstract recipes and clamps Range based on cost model f...
static void materializeConstantVectorTripCount(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
static LLVM_ABI_FOR_TEST bool tryToConvertVPInstructionsToVPRecipes(VPlan &Plan, function_ref< const InductionDescriptor *(PHINode *)> GetIntOrFpInductionDescriptor, const TargetLibraryInfo &TLI)
Replaces the VPInstructions in Plan with corresponding widen recipes.
static void addExitUsersForFirstOrderRecurrences(VPlan &Plan, VFRange &Range)
Handle users in the exit block for first order reductions in the original exit block.
static void addExplicitVectorLength(VPlan &Plan, const std::optional< unsigned > &MaxEVLSafeElements)
Add a VPEVLBasedIVPHIRecipe and related recipes to Plan and replaces all uses except the canonical IV...
static void replaceSymbolicStrides(VPlan &Plan, PredicatedScalarEvolution &PSE, const DenseMap< Value *, const SCEV * > &StridesMap)
Replace symbolic strides from StridesMap in Plan with constants when possible.
static void removeBranchOnConst(VPlan &Plan)
Remove BranchOnCond recipes with true or false conditions together with removing dead edges to their ...
static void removeDeadRecipes(VPlan &Plan)
Remove dead recipes from Plan.
static void materializeVectorTripCount(VPlan &Plan, VPBasicBlock *VectorPHVPBB, bool TailByMasking, bool RequiresScalarEpilogue)
Materialize vector trip count computations to a set of VPInstructions.
static void simplifyRecipes(VPlan &Plan)
Perform instcombine-like simplifications on recipes in Plan.
static void sinkPredicatedStores(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L)
Sink predicated stores to the same address with complementary predicates (P and NOT P) to an uncondit...
static void handleUncountableEarlyExit(VPBasicBlock *EarlyExitingVPBB, VPBasicBlock *EarlyExitVPBB, VPlan &Plan, VPBasicBlock *HeaderVPBB, VPBasicBlock *LatchVPBB)
Update Plan to account for the uncountable early exit from EarlyExitingVPBB to EarlyExitVPBB by intro...
static void clearReductionWrapFlags(VPlan &Plan)
Clear NSW/NUW flags from reduction instructions if necessary.
static void cse(VPlan &Plan)
Perform common-subexpression-elimination on Plan.
static void addActiveLaneMask(VPlan &Plan, bool UseActiveLaneMaskForControlFlow, bool DataAndControlFlowWithoutRuntimeCheck)
Replace (ICMP_ULE, wide canonical IV, backedge-taken-count) checks with an (active-lane-mask recipe,...
static LLVM_ABI_FOR_TEST void optimize(VPlan &Plan)
Apply VPlan-to-VPlan optimizations to Plan, including induction recipe optimizations,...
static void dissolveLoopRegions(VPlan &Plan)
Replace loop regions with explicit CFG.
static void truncateToMinimalBitwidths(VPlan &Plan, const MapVector< Instruction *, uint64_t > &MinBWs)
Insert truncates and extends for any truncated recipe.
static bool adjustFixedOrderRecurrences(VPlan &Plan, VPBuilder &Builder)
Try to have all users of fixed-order recurrences appear after the recipe defining their previous valu...
static void optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Optimize Plan based on BestVF and BestUF.
static void materializeVFAndVFxUF(VPlan &Plan, VPBasicBlock *VectorPH, ElementCount VF)
Materialize VF and VFxUF to be computed explicitly using VPInstructions.
static void updateScalarResumePhis(VPlan &Plan, DenseMap< VPValue *, VPValue * > &IVEndValues)
Update the resume phis in the scalar preheader after creating wide recipes for first-order recurrence...