LLVM 22.0.0git
VPlanTransforms.cpp
Go to the documentation of this file.
1//===-- VPlanTransforms.cpp - Utility VPlan to VPlan transforms -----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file implements a set of utility VPlan to VPlan transformations.
11///
12//===----------------------------------------------------------------------===//
13
14#include "VPlanTransforms.h"
15#include "VPRecipeBuilder.h"
16#include "VPlan.h"
17#include "VPlanAnalysis.h"
18#include "VPlanCFG.h"
19#include "VPlanDominatorTree.h"
20#include "VPlanHelpers.h"
21#include "VPlanPatternMatch.h"
22#include "VPlanUtils.h"
23#include "VPlanVerifier.h"
24#include "llvm/ADT/APInt.h"
26#include "llvm/ADT/STLExtras.h"
27#include "llvm/ADT/SetVector.h"
28#include "llvm/ADT/TypeSwitch.h"
34#include "llvm/IR/Intrinsics.h"
35#include "llvm/IR/MDBuilder.h"
39
40using namespace llvm;
41using namespace VPlanPatternMatch;
42
44 "enable-wide-lane-mask", cl::init(false), cl::Hidden,
45 cl::desc("Enable use of wide get active lane mask instructions"));
46
48 VPlan &Plan,
50 GetIntOrFpInductionDescriptor,
51 const TargetLibraryInfo &TLI) {
52
54 Plan.getVectorLoopRegion());
56 // Skip blocks outside region
57 if (!VPBB->getParent())
58 break;
59 VPRecipeBase *Term = VPBB->getTerminator();
60 auto EndIter = Term ? Term->getIterator() : VPBB->end();
61 // Introduce each ingredient into VPlan.
62 for (VPRecipeBase &Ingredient :
63 make_early_inc_range(make_range(VPBB->begin(), EndIter))) {
64
65 VPValue *VPV = Ingredient.getVPSingleValue();
66 if (!VPV->getUnderlyingValue())
67 continue;
68
70
71 VPRecipeBase *NewRecipe = nullptr;
72 if (auto *PhiR = dyn_cast<VPPhi>(&Ingredient)) {
73 auto *Phi = cast<PHINode>(PhiR->getUnderlyingValue());
74 const auto *II = GetIntOrFpInductionDescriptor(Phi);
75 if (!II) {
76 NewRecipe = new VPWidenPHIRecipe(Phi, nullptr, PhiR->getDebugLoc());
77 for (VPValue *Op : PhiR->operands())
78 NewRecipe->addOperand(Op);
79 } else {
80 VPValue *Start = Plan.getOrAddLiveIn(II->getStartValue());
81 VPValue *Step =
83 NewRecipe = new VPWidenIntOrFpInductionRecipe(
84 Phi, Start, Step, &Plan.getVF(), *II, Ingredient.getDebugLoc());
85 }
86 } else {
87 assert(isa<VPInstruction>(&Ingredient) &&
88 "only VPInstructions expected here");
89 assert(!isa<PHINode>(Inst) && "phis should be handled above");
90 // Create VPWidenMemoryRecipe for loads and stores.
91 if (LoadInst *Load = dyn_cast<LoadInst>(Inst)) {
92 NewRecipe = new VPWidenLoadRecipe(
93 *Load, Ingredient.getOperand(0), nullptr /*Mask*/,
94 false /*Consecutive*/, false /*Reverse*/, Load->getAlign(),
95 VPIRMetadata(*Load), Ingredient.getDebugLoc());
96 } else if (StoreInst *Store = dyn_cast<StoreInst>(Inst)) {
97 NewRecipe = new VPWidenStoreRecipe(
98 *Store, Ingredient.getOperand(1), Ingredient.getOperand(0),
99 nullptr /*Mask*/, false /*Consecutive*/, false /*Reverse*/,
100 Store->getAlign(), VPIRMetadata(*Store),
101 Ingredient.getDebugLoc());
103 NewRecipe = new VPWidenGEPRecipe(GEP, Ingredient.operands());
104 } else if (CallInst *CI = dyn_cast<CallInst>(Inst)) {
105 Intrinsic::ID VectorID = getVectorIntrinsicIDForCall(CI, &TLI);
106 if (VectorID == Intrinsic::not_intrinsic)
107 return false;
108 NewRecipe = new VPWidenIntrinsicRecipe(
109 *CI, getVectorIntrinsicIDForCall(CI, &TLI),
110 drop_end(Ingredient.operands()), CI->getType(),
111 CI->getDebugLoc());
112 } else if (SelectInst *SI = dyn_cast<SelectInst>(Inst)) {
113 NewRecipe = new VPWidenSelectRecipe(*SI, Ingredient.operands());
114 } else if (auto *CI = dyn_cast<CastInst>(Inst)) {
115 NewRecipe = new VPWidenCastRecipe(
116 CI->getOpcode(), Ingredient.getOperand(0), CI->getType(), *CI);
117 } else {
118 NewRecipe = new VPWidenRecipe(*Inst, Ingredient.operands());
119 }
120 }
121
122 NewRecipe->insertBefore(&Ingredient);
123 if (NewRecipe->getNumDefinedValues() == 1)
124 VPV->replaceAllUsesWith(NewRecipe->getVPSingleValue());
125 else
126 assert(NewRecipe->getNumDefinedValues() == 0 &&
127 "Only recpies with zero or one defined values expected");
128 Ingredient.eraseFromParent();
129 }
130 }
131 return true;
132}
133
134/// Return true if we do not know how to (mechanically) hoist or sink \p R out
135/// of a loop region.
137 // Assumes don't alias anything or throw; as long as they're guaranteed to
138 // execute, they're safe to hoist.
140 return false;
141
142 // TODO: Relax checks in the future, e.g. we could also hoist reads, if their
143 // memory location is not modified in the vector loop.
144 if (R.mayHaveSideEffects() || R.mayReadFromMemory() || R.isPhi())
145 return true;
146
147 // Allocas cannot be hoisted.
148 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
149 return RepR && RepR->getOpcode() == Instruction::Alloca;
150}
151
152static bool sinkScalarOperands(VPlan &Plan) {
153 auto Iter = vp_depth_first_deep(Plan.getEntry());
154 bool ScalarVFOnly = Plan.hasScalarVFOnly();
155 bool Changed = false;
156
158 auto InsertIfValidSinkCandidate = [ScalarVFOnly, &WorkList](
159 VPBasicBlock *SinkTo, VPValue *Op) {
160 auto *Candidate =
161 dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe());
162 if (!Candidate)
163 return;
164
165 // We only know how to sink VPReplicateRecipes and VPScalarIVStepsRecipes
166 // for now.
168 return;
169
170 if (Candidate->getParent() == SinkTo || Candidate->mayHaveSideEffects() ||
171 Candidate->mayReadOrWriteMemory())
172 return;
173
174 if (auto *RepR = dyn_cast<VPReplicateRecipe>(Candidate))
175 if (!ScalarVFOnly && RepR->isSingleScalar())
176 return;
177
178 WorkList.insert({SinkTo, Candidate});
179 };
180
181 // First, collect the operands of all recipes in replicate blocks as seeds for
182 // sinking.
184 VPBasicBlock *EntryVPBB = VPR->getEntryBasicBlock();
185 if (!VPR->isReplicator() || EntryVPBB->getSuccessors().size() != 2)
186 continue;
187 VPBasicBlock *VPBB = cast<VPBasicBlock>(EntryVPBB->getSuccessors().front());
188 if (VPBB->getSingleSuccessor() != VPR->getExitingBasicBlock())
189 continue;
190 for (auto &Recipe : *VPBB)
191 for (VPValue *Op : Recipe.operands())
192 InsertIfValidSinkCandidate(VPBB, Op);
193 }
194
195 // Try to sink each replicate or scalar IV steps recipe in the worklist.
196 for (unsigned I = 0; I != WorkList.size(); ++I) {
197 VPBasicBlock *SinkTo;
198 VPSingleDefRecipe *SinkCandidate;
199 std::tie(SinkTo, SinkCandidate) = WorkList[I];
200
201 // All recipe users of SinkCandidate must be in the same block SinkTo or all
202 // users outside of SinkTo must only use the first lane of SinkCandidate. In
203 // the latter case, we need to duplicate SinkCandidate.
204 auto UsersOutsideSinkTo =
205 make_filter_range(SinkCandidate->users(), [SinkTo](VPUser *U) {
206 return cast<VPRecipeBase>(U)->getParent() != SinkTo;
207 });
208 if (any_of(UsersOutsideSinkTo, [SinkCandidate](VPUser *U) {
209 return !U->onlyFirstLaneUsed(SinkCandidate);
210 }))
211 continue;
212 bool NeedsDuplicating = !UsersOutsideSinkTo.empty();
213
214 if (NeedsDuplicating) {
215 if (ScalarVFOnly)
216 continue;
217 VPSingleDefRecipe *Clone;
218 if (auto *SinkCandidateRepR =
219 dyn_cast<VPReplicateRecipe>(SinkCandidate)) {
220 // TODO: Handle converting to uniform recipes as separate transform,
221 // then cloning should be sufficient here.
222 Instruction *I = SinkCandidate->getUnderlyingInstr();
223 Clone = new VPReplicateRecipe(I, SinkCandidate->operands(), true,
224 nullptr /*Mask*/, *SinkCandidateRepR);
225 // TODO: add ".cloned" suffix to name of Clone's VPValue.
226 } else {
227 Clone = SinkCandidate->clone();
228 }
229
230 Clone->insertBefore(SinkCandidate);
231 SinkCandidate->replaceUsesWithIf(Clone, [SinkTo](VPUser &U, unsigned) {
232 return cast<VPRecipeBase>(&U)->getParent() != SinkTo;
233 });
234 }
235 SinkCandidate->moveBefore(*SinkTo, SinkTo->getFirstNonPhi());
236 for (VPValue *Op : SinkCandidate->operands())
237 InsertIfValidSinkCandidate(SinkTo, Op);
238 Changed = true;
239 }
240 return Changed;
241}
242
243/// If \p R is a region with a VPBranchOnMaskRecipe in the entry block, return
244/// the mask.
246 auto *EntryBB = dyn_cast<VPBasicBlock>(R->getEntry());
247 if (!EntryBB || EntryBB->size() != 1 ||
248 !isa<VPBranchOnMaskRecipe>(EntryBB->begin()))
249 return nullptr;
250
251 return cast<VPBranchOnMaskRecipe>(&*EntryBB->begin())->getOperand(0);
252}
253
254/// If \p R is a triangle region, return the 'then' block of the triangle.
256 auto *EntryBB = cast<VPBasicBlock>(R->getEntry());
257 if (EntryBB->getNumSuccessors() != 2)
258 return nullptr;
259
260 auto *Succ0 = dyn_cast<VPBasicBlock>(EntryBB->getSuccessors()[0]);
261 auto *Succ1 = dyn_cast<VPBasicBlock>(EntryBB->getSuccessors()[1]);
262 if (!Succ0 || !Succ1)
263 return nullptr;
264
265 if (Succ0->getNumSuccessors() + Succ1->getNumSuccessors() != 1)
266 return nullptr;
267 if (Succ0->getSingleSuccessor() == Succ1)
268 return Succ0;
269 if (Succ1->getSingleSuccessor() == Succ0)
270 return Succ1;
271 return nullptr;
272}
273
274// Merge replicate regions in their successor region, if a replicate region
275// is connected to a successor replicate region with the same predicate by a
276// single, empty VPBasicBlock.
278 SmallPtrSet<VPRegionBlock *, 4> TransformedRegions;
279
280 // Collect replicate regions followed by an empty block, followed by another
281 // replicate region with matching masks to process front. This is to avoid
282 // iterator invalidation issues while merging regions.
285 vp_depth_first_deep(Plan.getEntry()))) {
286 if (!Region1->isReplicator())
287 continue;
288 auto *MiddleBasicBlock =
289 dyn_cast_or_null<VPBasicBlock>(Region1->getSingleSuccessor());
290 if (!MiddleBasicBlock || !MiddleBasicBlock->empty())
291 continue;
292
293 auto *Region2 =
294 dyn_cast_or_null<VPRegionBlock>(MiddleBasicBlock->getSingleSuccessor());
295 if (!Region2 || !Region2->isReplicator())
296 continue;
297
298 VPValue *Mask1 = getPredicatedMask(Region1);
299 VPValue *Mask2 = getPredicatedMask(Region2);
300 if (!Mask1 || Mask1 != Mask2)
301 continue;
302
303 assert(Mask1 && Mask2 && "both region must have conditions");
304 WorkList.push_back(Region1);
305 }
306
307 // Move recipes from Region1 to its successor region, if both are triangles.
308 for (VPRegionBlock *Region1 : WorkList) {
309 if (TransformedRegions.contains(Region1))
310 continue;
311 auto *MiddleBasicBlock = cast<VPBasicBlock>(Region1->getSingleSuccessor());
312 auto *Region2 = cast<VPRegionBlock>(MiddleBasicBlock->getSingleSuccessor());
313
314 VPBasicBlock *Then1 = getPredicatedThenBlock(Region1);
315 VPBasicBlock *Then2 = getPredicatedThenBlock(Region2);
316 if (!Then1 || !Then2)
317 continue;
318
319 // Note: No fusion-preventing memory dependencies are expected in either
320 // region. Such dependencies should be rejected during earlier dependence
321 // checks, which guarantee accesses can be re-ordered for vectorization.
322 //
323 // Move recipes to the successor region.
324 for (VPRecipeBase &ToMove : make_early_inc_range(reverse(*Then1)))
325 ToMove.moveBefore(*Then2, Then2->getFirstNonPhi());
326
327 auto *Merge1 = cast<VPBasicBlock>(Then1->getSingleSuccessor());
328 auto *Merge2 = cast<VPBasicBlock>(Then2->getSingleSuccessor());
329
330 // Move VPPredInstPHIRecipes from the merge block to the successor region's
331 // merge block. Update all users inside the successor region to use the
332 // original values.
333 for (VPRecipeBase &Phi1ToMove : make_early_inc_range(reverse(*Merge1))) {
334 VPValue *PredInst1 =
335 cast<VPPredInstPHIRecipe>(&Phi1ToMove)->getOperand(0);
336 VPValue *Phi1ToMoveV = Phi1ToMove.getVPSingleValue();
337 Phi1ToMoveV->replaceUsesWithIf(PredInst1, [Then2](VPUser &U, unsigned) {
338 return cast<VPRecipeBase>(&U)->getParent() == Then2;
339 });
340
341 // Remove phi recipes that are unused after merging the regions.
342 if (Phi1ToMove.getVPSingleValue()->getNumUsers() == 0) {
343 Phi1ToMove.eraseFromParent();
344 continue;
345 }
346 Phi1ToMove.moveBefore(*Merge2, Merge2->begin());
347 }
348
349 // Remove the dead recipes in Region1's entry block.
350 for (VPRecipeBase &R :
351 make_early_inc_range(reverse(*Region1->getEntryBasicBlock())))
352 R.eraseFromParent();
353
354 // Finally, remove the first region.
355 for (VPBlockBase *Pred : make_early_inc_range(Region1->getPredecessors())) {
356 VPBlockUtils::disconnectBlocks(Pred, Region1);
357 VPBlockUtils::connectBlocks(Pred, MiddleBasicBlock);
358 }
359 VPBlockUtils::disconnectBlocks(Region1, MiddleBasicBlock);
360 TransformedRegions.insert(Region1);
361 }
362
363 return !TransformedRegions.empty();
364}
365
367 VPlan &Plan) {
368 Instruction *Instr = PredRecipe->getUnderlyingInstr();
369 // Build the triangular if-then region.
370 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
371 assert(Instr->getParent() && "Predicated instruction not in any basic block");
372 auto *BlockInMask = PredRecipe->getMask();
373 auto *MaskDef = BlockInMask->getDefiningRecipe();
374 auto *BOMRecipe = new VPBranchOnMaskRecipe(
375 BlockInMask, MaskDef ? MaskDef->getDebugLoc() : DebugLoc::getUnknown());
376 auto *Entry =
377 Plan.createVPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
378
379 // Replace predicated replicate recipe with a replicate recipe without a
380 // mask but in the replicate region.
381 auto *RecipeWithoutMask = new VPReplicateRecipe(
382 PredRecipe->getUnderlyingInstr(), drop_end(PredRecipe->operands()),
383 PredRecipe->isSingleScalar(), nullptr /*Mask*/, *PredRecipe);
384 auto *Pred =
385 Plan.createVPBasicBlock(Twine(RegionName) + ".if", RecipeWithoutMask);
386
387 VPPredInstPHIRecipe *PHIRecipe = nullptr;
388 if (PredRecipe->getNumUsers() != 0) {
389 PHIRecipe = new VPPredInstPHIRecipe(RecipeWithoutMask,
390 RecipeWithoutMask->getDebugLoc());
391 PredRecipe->replaceAllUsesWith(PHIRecipe);
392 PHIRecipe->setOperand(0, RecipeWithoutMask);
393 }
394 PredRecipe->eraseFromParent();
395 auto *Exiting =
396 Plan.createVPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
398 Plan.createReplicateRegion(Entry, Exiting, RegionName);
399
400 // Note: first set Entry as region entry and then connect successors starting
401 // from it in order, to propagate the "parent" of each VPBasicBlock.
402 VPBlockUtils::insertTwoBlocksAfter(Pred, Exiting, Entry);
403 VPBlockUtils::connectBlocks(Pred, Exiting);
404
405 return Region;
406}
407
408static void addReplicateRegions(VPlan &Plan) {
411 vp_depth_first_deep(Plan.getEntry()))) {
412 for (VPRecipeBase &R : *VPBB)
413 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
414 if (RepR->isPredicated())
415 WorkList.push_back(RepR);
416 }
417 }
418
419 unsigned BBNum = 0;
420 for (VPReplicateRecipe *RepR : WorkList) {
421 VPBasicBlock *CurrentBlock = RepR->getParent();
422 VPBasicBlock *SplitBlock = CurrentBlock->splitAt(RepR->getIterator());
423
424 BasicBlock *OrigBB = RepR->getUnderlyingInstr()->getParent();
425 SplitBlock->setName(
426 OrigBB->hasName() ? OrigBB->getName() + "." + Twine(BBNum++) : "");
427 // Record predicated instructions for above packing optimizations.
429 Region->setParent(CurrentBlock->getParent());
431
432 VPRegionBlock *ParentRegion = Region->getParent();
433 if (ParentRegion && ParentRegion->getExiting() == CurrentBlock)
434 ParentRegion->setExiting(SplitBlock);
435 }
436}
437
438/// Remove redundant VPBasicBlocks by merging them into their predecessor if
439/// the predecessor has a single successor.
443 vp_depth_first_deep(Plan.getEntry()))) {
444 // Don't fold the blocks in the skeleton of the Plan into their single
445 // predecessors for now.
446 // TODO: Remove restriction once more of the skeleton is modeled in VPlan.
447 if (!VPBB->getParent())
448 continue;
449 auto *PredVPBB =
450 dyn_cast_or_null<VPBasicBlock>(VPBB->getSinglePredecessor());
451 if (!PredVPBB || PredVPBB->getNumSuccessors() != 1 ||
452 isa<VPIRBasicBlock>(PredVPBB))
453 continue;
454 WorkList.push_back(VPBB);
455 }
456
457 for (VPBasicBlock *VPBB : WorkList) {
458 VPBasicBlock *PredVPBB = cast<VPBasicBlock>(VPBB->getSinglePredecessor());
459 for (VPRecipeBase &R : make_early_inc_range(*VPBB))
460 R.moveBefore(*PredVPBB, PredVPBB->end());
461 VPBlockUtils::disconnectBlocks(PredVPBB, VPBB);
462 auto *ParentRegion = VPBB->getParent();
463 if (ParentRegion && ParentRegion->getExiting() == VPBB)
464 ParentRegion->setExiting(PredVPBB);
465 for (auto *Succ : to_vector(VPBB->successors())) {
467 VPBlockUtils::connectBlocks(PredVPBB, Succ);
468 }
469 // VPBB is now dead and will be cleaned up when the plan gets destroyed.
470 }
471 return !WorkList.empty();
472}
473
475 // Convert masked VPReplicateRecipes to if-then region blocks.
477
478 bool ShouldSimplify = true;
479 while (ShouldSimplify) {
480 ShouldSimplify = sinkScalarOperands(Plan);
481 ShouldSimplify |= mergeReplicateRegionsIntoSuccessors(Plan);
482 ShouldSimplify |= mergeBlocksIntoPredecessors(Plan);
483 }
484}
485
486/// Remove redundant casts of inductions.
487///
488/// Such redundant casts are casts of induction variables that can be ignored,
489/// because we already proved that the casted phi is equal to the uncasted phi
490/// in the vectorized loop. There is no need to vectorize the cast - the same
491/// value can be used for both the phi and casts in the vector loop.
493 for (auto &Phi : Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
495 if (!IV || IV->getTruncInst())
496 continue;
497
498 // A sequence of IR Casts has potentially been recorded for IV, which
499 // *must be bypassed* when the IV is vectorized, because the vectorized IV
500 // will produce the desired casted value. This sequence forms a def-use
501 // chain and is provided in reverse order, ending with the cast that uses
502 // the IV phi. Search for the recipe of the last cast in the chain and
503 // replace it with the original IV. Note that only the final cast is
504 // expected to have users outside the cast-chain and the dead casts left
505 // over will be cleaned up later.
506 auto &Casts = IV->getInductionDescriptor().getCastInsts();
507 VPValue *FindMyCast = IV;
508 for (Instruction *IRCast : reverse(Casts)) {
509 VPSingleDefRecipe *FoundUserCast = nullptr;
510 for (auto *U : FindMyCast->users()) {
511 auto *UserCast = dyn_cast<VPSingleDefRecipe>(U);
512 if (UserCast && UserCast->getUnderlyingValue() == IRCast) {
513 FoundUserCast = UserCast;
514 break;
515 }
516 }
517 FindMyCast = FoundUserCast;
518 }
519 FindMyCast->replaceAllUsesWith(IV);
520 }
521}
522
523/// Try to replace VPWidenCanonicalIVRecipes with a widened canonical IV
524/// recipe, if it exists.
526 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
527 VPCanonicalIVPHIRecipe *CanonicalIV = LoopRegion->getCanonicalIV();
528 VPWidenCanonicalIVRecipe *WidenNewIV = nullptr;
529 for (VPUser *U : CanonicalIV->users()) {
531 if (WidenNewIV)
532 break;
533 }
534
535 if (!WidenNewIV)
536 return;
537
538 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
539 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
540 auto *WidenOriginalIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
541
542 if (!WidenOriginalIV || !WidenOriginalIV->isCanonical())
543 continue;
544
545 // Replace WidenNewIV with WidenOriginalIV if WidenOriginalIV provides
546 // everything WidenNewIV's users need. That is, WidenOriginalIV will
547 // generate a vector phi or all users of WidenNewIV demand the first lane
548 // only.
549 if (!vputils::onlyScalarValuesUsed(WidenOriginalIV) ||
550 vputils::onlyFirstLaneUsed(WidenNewIV)) {
551 WidenNewIV->replaceAllUsesWith(WidenOriginalIV);
552 WidenNewIV->eraseFromParent();
553 return;
554 }
555 }
556}
557
558/// Returns true if \p R is dead and can be removed.
559static bool isDeadRecipe(VPRecipeBase &R) {
560 // Do remove conditional assume instructions as their conditions may be
561 // flattened.
562 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
563 bool IsConditionalAssume = RepR && RepR->isPredicated() &&
565 if (IsConditionalAssume)
566 return true;
567
568 if (R.mayHaveSideEffects())
569 return false;
570
571 // Recipe is dead if no user keeps the recipe alive.
572 return all_of(R.definedValues(),
573 [](VPValue *V) { return V->getNumUsers() == 0; });
574}
575
578 vp_post_order_deep(Plan.getEntry()))) {
579 // The recipes in the block are processed in reverse order, to catch chains
580 // of dead recipes.
581 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
582 if (isDeadRecipe(R)) {
583 R.eraseFromParent();
584 continue;
585 }
586
587 // Check if R is a dead VPPhi <-> update cycle and remove it.
588 auto *PhiR = dyn_cast<VPPhi>(&R);
589 if (!PhiR || PhiR->getNumOperands() != 2 || PhiR->getNumUsers() != 1)
590 continue;
591 VPValue *Incoming = PhiR->getOperand(1);
592 if (*PhiR->user_begin() != Incoming->getDefiningRecipe() ||
593 Incoming->getNumUsers() != 1)
594 continue;
595 PhiR->replaceAllUsesWith(PhiR->getOperand(0));
596 PhiR->eraseFromParent();
597 Incoming->getDefiningRecipe()->eraseFromParent();
598 }
599 }
600}
601
604 Instruction::BinaryOps InductionOpcode,
605 FPMathOperator *FPBinOp, Instruction *TruncI,
606 VPValue *StartV, VPValue *Step, DebugLoc DL,
607 VPBuilder &Builder) {
608 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
609 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
610 VPCanonicalIVPHIRecipe *CanonicalIV = LoopRegion->getCanonicalIV();
611 VPSingleDefRecipe *BaseIV = Builder.createDerivedIV(
612 Kind, FPBinOp, StartV, CanonicalIV, Step, "offset.idx");
613
614 // Truncate base induction if needed.
615 VPTypeAnalysis TypeInfo(Plan);
616 Type *ResultTy = TypeInfo.inferScalarType(BaseIV);
617 if (TruncI) {
618 Type *TruncTy = TruncI->getType();
619 assert(ResultTy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits() &&
620 "Not truncating.");
621 assert(ResultTy->isIntegerTy() && "Truncation requires an integer type");
622 BaseIV = Builder.createScalarCast(Instruction::Trunc, BaseIV, TruncTy, DL);
623 ResultTy = TruncTy;
624 }
625
626 // Truncate step if needed.
627 Type *StepTy = TypeInfo.inferScalarType(Step);
628 if (ResultTy != StepTy) {
629 assert(StepTy->getScalarSizeInBits() > ResultTy->getScalarSizeInBits() &&
630 "Not truncating.");
631 assert(StepTy->isIntegerTy() && "Truncation requires an integer type");
632 auto *VecPreheader =
634 VPBuilder::InsertPointGuard Guard(Builder);
635 Builder.setInsertPoint(VecPreheader);
636 Step = Builder.createScalarCast(Instruction::Trunc, Step, ResultTy, DL);
637 }
638 return Builder.createScalarIVSteps(InductionOpcode, FPBinOp, BaseIV, Step,
639 &Plan.getVF(), DL);
640}
641
644 for (unsigned I = 0; I != Users.size(); ++I) {
646 if (isa<VPHeaderPHIRecipe>(Cur))
647 continue;
648 for (VPValue *V : Cur->definedValues())
649 Users.insert_range(V->users());
650 }
651 return Users.takeVector();
652}
653
654/// Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd
655/// (IndStart, ScalarIVSteps (0, Step)) if only its scalar values are used, as
656/// VPWidenPointerInductionRecipe will generate vectors only. If some users
657/// require vectors while other require scalars, the scalar uses need to extract
658/// the scalars from the generated vectors (Note that this is different to how
659/// int/fp inductions are handled). Legalize extract-from-ends using uniform
660/// VPReplicateRecipe of wide inductions to use regular VPReplicateRecipe, so
661/// the correct end value is available. Also optimize
662/// VPWidenIntOrFpInductionRecipe, if any of its users needs scalar values, by
663/// providing them scalar steps built on the canonical scalar IV and update the
664/// original IV's users. This is an optional optimization to reduce the needs of
665/// vector extracts.
668 bool HasOnlyVectorVFs = !Plan.hasScalarVFOnly();
669 VPBuilder Builder(HeaderVPBB, HeaderVPBB->getFirstNonPhi());
670 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
671 auto *PhiR = dyn_cast<VPWidenInductionRecipe>(&Phi);
672 if (!PhiR)
673 continue;
674
675 // Try to narrow wide and replicating recipes to uniform recipes, based on
676 // VPlan analysis.
677 // TODO: Apply to all recipes in the future, to replace legacy uniformity
678 // analysis.
679 auto Users = collectUsersRecursively(PhiR);
680 for (VPUser *U : reverse(Users)) {
681 auto *Def = dyn_cast<VPSingleDefRecipe>(U);
682 auto *RepR = dyn_cast<VPReplicateRecipe>(U);
683 // Skip recipes that shouldn't be narrowed.
684 if (!Def || !isa<VPReplicateRecipe, VPWidenRecipe>(Def) ||
685 Def->getNumUsers() == 0 || !Def->getUnderlyingValue() ||
686 (RepR && (RepR->isSingleScalar() || RepR->isPredicated())))
687 continue;
688
689 // Skip recipes that may have other lanes than their first used.
691 continue;
692
693 auto *Clone = new VPReplicateRecipe(Def->getUnderlyingInstr(),
694 Def->operands(), /*IsUniform*/ true);
695 Clone->insertAfter(Def);
696 Def->replaceAllUsesWith(Clone);
697 }
698
699 // Replace wide pointer inductions which have only their scalars used by
700 // PtrAdd(IndStart, ScalarIVSteps (0, Step)).
701 if (auto *PtrIV = dyn_cast<VPWidenPointerInductionRecipe>(&Phi)) {
702 if (!PtrIV->onlyScalarsGenerated(Plan.hasScalableVF()))
703 continue;
704
705 const InductionDescriptor &ID = PtrIV->getInductionDescriptor();
706 VPValue *StartV = Plan.getConstantInt(ID.getStep()->getType(), 0);
707 VPValue *StepV = PtrIV->getOperand(1);
709 Plan, InductionDescriptor::IK_IntInduction, Instruction::Add, nullptr,
710 nullptr, StartV, StepV, PtrIV->getDebugLoc(), Builder);
711
712 VPValue *PtrAdd = Builder.createPtrAdd(PtrIV->getStartValue(), Steps,
713 PtrIV->getDebugLoc(), "next.gep");
714
715 PtrIV->replaceAllUsesWith(PtrAdd);
716 continue;
717 }
718
719 // Replace widened induction with scalar steps for users that only use
720 // scalars.
721 auto *WideIV = cast<VPWidenIntOrFpInductionRecipe>(&Phi);
722 if (HasOnlyVectorVFs && none_of(WideIV->users(), [WideIV](VPUser *U) {
723 return U->usesScalars(WideIV);
724 }))
725 continue;
726
727 const InductionDescriptor &ID = WideIV->getInductionDescriptor();
729 Plan, ID.getKind(), ID.getInductionOpcode(),
730 dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
731 WideIV->getTruncInst(), WideIV->getStartValue(), WideIV->getStepValue(),
732 WideIV->getDebugLoc(), Builder);
733
734 // Update scalar users of IV to use Step instead.
735 if (!HasOnlyVectorVFs)
736 WideIV->replaceAllUsesWith(Steps);
737 else
738 WideIV->replaceUsesWithIf(Steps, [WideIV](VPUser &U, unsigned) {
739 return U.usesScalars(WideIV);
740 });
741 }
742}
743
744/// Check if \p VPV is an untruncated wide induction, either before or after the
745/// increment. If so return the header IV (before the increment), otherwise
746/// return null.
748 ScalarEvolution &SE) {
749 auto *WideIV = dyn_cast<VPWidenInductionRecipe>(VPV);
750 if (WideIV) {
751 // VPV itself is a wide induction, separately compute the end value for exit
752 // users if it is not a truncated IV.
753 auto *IntOrFpIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
754 return (IntOrFpIV && IntOrFpIV->getTruncInst()) ? nullptr : WideIV;
755 }
756
757 // Check if VPV is an optimizable induction increment.
758 VPRecipeBase *Def = VPV->getDefiningRecipe();
759 if (!Def || Def->getNumOperands() != 2)
760 return nullptr;
761 WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(0));
762 if (!WideIV)
763 WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(1));
764 if (!WideIV)
765 return nullptr;
766
767 auto IsWideIVInc = [&]() {
768 auto &ID = WideIV->getInductionDescriptor();
769
770 // Check if VPV increments the induction by the induction step.
771 VPValue *IVStep = WideIV->getStepValue();
772 switch (ID.getInductionOpcode()) {
773 case Instruction::Add:
774 return match(VPV, m_c_Add(m_Specific(WideIV), m_Specific(IVStep)));
775 case Instruction::FAdd:
777 m_Specific(IVStep)));
778 case Instruction::FSub:
779 return match(VPV, m_Binary<Instruction::FSub>(m_Specific(WideIV),
780 m_Specific(IVStep)));
781 case Instruction::Sub: {
782 // IVStep will be the negated step of the subtraction. Check if Step == -1
783 // * IVStep.
784 VPValue *Step;
785 if (!match(VPV, m_Sub(m_VPValue(), m_VPValue(Step))))
786 return false;
787 const SCEV *IVStepSCEV = vputils::getSCEVExprForVPValue(IVStep, SE);
788 const SCEV *StepSCEV = vputils::getSCEVExprForVPValue(Step, SE);
789 return !isa<SCEVCouldNotCompute>(IVStepSCEV) &&
790 !isa<SCEVCouldNotCompute>(StepSCEV) &&
791 IVStepSCEV == SE.getNegativeSCEV(StepSCEV);
792 }
793 default:
794 return ID.getKind() == InductionDescriptor::IK_PtrInduction &&
795 match(VPV, m_GetElementPtr(m_Specific(WideIV),
796 m_Specific(WideIV->getStepValue())));
797 }
798 llvm_unreachable("should have been covered by switch above");
799 };
800 return IsWideIVInc() ? WideIV : nullptr;
801}
802
803/// Attempts to optimize the induction variable exit values for users in the
804/// early exit block.
806 VPTypeAnalysis &TypeInfo,
807 VPBlockBase *PredVPBB,
808 VPValue *Op,
809 ScalarEvolution &SE) {
810 VPValue *Incoming, *Mask;
813 return nullptr;
814
815 auto *WideIV = getOptimizableIVOf(Incoming, SE);
816 if (!WideIV)
817 return nullptr;
818
819 auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
820 if (WideIntOrFp && WideIntOrFp->getTruncInst())
821 return nullptr;
822
823 // Calculate the final index.
824 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
825 auto *CanonicalIV = LoopRegion->getCanonicalIV();
826 Type *CanonicalIVType = LoopRegion->getCanonicalIVType();
827 VPBuilder B(cast<VPBasicBlock>(PredVPBB));
828
829 DebugLoc DL = cast<VPInstruction>(Op)->getDebugLoc();
830 VPValue *FirstActiveLane =
831 B.createNaryOp(VPInstruction::FirstActiveLane, Mask, DL);
832 Type *FirstActiveLaneType = TypeInfo.inferScalarType(FirstActiveLane);
833 FirstActiveLane = B.createScalarZExtOrTrunc(FirstActiveLane, CanonicalIVType,
834 FirstActiveLaneType, DL);
835 VPValue *EndValue =
836 B.createNaryOp(Instruction::Add, {CanonicalIV, FirstActiveLane}, DL);
837
838 // `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
839 // changed it means the exit is using the incremented value, so we need to
840 // add the step.
841 if (Incoming != WideIV) {
842 VPValue *One = Plan.getConstantInt(CanonicalIVType, 1);
843 EndValue = B.createNaryOp(Instruction::Add, {EndValue, One}, DL);
844 }
845
846 if (!WideIntOrFp || !WideIntOrFp->isCanonical()) {
847 const InductionDescriptor &ID = WideIV->getInductionDescriptor();
848 VPValue *Start = WideIV->getStartValue();
849 VPValue *Step = WideIV->getStepValue();
850 EndValue = B.createDerivedIV(
851 ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
852 Start, EndValue, Step);
853 }
854
855 return EndValue;
856}
857
858/// Attempts to optimize the induction variable exit values for users in the
859/// exit block coming from the latch in the original scalar loop.
861 VPlan &Plan, VPTypeAnalysis &TypeInfo, VPBlockBase *PredVPBB, VPValue *Op,
865 return nullptr;
866
867 auto *WideIV = getOptimizableIVOf(Incoming, SE);
868 if (!WideIV)
869 return nullptr;
870
871 VPValue *EndValue = EndValues.lookup(WideIV);
872 assert(EndValue && "end value must have been pre-computed");
873
874 // `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
875 // changed it means the exit is using the incremented value, so we don't
876 // need to subtract the step.
877 if (Incoming != WideIV)
878 return EndValue;
879
880 // Otherwise, subtract the step from the EndValue.
881 VPBuilder B(cast<VPBasicBlock>(PredVPBB)->getTerminator());
882 VPValue *Step = WideIV->getStepValue();
883 Type *ScalarTy = TypeInfo.inferScalarType(WideIV);
884 if (ScalarTy->isIntegerTy())
885 return B.createNaryOp(Instruction::Sub, {EndValue, Step}, {}, "ind.escape");
886 if (ScalarTy->isPointerTy()) {
887 Type *StepTy = TypeInfo.inferScalarType(Step);
888 auto *Zero = Plan.getConstantInt(StepTy, 0);
889 return B.createPtrAdd(EndValue,
890 B.createNaryOp(Instruction::Sub, {Zero, Step}),
891 DebugLoc::getUnknown(), "ind.escape");
892 }
893 if (ScalarTy->isFloatingPointTy()) {
894 const auto &ID = WideIV->getInductionDescriptor();
895 return B.createNaryOp(
896 ID.getInductionBinOp()->getOpcode() == Instruction::FAdd
897 ? Instruction::FSub
898 : Instruction::FAdd,
899 {EndValue, Step}, {ID.getInductionBinOp()->getFastMathFlags()});
900 }
901 llvm_unreachable("all possible induction types must be handled");
902 return nullptr;
903}
904
906 VPlan &Plan, DenseMap<VPValue *, VPValue *> &EndValues,
907 ScalarEvolution &SE) {
908 VPBlockBase *MiddleVPBB = Plan.getMiddleBlock();
909 VPTypeAnalysis TypeInfo(Plan);
910 for (VPIRBasicBlock *ExitVPBB : Plan.getExitBlocks()) {
911 for (VPRecipeBase &R : ExitVPBB->phis()) {
912 auto *ExitIRI = cast<VPIRPhi>(&R);
913
914 for (auto [Idx, PredVPBB] : enumerate(ExitVPBB->getPredecessors())) {
915 VPValue *Escape = nullptr;
916 if (PredVPBB == MiddleVPBB)
917 Escape = optimizeLatchExitInductionUser(Plan, TypeInfo, PredVPBB,
918 ExitIRI->getOperand(Idx),
919 EndValues, SE);
920 else
921 Escape = optimizeEarlyExitInductionUser(Plan, TypeInfo, PredVPBB,
922 ExitIRI->getOperand(Idx), SE);
923 if (Escape)
924 ExitIRI->setOperand(Idx, Escape);
925 }
926 }
927 }
928}
929
930/// Remove redundant EpxandSCEVRecipes in \p Plan's entry block by replacing
931/// them with already existing recipes expanding the same SCEV expression.
934
935 for (VPRecipeBase &R :
937 auto *ExpR = dyn_cast<VPExpandSCEVRecipe>(&R);
938 if (!ExpR)
939 continue;
940
941 const auto &[V, Inserted] = SCEV2VPV.try_emplace(ExpR->getSCEV(), ExpR);
942 if (Inserted)
943 continue;
944 ExpR->replaceAllUsesWith(V->second);
945 ExpR->eraseFromParent();
946 }
947}
948
950 SmallVector<VPValue *> WorkList;
952 WorkList.push_back(V);
953
954 while (!WorkList.empty()) {
955 VPValue *Cur = WorkList.pop_back_val();
956 if (!Seen.insert(Cur).second)
957 continue;
959 if (!R)
960 continue;
961 if (!isDeadRecipe(*R))
962 continue;
963 append_range(WorkList, R->operands());
964 R->eraseFromParent();
965 }
966}
967
968/// Get any instruction opcode or intrinsic ID data embedded in recipe \p R.
969/// Returns an optional pair, where the first element indicates whether it is
970/// an intrinsic ID.
971static std::optional<std::pair<bool, unsigned>>
973 return TypeSwitch<const VPSingleDefRecipe *,
974 std::optional<std::pair<bool, unsigned>>>(R)
977 [](auto *I) { return std::make_pair(false, I->getOpcode()); })
978 .Case<VPWidenIntrinsicRecipe>([](auto *I) {
979 return std::make_pair(true, I->getVectorIntrinsicID());
980 })
981 .Case<VPVectorPointerRecipe, VPPredInstPHIRecipe>([](auto *I) {
982 // For recipes that do not directly map to LLVM IR instructions,
983 // assign opcodes after the last VPInstruction opcode (which is also
984 // after the last IR Instruction opcode), based on the VPDefID.
985 return std::make_pair(false,
986 VPInstruction::OpsEnd + 1 + I->getVPDefID());
987 })
988 .Default([](auto *) { return std::nullopt; });
989}
990
991/// Try to fold \p R using InstSimplifyFolder. Will succeed and return a
992/// non-nullptr VPValue for a handled opcode or intrinsic ID if corresponding \p
993/// Operands are foldable live-ins.
995 ArrayRef<VPValue *> Operands,
996 const DataLayout &DL,
997 VPTypeAnalysis &TypeInfo) {
998 auto OpcodeOrIID = getOpcodeOrIntrinsicID(&R);
999 if (!OpcodeOrIID)
1000 return nullptr;
1001
1003 for (VPValue *Op : Operands) {
1004 if (!Op->isLiveIn() || !Op->getLiveInIRValue())
1005 return nullptr;
1006 Ops.push_back(Op->getLiveInIRValue());
1007 }
1008
1009 auto FoldToIRValue = [&]() -> Value * {
1010 InstSimplifyFolder Folder(DL);
1011 if (OpcodeOrIID->first) {
1012 if (R.getNumOperands() != 2)
1013 return nullptr;
1014 unsigned ID = OpcodeOrIID->second;
1015 return Folder.FoldBinaryIntrinsic(ID, Ops[0], Ops[1],
1016 TypeInfo.inferScalarType(&R));
1017 }
1018 unsigned Opcode = OpcodeOrIID->second;
1019 if (Instruction::isBinaryOp(Opcode))
1020 return Folder.FoldBinOp(static_cast<Instruction::BinaryOps>(Opcode),
1021 Ops[0], Ops[1]);
1022 if (Instruction::isCast(Opcode))
1023 return Folder.FoldCast(static_cast<Instruction::CastOps>(Opcode), Ops[0],
1024 TypeInfo.inferScalarType(R.getVPSingleValue()));
1025 switch (Opcode) {
1027 return Folder.FoldSelect(Ops[0], Ops[1],
1029 case VPInstruction::Not:
1030 return Folder.FoldBinOp(Instruction::BinaryOps::Xor, Ops[0],
1032 case Instruction::Select:
1033 return Folder.FoldSelect(Ops[0], Ops[1], Ops[2]);
1034 case Instruction::ICmp:
1035 case Instruction::FCmp:
1036 return Folder.FoldCmp(cast<VPRecipeWithIRFlags>(R).getPredicate(), Ops[0],
1037 Ops[1]);
1038 case Instruction::GetElementPtr: {
1039 auto &RFlags = cast<VPRecipeWithIRFlags>(R);
1040 auto *GEP = cast<GetElementPtrInst>(RFlags.getUnderlyingInstr());
1041 return Folder.FoldGEP(GEP->getSourceElementType(), Ops[0],
1042 drop_begin(Ops), RFlags.getGEPNoWrapFlags());
1043 }
1046 return Folder.FoldGEP(IntegerType::getInt8Ty(TypeInfo.getContext()),
1047 Ops[0], Ops[1],
1048 cast<VPRecipeWithIRFlags>(R).getGEPNoWrapFlags());
1049 // An extract of a live-in is an extract of a broadcast, so return the
1050 // broadcasted element.
1051 case Instruction::ExtractElement:
1052 assert(!Ops[0]->getType()->isVectorTy() && "Live-ins should be scalar");
1053 return Ops[0];
1054 }
1055 return nullptr;
1056 };
1057
1058 if (Value *V = FoldToIRValue())
1059 return R.getParent()->getPlan()->getOrAddLiveIn(V);
1060 return nullptr;
1061}
1062
1063/// Try to simplify VPSingleDefRecipe \p Def.
1065 VPlan *Plan = Def->getParent()->getPlan();
1066
1067 // Simplification of live-in IR values for SingleDef recipes using
1068 // InstSimplifyFolder.
1069 const DataLayout &DL =
1071 if (VPValue *V = tryToFoldLiveIns(*Def, Def->operands(), DL, TypeInfo))
1072 return Def->replaceAllUsesWith(V);
1073
1074 // Fold PredPHI LiveIn -> LiveIn.
1075 if (auto *PredPHI = dyn_cast<VPPredInstPHIRecipe>(Def)) {
1076 VPValue *Op = PredPHI->getOperand(0);
1077 if (Op->isLiveIn())
1078 PredPHI->replaceAllUsesWith(Op);
1079 }
1080
1081 VPBuilder Builder(Def);
1082 VPValue *A;
1083 if (match(Def, m_Trunc(m_ZExtOrSExt(m_VPValue(A))))) {
1084 Type *TruncTy = TypeInfo.inferScalarType(Def);
1085 Type *ATy = TypeInfo.inferScalarType(A);
1086 if (TruncTy == ATy) {
1087 Def->replaceAllUsesWith(A);
1088 } else {
1089 // Don't replace a scalarizing recipe with a widened cast.
1090 if (isa<VPReplicateRecipe>(Def))
1091 return;
1092 if (ATy->getScalarSizeInBits() < TruncTy->getScalarSizeInBits()) {
1093
1094 unsigned ExtOpcode = match(Def->getOperand(0), m_SExt(m_VPValue()))
1095 ? Instruction::SExt
1096 : Instruction::ZExt;
1097 auto *Ext = Builder.createWidenCast(Instruction::CastOps(ExtOpcode), A,
1098 TruncTy);
1099 if (auto *UnderlyingExt = Def->getOperand(0)->getUnderlyingValue()) {
1100 // UnderlyingExt has distinct return type, used to retain legacy cost.
1101 Ext->setUnderlyingValue(UnderlyingExt);
1102 }
1103 Def->replaceAllUsesWith(Ext);
1104 } else if (ATy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits()) {
1105 auto *Trunc = Builder.createWidenCast(Instruction::Trunc, A, TruncTy);
1106 Def->replaceAllUsesWith(Trunc);
1107 }
1108 }
1109#ifndef NDEBUG
1110 // Verify that the cached type info is for both A and its users is still
1111 // accurate by comparing it to freshly computed types.
1112 VPTypeAnalysis TypeInfo2(*Plan);
1113 assert(TypeInfo.inferScalarType(A) == TypeInfo2.inferScalarType(A));
1114 for (VPUser *U : A->users()) {
1115 auto *R = cast<VPRecipeBase>(U);
1116 for (VPValue *VPV : R->definedValues())
1117 assert(TypeInfo.inferScalarType(VPV) == TypeInfo2.inferScalarType(VPV));
1118 }
1119#endif
1120 }
1121
1122 // Simplify (X && Y) || (X && !Y) -> X.
1123 // TODO: Split up into simpler, modular combines: (X && Y) || (X && Z) into X
1124 // && (Y || Z) and (X || !X) into true. This requires queuing newly created
1125 // recipes to be visited during simplification.
1126 VPValue *X, *Y, *Z;
1127 if (match(Def,
1130 Def->replaceAllUsesWith(X);
1131 Def->eraseFromParent();
1132 return;
1133 }
1134
1135 // x | 1 -> 1
1136 if (match(Def, m_c_BinaryOr(m_VPValue(X), m_AllOnes())))
1137 return Def->replaceAllUsesWith(Def->getOperand(Def->getOperand(0) == X));
1138
1139 // x | 0 -> x
1140 if (match(Def, m_c_BinaryOr(m_VPValue(X), m_ZeroInt())))
1141 return Def->replaceAllUsesWith(X);
1142
1143 // x & 0 -> 0
1144 if (match(Def, m_c_BinaryAnd(m_VPValue(X), m_ZeroInt())))
1145 return Def->replaceAllUsesWith(Def->getOperand(Def->getOperand(0) == X));
1146
1147 // x && false -> false
1148 if (match(Def, m_LogicalAnd(m_VPValue(X), m_False())))
1149 return Def->replaceAllUsesWith(Def->getOperand(1));
1150
1151 // (x && y) || (x && z) -> x && (y || z)
1154 // Simplify only if one of the operands has one use to avoid creating an
1155 // extra recipe.
1156 (!Def->getOperand(0)->hasMoreThanOneUniqueUser() ||
1157 !Def->getOperand(1)->hasMoreThanOneUniqueUser()))
1158 return Def->replaceAllUsesWith(
1159 Builder.createLogicalAnd(X, Builder.createOr(Y, Z)));
1160
1161 // x && !x -> 0
1163 return Def->replaceAllUsesWith(Plan->getFalse());
1164
1165 if (match(Def, m_Select(m_VPValue(), m_VPValue(X), m_Deferred(X))))
1166 return Def->replaceAllUsesWith(X);
1167
1168 // select !c, x, y -> select c, y, x
1169 VPValue *C;
1170 if (match(Def, m_Select(m_Not(m_VPValue(C)), m_VPValue(X), m_VPValue(Y)))) {
1171 Def->setOperand(0, C);
1172 Def->setOperand(1, Y);
1173 Def->setOperand(2, X);
1174 return;
1175 }
1176
1177 // Reassociate (x && y) && z -> x && (y && z) if x has multiple users. With
1178 // tail folding it is likely that x is a header mask and can be simplified
1179 // further.
1181 m_VPValue(Z))) &&
1182 X->hasMoreThanOneUniqueUser())
1183 return Def->replaceAllUsesWith(
1184 Builder.createLogicalAnd(X, Builder.createLogicalAnd(Y, Z)));
1185
1186 if (match(Def, m_c_Mul(m_VPValue(A), m_One())))
1187 return Def->replaceAllUsesWith(A);
1188
1189 if (match(Def, m_c_Mul(m_VPValue(A), m_ZeroInt())))
1190 return Def->replaceAllUsesWith(
1191 Def->getOperand(0) == A ? Def->getOperand(1) : Def->getOperand(0));
1192
1193 if (match(Def, m_Not(m_VPValue(A)))) {
1194 if (match(A, m_Not(m_VPValue(A))))
1195 return Def->replaceAllUsesWith(A);
1196
1197 // Try to fold Not into compares by adjusting the predicate in-place.
1198 CmpPredicate Pred;
1199 if (match(A, m_Cmp(Pred, m_VPValue(), m_VPValue()))) {
1200 auto *Cmp = cast<VPRecipeWithIRFlags>(A);
1201 if (all_of(Cmp->users(),
1203 m_Not(m_Specific(Cmp)),
1204 m_Select(m_Specific(Cmp), m_VPValue(), m_VPValue()))))) {
1205 Cmp->setPredicate(CmpInst::getInversePredicate(Pred));
1206 for (VPUser *U : to_vector(Cmp->users())) {
1207 auto *R = cast<VPSingleDefRecipe>(U);
1208 if (match(R, m_Select(m_Specific(Cmp), m_VPValue(X), m_VPValue(Y)))) {
1209 // select (cmp pred), x, y -> select (cmp inv_pred), y, x
1210 R->setOperand(1, Y);
1211 R->setOperand(2, X);
1212 } else {
1213 // not (cmp pred) -> cmp inv_pred
1214 assert(match(R, m_Not(m_Specific(Cmp))) && "Unexpected user");
1215 R->replaceAllUsesWith(Cmp);
1216 }
1217 }
1218 // If Cmp doesn't have a debug location, use the one from the negation,
1219 // to preserve the location.
1220 if (!Cmp->getDebugLoc() && Def->getDebugLoc())
1221 Cmp->setDebugLoc(Def->getDebugLoc());
1222 }
1223 }
1224 }
1225
1226 // Remove redundant DerviedIVs, that is 0 + A * 1 -> A and 0 + 0 * x -> 0.
1227 if ((match(Def, m_DerivedIV(m_ZeroInt(), m_VPValue(A), m_One())) ||
1228 match(Def, m_DerivedIV(m_ZeroInt(), m_ZeroInt(), m_VPValue()))) &&
1229 TypeInfo.inferScalarType(Def->getOperand(1)) ==
1230 TypeInfo.inferScalarType(Def))
1231 return Def->replaceAllUsesWith(Def->getOperand(1));
1232
1234 m_One()))) {
1235 Type *WideStepTy = TypeInfo.inferScalarType(Def);
1236 if (TypeInfo.inferScalarType(X) != WideStepTy)
1237 X = Builder.createWidenCast(Instruction::Trunc, X, WideStepTy);
1238 Def->replaceAllUsesWith(X);
1239 return;
1240 }
1241
1242 // For i1 vp.merges produced by AnyOf reductions:
1243 // vp.merge true, (or x, y), x, evl -> vp.merge y, true, x, evl
1245 m_VPValue(X), m_VPValue())) &&
1247 TypeInfo.inferScalarType(Def)->isIntegerTy(1)) {
1248 Def->setOperand(1, Def->getOperand(0));
1249 Def->setOperand(0, Y);
1250 return;
1251 }
1252
1253 if (auto *Phi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(Def)) {
1254 if (Phi->getOperand(0) == Phi->getOperand(1))
1255 Phi->replaceAllUsesWith(Phi->getOperand(0));
1256 return;
1257 }
1258
1259 // Look through ExtractLastElement (BuildVector ....).
1262 auto *BuildVector = cast<VPInstruction>(Def->getOperand(0));
1263 Def->replaceAllUsesWith(
1264 BuildVector->getOperand(BuildVector->getNumOperands() - 1));
1265 return;
1266 }
1267
1268 // Look through ExtractPenultimateElement (BuildVector ....).
1270 m_BuildVector()))) {
1271 auto *BuildVector = cast<VPInstruction>(Def->getOperand(0));
1272 Def->replaceAllUsesWith(
1273 BuildVector->getOperand(BuildVector->getNumOperands() - 2));
1274 return;
1275 }
1276
1277 uint64_t Idx;
1279 auto *BuildVector = cast<VPInstruction>(Def->getOperand(0));
1280 Def->replaceAllUsesWith(BuildVector->getOperand(Idx));
1281 return;
1282 }
1283
1284 if (match(Def, m_BuildVector()) && all_equal(Def->operands())) {
1285 Def->replaceAllUsesWith(
1286 Builder.createNaryOp(VPInstruction::Broadcast, Def->getOperand(0)));
1287 return;
1288 }
1289
1290 if (auto *Phi = dyn_cast<VPPhi>(Def)) {
1291 if (Phi->getNumOperands() == 1)
1292 Phi->replaceAllUsesWith(Phi->getOperand(0));
1293 return;
1294 }
1295
1296 // Some simplifications can only be applied after unrolling. Perform them
1297 // below.
1298 if (!Plan->isUnrolled())
1299 return;
1300
1301 // Hoist an invariant increment Y of a phi X, by having X start at Y.
1302 if (match(Def, m_c_Add(m_VPValue(X), m_VPValue(Y))) && Y->isLiveIn() &&
1303 isa<VPPhi>(X)) {
1304 auto *Phi = cast<VPPhi>(X);
1305 if (Phi->getOperand(1) != Def && match(Phi->getOperand(0), m_ZeroInt()) &&
1306 Phi->getNumUsers() == 1 && (*Phi->user_begin() == Def)) {
1307 Phi->setOperand(0, Y);
1308 Def->replaceAllUsesWith(Phi);
1309 return;
1310 }
1311 }
1312
1313 // VPVectorPointer for part 0 can be replaced by their start pointer.
1314 if (auto *VecPtr = dyn_cast<VPVectorPointerRecipe>(Def)) {
1315 if (VecPtr->isFirstPart()) {
1316 VecPtr->replaceAllUsesWith(VecPtr->getOperand(0));
1317 return;
1318 }
1319 }
1320
1321 // VPScalarIVSteps for part 0 can be replaced by their start value, if only
1322 // the first lane is demanded.
1323 if (auto *Steps = dyn_cast<VPScalarIVStepsRecipe>(Def)) {
1324 if (Steps->isPart0() && vputils::onlyFirstLaneUsed(Steps)) {
1325 Steps->replaceAllUsesWith(Steps->getOperand(0));
1326 return;
1327 }
1328 }
1329 // Simplify redundant ReductionStartVector recipes after unrolling.
1330 VPValue *StartV;
1332 m_VPValue(StartV), m_VPValue(), m_VPValue()))) {
1333 Def->replaceUsesWithIf(StartV, [](const VPUser &U, unsigned Idx) {
1334 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&U);
1335 return PhiR && PhiR->isInLoop();
1336 });
1337 return;
1338 }
1339
1340 if (match(Def,
1343 Def->replaceAllUsesWith(A);
1344 return;
1345 }
1346
1351 cast<VPReplicateRecipe>(A)->isSingleScalar())) &&
1352 all_of(A->users(),
1353 [Def, A](VPUser *U) { return U->usesScalars(A) || Def == U; })) {
1354 return Def->replaceAllUsesWith(A);
1355 }
1356
1357 if (Plan->getUF() == 1 &&
1359 return Def->replaceAllUsesWith(
1360 Builder.createNaryOp(VPInstruction::ExtractLastElement, {A}));
1361 }
1362}
1363
1366 Plan.getEntry());
1367 VPTypeAnalysis TypeInfo(Plan);
1369 for (VPRecipeBase &R : make_early_inc_range(*VPBB))
1370 if (auto *Def = dyn_cast<VPSingleDefRecipe>(&R))
1371 simplifyRecipe(Def, TypeInfo);
1372 }
1373}
1374
1376 if (Plan.hasScalarVFOnly())
1377 return;
1378
1379 // Try to narrow wide and replicating recipes to single scalar recipes,
1380 // based on VPlan analysis. Only process blocks in the loop region for now,
1381 // without traversing into nested regions, as recipes in replicate regions
1382 // cannot be converted yet.
1385 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
1387 continue;
1388 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
1389 if (RepR && (RepR->isSingleScalar() || RepR->isPredicated()))
1390 continue;
1391
1392 auto *RepOrWidenR = cast<VPSingleDefRecipe>(&R);
1393 if (RepR && isa<StoreInst>(RepR->getUnderlyingInstr()) &&
1394 vputils::isSingleScalar(RepR->getOperand(1))) {
1395 auto *Clone = new VPReplicateRecipe(
1396 RepOrWidenR->getUnderlyingInstr(), RepOrWidenR->operands(),
1397 true /*IsSingleScalar*/, nullptr /*Mask*/, *RepR /*Metadata*/);
1398 Clone->insertBefore(RepOrWidenR);
1399 unsigned ExtractOpc =
1400 vputils::isUniformAcrossVFsAndUFs(RepR->getOperand(1))
1403 auto *Ext = new VPInstruction(ExtractOpc, {Clone->getOperand(0)});
1404 Ext->insertBefore(Clone);
1405 Clone->setOperand(0, Ext);
1406 RepR->eraseFromParent();
1407 continue;
1408 }
1409
1410 // Skip recipes that aren't single scalars or don't have only their
1411 // scalar results used. In the latter case, we would introduce extra
1412 // broadcasts.
1413 if (!vputils::isSingleScalar(RepOrWidenR) ||
1414 !all_of(RepOrWidenR->users(), [RepOrWidenR](const VPUser *U) {
1415 return U->usesScalars(RepOrWidenR) ||
1416 match(cast<VPRecipeBase>(U),
1417 m_CombineOr(m_ExtractLastElement(m_VPValue()),
1418 m_ExtractLastLanePerPart(m_VPValue())));
1419 }))
1420 continue;
1421
1422 auto *Clone = new VPReplicateRecipe(RepOrWidenR->getUnderlyingInstr(),
1423 RepOrWidenR->operands(),
1424 true /*IsSingleScalar*/);
1425 Clone->insertBefore(RepOrWidenR);
1426 RepOrWidenR->replaceAllUsesWith(Clone);
1427 if (isDeadRecipe(*RepOrWidenR))
1428 RepOrWidenR->eraseFromParent();
1429 }
1430 }
1431}
1432
1433/// Try to see if all of \p Blend's masks share a common value logically and'ed
1434/// and remove it from the masks.
1436 if (Blend->isNormalized())
1437 return;
1438 VPValue *CommonEdgeMask;
1439 if (!match(Blend->getMask(0),
1440 m_LogicalAnd(m_VPValue(CommonEdgeMask), m_VPValue())))
1441 return;
1442 for (unsigned I = 0; I < Blend->getNumIncomingValues(); I++)
1443 if (!match(Blend->getMask(I),
1444 m_LogicalAnd(m_Specific(CommonEdgeMask), m_VPValue())))
1445 return;
1446 for (unsigned I = 0; I < Blend->getNumIncomingValues(); I++)
1447 Blend->setMask(I, Blend->getMask(I)->getDefiningRecipe()->getOperand(1));
1448}
1449
1450/// Normalize and simplify VPBlendRecipes. Should be run after simplifyRecipes
1451/// to make sure the masks are simplified.
1452static void simplifyBlends(VPlan &Plan) {
1455 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
1456 auto *Blend = dyn_cast<VPBlendRecipe>(&R);
1457 if (!Blend)
1458 continue;
1459
1460 removeCommonBlendMask(Blend);
1461
1462 // Try to remove redundant blend recipes.
1463 SmallPtrSet<VPValue *, 4> UniqueValues;
1464 if (Blend->isNormalized() || !match(Blend->getMask(0), m_False()))
1465 UniqueValues.insert(Blend->getIncomingValue(0));
1466 for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
1467 if (!match(Blend->getMask(I), m_False()))
1468 UniqueValues.insert(Blend->getIncomingValue(I));
1469
1470 if (UniqueValues.size() == 1) {
1471 Blend->replaceAllUsesWith(*UniqueValues.begin());
1472 Blend->eraseFromParent();
1473 continue;
1474 }
1475
1476 if (Blend->isNormalized())
1477 continue;
1478
1479 // Normalize the blend so its first incoming value is used as the initial
1480 // value with the others blended into it.
1481
1482 unsigned StartIndex = 0;
1483 for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
1484 // If a value's mask is used only by the blend then is can be deadcoded.
1485 // TODO: Find the most expensive mask that can be deadcoded, or a mask
1486 // that's used by multiple blends where it can be removed from them all.
1487 VPValue *Mask = Blend->getMask(I);
1488 if (Mask->getNumUsers() == 1 && !match(Mask, m_False())) {
1489 StartIndex = I;
1490 break;
1491 }
1492 }
1493
1494 SmallVector<VPValue *, 4> OperandsWithMask;
1495 OperandsWithMask.push_back(Blend->getIncomingValue(StartIndex));
1496
1497 for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
1498 if (I == StartIndex)
1499 continue;
1500 OperandsWithMask.push_back(Blend->getIncomingValue(I));
1501 OperandsWithMask.push_back(Blend->getMask(I));
1502 }
1503
1504 auto *NewBlend =
1505 new VPBlendRecipe(cast_or_null<PHINode>(Blend->getUnderlyingValue()),
1506 OperandsWithMask, Blend->getDebugLoc());
1507 NewBlend->insertBefore(&R);
1508
1509 VPValue *DeadMask = Blend->getMask(StartIndex);
1510 Blend->replaceAllUsesWith(NewBlend);
1511 Blend->eraseFromParent();
1513
1514 /// Simplify BLEND %a, %b, Not(%mask) -> BLEND %b, %a, %mask.
1515 VPValue *NewMask;
1516 if (NewBlend->getNumOperands() == 3 &&
1517 match(NewBlend->getMask(1), m_Not(m_VPValue(NewMask)))) {
1518 VPValue *Inc0 = NewBlend->getOperand(0);
1519 VPValue *Inc1 = NewBlend->getOperand(1);
1520 VPValue *OldMask = NewBlend->getOperand(2);
1521 NewBlend->setOperand(0, Inc1);
1522 NewBlend->setOperand(1, Inc0);
1523 NewBlend->setOperand(2, NewMask);
1524 if (OldMask->getNumUsers() == 0)
1525 cast<VPInstruction>(OldMask)->eraseFromParent();
1526 }
1527 }
1528 }
1529}
1530
1531/// Optimize the width of vector induction variables in \p Plan based on a known
1532/// constant Trip Count, \p BestVF and \p BestUF.
1534 ElementCount BestVF,
1535 unsigned BestUF) {
1536 // Only proceed if we have not completely removed the vector region.
1537 if (!Plan.getVectorLoopRegion())
1538 return false;
1539
1540 const APInt *TC;
1541 if (!BestVF.isFixed() || !match(Plan.getTripCount(), m_APInt(TC)))
1542 return false;
1543
1544 // Calculate the minimum power-of-2 bit width that can fit the known TC, VF
1545 // and UF. Returns at least 8.
1546 auto ComputeBitWidth = [](APInt TC, uint64_t Align) {
1547 APInt AlignedTC =
1550 APInt MaxVal = AlignedTC - 1;
1551 return std::max<unsigned>(PowerOf2Ceil(MaxVal.getActiveBits()), 8);
1552 };
1553 unsigned NewBitWidth =
1554 ComputeBitWidth(*TC, BestVF.getKnownMinValue() * BestUF);
1555
1556 LLVMContext &Ctx = Plan.getContext();
1557 auto *NewIVTy = IntegerType::get(Ctx, NewBitWidth);
1558
1559 bool MadeChange = false;
1560
1561 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
1562 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
1563 auto *WideIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
1564
1565 // Currently only handle canonical IVs as it is trivial to replace the start
1566 // and stop values, and we currently only perform the optimization when the
1567 // IV has a single use.
1568 if (!WideIV || !WideIV->isCanonical() ||
1569 WideIV->hasMoreThanOneUniqueUser() ||
1570 NewIVTy == WideIV->getScalarType())
1571 continue;
1572
1573 // Currently only handle cases where the single user is a header-mask
1574 // comparison with the backedge-taken-count.
1575 if (!match(*WideIV->user_begin(),
1576 m_ICmp(m_Specific(WideIV),
1579 continue;
1580
1581 // Update IV operands and comparison bound to use new narrower type.
1582 auto *NewStart = Plan.getConstantInt(NewIVTy, 0);
1583 WideIV->setStartValue(NewStart);
1584 auto *NewStep = Plan.getConstantInt(NewIVTy, 1);
1585 WideIV->setStepValue(NewStep);
1586
1587 auto *NewBTC = new VPWidenCastRecipe(
1588 Instruction::Trunc, Plan.getOrCreateBackedgeTakenCount(), NewIVTy);
1589 Plan.getVectorPreheader()->appendRecipe(NewBTC);
1590 auto *Cmp = cast<VPInstruction>(*WideIV->user_begin());
1591 Cmp->setOperand(1, NewBTC);
1592
1593 MadeChange = true;
1594 }
1595
1596 return MadeChange;
1597}
1598
1599/// Return true if \p Cond is known to be true for given \p BestVF and \p
1600/// BestUF.
1602 ElementCount BestVF, unsigned BestUF,
1603 ScalarEvolution &SE) {
1605 return any_of(Cond->getDefiningRecipe()->operands(), [&Plan, BestVF, BestUF,
1606 &SE](VPValue *C) {
1607 return isConditionTrueViaVFAndUF(C, Plan, BestVF, BestUF, SE);
1608 });
1609
1610 auto *CanIV = Plan.getVectorLoopRegion()->getCanonicalIV();
1612 m_Specific(CanIV->getBackedgeValue()),
1613 m_Specific(&Plan.getVectorTripCount()))))
1614 return false;
1615
1616 // The compare checks CanIV + VFxUF == vector trip count. The vector trip
1617 // count is not conveniently available as SCEV so far, so we compare directly
1618 // against the original trip count. This is stricter than necessary, as we
1619 // will only return true if the trip count == vector trip count.
1620 const SCEV *VectorTripCount =
1622 if (isa<SCEVCouldNotCompute>(VectorTripCount))
1623 VectorTripCount = vputils::getSCEVExprForVPValue(Plan.getTripCount(), SE);
1624 assert(!isa<SCEVCouldNotCompute>(VectorTripCount) &&
1625 "Trip count SCEV must be computable");
1626 ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF);
1627 const SCEV *C = SE.getElementCount(VectorTripCount->getType(), NumElements);
1628 return SE.isKnownPredicate(CmpInst::ICMP_EQ, VectorTripCount, C);
1629}
1630
1631/// Try to replace multiple active lane masks used for control flow with
1632/// a single, wide active lane mask instruction followed by multiple
1633/// extract subvector intrinsics. This applies to the active lane mask
1634/// instructions both in the loop and in the preheader.
1635/// Incoming values of all ActiveLaneMaskPHIs are updated to use the
1636/// new extracts from the first active lane mask, which has it's last
1637/// operand (multiplier) set to UF.
1639 unsigned UF) {
1640 if (!EnableWideActiveLaneMask || !VF.isVector() || UF == 1)
1641 return false;
1642
1643 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
1644 VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
1645 auto *Term = &ExitingVPBB->back();
1646
1647 using namespace llvm::VPlanPatternMatch;
1649 m_VPValue(), m_VPValue(), m_VPValue())))))
1650 return false;
1651
1652 auto *Header = cast<VPBasicBlock>(VectorRegion->getEntry());
1653 LLVMContext &Ctx = Plan.getContext();
1654
1655 auto ExtractFromALM = [&](VPInstruction *ALM,
1656 SmallVectorImpl<VPValue *> &Extracts) {
1657 DebugLoc DL = ALM->getDebugLoc();
1658 for (unsigned Part = 0; Part < UF; ++Part) {
1660 Ops.append({ALM, Plan.getOrAddLiveIn(
1661 ConstantInt::get(IntegerType::getInt64Ty(Ctx),
1662 VF.getKnownMinValue() * Part))});
1663 auto *Ext = new VPWidenIntrinsicRecipe(Intrinsic::vector_extract, Ops,
1665 Extracts[Part] = Ext;
1666 Ext->insertAfter(ALM);
1667 }
1668 };
1669
1670 // Create a list of each active lane mask phi, ordered by unroll part.
1672 for (VPRecipeBase &R : Header->phis()) {
1674 if (!Phi)
1675 continue;
1676 VPValue *Index = nullptr;
1677 match(Phi->getBackedgeValue(),
1679 assert(Index && "Expected index from ActiveLaneMask instruction");
1680
1681 uint64_t Part;
1682 if (match(Index,
1684 m_VPValue(), m_ConstantInt(Part))))
1685 Phis[Part] = Phi;
1686 else
1687 // Anything other than a CanonicalIVIncrementForPart is part 0
1688 Phis[0] = Phi;
1689 }
1690
1691 assert(all_of(Phis, [](VPActiveLaneMaskPHIRecipe *Phi) { return Phi; }) &&
1692 "Expected one VPActiveLaneMaskPHIRecipe for each unroll part");
1693
1694 auto *EntryALM = cast<VPInstruction>(Phis[0]->getStartValue());
1695 auto *LoopALM = cast<VPInstruction>(Phis[0]->getBackedgeValue());
1696
1697 assert((EntryALM->getOpcode() == VPInstruction::ActiveLaneMask &&
1698 LoopALM->getOpcode() == VPInstruction::ActiveLaneMask) &&
1699 "Expected incoming values of Phi to be ActiveLaneMasks");
1700
1701 // When using wide lane masks, the return type of the get.active.lane.mask
1702 // intrinsic is VF x UF (last operand).
1703 VPValue *ALMMultiplier = Plan.getConstantInt(64, UF);
1704 EntryALM->setOperand(2, ALMMultiplier);
1705 LoopALM->setOperand(2, ALMMultiplier);
1706
1707 // Create UF x extract vectors and insert into preheader.
1708 SmallVector<VPValue *> EntryExtracts(UF);
1709 ExtractFromALM(EntryALM, EntryExtracts);
1710
1711 // Create UF x extract vectors and insert before the loop compare & branch,
1712 // updating the compare to use the first extract.
1713 SmallVector<VPValue *> LoopExtracts(UF);
1714 ExtractFromALM(LoopALM, LoopExtracts);
1715 VPInstruction *Not = cast<VPInstruction>(Term->getOperand(0));
1716 Not->setOperand(0, LoopExtracts[0]);
1717
1718 // Update the incoming values of active lane mask phis.
1719 for (unsigned Part = 0; Part < UF; ++Part) {
1720 Phis[Part]->setStartValue(EntryExtracts[Part]);
1721 Phis[Part]->setBackedgeValue(LoopExtracts[Part]);
1722 }
1723
1724 return true;
1725}
1726
1727/// Try to simplify the branch condition of \p Plan. This may restrict the
1728/// resulting plan to \p BestVF and \p BestUF.
1730 unsigned BestUF,
1732 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
1733 VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
1734 auto *Term = &ExitingVPBB->back();
1735 VPValue *Cond;
1736 ScalarEvolution &SE = *PSE.getSE();
1737 if (match(Term, m_BranchOnCount()) ||
1739 m_VPValue(), m_VPValue(), m_VPValue()))))) {
1740 // Try to simplify the branch condition if TC <= VF * UF when the latch
1741 // terminator is BranchOnCount or BranchOnCond where the input is
1742 // Not(ActiveLaneMask).
1743 const SCEV *TripCount =
1745 assert(!isa<SCEVCouldNotCompute>(TripCount) &&
1746 "Trip count SCEV must be computable");
1747 ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF);
1748 const SCEV *C = SE.getElementCount(TripCount->getType(), NumElements);
1749 if (TripCount->isZero() ||
1750 !SE.isKnownPredicate(CmpInst::ICMP_ULE, TripCount, C))
1751 return false;
1752 } else if (match(Term, m_BranchOnCond(m_VPValue(Cond)))) {
1753 // For BranchOnCond, check if we can prove the condition to be true using VF
1754 // and UF.
1755 if (!isConditionTrueViaVFAndUF(Cond, Plan, BestVF, BestUF, SE))
1756 return false;
1757 } else {
1758 return false;
1759 }
1760
1761 // The vector loop region only executes once. If possible, completely remove
1762 // the region, otherwise replace the terminator controlling the latch with
1763 // (BranchOnCond true).
1764 // TODO: VPWidenIntOrFpInductionRecipe is only partially supported; add
1765 // support for other non-canonical widen induction recipes (e.g.,
1766 // VPWidenPointerInductionRecipe).
1767 auto *Header = cast<VPBasicBlock>(VectorRegion->getEntry());
1768 if (all_of(Header->phis(), [](VPRecipeBase &Phi) {
1769 if (auto *R = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi))
1770 return R->isCanonical();
1771 return isa<VPCanonicalIVPHIRecipe, VPEVLBasedIVPHIRecipe,
1772 VPFirstOrderRecurrencePHIRecipe, VPPhi>(&Phi);
1773 })) {
1774 for (VPRecipeBase &HeaderR : make_early_inc_range(Header->phis())) {
1775 if (auto *R = dyn_cast<VPWidenIntOrFpInductionRecipe>(&HeaderR)) {
1776 VPBuilder Builder(Plan.getVectorPreheader());
1777 VPValue *StepV = Builder.createNaryOp(VPInstruction::StepVector, {},
1778 R->getScalarType());
1779 HeaderR.getVPSingleValue()->replaceAllUsesWith(StepV);
1780 HeaderR.eraseFromParent();
1781 continue;
1782 }
1783 auto *Phi = cast<VPPhiAccessors>(&HeaderR);
1784 HeaderR.getVPSingleValue()->replaceAllUsesWith(Phi->getIncomingValue(0));
1785 HeaderR.eraseFromParent();
1786 }
1787
1788 VPBlockBase *Preheader = VectorRegion->getSinglePredecessor();
1789 VPBlockBase *Exit = VectorRegion->getSingleSuccessor();
1790 VPBlockUtils::disconnectBlocks(Preheader, VectorRegion);
1791 VPBlockUtils::disconnectBlocks(VectorRegion, Exit);
1792
1793 for (VPBlockBase *B : vp_depth_first_shallow(VectorRegion->getEntry()))
1794 B->setParent(nullptr);
1795
1796 VPBlockUtils::connectBlocks(Preheader, Header);
1797 VPBlockUtils::connectBlocks(ExitingVPBB, Exit);
1799 } else {
1800 // The vector region contains header phis for which we cannot remove the
1801 // loop region yet.
1802 auto *BOC = new VPInstruction(VPInstruction::BranchOnCond, {Plan.getTrue()},
1803 Term->getDebugLoc());
1804 ExitingVPBB->appendRecipe(BOC);
1805 }
1806
1807 Term->eraseFromParent();
1808
1809 return true;
1810}
1811
1813 unsigned BestUF,
1815 assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
1816 assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
1817
1818 bool MadeChange = tryToReplaceALMWithWideALM(Plan, BestVF, BestUF);
1819 MadeChange |= simplifyBranchConditionForVFAndUF(Plan, BestVF, BestUF, PSE);
1820 MadeChange |= optimizeVectorInductionWidthForTCAndVFUF(Plan, BestVF, BestUF);
1821
1822 if (MadeChange) {
1823 Plan.setVF(BestVF);
1824 assert(Plan.getUF() == BestUF && "BestUF must match the Plan's UF");
1825 }
1826 // TODO: Further simplifications are possible
1827 // 1. Replace inductions with constants.
1828 // 2. Replace vector loop region with VPBasicBlock.
1829}
1830
1831/// Sink users of \p FOR after the recipe defining the previous value \p
1832/// Previous of the recurrence. \returns true if all users of \p FOR could be
1833/// re-arranged as needed or false if it is not possible.
1834static bool
1836 VPRecipeBase *Previous,
1837 VPDominatorTree &VPDT) {
1838 // Collect recipes that need sinking.
1841 Seen.insert(Previous);
1842 auto TryToPushSinkCandidate = [&](VPRecipeBase *SinkCandidate) {
1843 // The previous value must not depend on the users of the recurrence phi. In
1844 // that case, FOR is not a fixed order recurrence.
1845 if (SinkCandidate == Previous)
1846 return false;
1847
1848 if (isa<VPHeaderPHIRecipe>(SinkCandidate) ||
1849 !Seen.insert(SinkCandidate).second ||
1850 VPDT.properlyDominates(Previous, SinkCandidate))
1851 return true;
1852
1853 if (cannotHoistOrSinkRecipe(*SinkCandidate))
1854 return false;
1855
1856 WorkList.push_back(SinkCandidate);
1857 return true;
1858 };
1859
1860 // Recursively sink users of FOR after Previous.
1861 WorkList.push_back(FOR);
1862 for (unsigned I = 0; I != WorkList.size(); ++I) {
1863 VPRecipeBase *Current = WorkList[I];
1864 assert(Current->getNumDefinedValues() == 1 &&
1865 "only recipes with a single defined value expected");
1866
1867 for (VPUser *User : Current->getVPSingleValue()->users()) {
1868 if (!TryToPushSinkCandidate(cast<VPRecipeBase>(User)))
1869 return false;
1870 }
1871 }
1872
1873 // Keep recipes to sink ordered by dominance so earlier instructions are
1874 // processed first.
1875 sort(WorkList, [&VPDT](const VPRecipeBase *A, const VPRecipeBase *B) {
1876 return VPDT.properlyDominates(A, B);
1877 });
1878
1879 for (VPRecipeBase *SinkCandidate : WorkList) {
1880 if (SinkCandidate == FOR)
1881 continue;
1882
1883 SinkCandidate->moveAfter(Previous);
1884 Previous = SinkCandidate;
1885 }
1886 return true;
1887}
1888
1889/// Try to hoist \p Previous and its operands before all users of \p FOR.
1891 VPRecipeBase *Previous,
1892 VPDominatorTree &VPDT) {
1893 if (cannotHoistOrSinkRecipe(*Previous))
1894 return false;
1895
1896 // Collect recipes that need hoisting.
1897 SmallVector<VPRecipeBase *> HoistCandidates;
1899 VPRecipeBase *HoistPoint = nullptr;
1900 // Find the closest hoist point by looking at all users of FOR and selecting
1901 // the recipe dominating all other users.
1902 for (VPUser *U : FOR->users()) {
1903 auto *R = cast<VPRecipeBase>(U);
1904 if (!HoistPoint || VPDT.properlyDominates(R, HoistPoint))
1905 HoistPoint = R;
1906 }
1907 assert(all_of(FOR->users(),
1908 [&VPDT, HoistPoint](VPUser *U) {
1909 auto *R = cast<VPRecipeBase>(U);
1910 return HoistPoint == R ||
1911 VPDT.properlyDominates(HoistPoint, R);
1912 }) &&
1913 "HoistPoint must dominate all users of FOR");
1914
1915 auto NeedsHoisting = [HoistPoint, &VPDT,
1916 &Visited](VPValue *HoistCandidateV) -> VPRecipeBase * {
1917 VPRecipeBase *HoistCandidate = HoistCandidateV->getDefiningRecipe();
1918 if (!HoistCandidate)
1919 return nullptr;
1920 VPRegionBlock *EnclosingLoopRegion =
1921 HoistCandidate->getParent()->getEnclosingLoopRegion();
1922 assert((!HoistCandidate->getRegion() ||
1923 HoistCandidate->getRegion() == EnclosingLoopRegion) &&
1924 "CFG in VPlan should still be flat, without replicate regions");
1925 // Hoist candidate was already visited, no need to hoist.
1926 if (!Visited.insert(HoistCandidate).second)
1927 return nullptr;
1928
1929 // Candidate is outside loop region or a header phi, dominates FOR users w/o
1930 // hoisting.
1931 if (!EnclosingLoopRegion || isa<VPHeaderPHIRecipe>(HoistCandidate))
1932 return nullptr;
1933
1934 // If we reached a recipe that dominates HoistPoint, we don't need to
1935 // hoist the recipe.
1936 if (VPDT.properlyDominates(HoistCandidate, HoistPoint))
1937 return nullptr;
1938 return HoistCandidate;
1939 };
1940
1941 if (!NeedsHoisting(Previous->getVPSingleValue()))
1942 return true;
1943
1944 // Recursively try to hoist Previous and its operands before all users of FOR.
1945 HoistCandidates.push_back(Previous);
1946
1947 for (unsigned I = 0; I != HoistCandidates.size(); ++I) {
1948 VPRecipeBase *Current = HoistCandidates[I];
1949 assert(Current->getNumDefinedValues() == 1 &&
1950 "only recipes with a single defined value expected");
1951 if (cannotHoistOrSinkRecipe(*Current))
1952 return false;
1953
1954 for (VPValue *Op : Current->operands()) {
1955 // If we reach FOR, it means the original Previous depends on some other
1956 // recurrence that in turn depends on FOR. If that is the case, we would
1957 // also need to hoist recipes involving the other FOR, which may break
1958 // dependencies.
1959 if (Op == FOR)
1960 return false;
1961
1962 if (auto *R = NeedsHoisting(Op))
1963 HoistCandidates.push_back(R);
1964 }
1965 }
1966
1967 // Order recipes to hoist by dominance so earlier instructions are processed
1968 // first.
1969 sort(HoistCandidates, [&VPDT](const VPRecipeBase *A, const VPRecipeBase *B) {
1970 return VPDT.properlyDominates(A, B);
1971 });
1972
1973 for (VPRecipeBase *HoistCandidate : HoistCandidates) {
1974 HoistCandidate->moveBefore(*HoistPoint->getParent(),
1975 HoistPoint->getIterator());
1976 }
1977
1978 return true;
1979}
1980
1982 VPBuilder &LoopBuilder) {
1983 VPDominatorTree VPDT(Plan);
1984
1986 for (VPRecipeBase &R :
1989 RecurrencePhis.push_back(FOR);
1990
1991 for (VPFirstOrderRecurrencePHIRecipe *FOR : RecurrencePhis) {
1993 VPRecipeBase *Previous = FOR->getBackedgeValue()->getDefiningRecipe();
1994 // Fixed-order recurrences do not contain cycles, so this loop is guaranteed
1995 // to terminate.
1996 while (auto *PrevPhi =
1998 assert(PrevPhi->getParent() == FOR->getParent());
1999 assert(SeenPhis.insert(PrevPhi).second);
2000 Previous = PrevPhi->getBackedgeValue()->getDefiningRecipe();
2001 }
2002
2003 if (!sinkRecurrenceUsersAfterPrevious(FOR, Previous, VPDT) &&
2004 !hoistPreviousBeforeFORUsers(FOR, Previous, VPDT))
2005 return false;
2006
2007 // Introduce a recipe to combine the incoming and previous values of a
2008 // fixed-order recurrence.
2009 VPBasicBlock *InsertBlock = Previous->getParent();
2010 if (isa<VPHeaderPHIRecipe>(Previous))
2011 LoopBuilder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi());
2012 else
2013 LoopBuilder.setInsertPoint(InsertBlock,
2014 std::next(Previous->getIterator()));
2015
2016 auto *RecurSplice =
2018 {FOR, FOR->getBackedgeValue()});
2019
2020 FOR->replaceAllUsesWith(RecurSplice);
2021 // Set the first operand of RecurSplice to FOR again, after replacing
2022 // all users.
2023 RecurSplice->setOperand(0, FOR);
2024 }
2025 return true;
2026}
2027
2029 for (VPRecipeBase &R :
2031 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
2032 if (!PhiR)
2033 continue;
2034 RecurKind RK = PhiR->getRecurrenceKind();
2035 if (RK != RecurKind::Add && RK != RecurKind::Mul && RK != RecurKind::Sub &&
2037 continue;
2038
2039 for (VPUser *U : collectUsersRecursively(PhiR))
2040 if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(U)) {
2041 RecWithFlags->dropPoisonGeneratingFlags();
2042 }
2043 }
2044}
2045
2046namespace {
2047struct VPCSEDenseMapInfo : public DenseMapInfo<VPSingleDefRecipe *> {
2048 static bool isSentinel(const VPSingleDefRecipe *Def) {
2049 return Def == getEmptyKey() || Def == getTombstoneKey();
2050 }
2051
2052 /// If recipe \p R will lower to a GEP with a non-i8 source element type,
2053 /// return that source element type.
2054 static Type *getGEPSourceElementType(const VPSingleDefRecipe *R) {
2055 // All VPInstructions that lower to GEPs must have the i8 source element
2056 // type (as they are PtrAdds), so we omit it.
2058 .Case<VPReplicateRecipe>([](auto *I) -> Type * {
2059 if (auto *GEP = dyn_cast<GetElementPtrInst>(I->getUnderlyingValue()))
2060 return GEP->getSourceElementType();
2061 return nullptr;
2062 })
2063 .Case<VPVectorPointerRecipe, VPWidenGEPRecipe>(
2064 [](auto *I) { return I->getSourceElementType(); })
2065 .Default([](auto *) { return nullptr; });
2066 }
2067
2068 /// Returns true if recipe \p Def can be safely handed for CSE.
2069 static bool canHandle(const VPSingleDefRecipe *Def) {
2070 // We can extend the list of handled recipes in the future,
2071 // provided we account for the data embedded in them while checking for
2072 // equality or hashing.
2073 auto C = getOpcodeOrIntrinsicID(Def);
2074
2075 // The issue with (Insert|Extract)Value is that the index of the
2076 // insert/extract is not a proper operand in LLVM IR, and hence also not in
2077 // VPlan.
2078 if (!C || (!C->first && (C->second == Instruction::InsertValue ||
2079 C->second == Instruction::ExtractValue)))
2080 return false;
2081
2082 // During CSE, we can only handle recipes that don't read from memory: if
2083 // they read from memory, there could be an intervening write to memory
2084 // before the next instance is CSE'd, leading to an incorrect result.
2085 return !Def->mayReadFromMemory();
2086 }
2087
2088 /// Hash the underlying data of \p Def.
2089 static unsigned getHashValue(const VPSingleDefRecipe *Def) {
2090 const VPlan *Plan = Def->getParent()->getPlan();
2091 VPTypeAnalysis TypeInfo(*Plan);
2092 hash_code Result = hash_combine(
2093 Def->getVPDefID(), getOpcodeOrIntrinsicID(Def),
2094 getGEPSourceElementType(Def), TypeInfo.inferScalarType(Def),
2096 if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(Def))
2097 if (RFlags->hasPredicate())
2098 return hash_combine(Result, RFlags->getPredicate());
2099 return Result;
2100 }
2101
2102 /// Check equality of underlying data of \p L and \p R.
2103 static bool isEqual(const VPSingleDefRecipe *L, const VPSingleDefRecipe *R) {
2104 if (isSentinel(L) || isSentinel(R))
2105 return L == R;
2106 if (L->getVPDefID() != R->getVPDefID() ||
2108 getGEPSourceElementType(L) != getGEPSourceElementType(R) ||
2110 !equal(L->operands(), R->operands()))
2111 return false;
2113 "must have valid opcode info for both recipes");
2114 if (auto *LFlags = dyn_cast<VPRecipeWithIRFlags>(L))
2115 if (LFlags->hasPredicate() &&
2116 LFlags->getPredicate() !=
2117 cast<VPRecipeWithIRFlags>(R)->getPredicate())
2118 return false;
2119 // Recipes in replicate regions implicitly depend on predicate. If either
2120 // recipe is in a replicate region, only consider them equal if both have
2121 // the same parent.
2122 const VPRegionBlock *RegionL = L->getRegion();
2123 const VPRegionBlock *RegionR = R->getRegion();
2124 if (((RegionL && RegionL->isReplicator()) ||
2125 (RegionR && RegionR->isReplicator())) &&
2126 L->getParent() != R->getParent())
2127 return false;
2128 const VPlan *Plan = L->getParent()->getPlan();
2129 VPTypeAnalysis TypeInfo(*Plan);
2130 return TypeInfo.inferScalarType(L) == TypeInfo.inferScalarType(R);
2131 }
2132};
2133} // end anonymous namespace
2134
2135/// Perform a common-subexpression-elimination of VPSingleDefRecipes on the \p
2136/// Plan.
2138 VPDominatorTree VPDT(Plan);
2140
2142 vp_depth_first_deep(Plan.getEntry()))) {
2143 for (VPRecipeBase &R : *VPBB) {
2144 auto *Def = dyn_cast<VPSingleDefRecipe>(&R);
2145 if (!Def || !VPCSEDenseMapInfo::canHandle(Def))
2146 continue;
2147 if (VPSingleDefRecipe *V = CSEMap.lookup(Def)) {
2148 // V must dominate Def for a valid replacement.
2149 if (!VPDT.dominates(V->getParent(), VPBB))
2150 continue;
2151 // Only keep flags present on both V and Def.
2152 if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(V))
2153 RFlags->intersectFlags(*cast<VPRecipeWithIRFlags>(Def));
2154 Def->replaceAllUsesWith(V);
2155 continue;
2156 }
2157 CSEMap[Def] = Def;
2158 }
2159 }
2160}
2161
2162/// Move loop-invariant recipes out of the vector loop region in \p Plan.
2163static void licm(VPlan &Plan) {
2164 VPBasicBlock *Preheader = Plan.getVectorPreheader();
2165
2166 // Hoist any loop invariant recipes from the vector loop region to the
2167 // preheader. Preform a shallow traversal of the vector loop region, to
2168 // exclude recipes in replicate regions. Since the top-level blocks in the
2169 // vector loop region are guaranteed to execute if the vector pre-header is,
2170 // we don't need to check speculation safety.
2171 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2172 assert(Preheader->getSingleSuccessor() == LoopRegion &&
2173 "Expected vector prehader's successor to be the vector loop region");
2175 vp_depth_first_shallow(LoopRegion->getEntry()))) {
2176 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
2178 continue;
2179 if (any_of(R.operands(), [](VPValue *Op) {
2180 return !Op->isDefinedOutsideLoopRegions();
2181 }))
2182 continue;
2183 R.moveBefore(*Preheader, Preheader->end());
2184 }
2185 }
2186}
2187
2189 VPlan &Plan, const MapVector<Instruction *, uint64_t> &MinBWs) {
2190 if (Plan.hasScalarVFOnly())
2191 return;
2192 // Keep track of created truncates, so they can be re-used. Note that we
2193 // cannot use RAUW after creating a new truncate, as this would could make
2194 // other uses have different types for their operands, making them invalidly
2195 // typed.
2197 VPTypeAnalysis TypeInfo(Plan);
2198 VPBasicBlock *PH = Plan.getVectorPreheader();
2201 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
2204 &R))
2205 continue;
2206
2207 VPValue *ResultVPV = R.getVPSingleValue();
2208 auto *UI = cast_or_null<Instruction>(ResultVPV->getUnderlyingValue());
2209 unsigned NewResSizeInBits = MinBWs.lookup(UI);
2210 if (!NewResSizeInBits)
2211 continue;
2212
2213 // If the value wasn't vectorized, we must maintain the original scalar
2214 // type. Skip those here, after incrementing NumProcessedRecipes. Also
2215 // skip casts which do not need to be handled explicitly here, as
2216 // redundant casts will be removed during recipe simplification.
2218 continue;
2219
2220 Type *OldResTy = TypeInfo.inferScalarType(ResultVPV);
2221 unsigned OldResSizeInBits = OldResTy->getScalarSizeInBits();
2222 assert(OldResTy->isIntegerTy() && "only integer types supported");
2223 (void)OldResSizeInBits;
2224
2225 auto *NewResTy = IntegerType::get(Plan.getContext(), NewResSizeInBits);
2226
2227 // Any wrapping introduced by shrinking this operation shouldn't be
2228 // considered undefined behavior. So, we can't unconditionally copy
2229 // arithmetic wrapping flags to VPW.
2230 if (auto *VPW = dyn_cast<VPRecipeWithIRFlags>(&R))
2231 VPW->dropPoisonGeneratingFlags();
2232
2233 if (OldResSizeInBits != NewResSizeInBits &&
2234 !match(&R, m_ICmp(m_VPValue(), m_VPValue()))) {
2235 // Extend result to original width.
2236 auto *Ext =
2237 new VPWidenCastRecipe(Instruction::ZExt, ResultVPV, OldResTy);
2238 Ext->insertAfter(&R);
2239 ResultVPV->replaceAllUsesWith(Ext);
2240 Ext->setOperand(0, ResultVPV);
2241 assert(OldResSizeInBits > NewResSizeInBits && "Nothing to shrink?");
2242 } else {
2243 assert(match(&R, m_ICmp(m_VPValue(), m_VPValue())) &&
2244 "Only ICmps should not need extending the result.");
2245 }
2246
2247 assert(!isa<VPWidenStoreRecipe>(&R) && "stores cannot be narrowed");
2249 continue;
2250
2251 // Shrink operands by introducing truncates as needed.
2252 unsigned StartIdx = isa<VPWidenSelectRecipe>(&R) ? 1 : 0;
2253 for (unsigned Idx = StartIdx; Idx != R.getNumOperands(); ++Idx) {
2254 auto *Op = R.getOperand(Idx);
2255 unsigned OpSizeInBits =
2257 if (OpSizeInBits == NewResSizeInBits)
2258 continue;
2259 assert(OpSizeInBits > NewResSizeInBits && "nothing to truncate");
2260 auto [ProcessedIter, IterIsEmpty] = ProcessedTruncs.try_emplace(Op);
2261 if (!IterIsEmpty) {
2262 R.setOperand(Idx, ProcessedIter->second);
2263 continue;
2264 }
2265
2266 VPBuilder Builder;
2267 if (Op->isLiveIn())
2268 Builder.setInsertPoint(PH);
2269 else
2270 Builder.setInsertPoint(&R);
2271 VPWidenCastRecipe *NewOp =
2272 Builder.createWidenCast(Instruction::Trunc, Op, NewResTy);
2273 ProcessedIter->second = NewOp;
2274 R.setOperand(Idx, NewOp);
2275 }
2276
2277 }
2278 }
2279}
2280
2284 VPValue *Cond;
2285 // Skip blocks that are not terminated by BranchOnCond.
2286 if (VPBB->empty() || !match(&VPBB->back(), m_BranchOnCond(m_VPValue(Cond))))
2287 continue;
2288
2289 assert(VPBB->getNumSuccessors() == 2 &&
2290 "Two successors expected for BranchOnCond");
2291 unsigned RemovedIdx;
2292 if (match(Cond, m_True()))
2293 RemovedIdx = 1;
2294 else if (match(Cond, m_False()))
2295 RemovedIdx = 0;
2296 else
2297 continue;
2298
2299 VPBasicBlock *RemovedSucc =
2300 cast<VPBasicBlock>(VPBB->getSuccessors()[RemovedIdx]);
2301 assert(count(RemovedSucc->getPredecessors(), VPBB) == 1 &&
2302 "There must be a single edge between VPBB and its successor");
2303 // Values coming from VPBB into phi recipes of RemoveSucc are removed from
2304 // these recipes.
2305 for (VPRecipeBase &R : RemovedSucc->phis())
2306 cast<VPPhiAccessors>(&R)->removeIncomingValueFor(VPBB);
2307
2308 // Disconnect blocks and remove the terminator. RemovedSucc will be deleted
2309 // automatically on VPlan destruction if it becomes unreachable.
2310 VPBlockUtils::disconnectBlocks(VPBB, RemovedSucc);
2311 VPBB->back().eraseFromParent();
2312 }
2313}
2314
2333
2334// Add a VPActiveLaneMaskPHIRecipe and related recipes to \p Plan and replace
2335// the loop terminator with a branch-on-cond recipe with the negated
2336// active-lane-mask as operand. Note that this turns the loop into an
2337// uncountable one. Only the existing terminator is replaced, all other existing
2338// recipes/users remain unchanged, except for poison-generating flags being
2339// dropped from the canonical IV increment. Return the created
2340// VPActiveLaneMaskPHIRecipe.
2341//
2342// The function uses the following definitions:
2343//
2344// %TripCount = DataWithControlFlowWithoutRuntimeCheck ?
2345// calculate-trip-count-minus-VF (original TC) : original TC
2346// %IncrementValue = DataWithControlFlowWithoutRuntimeCheck ?
2347// CanonicalIVPhi : CanonicalIVIncrement
2348// %StartV is the canonical induction start value.
2349//
2350// The function adds the following recipes:
2351//
2352// vector.ph:
2353// %TripCount = calculate-trip-count-minus-VF (original TC)
2354// [if DataWithControlFlowWithoutRuntimeCheck]
2355// %EntryInc = canonical-iv-increment-for-part %StartV
2356// %EntryALM = active-lane-mask %EntryInc, %TripCount
2357//
2358// vector.body:
2359// ...
2360// %P = active-lane-mask-phi [ %EntryALM, %vector.ph ], [ %ALM, %vector.body ]
2361// ...
2362// %InLoopInc = canonical-iv-increment-for-part %IncrementValue
2363// %ALM = active-lane-mask %InLoopInc, TripCount
2364// %Negated = Not %ALM
2365// branch-on-cond %Negated
2366//
2369 VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
2370 VPBasicBlock *EB = TopRegion->getExitingBasicBlock();
2371 auto *CanonicalIVPHI = TopRegion->getCanonicalIV();
2372 VPValue *StartV = CanonicalIVPHI->getStartValue();
2373
2374 auto *CanonicalIVIncrement =
2375 cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue());
2376 // TODO: Check if dropping the flags is needed if
2377 // !DataAndControlFlowWithoutRuntimeCheck.
2378 CanonicalIVIncrement->dropPoisonGeneratingFlags();
2379 DebugLoc DL = CanonicalIVIncrement->getDebugLoc();
2380 // We can't use StartV directly in the ActiveLaneMask VPInstruction, since
2381 // we have to take unrolling into account. Each part needs to start at
2382 // Part * VF
2383 auto *VecPreheader = Plan.getVectorPreheader();
2384 VPBuilder Builder(VecPreheader);
2385
2386 // Create the ActiveLaneMask instruction using the correct start values.
2387 VPValue *TC = Plan.getTripCount();
2388
2389 VPValue *TripCount, *IncrementValue;
2391 // When the loop is guarded by a runtime overflow check for the loop
2392 // induction variable increment by VF, we can increment the value before
2393 // the get.active.lane mask and use the unmodified tripcount.
2394 IncrementValue = CanonicalIVIncrement;
2395 TripCount = TC;
2396 } else {
2397 // When avoiding a runtime check, the active.lane.mask inside the loop
2398 // uses a modified trip count and the induction variable increment is
2399 // done after the active.lane.mask intrinsic is called.
2400 IncrementValue = CanonicalIVPHI;
2401 TripCount = Builder.createNaryOp(VPInstruction::CalculateTripCountMinusVF,
2402 {TC}, DL);
2403 }
2404 auto *EntryIncrement = Builder.createOverflowingOp(
2405 VPInstruction::CanonicalIVIncrementForPart, {StartV}, {false, false}, DL,
2406 "index.part.next");
2407
2408 // Create the active lane mask instruction in the VPlan preheader.
2409 VPValue *ALMMultiplier =
2410 Plan.getConstantInt(TopRegion->getCanonicalIVType(), 1);
2411 auto *EntryALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
2412 {EntryIncrement, TC, ALMMultiplier}, DL,
2413 "active.lane.mask.entry");
2414
2415 // Now create the ActiveLaneMaskPhi recipe in the main loop using the
2416 // preheader ActiveLaneMask instruction.
2417 auto *LaneMaskPhi =
2419 LaneMaskPhi->insertAfter(CanonicalIVPHI);
2420
2421 // Create the active lane mask for the next iteration of the loop before the
2422 // original terminator.
2423 VPRecipeBase *OriginalTerminator = EB->getTerminator();
2424 Builder.setInsertPoint(OriginalTerminator);
2425 auto *InLoopIncrement =
2426 Builder.createOverflowingOp(VPInstruction::CanonicalIVIncrementForPart,
2427 {IncrementValue}, {false, false}, DL);
2428 auto *ALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
2429 {InLoopIncrement, TripCount, ALMMultiplier},
2430 DL, "active.lane.mask.next");
2431 LaneMaskPhi->addOperand(ALM);
2432
2433 // Replace the original terminator with BranchOnCond. We have to invert the
2434 // mask here because a true condition means jumping to the exit block.
2435 auto *NotMask = Builder.createNot(ALM, DL);
2436 Builder.createNaryOp(VPInstruction::BranchOnCond, {NotMask}, DL);
2437 OriginalTerminator->eraseFromParent();
2438 return LaneMaskPhi;
2439}
2440
2441/// Collect the header mask with the pattern:
2442/// (ICMP_ULE, WideCanonicalIV, backedge-taken-count)
2443/// TODO: Introduce explicit recipe for header-mask instead of searching
2444/// for the header-mask pattern manually.
2446 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2447 SmallVector<VPValue *> WideCanonicalIVs;
2448 auto *FoundWidenCanonicalIVUser = find_if(
2450 assert(count_if(LoopRegion->getCanonicalIV()->users(),
2452 "Must have at most one VPWideCanonicalIVRecipe");
2453 if (FoundWidenCanonicalIVUser !=
2454 LoopRegion->getCanonicalIV()->users().end()) {
2455 auto *WideCanonicalIV =
2456 cast<VPWidenCanonicalIVRecipe>(*FoundWidenCanonicalIVUser);
2457 WideCanonicalIVs.push_back(WideCanonicalIV);
2458 }
2459
2460 // Also include VPWidenIntOrFpInductionRecipes that represent a widened
2461 // version of the canonical induction.
2462 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
2463 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
2464 auto *WidenOriginalIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
2465 if (WidenOriginalIV && WidenOriginalIV->isCanonical())
2466 WideCanonicalIVs.push_back(WidenOriginalIV);
2467 }
2468
2469 // Walk users of wide canonical IVs and find the single compare of the form
2470 // (ICMP_ULE, WideCanonicalIV, backedge-taken-count).
2471 VPSingleDefRecipe *HeaderMask = nullptr;
2472 for (auto *Wide : WideCanonicalIVs) {
2473 for (VPUser *U : SmallVector<VPUser *>(Wide->users())) {
2474 auto *VPI = dyn_cast<VPInstruction>(U);
2475 if (!VPI || !vputils::isHeaderMask(VPI, Plan))
2476 continue;
2477
2478 assert(VPI->getOperand(0) == Wide &&
2479 "WidenCanonicalIV must be the first operand of the compare");
2480 assert(!HeaderMask && "Multiple header masks found?");
2481 HeaderMask = VPI;
2482 }
2483 }
2484 return HeaderMask;
2485}
2486
2488 VPlan &Plan, bool UseActiveLaneMaskForControlFlow,
2491 UseActiveLaneMaskForControlFlow) &&
2492 "DataAndControlFlowWithoutRuntimeCheck implies "
2493 "UseActiveLaneMaskForControlFlow");
2494
2495 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2496 auto *FoundWidenCanonicalIVUser = find_if(
2498 assert(FoundWidenCanonicalIVUser &&
2499 "Must have widened canonical IV when tail folding!");
2500 VPSingleDefRecipe *HeaderMask = findHeaderMask(Plan);
2501 auto *WideCanonicalIV =
2502 cast<VPWidenCanonicalIVRecipe>(*FoundWidenCanonicalIVUser);
2503 VPSingleDefRecipe *LaneMask;
2504 if (UseActiveLaneMaskForControlFlow) {
2507 } else {
2508 VPBuilder B = VPBuilder::getToInsertAfter(WideCanonicalIV);
2509 VPValue *ALMMultiplier = Plan.getOrAddLiveIn(
2510 ConstantInt::get(LoopRegion->getCanonicalIVType(), 1));
2511 LaneMask =
2512 B.createNaryOp(VPInstruction::ActiveLaneMask,
2513 {WideCanonicalIV, Plan.getTripCount(), ALMMultiplier},
2514 nullptr, "active.lane.mask");
2515 }
2516
2517 // Walk users of WideCanonicalIV and replace the header mask of the form
2518 // (ICMP_ULE, WideCanonicalIV, backedge-taken-count) with an active-lane-mask,
2519 // removing the old one to ensure there is always only a single header mask.
2520 HeaderMask->replaceAllUsesWith(LaneMask);
2521 HeaderMask->eraseFromParent();
2522}
2523
2524template <typename Op0_t, typename Op1_t> struct RemoveMask_match {
2525 Op0_t In;
2527
2528 RemoveMask_match(const Op0_t &In, Op1_t &Out) : In(In), Out(Out) {}
2529
2530 template <typename OpTy> bool match(OpTy *V) const {
2531 if (m_Specific(In).match(V)) {
2532 Out = nullptr;
2533 return true;
2534 }
2536 return true;
2537 return false;
2538 }
2539};
2540
2541/// Match a specific mask \p In, or a combination of it (logical-and In, Out).
2542/// Returns the remaining part \p Out if so, or nullptr otherwise.
2543template <typename Op0_t, typename Op1_t>
2544static inline RemoveMask_match<Op0_t, Op1_t> m_RemoveMask(const Op0_t &In,
2545 Op1_t &Out) {
2546 return RemoveMask_match<Op0_t, Op1_t>(In, Out);
2547}
2548
2549/// Try to optimize a \p CurRecipe masked by \p HeaderMask to a corresponding
2550/// EVL-based recipe without the header mask. Returns nullptr if no EVL-based
2551/// recipe could be created.
2552/// \p HeaderMask Header Mask.
2553/// \p CurRecipe Recipe to be transform.
2554/// \p TypeInfo VPlan-based type analysis.
2555/// \p EVL The explicit vector length parameter of vector-predication
2556/// intrinsics.
2558 VPRecipeBase &CurRecipe,
2559 VPTypeAnalysis &TypeInfo, VPValue &EVL) {
2560 VPlan *Plan = CurRecipe.getParent()->getPlan();
2561 VPValue *Addr, *Mask, *EndPtr;
2562
2563 /// Adjust any end pointers so that they point to the end of EVL lanes not VF.
2564 auto AdjustEndPtr = [&CurRecipe, &EVL](VPValue *EndPtr) {
2565 auto *EVLEndPtr = cast<VPVectorEndPointerRecipe>(EndPtr)->clone();
2566 EVLEndPtr->insertBefore(&CurRecipe);
2567 EVLEndPtr->setOperand(1, &EVL);
2568 return EVLEndPtr;
2569 };
2570
2571 if (match(&CurRecipe,
2572 m_MaskedLoad(m_VPValue(Addr), m_RemoveMask(HeaderMask, Mask))) &&
2573 !cast<VPWidenLoadRecipe>(CurRecipe).isReverse())
2574 return new VPWidenLoadEVLRecipe(cast<VPWidenLoadRecipe>(CurRecipe), Addr,
2575 EVL, Mask);
2576
2577 if (match(&CurRecipe,
2578 m_MaskedLoad(m_VPValue(EndPtr), m_RemoveMask(HeaderMask, Mask))) &&
2579 match(EndPtr, m_VecEndPtr(m_VPValue(Addr), m_Specific(&Plan->getVF()))) &&
2580 cast<VPWidenLoadRecipe>(CurRecipe).isReverse())
2581 return new VPWidenLoadEVLRecipe(cast<VPWidenLoadRecipe>(CurRecipe),
2582 AdjustEndPtr(EndPtr), EVL, Mask);
2583
2584 if (match(&CurRecipe, m_MaskedStore(m_VPValue(Addr), m_VPValue(),
2585 m_RemoveMask(HeaderMask, Mask))) &&
2586 !cast<VPWidenStoreRecipe>(CurRecipe).isReverse())
2587 return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(CurRecipe), Addr,
2588 EVL, Mask);
2589
2590 if (match(&CurRecipe, m_MaskedStore(m_VPValue(EndPtr), m_VPValue(),
2591 m_RemoveMask(HeaderMask, Mask))) &&
2592 match(EndPtr, m_VecEndPtr(m_VPValue(Addr), m_Specific(&Plan->getVF()))) &&
2593 cast<VPWidenStoreRecipe>(CurRecipe).isReverse())
2594 return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(CurRecipe),
2595 AdjustEndPtr(EndPtr), EVL, Mask);
2596
2597 if (auto *Rdx = dyn_cast<VPReductionRecipe>(&CurRecipe))
2598 if (Rdx->isConditional() &&
2599 match(Rdx->getCondOp(), m_RemoveMask(HeaderMask, Mask)))
2600 return new VPReductionEVLRecipe(*Rdx, EVL, Mask);
2601
2602 if (auto *Interleave = dyn_cast<VPInterleaveRecipe>(&CurRecipe))
2603 if (Interleave->getMask() &&
2604 match(Interleave->getMask(), m_RemoveMask(HeaderMask, Mask)))
2605 return new VPInterleaveEVLRecipe(*Interleave, EVL, Mask);
2606
2607 VPValue *LHS, *RHS;
2608 if (match(&CurRecipe,
2609 m_Select(m_Specific(HeaderMask), m_VPValue(LHS), m_VPValue(RHS))))
2610 return new VPWidenIntrinsicRecipe(
2611 Intrinsic::vp_merge, {Plan->getTrue(), LHS, RHS, &EVL},
2612 TypeInfo.inferScalarType(LHS), CurRecipe.getDebugLoc());
2613
2614 return nullptr;
2615}
2616
2617/// Replace recipes with their EVL variants.
2619 VPTypeAnalysis TypeInfo(Plan);
2620 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2621 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
2622
2623 assert(all_of(Plan.getVF().users(),
2626 "User of VF that we can't transform to EVL.");
2627 Plan.getVF().replaceUsesWithIf(&EVL, [](VPUser &U, unsigned Idx) {
2629 });
2630
2631 assert(all_of(Plan.getVFxUF().users(),
2632 [&LoopRegion, &Plan](VPUser *U) {
2633 return match(U,
2634 m_c_Add(m_Specific(LoopRegion->getCanonicalIV()),
2635 m_Specific(&Plan.getVFxUF()))) ||
2636 isa<VPWidenPointerInductionRecipe>(U);
2637 }) &&
2638 "Only users of VFxUF should be VPWidenPointerInductionRecipe and the "
2639 "increment of the canonical induction.");
2640 Plan.getVFxUF().replaceUsesWithIf(&EVL, [](VPUser &U, unsigned Idx) {
2641 // Only replace uses in VPWidenPointerInductionRecipe; The increment of the
2642 // canonical induction must not be updated.
2644 });
2645
2646 // Defer erasing recipes till the end so that we don't invalidate the
2647 // VPTypeAnalysis cache.
2649
2650 // Create a scalar phi to track the previous EVL if fixed-order recurrence is
2651 // contained.
2652 bool ContainsFORs =
2654 if (ContainsFORs) {
2655 // TODO: Use VPInstruction::ExplicitVectorLength to get maximum EVL.
2656 VPValue *MaxEVL = &Plan.getVF();
2657 // Emit VPScalarCastRecipe in preheader if VF is not a 32 bits integer.
2658 VPBuilder Builder(LoopRegion->getPreheaderVPBB());
2659 MaxEVL = Builder.createScalarZExtOrTrunc(
2660 MaxEVL, Type::getInt32Ty(Plan.getContext()),
2661 TypeInfo.inferScalarType(MaxEVL), DebugLoc::getUnknown());
2662
2663 Builder.setInsertPoint(Header, Header->getFirstNonPhi());
2664 VPValue *PrevEVL = Builder.createScalarPhi(
2665 {MaxEVL, &EVL}, DebugLoc::getUnknown(), "prev.evl");
2666
2669 for (VPRecipeBase &R : *VPBB) {
2670 VPValue *V1, *V2;
2671 if (!match(&R,
2673 m_VPValue(V1), m_VPValue(V2))))
2674 continue;
2675 VPValue *Imm = Plan.getOrAddLiveIn(
2678 Intrinsic::experimental_vp_splice,
2679 {V1, V2, Imm, Plan.getTrue(), PrevEVL, &EVL},
2680 TypeInfo.inferScalarType(R.getVPSingleValue()), R.getDebugLoc());
2681 VPSplice->insertBefore(&R);
2682 R.getVPSingleValue()->replaceAllUsesWith(VPSplice);
2683 ToErase.push_back(&R);
2684 }
2685 }
2686 }
2687
2688 VPValue *HeaderMask = findHeaderMask(Plan);
2689 if (!HeaderMask)
2690 return;
2691
2692 // Replace header masks with a mask equivalent to predicating by EVL:
2693 //
2694 // icmp ule widen-canonical-iv backedge-taken-count
2695 // ->
2696 // icmp ult step-vector, EVL
2697 VPRecipeBase *EVLR = EVL.getDefiningRecipe();
2698 VPBuilder Builder(EVLR->getParent(), std::next(EVLR->getIterator()));
2699 Type *EVLType = TypeInfo.inferScalarType(&EVL);
2700 VPValue *EVLMask = Builder.createICmp(
2702 Builder.createNaryOp(VPInstruction::StepVector, {}, EVLType), &EVL);
2703 HeaderMask->replaceAllUsesWith(EVLMask);
2704 ToErase.push_back(HeaderMask->getDefiningRecipe());
2705
2706 // Try to optimize header mask recipes away to their EVL variants.
2707 // TODO: Split optimizeMaskToEVL out and move into
2708 // VPlanTransforms::optimize. transformRecipestoEVLRecipes should be run in
2709 // tryToBuildVPlanWithVPRecipes beforehand.
2710 for (VPUser *U : collectUsersRecursively(EVLMask)) {
2711 auto *CurRecipe = cast<VPRecipeBase>(U);
2712 VPRecipeBase *EVLRecipe =
2713 optimizeMaskToEVL(EVLMask, *CurRecipe, TypeInfo, EVL);
2714 if (!EVLRecipe)
2715 continue;
2716
2717 unsigned NumDefVal = EVLRecipe->getNumDefinedValues();
2718 assert(NumDefVal == CurRecipe->getNumDefinedValues() &&
2719 "New recipe must define the same number of values as the "
2720 "original.");
2721 EVLRecipe->insertBefore(CurRecipe);
2723 EVLRecipe)) {
2724 for (unsigned I = 0; I < NumDefVal; ++I) {
2725 VPValue *CurVPV = CurRecipe->getVPValue(I);
2726 CurVPV->replaceAllUsesWith(EVLRecipe->getVPValue(I));
2727 }
2728 }
2729 ToErase.push_back(CurRecipe);
2730 }
2731 // Remove dead EVL mask.
2732 if (EVLMask->getNumUsers() == 0)
2733 ToErase.push_back(EVLMask->getDefiningRecipe());
2734
2735 for (VPRecipeBase *R : reverse(ToErase)) {
2736 SmallVector<VPValue *> PossiblyDead(R->operands());
2737 R->eraseFromParent();
2738 for (VPValue *Op : PossiblyDead)
2740 }
2741}
2742
2743/// Add a VPEVLBasedIVPHIRecipe and related recipes to \p Plan and
2744/// replaces all uses except the canonical IV increment of
2745/// VPCanonicalIVPHIRecipe with a VPEVLBasedIVPHIRecipe. VPCanonicalIVPHIRecipe
2746/// is used only for loop iterations counting after this transformation.
2747///
2748/// The function uses the following definitions:
2749/// %StartV is the canonical induction start value.
2750///
2751/// The function adds the following recipes:
2752///
2753/// vector.ph:
2754/// ...
2755///
2756/// vector.body:
2757/// ...
2758/// %EVLPhi = EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI [ %StartV, %vector.ph ],
2759/// [ %NextEVLIV, %vector.body ]
2760/// %AVL = phi [ trip-count, %vector.ph ], [ %NextAVL, %vector.body ]
2761/// %VPEVL = EXPLICIT-VECTOR-LENGTH %AVL
2762/// ...
2763/// %OpEVL = cast i32 %VPEVL to IVSize
2764/// %NextEVLIV = add IVSize %OpEVL, %EVLPhi
2765/// %NextAVL = sub IVSize nuw %AVL, %OpEVL
2766/// ...
2767///
2768/// If MaxSafeElements is provided, the function adds the following recipes:
2769/// vector.ph:
2770/// ...
2771///
2772/// vector.body:
2773/// ...
2774/// %EVLPhi = EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI [ %StartV, %vector.ph ],
2775/// [ %NextEVLIV, %vector.body ]
2776/// %AVL = phi [ trip-count, %vector.ph ], [ %NextAVL, %vector.body ]
2777/// %cmp = cmp ult %AVL, MaxSafeElements
2778/// %SAFE_AVL = select %cmp, %AVL, MaxSafeElements
2779/// %VPEVL = EXPLICIT-VECTOR-LENGTH %SAFE_AVL
2780/// ...
2781/// %OpEVL = cast i32 %VPEVL to IVSize
2782/// %NextEVLIV = add IVSize %OpEVL, %EVLPhi
2783/// %NextAVL = sub IVSize nuw %AVL, %OpEVL
2784/// ...
2785///
2787 VPlan &Plan, const std::optional<unsigned> &MaxSafeElements) {
2788 if (Plan.hasScalarVFOnly())
2789 return;
2790 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2791 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
2792
2793 auto *CanonicalIVPHI = LoopRegion->getCanonicalIV();
2794 auto *CanIVTy = LoopRegion->getCanonicalIVType();
2795 VPValue *StartV = CanonicalIVPHI->getStartValue();
2796
2797 // Create the ExplicitVectorLengthPhi recipe in the main loop.
2798 auto *EVLPhi = new VPEVLBasedIVPHIRecipe(StartV, DebugLoc::getUnknown());
2799 EVLPhi->insertAfter(CanonicalIVPHI);
2800 VPBuilder Builder(Header, Header->getFirstNonPhi());
2801 // Create the AVL (application vector length), starting from TC -> 0 in steps
2802 // of EVL.
2803 VPPhi *AVLPhi = Builder.createScalarPhi(
2804 {Plan.getTripCount()}, DebugLoc::getCompilerGenerated(), "avl");
2805 VPValue *AVL = AVLPhi;
2806
2807 if (MaxSafeElements) {
2808 // Support for MaxSafeDist for correct loop emission.
2809 VPValue *AVLSafe = Plan.getConstantInt(CanIVTy, *MaxSafeElements);
2810 VPValue *Cmp = Builder.createICmp(ICmpInst::ICMP_ULT, AVL, AVLSafe);
2811 AVL = Builder.createSelect(Cmp, AVL, AVLSafe, DebugLoc::getUnknown(),
2812 "safe_avl");
2813 }
2814 auto *VPEVL = Builder.createNaryOp(VPInstruction::ExplicitVectorLength, AVL,
2816
2817 auto *CanonicalIVIncrement =
2818 cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue());
2819 Builder.setInsertPoint(CanonicalIVIncrement);
2820 VPValue *OpVPEVL = VPEVL;
2821
2822 auto *I32Ty = Type::getInt32Ty(Plan.getContext());
2823 OpVPEVL = Builder.createScalarZExtOrTrunc(
2824 OpVPEVL, CanIVTy, I32Ty, CanonicalIVIncrement->getDebugLoc());
2825
2826 auto *NextEVLIV = Builder.createOverflowingOp(
2827 Instruction::Add, {OpVPEVL, EVLPhi},
2828 {CanonicalIVIncrement->hasNoUnsignedWrap(),
2829 CanonicalIVIncrement->hasNoSignedWrap()},
2830 CanonicalIVIncrement->getDebugLoc(), "index.evl.next");
2831 EVLPhi->addOperand(NextEVLIV);
2832
2833 VPValue *NextAVL = Builder.createOverflowingOp(
2834 Instruction::Sub, {AVLPhi, OpVPEVL}, {/*hasNUW=*/true, /*hasNSW=*/false},
2835 DebugLoc::getCompilerGenerated(), "avl.next");
2836 AVLPhi->addOperand(NextAVL);
2837
2838 transformRecipestoEVLRecipes(Plan, *VPEVL);
2839
2840 // Replace all uses of VPCanonicalIVPHIRecipe by
2841 // VPEVLBasedIVPHIRecipe except for the canonical IV increment.
2842 CanonicalIVPHI->replaceAllUsesWith(EVLPhi);
2843 CanonicalIVIncrement->setOperand(0, CanonicalIVPHI);
2844 // TODO: support unroll factor > 1.
2845 Plan.setUF(1);
2846}
2847
2849 // Find EVL loop entries by locating VPEVLBasedIVPHIRecipe.
2850 // There should be only one EVL PHI in the entire plan.
2851 VPEVLBasedIVPHIRecipe *EVLPhi = nullptr;
2852
2855 for (VPRecipeBase &R : VPBB->phis())
2856 if (auto *PhiR = dyn_cast<VPEVLBasedIVPHIRecipe>(&R)) {
2857 assert(!EVLPhi && "Found multiple EVL PHIs. Only one expected");
2858 EVLPhi = PhiR;
2859 }
2860
2861 // Early return if no EVL PHI is found.
2862 if (!EVLPhi)
2863 return;
2864
2865 VPBasicBlock *HeaderVPBB = EVLPhi->getParent();
2866 VPValue *EVLIncrement = EVLPhi->getBackedgeValue();
2867 VPValue *AVL;
2868 [[maybe_unused]] bool FoundAVL =
2869 match(EVLIncrement,
2870 m_c_Add(m_ZExtOrSelf(m_EVL(m_VPValue(AVL))), m_Specific(EVLPhi)));
2871 assert(FoundAVL && "Didn't find AVL?");
2872
2873 // The AVL may be capped to a safe distance.
2874 VPValue *SafeAVL;
2875 if (match(AVL, m_Select(m_VPValue(), m_VPValue(SafeAVL), m_VPValue())))
2876 AVL = SafeAVL;
2877
2878 VPValue *AVLNext;
2879 [[maybe_unused]] bool FoundAVLNext =
2881 m_Specific(Plan.getTripCount()), m_VPValue(AVLNext)));
2882 assert(FoundAVLNext && "Didn't find AVL backedge?");
2883
2884 // Convert EVLPhi to concrete recipe.
2885 auto *ScalarR =
2886 VPBuilder(EVLPhi).createScalarPhi({EVLPhi->getStartValue(), EVLIncrement},
2887 EVLPhi->getDebugLoc(), "evl.based.iv");
2888 EVLPhi->replaceAllUsesWith(ScalarR);
2889 EVLPhi->eraseFromParent();
2890
2891 // Replace CanonicalIVInc with EVL-PHI increment.
2892 auto *CanonicalIV = cast<VPPhi>(&*HeaderVPBB->begin());
2893 VPValue *Backedge = CanonicalIV->getIncomingValue(1);
2894 assert(match(Backedge, m_c_Add(m_Specific(CanonicalIV),
2895 m_Specific(&Plan.getVFxUF()))) &&
2896 "Unexpected canonical iv");
2897 Backedge->replaceAllUsesWith(EVLIncrement);
2898
2899 // Remove unused phi and increment.
2900 VPRecipeBase *CanonicalIVIncrement = Backedge->getDefiningRecipe();
2901 CanonicalIVIncrement->eraseFromParent();
2902 CanonicalIV->eraseFromParent();
2903
2904 // Replace the use of VectorTripCount in the latch-exiting block.
2905 // Before: (branch-on-count EVLIVInc, VectorTripCount)
2906 // After: (branch-on-cond eq AVLNext, 0)
2907
2908 VPBasicBlock *LatchExiting =
2909 HeaderVPBB->getPredecessors()[1]->getEntryBasicBlock();
2910 auto *LatchExitingBr = cast<VPInstruction>(LatchExiting->getTerminator());
2911 // Skip single-iteration loop region
2912 if (match(LatchExitingBr, m_BranchOnCond(m_True())))
2913 return;
2914 assert(LatchExitingBr &&
2915 match(LatchExitingBr,
2916 m_BranchOnCount(m_VPValue(EVLIncrement),
2917 m_Specific(&Plan.getVectorTripCount()))) &&
2918 "Unexpected terminator in EVL loop");
2919
2920 Type *AVLTy = VPTypeAnalysis(Plan).inferScalarType(AVLNext);
2921 VPBuilder Builder(LatchExitingBr);
2922 VPValue *Cmp = Builder.createICmp(CmpInst::ICMP_EQ, AVLNext,
2923 Plan.getConstantInt(AVLTy, 0));
2924 Builder.createNaryOp(VPInstruction::BranchOnCond, Cmp);
2925 LatchExitingBr->eraseFromParent();
2926}
2927
2929 VPlan &Plan, PredicatedScalarEvolution &PSE,
2930 const DenseMap<Value *, const SCEV *> &StridesMap) {
2931 // Replace VPValues for known constant strides guaranteed by predicate scalar
2932 // evolution.
2933 auto CanUseVersionedStride = [&Plan](VPUser &U, unsigned) {
2934 auto *R = cast<VPRecipeBase>(&U);
2935 return R->getRegion() ||
2936 R->getParent() == Plan.getVectorLoopRegion()->getSinglePredecessor();
2937 };
2938 ValueToSCEVMapTy RewriteMap;
2939 for (const SCEV *Stride : StridesMap.values()) {
2940 using namespace SCEVPatternMatch;
2941 auto *StrideV = cast<SCEVUnknown>(Stride)->getValue();
2942 const APInt *StrideConst;
2943 if (!match(PSE.getSCEV(StrideV), m_scev_APInt(StrideConst)))
2944 // Only handle constant strides for now.
2945 continue;
2946
2947 auto *CI = Plan.getConstantInt(*StrideConst);
2948 if (VPValue *StrideVPV = Plan.getLiveIn(StrideV))
2949 StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
2950
2951 // The versioned value may not be used in the loop directly but through a
2952 // sext/zext. Add new live-ins in those cases.
2953 for (Value *U : StrideV->users()) {
2955 continue;
2956 VPValue *StrideVPV = Plan.getLiveIn(U);
2957 if (!StrideVPV)
2958 continue;
2959 unsigned BW = U->getType()->getScalarSizeInBits();
2960 APInt C =
2961 isa<SExtInst>(U) ? StrideConst->sext(BW) : StrideConst->zext(BW);
2962 VPValue *CI = Plan.getConstantInt(C);
2963 StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
2964 }
2965 RewriteMap[StrideV] = PSE.getSCEV(StrideV);
2966 }
2967
2968 for (VPRecipeBase &R : *Plan.getEntry()) {
2969 auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(&R);
2970 if (!ExpSCEV)
2971 continue;
2972 const SCEV *ScevExpr = ExpSCEV->getSCEV();
2973 auto *NewSCEV =
2974 SCEVParameterRewriter::rewrite(ScevExpr, *PSE.getSE(), RewriteMap);
2975 if (NewSCEV != ScevExpr) {
2976 VPValue *NewExp = vputils::getOrCreateVPValueForSCEVExpr(Plan, NewSCEV);
2977 ExpSCEV->replaceAllUsesWith(NewExp);
2978 if (Plan.getTripCount() == ExpSCEV)
2979 Plan.resetTripCount(NewExp);
2980 }
2981 }
2982}
2983
2985 VPlan &Plan,
2986 const std::function<bool(BasicBlock *)> &BlockNeedsPredication) {
2987 // Collect recipes in the backward slice of `Root` that may generate a poison
2988 // value that is used after vectorization.
2990 auto CollectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) {
2992 Worklist.push_back(Root);
2993
2994 // Traverse the backward slice of Root through its use-def chain.
2995 while (!Worklist.empty()) {
2996 VPRecipeBase *CurRec = Worklist.pop_back_val();
2997
2998 if (!Visited.insert(CurRec).second)
2999 continue;
3000
3001 // Prune search if we find another recipe generating a widen memory
3002 // instruction. Widen memory instructions involved in address computation
3003 // will lead to gather/scatter instructions, which don't need to be
3004 // handled.
3006 VPHeaderPHIRecipe>(CurRec))
3007 continue;
3008
3009 // This recipe contributes to the address computation of a widen
3010 // load/store. If the underlying instruction has poison-generating flags,
3011 // drop them directly.
3012 if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(CurRec)) {
3013 VPValue *A, *B;
3014 // Dropping disjoint from an OR may yield incorrect results, as some
3015 // analysis may have converted it to an Add implicitly (e.g. SCEV used
3016 // for dependence analysis). Instead, replace it with an equivalent Add.
3017 // This is possible as all users of the disjoint OR only access lanes
3018 // where the operands are disjoint or poison otherwise.
3019 if (match(RecWithFlags, m_BinaryOr(m_VPValue(A), m_VPValue(B))) &&
3020 RecWithFlags->isDisjoint()) {
3021 VPBuilder Builder(RecWithFlags);
3022 VPInstruction *New = Builder.createOverflowingOp(
3023 Instruction::Add, {A, B}, {false, false},
3024 RecWithFlags->getDebugLoc());
3025 New->setUnderlyingValue(RecWithFlags->getUnderlyingValue());
3026 RecWithFlags->replaceAllUsesWith(New);
3027 RecWithFlags->eraseFromParent();
3028 CurRec = New;
3029 } else
3030 RecWithFlags->dropPoisonGeneratingFlags();
3031 } else {
3034 (void)Instr;
3035 assert((!Instr || !Instr->hasPoisonGeneratingFlags()) &&
3036 "found instruction with poison generating flags not covered by "
3037 "VPRecipeWithIRFlags");
3038 }
3039
3040 // Add new definitions to the worklist.
3041 for (VPValue *Operand : CurRec->operands())
3042 if (VPRecipeBase *OpDef = Operand->getDefiningRecipe())
3043 Worklist.push_back(OpDef);
3044 }
3045 });
3046
3047 // Traverse all the recipes in the VPlan and collect the poison-generating
3048 // recipes in the backward slice starting at the address of a VPWidenRecipe or
3049 // VPInterleaveRecipe.
3050 auto Iter = vp_depth_first_deep(Plan.getEntry());
3052 for (VPRecipeBase &Recipe : *VPBB) {
3053 if (auto *WidenRec = dyn_cast<VPWidenMemoryRecipe>(&Recipe)) {
3054 Instruction &UnderlyingInstr = WidenRec->getIngredient();
3055 VPRecipeBase *AddrDef = WidenRec->getAddr()->getDefiningRecipe();
3056 if (AddrDef && WidenRec->isConsecutive() &&
3057 BlockNeedsPredication(UnderlyingInstr.getParent()))
3058 CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
3059 } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) {
3060 VPRecipeBase *AddrDef = InterleaveRec->getAddr()->getDefiningRecipe();
3061 if (AddrDef) {
3062 // Check if any member of the interleave group needs predication.
3063 const InterleaveGroup<Instruction> *InterGroup =
3064 InterleaveRec->getInterleaveGroup();
3065 bool NeedPredication = false;
3066 for (int I = 0, NumMembers = InterGroup->getNumMembers();
3067 I < NumMembers; ++I) {
3068 Instruction *Member = InterGroup->getMember(I);
3069 if (Member)
3070 NeedPredication |= BlockNeedsPredication(Member->getParent());
3071 }
3072
3073 if (NeedPredication)
3074 CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
3075 }
3076 }
3077 }
3078 }
3079}
3080
3082 VPlan &Plan,
3084 &InterleaveGroups,
3085 VPRecipeBuilder &RecipeBuilder, const bool &ScalarEpilogueAllowed) {
3086 if (InterleaveGroups.empty())
3087 return;
3088
3089 // Interleave memory: for each Interleave Group we marked earlier as relevant
3090 // for this VPlan, replace the Recipes widening its memory instructions with a
3091 // single VPInterleaveRecipe at its insertion point.
3092 VPDominatorTree VPDT(Plan);
3093 for (const auto *IG : InterleaveGroups) {
3094 auto *Start =
3095 cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IG->getMember(0)));
3096 VPIRMetadata InterleaveMD(*Start);
3097 SmallVector<VPValue *, 4> StoredValues;
3098 if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(Start))
3099 StoredValues.push_back(StoreR->getStoredValue());
3100 for (unsigned I = 1; I < IG->getFactor(); ++I) {
3101 Instruction *MemberI = IG->getMember(I);
3102 if (!MemberI)
3103 continue;
3104 VPWidenMemoryRecipe *MemoryR =
3105 cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(MemberI));
3106 if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(MemoryR))
3107 StoredValues.push_back(StoreR->getStoredValue());
3108 InterleaveMD.intersect(*MemoryR);
3109 }
3110
3111 bool NeedsMaskForGaps =
3112 (IG->requiresScalarEpilogue() && !ScalarEpilogueAllowed) ||
3113 (!StoredValues.empty() && !IG->isFull());
3114
3115 Instruction *IRInsertPos = IG->getInsertPos();
3116 auto *InsertPos =
3117 cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IRInsertPos));
3118
3120 if (auto *Gep = dyn_cast<GetElementPtrInst>(
3121 getLoadStorePointerOperand(IRInsertPos)->stripPointerCasts()))
3122 NW = Gep->getNoWrapFlags().withoutNoUnsignedWrap();
3123
3124 // Get or create the start address for the interleave group.
3125 VPValue *Addr = Start->getAddr();
3126 VPRecipeBase *AddrDef = Addr->getDefiningRecipe();
3127 if (AddrDef && !VPDT.properlyDominates(AddrDef, InsertPos)) {
3128 // We cannot re-use the address of member zero because it does not
3129 // dominate the insert position. Instead, use the address of the insert
3130 // position and create a PtrAdd adjusting it to the address of member
3131 // zero.
3132 // TODO: Hoist Addr's defining recipe (and any operands as needed) to
3133 // InsertPos or sink loads above zero members to join it.
3134 assert(IG->getIndex(IRInsertPos) != 0 &&
3135 "index of insert position shouldn't be zero");
3136 auto &DL = IRInsertPos->getDataLayout();
3137 APInt Offset(32,
3138 DL.getTypeAllocSize(getLoadStoreType(IRInsertPos)) *
3139 IG->getIndex(IRInsertPos),
3140 /*IsSigned=*/true);
3141 VPValue *OffsetVPV = Plan.getConstantInt(-Offset);
3142 VPBuilder B(InsertPos);
3143 Addr = B.createNoWrapPtrAdd(InsertPos->getAddr(), OffsetVPV, NW);
3144 }
3145 // If the group is reverse, adjust the index to refer to the last vector
3146 // lane instead of the first. We adjust the index from the first vector
3147 // lane, rather than directly getting the pointer for lane VF - 1, because
3148 // the pointer operand of the interleaved access is supposed to be uniform.
3149 if (IG->isReverse()) {
3150 auto *ReversePtr = new VPVectorEndPointerRecipe(
3151 Addr, &Plan.getVF(), getLoadStoreType(IRInsertPos),
3152 -(int64_t)IG->getFactor(), NW, InsertPos->getDebugLoc());
3153 ReversePtr->insertBefore(InsertPos);
3154 Addr = ReversePtr;
3155 }
3156 auto *VPIG = new VPInterleaveRecipe(IG, Addr, StoredValues,
3157 InsertPos->getMask(), NeedsMaskForGaps,
3158 InterleaveMD, InsertPos->getDebugLoc());
3159 VPIG->insertBefore(InsertPos);
3160
3161 unsigned J = 0;
3162 for (unsigned i = 0; i < IG->getFactor(); ++i)
3163 if (Instruction *Member = IG->getMember(i)) {
3164 VPRecipeBase *MemberR = RecipeBuilder.getRecipe(Member);
3165 if (!Member->getType()->isVoidTy()) {
3166 VPValue *OriginalV = MemberR->getVPSingleValue();
3167 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));
3168 J++;
3169 }
3170 MemberR->eraseFromParent();
3171 }
3172 }
3173}
3174
3175/// Expand a VPWidenIntOrFpInduction into executable recipes, for the initial
3176/// value, phi and backedge value. In the following example:
3177///
3178/// vector.ph:
3179/// Successor(s): vector loop
3180///
3181/// <x1> vector loop: {
3182/// vector.body:
3183/// WIDEN-INDUCTION %i = phi %start, %step, %vf
3184/// ...
3185/// EMIT branch-on-count ...
3186/// No successors
3187/// }
3188///
3189/// WIDEN-INDUCTION will get expanded to:
3190///
3191/// vector.ph:
3192/// ...
3193/// vp<%induction.start> = ...
3194/// vp<%induction.increment> = ...
3195///
3196/// Successor(s): vector loop
3197///
3198/// <x1> vector loop: {
3199/// vector.body:
3200/// ir<%i> = WIDEN-PHI vp<%induction.start>, vp<%vec.ind.next>
3201/// ...
3202/// vp<%vec.ind.next> = add ir<%i>, vp<%induction.increment>
3203/// EMIT branch-on-count ...
3204/// No successors
3205/// }
3206static void
3208 VPTypeAnalysis &TypeInfo) {
3209 VPlan *Plan = WidenIVR->getParent()->getPlan();
3210 VPValue *Start = WidenIVR->getStartValue();
3211 VPValue *Step = WidenIVR->getStepValue();
3212 VPValue *VF = WidenIVR->getVFValue();
3213 DebugLoc DL = WidenIVR->getDebugLoc();
3214
3215 // The value from the original loop to which we are mapping the new induction
3216 // variable.
3217 Type *Ty = TypeInfo.inferScalarType(WidenIVR);
3218
3219 const InductionDescriptor &ID = WidenIVR->getInductionDescriptor();
3222 // FIXME: The newly created binary instructions should contain nsw/nuw
3223 // flags, which can be found from the original scalar operations.
3224 VPIRFlags Flags;
3225 if (ID.getKind() == InductionDescriptor::IK_IntInduction) {
3226 AddOp = Instruction::Add;
3227 MulOp = Instruction::Mul;
3228 } else {
3229 AddOp = ID.getInductionOpcode();
3230 MulOp = Instruction::FMul;
3231 Flags = ID.getInductionBinOp()->getFastMathFlags();
3232 }
3233
3234 // If the phi is truncated, truncate the start and step values.
3235 VPBuilder Builder(Plan->getVectorPreheader());
3236 Type *StepTy = TypeInfo.inferScalarType(Step);
3237 if (Ty->getScalarSizeInBits() < StepTy->getScalarSizeInBits()) {
3238 assert(StepTy->isIntegerTy() && "Truncation requires an integer type");
3239 Step = Builder.createScalarCast(Instruction::Trunc, Step, Ty, DL);
3240 Start = Builder.createScalarCast(Instruction::Trunc, Start, Ty, DL);
3241 StepTy = Ty;
3242 }
3243
3244 // Construct the initial value of the vector IV in the vector loop preheader.
3245 Type *IVIntTy =
3247 VPValue *Init = Builder.createNaryOp(VPInstruction::StepVector, {}, IVIntTy);
3248 if (StepTy->isFloatingPointTy())
3249 Init = Builder.createWidenCast(Instruction::UIToFP, Init, StepTy);
3250
3251 VPValue *SplatStart = Builder.createNaryOp(VPInstruction::Broadcast, Start);
3252 VPValue *SplatStep = Builder.createNaryOp(VPInstruction::Broadcast, Step);
3253
3254 Init = Builder.createNaryOp(MulOp, {Init, SplatStep}, Flags);
3255 Init = Builder.createNaryOp(AddOp, {SplatStart, Init}, Flags,
3256 DebugLoc::getUnknown(), "induction");
3257
3258 // Create the widened phi of the vector IV.
3259 auto *WidePHI = new VPWidenPHIRecipe(WidenIVR->getPHINode(), Init,
3260 WidenIVR->getDebugLoc(), "vec.ind");
3261 WidePHI->insertBefore(WidenIVR);
3262
3263 // Create the backedge value for the vector IV.
3264 VPValue *Inc;
3265 VPValue *Prev;
3266 // If unrolled, use the increment and prev value from the operands.
3267 if (auto *SplatVF = WidenIVR->getSplatVFValue()) {
3268 Inc = SplatVF;
3269 Prev = WidenIVR->getLastUnrolledPartOperand();
3270 } else {
3271 if (VPRecipeBase *R = VF->getDefiningRecipe())
3272 Builder.setInsertPoint(R->getParent(), std::next(R->getIterator()));
3273 // Multiply the vectorization factor by the step using integer or
3274 // floating-point arithmetic as appropriate.
3275 if (StepTy->isFloatingPointTy())
3276 VF = Builder.createScalarCast(Instruction::CastOps::UIToFP, VF, StepTy,
3277 DL);
3278 else
3279 VF = Builder.createScalarZExtOrTrunc(VF, StepTy,
3280 TypeInfo.inferScalarType(VF), DL);
3281
3282 Inc = Builder.createNaryOp(MulOp, {Step, VF}, Flags);
3283 Inc = Builder.createNaryOp(VPInstruction::Broadcast, Inc);
3284 Prev = WidePHI;
3285 }
3286
3288 Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator());
3289 auto *Next = Builder.createNaryOp(AddOp, {Prev, Inc}, Flags,
3290 WidenIVR->getDebugLoc(), "vec.ind.next");
3291
3292 WidePHI->addOperand(Next);
3293
3294 WidenIVR->replaceAllUsesWith(WidePHI);
3295}
3296
3297/// Expand a VPWidenPointerInductionRecipe into executable recipes, for the
3298/// initial value, phi and backedge value. In the following example:
3299///
3300/// <x1> vector loop: {
3301/// vector.body:
3302/// EMIT ir<%ptr.iv> = WIDEN-POINTER-INDUCTION %start, %step, %vf
3303/// ...
3304/// EMIT branch-on-count ...
3305/// }
3306///
3307/// WIDEN-POINTER-INDUCTION will get expanded to:
3308///
3309/// <x1> vector loop: {
3310/// vector.body:
3311/// EMIT-SCALAR %pointer.phi = phi %start, %ptr.ind
3312/// EMIT %mul = mul %stepvector, %step
3313/// EMIT %vector.gep = wide-ptradd %pointer.phi, %mul
3314/// ...
3315/// EMIT %ptr.ind = ptradd %pointer.phi, %vf
3316/// EMIT branch-on-count ...
3317/// }
3319 VPTypeAnalysis &TypeInfo) {
3320 VPlan *Plan = R->getParent()->getPlan();
3321 VPValue *Start = R->getStartValue();
3322 VPValue *Step = R->getStepValue();
3323 VPValue *VF = R->getVFValue();
3324
3325 assert(R->getInductionDescriptor().getKind() ==
3327 "Not a pointer induction according to InductionDescriptor!");
3328 assert(TypeInfo.inferScalarType(R)->isPointerTy() && "Unexpected type.");
3329 assert(!R->onlyScalarsGenerated(Plan->hasScalableVF()) &&
3330 "Recipe should have been replaced");
3331
3332 VPBuilder Builder(R);
3333 DebugLoc DL = R->getDebugLoc();
3334
3335 // Build a scalar pointer phi.
3336 VPPhi *ScalarPtrPhi = Builder.createScalarPhi(Start, DL, "pointer.phi");
3337
3338 // Create actual address geps that use the pointer phi as base and a
3339 // vectorized version of the step value (<step*0, ..., step*N>) as offset.
3340 Builder.setInsertPoint(R->getParent(), R->getParent()->getFirstNonPhi());
3341 Type *StepTy = TypeInfo.inferScalarType(Step);
3342 VPValue *Offset = Builder.createNaryOp(VPInstruction::StepVector, {}, StepTy);
3343 Offset = Builder.createNaryOp(Instruction::Mul, {Offset, Step});
3344 VPValue *PtrAdd = Builder.createNaryOp(
3345 VPInstruction::WidePtrAdd, {ScalarPtrPhi, Offset}, DL, "vector.gep");
3346 R->replaceAllUsesWith(PtrAdd);
3347
3348 // Create the backedge value for the scalar pointer phi.
3350 Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator());
3351 VF = Builder.createScalarZExtOrTrunc(VF, StepTy, TypeInfo.inferScalarType(VF),
3352 DL);
3353 VPValue *Inc = Builder.createNaryOp(Instruction::Mul, {Step, VF});
3354
3355 VPValue *InductionGEP =
3356 Builder.createPtrAdd(ScalarPtrPhi, Inc, DL, "ptr.ind");
3357 ScalarPtrPhi->addOperand(InductionGEP);
3358}
3359
3361 // Replace loop regions with explicity CFG.
3362 SmallVector<VPRegionBlock *> LoopRegions;
3364 vp_depth_first_deep(Plan.getEntry()))) {
3365 if (!R->isReplicator())
3366 LoopRegions.push_back(R);
3367 }
3368 for (VPRegionBlock *R : LoopRegions)
3369 R->dissolveToCFGLoop();
3370}
3371
3373 VPTypeAnalysis TypeInfo(Plan);
3376 vp_depth_first_deep(Plan.getEntry()))) {
3377 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
3378 if (auto *WidenIVR = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R)) {
3379 expandVPWidenIntOrFpInduction(WidenIVR, TypeInfo);
3380 ToRemove.push_back(WidenIVR);
3381 continue;
3382 }
3383
3384 if (auto *WidenIVR = dyn_cast<VPWidenPointerInductionRecipe>(&R)) {
3385 expandVPWidenPointerInduction(WidenIVR, TypeInfo);
3386 ToRemove.push_back(WidenIVR);
3387 continue;
3388 }
3389
3390 // Expand VPBlendRecipe into VPInstruction::Select.
3391 VPBuilder Builder(&R);
3392 if (auto *Blend = dyn_cast<VPBlendRecipe>(&R)) {
3393 VPValue *Select = Blend->getIncomingValue(0);
3394 for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
3395 Select = Builder.createSelect(Blend->getMask(I),
3396 Blend->getIncomingValue(I), Select,
3397 R.getDebugLoc(), "predphi");
3398 Blend->replaceAllUsesWith(Select);
3399 ToRemove.push_back(Blend);
3400 }
3401
3402 if (auto *Expr = dyn_cast<VPExpressionRecipe>(&R)) {
3403 Expr->decompose();
3404 ToRemove.push_back(Expr);
3405 }
3406
3407 VPValue *VectorStep;
3408 VPValue *ScalarStep;
3410 m_VPValue(VectorStep), m_VPValue(ScalarStep))))
3411 continue;
3412
3413 // Expand WideIVStep.
3414 auto *VPI = cast<VPInstruction>(&R);
3415 Type *IVTy = TypeInfo.inferScalarType(VPI);
3416 if (TypeInfo.inferScalarType(VectorStep) != IVTy) {
3418 ? Instruction::UIToFP
3419 : Instruction::Trunc;
3420 VectorStep = Builder.createWidenCast(CastOp, VectorStep, IVTy);
3421 }
3422
3423 assert(!match(ScalarStep, m_One()) && "Expected non-unit scalar-step");
3424 if (TypeInfo.inferScalarType(ScalarStep) != IVTy) {
3425 ScalarStep =
3426 Builder.createWidenCast(Instruction::Trunc, ScalarStep, IVTy);
3427 }
3428
3429 VPIRFlags Flags;
3430 if (IVTy->isFloatingPointTy())
3431 Flags = {VPI->getFastMathFlags()};
3432
3433 unsigned MulOpc =
3434 IVTy->isFloatingPointTy() ? Instruction::FMul : Instruction::Mul;
3435 VPInstruction *Mul = Builder.createNaryOp(
3436 MulOpc, {VectorStep, ScalarStep}, Flags, R.getDebugLoc());
3437 VectorStep = Mul;
3438 VPI->replaceAllUsesWith(VectorStep);
3439 ToRemove.push_back(VPI);
3440 }
3441 }
3442
3443 for (VPRecipeBase *R : ToRemove)
3444 R->eraseFromParent();
3445}
3446
3448 VPBasicBlock *EarlyExitVPBB,
3449 VPlan &Plan,
3450 VPBasicBlock *HeaderVPBB,
3451 VPBasicBlock *LatchVPBB) {
3452 VPBlockBase *MiddleVPBB = LatchVPBB->getSuccessors()[0];
3453 if (!EarlyExitVPBB->getSinglePredecessor() &&
3454 EarlyExitVPBB->getPredecessors()[1] == MiddleVPBB) {
3455 assert(EarlyExitVPBB->getNumPredecessors() == 2 &&
3456 EarlyExitVPBB->getPredecessors()[0] == EarlyExitingVPBB &&
3457 "unsupported early exit VPBB");
3458 // Early exit operand should always be last phi operand. If EarlyExitVPBB
3459 // has two predecessors and EarlyExitingVPBB is the first, swap the operands
3460 // of the phis.
3461 for (VPRecipeBase &R : EarlyExitVPBB->phis())
3462 cast<VPIRPhi>(&R)->swapOperands();
3463 }
3464
3465 VPBuilder Builder(LatchVPBB->getTerminator());
3466 VPBlockBase *TrueSucc = EarlyExitingVPBB->getSuccessors()[0];
3467 assert(match(EarlyExitingVPBB->getTerminator(), m_BranchOnCond()) &&
3468 "Terminator must be be BranchOnCond");
3469 VPValue *CondOfEarlyExitingVPBB =
3470 EarlyExitingVPBB->getTerminator()->getOperand(0);
3471 auto *CondToEarlyExit = TrueSucc == EarlyExitVPBB
3472 ? CondOfEarlyExitingVPBB
3473 : Builder.createNot(CondOfEarlyExitingVPBB);
3474
3475 // Split the middle block and have it conditionally branch to the early exit
3476 // block if CondToEarlyExit.
3477 VPValue *IsEarlyExitTaken =
3478 Builder.createNaryOp(VPInstruction::AnyOf, {CondToEarlyExit});
3479 VPBasicBlock *NewMiddle = Plan.createVPBasicBlock("middle.split");
3480 VPBasicBlock *VectorEarlyExitVPBB =
3481 Plan.createVPBasicBlock("vector.early.exit");
3482 VPBlockUtils::insertOnEdge(LatchVPBB, MiddleVPBB, NewMiddle);
3483 VPBlockUtils::connectBlocks(NewMiddle, VectorEarlyExitVPBB);
3484 NewMiddle->swapSuccessors();
3485
3486 VPBlockUtils::connectBlocks(VectorEarlyExitVPBB, EarlyExitVPBB);
3487
3488 // Update the exit phis in the early exit block.
3489 VPBuilder MiddleBuilder(NewMiddle);
3490 VPBuilder EarlyExitB(VectorEarlyExitVPBB);
3491 for (VPRecipeBase &R : EarlyExitVPBB->phis()) {
3492 auto *ExitIRI = cast<VPIRPhi>(&R);
3493 // Early exit operand should always be last, i.e., 0 if EarlyExitVPBB has
3494 // a single predecessor and 1 if it has two.
3495 unsigned EarlyExitIdx = ExitIRI->getNumOperands() - 1;
3496 if (ExitIRI->getNumOperands() != 1) {
3497 // The first of two operands corresponds to the latch exit, via MiddleVPBB
3498 // predecessor. Extract its last lane.
3499 ExitIRI->extractLastLaneOfFirstOperand(MiddleBuilder);
3500 }
3501
3502 VPValue *IncomingFromEarlyExit = ExitIRI->getOperand(EarlyExitIdx);
3503 if (!IncomingFromEarlyExit->isLiveIn()) {
3504 // Update the incoming value from the early exit.
3505 VPValue *FirstActiveLane = EarlyExitB.createNaryOp(
3506 VPInstruction::FirstActiveLane, {CondToEarlyExit}, nullptr,
3507 "first.active.lane");
3508 IncomingFromEarlyExit = EarlyExitB.createNaryOp(
3509 VPInstruction::ExtractLane, {FirstActiveLane, IncomingFromEarlyExit},
3510 nullptr, "early.exit.value");
3511 ExitIRI->setOperand(EarlyExitIdx, IncomingFromEarlyExit);
3512 }
3513 }
3514 MiddleBuilder.createNaryOp(VPInstruction::BranchOnCond, {IsEarlyExitTaken});
3515
3516 // Replace the condition controlling the non-early exit from the vector loop
3517 // with one exiting if either the original condition of the vector latch is
3518 // true or the early exit has been taken.
3519 auto *LatchExitingBranch = cast<VPInstruction>(LatchVPBB->getTerminator());
3520 assert(LatchExitingBranch->getOpcode() == VPInstruction::BranchOnCount &&
3521 "Unexpected terminator");
3522 auto *IsLatchExitTaken =
3523 Builder.createICmp(CmpInst::ICMP_EQ, LatchExitingBranch->getOperand(0),
3524 LatchExitingBranch->getOperand(1));
3525 auto *AnyExitTaken = Builder.createNaryOp(
3526 Instruction::Or, {IsEarlyExitTaken, IsLatchExitTaken});
3527 Builder.createNaryOp(VPInstruction::BranchOnCond, AnyExitTaken);
3528 LatchExitingBranch->eraseFromParent();
3529}
3530
3531/// This function tries convert extended in-loop reductions to
3532/// VPExpressionRecipe and clamp the \p Range if it is beneficial and
3533/// valid. The created recipe must be decomposed to its constituent
3534/// recipes before execution.
3535static VPExpressionRecipe *
3537 VFRange &Range) {
3538 Type *RedTy = Ctx.Types.inferScalarType(Red);
3539 VPValue *VecOp = Red->getVecOp();
3540
3541 // Clamp the range if using extended-reduction is profitable.
3542 auto IsExtendedRedValidAndClampRange =
3543 [&](unsigned Opcode, Instruction::CastOps ExtOpc, Type *SrcTy) -> bool {
3545 [&](ElementCount VF) {
3546 auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF));
3548
3549 InstructionCost ExtRedCost;
3550 InstructionCost ExtCost =
3551 cast<VPWidenCastRecipe>(VecOp)->computeCost(VF, Ctx);
3552 InstructionCost RedCost = Red->computeCost(VF, Ctx);
3553
3557 // FIXME: Move partial reduction creation, costing and clamping
3558 // here from LoopVectorize.cpp.
3559 ExtRedCost = Ctx.TTI.getPartialReductionCost(
3560 Opcode, SrcTy, nullptr, RedTy, VF, ExtKind,
3561 llvm::TargetTransformInfo::PR_None, std::nullopt, Ctx.CostKind);
3562 } else {
3563 ExtRedCost = Ctx.TTI.getExtendedReductionCost(
3564 Opcode, ExtOpc == Instruction::CastOps::ZExt, RedTy, SrcVecTy,
3565 Red->getFastMathFlags(), CostKind);
3566 }
3567 return ExtRedCost.isValid() && ExtRedCost < ExtCost + RedCost;
3568 },
3569 Range);
3570 };
3571
3572 VPValue *A;
3573 // Match reduce(ext)).
3574 if (match(VecOp, m_ZExtOrSExt(m_VPValue(A))) &&
3575 IsExtendedRedValidAndClampRange(
3576 RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()),
3577 cast<VPWidenCastRecipe>(VecOp)->getOpcode(),
3578 Ctx.Types.inferScalarType(A)))
3579 return new VPExpressionRecipe(cast<VPWidenCastRecipe>(VecOp), Red);
3580
3581 return nullptr;
3582}
3583
3584/// This function tries convert extended in-loop reductions to
3585/// VPExpressionRecipe and clamp the \p Range if it is beneficial
3586/// and valid. The created VPExpressionRecipe must be decomposed to its
3587/// constituent recipes before execution. Patterns of the
3588/// VPExpressionRecipe:
3589/// reduce.add(mul(...)),
3590/// reduce.add(mul(ext(A), ext(B))),
3591/// reduce.add(ext(mul(ext(A), ext(B)))).
3592static VPExpressionRecipe *
3594 VPCostContext &Ctx, VFRange &Range) {
3595 bool IsPartialReduction = isa<VPPartialReductionRecipe>(Red);
3596
3597 unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind());
3598 if (Opcode != Instruction::Add && Opcode != Instruction::Sub)
3599 return nullptr;
3600
3601 Type *RedTy = Ctx.Types.inferScalarType(Red);
3602
3603 // Clamp the range if using multiply-accumulate-reduction is profitable.
3604 auto IsMulAccValidAndClampRange =
3606 VPWidenCastRecipe *OuterExt) -> bool {
3608 [&](ElementCount VF) {
3610 Type *SrcTy =
3611 Ext0 ? Ctx.Types.inferScalarType(Ext0->getOperand(0)) : RedTy;
3612 InstructionCost MulAccCost;
3613
3614 if (IsPartialReduction) {
3615 Type *SrcTy2 =
3616 Ext1 ? Ctx.Types.inferScalarType(Ext1->getOperand(0)) : nullptr;
3617 // FIXME: Move partial reduction creation, costing and clamping
3618 // here from LoopVectorize.cpp.
3619 MulAccCost = Ctx.TTI.getPartialReductionCost(
3620 Opcode, SrcTy, SrcTy2, RedTy, VF,
3622 Ext0->getOpcode())
3625 Ext1->getOpcode())
3627 Mul->getOpcode(), CostKind);
3628 } else {
3629 // Only partial reductions support mixed extends at the moment.
3630 if (Ext0 && Ext1 && Ext0->getOpcode() != Ext1->getOpcode())
3631 return false;
3632
3633 bool IsZExt =
3634 !Ext0 || Ext0->getOpcode() == Instruction::CastOps::ZExt;
3635 auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF));
3636 MulAccCost = Ctx.TTI.getMulAccReductionCost(IsZExt, Opcode, RedTy,
3637 SrcVecTy, CostKind);
3638 }
3639
3640 InstructionCost MulCost = Mul->computeCost(VF, Ctx);
3641 InstructionCost RedCost = Red->computeCost(VF, Ctx);
3642 InstructionCost ExtCost = 0;
3643 if (Ext0)
3644 ExtCost += Ext0->computeCost(VF, Ctx);
3645 if (Ext1)
3646 ExtCost += Ext1->computeCost(VF, Ctx);
3647 if (OuterExt)
3648 ExtCost += OuterExt->computeCost(VF, Ctx);
3649
3650 return MulAccCost.isValid() &&
3651 MulAccCost < ExtCost + MulCost + RedCost;
3652 },
3653 Range);
3654 };
3655
3656 VPValue *VecOp = Red->getVecOp();
3657 VPRecipeBase *Sub = nullptr;
3658 VPValue *A, *B;
3659 VPValue *Tmp = nullptr;
3660 // Sub reductions could have a sub between the add reduction and vec op.
3661 if (match(VecOp, m_Sub(m_ZeroInt(), m_VPValue(Tmp)))) {
3662 Sub = VecOp->getDefiningRecipe();
3663 VecOp = Tmp;
3664 }
3665
3666 // If ValB is a constant and can be safely extended, truncate it to the same
3667 // type as ExtA's operand, then extend it to the same type as ExtA. This
3668 // creates two uniform extends that can more easily be matched by the rest of
3669 // the bundling code. The ExtB reference, ValB and operand 1 of Mul are all
3670 // replaced with the new extend of the constant.
3671 auto ExtendAndReplaceConstantOp = [&Ctx](VPWidenCastRecipe *ExtA,
3672 VPWidenCastRecipe *&ExtB,
3673 VPValue *&ValB, VPWidenRecipe *Mul) {
3674 if (!ExtA || ExtB || !ValB->isLiveIn())
3675 return;
3676 Type *NarrowTy = Ctx.Types.inferScalarType(ExtA->getOperand(0));
3677 Instruction::CastOps ExtOpc = ExtA->getOpcode();
3678 const APInt *Const;
3679 if (!match(ValB, m_APInt(Const)) ||
3681 Const, NarrowTy, TTI::getPartialReductionExtendKind(ExtOpc)))
3682 return;
3683 // The truncate ensures that the type of each extended operand is the
3684 // same, and it's been proven that the constant can be extended from
3685 // NarrowTy safely. Necessary since ExtA's extended operand would be
3686 // e.g. an i8, while the const will likely be an i32. This will be
3687 // elided by later optimisations.
3688 VPBuilder Builder(Mul);
3689 auto *Trunc =
3690 Builder.createWidenCast(Instruction::CastOps::Trunc, ValB, NarrowTy);
3691 Type *WideTy = Ctx.Types.inferScalarType(ExtA);
3692 ValB = ExtB = Builder.createWidenCast(ExtOpc, Trunc, WideTy);
3693 Mul->setOperand(1, ExtB);
3694 };
3695
3696 // Try to match reduce.add(mul(...)).
3697 if (match(VecOp, m_Mul(m_VPValue(A), m_VPValue(B)))) {
3698 auto *RecipeA =
3699 dyn_cast_if_present<VPWidenCastRecipe>(A->getDefiningRecipe());
3700 auto *RecipeB =
3701 dyn_cast_if_present<VPWidenCastRecipe>(B->getDefiningRecipe());
3702 auto *Mul = cast<VPWidenRecipe>(VecOp->getDefiningRecipe());
3703
3704 // Convert reduce.add(mul(ext, const)) to reduce.add(mul(ext, ext(const)))
3705 ExtendAndReplaceConstantOp(RecipeA, RecipeB, B, Mul);
3706
3707 // Match reduce.add/sub(mul(ext, ext)).
3708 if (RecipeA && RecipeB && match(RecipeA, m_ZExtOrSExt(m_VPValue())) &&
3709 match(RecipeB, m_ZExtOrSExt(m_VPValue())) &&
3710 IsMulAccValidAndClampRange(Mul, RecipeA, RecipeB, nullptr)) {
3711 if (Sub)
3712 return new VPExpressionRecipe(RecipeA, RecipeB, Mul,
3713 cast<VPWidenRecipe>(Sub), Red);
3714 return new VPExpressionRecipe(RecipeA, RecipeB, Mul, Red);
3715 }
3716 // TODO: Add an expression type for this variant with a negated mul
3717 if (!Sub && IsMulAccValidAndClampRange(Mul, nullptr, nullptr, nullptr))
3718 return new VPExpressionRecipe(Mul, Red);
3719 }
3720 // TODO: Add an expression type for negated versions of other expression
3721 // variants.
3722 if (Sub)
3723 return nullptr;
3724
3725 // Match reduce.add(ext(mul(A, B))).
3726 if (match(VecOp, m_ZExtOrSExt(m_Mul(m_VPValue(A), m_VPValue(B))))) {
3727 auto *Ext = cast<VPWidenCastRecipe>(VecOp->getDefiningRecipe());
3728 auto *Mul = cast<VPWidenRecipe>(Ext->getOperand(0)->getDefiningRecipe());
3729 auto *Ext0 = dyn_cast_if_present<VPWidenCastRecipe>(A->getDefiningRecipe());
3730 auto *Ext1 = dyn_cast_if_present<VPWidenCastRecipe>(B->getDefiningRecipe());
3731
3732 // reduce.add(ext(mul(ext, const)))
3733 // -> reduce.add(ext(mul(ext, ext(const))))
3734 ExtendAndReplaceConstantOp(Ext0, Ext1, B, Mul);
3735
3736 // reduce.add(ext(mul(ext(A), ext(B))))
3737 // -> reduce.add(mul(wider_ext(A), wider_ext(B)))
3738 // The inner extends must either have the same opcode as the outer extend or
3739 // be the same, in which case the multiply can never result in a negative
3740 // value and the outer extend can be folded away by doing wider
3741 // extends for the operands of the mul.
3742 if (Ext0 && Ext1 &&
3743 (Ext->getOpcode() == Ext0->getOpcode() || Ext0 == Ext1) &&
3744 Ext0->getOpcode() == Ext1->getOpcode() &&
3745 IsMulAccValidAndClampRange(Mul, Ext0, Ext1, Ext) && Mul->hasOneUse()) {
3746 auto *NewExt0 = new VPWidenCastRecipe(
3747 Ext0->getOpcode(), Ext0->getOperand(0), Ext->getResultType(), *Ext0,
3748 *Ext0, Ext0->getDebugLoc());
3749 NewExt0->insertBefore(Ext0);
3750
3751 VPWidenCastRecipe *NewExt1 = NewExt0;
3752 if (Ext0 != Ext1) {
3753 NewExt1 = new VPWidenCastRecipe(Ext1->getOpcode(), Ext1->getOperand(0),
3754 Ext->getResultType(), *Ext1, *Ext1,
3755 Ext1->getDebugLoc());
3756 NewExt1->insertBefore(Ext1);
3757 }
3758 Mul->setOperand(0, NewExt0);
3759 Mul->setOperand(1, NewExt1);
3760 Red->setOperand(1, Mul);
3761 return new VPExpressionRecipe(NewExt0, NewExt1, Mul, Red);
3762 }
3763 }
3764 return nullptr;
3765}
3766
3767/// This function tries to create abstract recipes from the reduction recipe for
3768/// following optimizations and cost estimation.
3770 VPCostContext &Ctx,
3771 VFRange &Range) {
3772 VPExpressionRecipe *AbstractR = nullptr;
3773 auto IP = std::next(Red->getIterator());
3774 auto *VPBB = Red->getParent();
3775 if (auto *MulAcc = tryToMatchAndCreateMulAccumulateReduction(Red, Ctx, Range))
3776 AbstractR = MulAcc;
3777 else if (auto *ExtRed = tryToMatchAndCreateExtendedReduction(Red, Ctx, Range))
3778 AbstractR = ExtRed;
3779 // Cannot create abstract inloop reduction recipes.
3780 if (!AbstractR)
3781 return;
3782
3783 AbstractR->insertBefore(*VPBB, IP);
3784 Red->replaceAllUsesWith(AbstractR);
3785}
3786
3797
3799 if (Plan.hasScalarVFOnly())
3800 return;
3801
3802#ifndef NDEBUG
3803 VPDominatorTree VPDT(Plan);
3804#endif
3805
3806 SmallVector<VPValue *> VPValues;
3809 append_range(VPValues, Plan.getLiveIns());
3810 for (VPRecipeBase &R : *Plan.getEntry())
3811 append_range(VPValues, R.definedValues());
3812
3813 auto *VectorPreheader = Plan.getVectorPreheader();
3814 for (VPValue *VPV : VPValues) {
3816 (VPV->isLiveIn() && VPV->getLiveInIRValue() &&
3817 isa<Constant>(VPV->getLiveInIRValue())))
3818 continue;
3819
3820 // Add explicit broadcast at the insert point that dominates all users.
3821 VPBasicBlock *HoistBlock = VectorPreheader;
3822 VPBasicBlock::iterator HoistPoint = VectorPreheader->end();
3823 for (VPUser *User : VPV->users()) {
3824 if (User->usesScalars(VPV))
3825 continue;
3826 if (cast<VPRecipeBase>(User)->getParent() == VectorPreheader)
3827 HoistPoint = HoistBlock->begin();
3828 else
3829 assert(VPDT.dominates(VectorPreheader,
3830 cast<VPRecipeBase>(User)->getParent()) &&
3831 "All users must be in the vector preheader or dominated by it");
3832 }
3833
3834 VPBuilder Builder(cast<VPBasicBlock>(HoistBlock), HoistPoint);
3835 auto *Broadcast = Builder.createNaryOp(VPInstruction::Broadcast, {VPV});
3836 VPV->replaceUsesWithIf(Broadcast,
3837 [VPV, Broadcast](VPUser &U, unsigned Idx) {
3838 return Broadcast != &U && !U.usesScalars(VPV);
3839 });
3840 }
3841}
3842
3844 VPlan &Plan, ElementCount BestVF, unsigned BestUF,
3846 assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
3847 assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
3848
3849 VPValue *TC = Plan.getTripCount();
3850 // Skip cases for which the trip count may be non-trivial to materialize.
3851 // I.e., when a scalar tail is absent - due to tail folding, or when a scalar
3852 // tail is required.
3853 if (!Plan.hasScalarTail() ||
3855 Plan.getScalarPreheader() ||
3856 !TC->isLiveIn())
3857 return;
3858
3859 // Materialize vector trip counts for constants early if it can simply
3860 // be computed as (Original TC / VF * UF) * VF * UF.
3861 // TODO: Compute vector trip counts for loops requiring a scalar epilogue and
3862 // tail-folded loops.
3863 ScalarEvolution &SE = *PSE.getSE();
3864 auto *TCScev = SE.getSCEV(TC->getLiveInIRValue());
3865 if (!isa<SCEVConstant>(TCScev))
3866 return;
3867 const SCEV *VFxUF = SE.getElementCount(TCScev->getType(), BestVF * BestUF);
3868 auto VecTCScev = SE.getMulExpr(SE.getUDivExpr(TCScev, VFxUF), VFxUF);
3869 if (auto *ConstVecTC = dyn_cast<SCEVConstant>(VecTCScev))
3870 Plan.getVectorTripCount().setUnderlyingValue(ConstVecTC->getValue());
3871}
3872
3874 VPBasicBlock *VectorPH) {
3876 if (BTC->getNumUsers() == 0)
3877 return;
3878
3879 VPBuilder Builder(VectorPH, VectorPH->begin());
3880 auto *TCTy = VPTypeAnalysis(Plan).inferScalarType(Plan.getTripCount());
3881 auto *TCMO = Builder.createNaryOp(
3882 Instruction::Sub, {Plan.getTripCount(), Plan.getConstantInt(TCTy, 1)},
3883 DebugLoc::getCompilerGenerated(), "trip.count.minus.1");
3884 BTC->replaceAllUsesWith(TCMO);
3885}
3886
3888 if (Plan.hasScalarVFOnly())
3889 return;
3890
3891 VPTypeAnalysis TypeInfo(Plan);
3892 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3893 auto VPBBsOutsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
3895 auto VPBBsInsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
3896 vp_depth_first_shallow(LoopRegion->getEntry()));
3897 // Materialize Build(Struct)Vector for all replicating VPReplicateRecipes and
3898 // VPInstructions, excluding ones in replicate regions. Those are not
3899 // materialized explicitly yet. Those vector users are still handled in
3900 // VPReplicateRegion::execute(), via shouldPack().
3901 // TODO: materialize build vectors for replicating recipes in replicating
3902 // regions.
3903 for (VPBasicBlock *VPBB :
3904 concat<VPBasicBlock *>(VPBBsOutsideLoopRegion, VPBBsInsideLoopRegion)) {
3905 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
3907 continue;
3908 auto *DefR = cast<VPRecipeWithIRFlags>(&R);
3909 auto UsesVectorOrInsideReplicateRegion = [DefR, LoopRegion](VPUser *U) {
3910 VPRegionBlock *ParentRegion = cast<VPRecipeBase>(U)->getRegion();
3911 return !U->usesScalars(DefR) || ParentRegion != LoopRegion;
3912 };
3913 if ((isa<VPReplicateRecipe>(DefR) &&
3914 cast<VPReplicateRecipe>(DefR)->isSingleScalar()) ||
3915 (isa<VPInstruction>(DefR) &&
3917 !cast<VPInstruction>(DefR)->doesGeneratePerAllLanes())) ||
3918 none_of(DefR->users(), UsesVectorOrInsideReplicateRegion))
3919 continue;
3920
3921 Type *ScalarTy = TypeInfo.inferScalarType(DefR);
3922 unsigned Opcode = ScalarTy->isStructTy()
3925 auto *BuildVector = new VPInstruction(Opcode, {DefR});
3926 BuildVector->insertAfter(DefR);
3927
3928 DefR->replaceUsesWithIf(
3929 BuildVector, [BuildVector, &UsesVectorOrInsideReplicateRegion](
3930 VPUser &U, unsigned) {
3931 return &U != BuildVector && UsesVectorOrInsideReplicateRegion(&U);
3932 });
3933 }
3934 }
3935
3936 // Create explicit VPInstructions to convert vectors to scalars. The current
3937 // implementation is conservative - it may miss some cases that may or may not
3938 // be vector values. TODO: introduce Unpacks speculatively - remove them later
3939 // if they are known to operate on scalar values.
3940 for (VPBasicBlock *VPBB : VPBBsInsideLoopRegion) {
3941 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
3944 continue;
3945 for (VPValue *Def : R.definedValues()) {
3946 // Skip recipes that are single-scalar or only have their first lane
3947 // used.
3948 // TODO: The Defs skipped here may or may not be vector values.
3949 // Introduce Unpacks, and remove them later, if they are guaranteed to
3950 // produce scalar values.
3952 continue;
3953
3954 // At the moment, we create unpacks only for scalar users outside
3955 // replicate regions. Recipes inside replicate regions still extract the
3956 // required lanes implicitly.
3957 // TODO: Remove once replicate regions are unrolled completely.
3958 auto IsCandidateUnpackUser = [Def](VPUser *U) {
3959 VPRegionBlock *ParentRegion = cast<VPRecipeBase>(U)->getRegion();
3960 return U->usesScalars(Def) &&
3961 (!ParentRegion || !ParentRegion->isReplicator());
3962 };
3963 if (none_of(Def->users(), IsCandidateUnpackUser))
3964 continue;
3965
3966 auto *Unpack = new VPInstruction(VPInstruction::Unpack, {Def});
3967 if (R.isPhi())
3968 Unpack->insertBefore(*VPBB, VPBB->getFirstNonPhi());
3969 else
3970 Unpack->insertAfter(&R);
3971 Def->replaceUsesWithIf(Unpack,
3972 [&IsCandidateUnpackUser](VPUser &U, unsigned) {
3973 return IsCandidateUnpackUser(&U);
3974 });
3975 }
3976 }
3977 }
3978}
3979
3981 VPBasicBlock *VectorPHVPBB,
3982 bool TailByMasking,
3983 bool RequiresScalarEpilogue) {
3984 VPValue &VectorTC = Plan.getVectorTripCount();
3985 assert(VectorTC.isLiveIn() && "vector-trip-count must be a live-in");
3986 // There's nothing to do if there are no users of the vector trip count or its
3987 // IR value has already been set.
3988 if (VectorTC.getNumUsers() == 0 || VectorTC.getLiveInIRValue())
3989 return;
3990
3991 VPValue *TC = Plan.getTripCount();
3992 Type *TCTy = VPTypeAnalysis(Plan).inferScalarType(TC);
3993 VPBuilder Builder(VectorPHVPBB, VectorPHVPBB->begin());
3994 VPValue *Step = &Plan.getVFxUF();
3995
3996 // If the tail is to be folded by masking, round the number of iterations N
3997 // up to a multiple of Step instead of rounding down. This is done by first
3998 // adding Step-1 and then rounding down. Note that it's ok if this addition
3999 // overflows: the vector induction variable will eventually wrap to zero given
4000 // that it starts at zero and its Step is a power of two; the loop will then
4001 // exit, with the last early-exit vector comparison also producing all-true.
4002 // For scalable vectors the VF is not guaranteed to be a power of 2, but this
4003 // is accounted for in emitIterationCountCheck that adds an overflow check.
4004 if (TailByMasking) {
4005 TC = Builder.createNaryOp(
4006 Instruction::Add,
4007 {TC, Builder.createNaryOp(Instruction::Sub,
4008 {Step, Plan.getConstantInt(TCTy, 1)})},
4009 DebugLoc::getCompilerGenerated(), "n.rnd.up");
4010 }
4011
4012 // Now we need to generate the expression for the part of the loop that the
4013 // vectorized body will execute. This is equal to N - (N % Step) if scalar
4014 // iterations are not required for correctness, or N - Step, otherwise. Step
4015 // is equal to the vectorization factor (number of SIMD elements) times the
4016 // unroll factor (number of SIMD instructions).
4017 VPValue *R =
4018 Builder.createNaryOp(Instruction::URem, {TC, Step},
4019 DebugLoc::getCompilerGenerated(), "n.mod.vf");
4020
4021 // There are cases where we *must* run at least one iteration in the remainder
4022 // loop. See the cost model for when this can happen. If the step evenly
4023 // divides the trip count, we set the remainder to be equal to the step. If
4024 // the step does not evenly divide the trip count, no adjustment is necessary
4025 // since there will already be scalar iterations. Note that the minimum
4026 // iterations check ensures that N >= Step.
4027 if (RequiresScalarEpilogue) {
4028 assert(!TailByMasking &&
4029 "requiring scalar epilogue is not supported with fail folding");
4030 VPValue *IsZero =
4031 Builder.createICmp(CmpInst::ICMP_EQ, R, Plan.getConstantInt(TCTy, 0));
4032 R = Builder.createSelect(IsZero, Step, R);
4033 }
4034
4035 VPValue *Res = Builder.createNaryOp(
4036 Instruction::Sub, {TC, R}, DebugLoc::getCompilerGenerated(), "n.vec");
4037 VectorTC.replaceAllUsesWith(Res);
4038}
4039
4041 ElementCount VFEC) {
4042 VPBuilder Builder(VectorPH, VectorPH->begin());
4043 Type *TCTy = VPTypeAnalysis(Plan).inferScalarType(Plan.getTripCount());
4044 VPValue &VF = Plan.getVF();
4045 VPValue &VFxUF = Plan.getVFxUF();
4046 // Note that after the transform, Plan.getVF and Plan.getVFxUF should not be
4047 // used.
4048 // TODO: Assert that they aren't used.
4049
4050 // If there are no users of the runtime VF, compute VFxUF by constant folding
4051 // the multiplication of VF and UF.
4052 if (VF.getNumUsers() == 0) {
4053 VPValue *RuntimeVFxUF =
4054 Builder.createElementCount(TCTy, VFEC * Plan.getUF());
4055 VFxUF.replaceAllUsesWith(RuntimeVFxUF);
4056 return;
4057 }
4058
4059 // For users of the runtime VF, compute it as VF * vscale, and VFxUF as (VF *
4060 // vscale) * UF.
4061 VPValue *RuntimeVF = Builder.createElementCount(TCTy, VFEC);
4063 VPValue *BC = Builder.createNaryOp(VPInstruction::Broadcast, RuntimeVF);
4065 BC, [&VF](VPUser &U, unsigned) { return !U.usesScalars(&VF); });
4066 }
4067 VF.replaceAllUsesWith(RuntimeVF);
4068
4069 VPValue *UF = Plan.getConstantInt(TCTy, Plan.getUF());
4070 VPValue *MulByUF = Builder.createNaryOp(Instruction::Mul, {RuntimeVF, UF});
4071 VFxUF.replaceAllUsesWith(MulByUF);
4072}
4073
4076 const DataLayout &DL = SE.getDataLayout();
4077 SCEVExpander Expander(SE, DL, "induction", /*PreserveLCSSA=*/false);
4078
4079 auto *Entry = cast<VPIRBasicBlock>(Plan.getEntry());
4080 BasicBlock *EntryBB = Entry->getIRBasicBlock();
4081 DenseMap<const SCEV *, Value *> ExpandedSCEVs;
4082 for (VPRecipeBase &R : make_early_inc_range(*Entry)) {
4084 continue;
4085 auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(&R);
4086 if (!ExpSCEV)
4087 break;
4088 const SCEV *Expr = ExpSCEV->getSCEV();
4089 Value *Res =
4090 Expander.expandCodeFor(Expr, Expr->getType(), EntryBB->getTerminator());
4091 ExpandedSCEVs[ExpSCEV->getSCEV()] = Res;
4092 VPValue *Exp = Plan.getOrAddLiveIn(Res);
4093 ExpSCEV->replaceAllUsesWith(Exp);
4094 if (Plan.getTripCount() == ExpSCEV)
4095 Plan.resetTripCount(Exp);
4096 ExpSCEV->eraseFromParent();
4097 }
4099 "VPExpandSCEVRecipes must be at the beginning of the entry block, "
4100 "after any VPIRInstructions");
4101 // Add IR instructions in the entry basic block but not in the VPIRBasicBlock
4102 // to the VPIRBasicBlock.
4103 auto EI = Entry->begin();
4104 for (Instruction &I : drop_end(*EntryBB)) {
4105 if (EI != Entry->end() && isa<VPIRInstruction>(*EI) &&
4106 &cast<VPIRInstruction>(&*EI)->getInstruction() == &I) {
4107 EI++;
4108 continue;
4109 }
4111 }
4112
4113 return ExpandedSCEVs;
4114}
4115
4116/// Returns true if \p V is VPWidenLoadRecipe or VPInterleaveRecipe that can be
4117/// converted to a narrower recipe. \p V is used by a wide recipe that feeds a
4118/// store interleave group at index \p Idx, \p WideMember0 is the recipe feeding
4119/// the same interleave group at index 0. A VPWidenLoadRecipe can be narrowed to
4120/// an index-independent load if it feeds all wide ops at all indices (\p OpV
4121/// must be the operand at index \p OpIdx for both the recipe at lane 0, \p
4122/// WideMember0). A VPInterleaveRecipe can be narrowed to a wide load, if \p V
4123/// is defined at \p Idx of a load interleave group.
4124static bool canNarrowLoad(VPWidenRecipe *WideMember0, unsigned OpIdx,
4125 VPValue *OpV, unsigned Idx) {
4126 auto *DefR = OpV->getDefiningRecipe();
4127 if (!DefR)
4128 return WideMember0->getOperand(OpIdx) == OpV;
4129 if (auto *W = dyn_cast<VPWidenLoadRecipe>(DefR))
4130 return !W->getMask() && WideMember0->getOperand(OpIdx) == OpV;
4131
4132 if (auto *IR = dyn_cast<VPInterleaveRecipe>(DefR))
4133 return IR->getInterleaveGroup()->isFull() && IR->getVPValue(Idx) == OpV;
4134 return false;
4135}
4136
4137/// Returns true if \p IR is a full interleave group with factor and number of
4138/// members both equal to \p VF. The interleave group must also access the full
4139/// vector width \p VectorRegWidth.
4141 unsigned VF, VPTypeAnalysis &TypeInfo,
4142 unsigned VectorRegWidth) {
4143 if (!InterleaveR || InterleaveR->getMask())
4144 return false;
4145
4146 Type *GroupElementTy = nullptr;
4147 if (InterleaveR->getStoredValues().empty()) {
4148 GroupElementTy = TypeInfo.inferScalarType(InterleaveR->getVPValue(0));
4149 if (!all_of(InterleaveR->definedValues(),
4150 [&TypeInfo, GroupElementTy](VPValue *Op) {
4151 return TypeInfo.inferScalarType(Op) == GroupElementTy;
4152 }))
4153 return false;
4154 } else {
4155 GroupElementTy =
4156 TypeInfo.inferScalarType(InterleaveR->getStoredValues()[0]);
4157 if (!all_of(InterleaveR->getStoredValues(),
4158 [&TypeInfo, GroupElementTy](VPValue *Op) {
4159 return TypeInfo.inferScalarType(Op) == GroupElementTy;
4160 }))
4161 return false;
4162 }
4163
4164 unsigned GroupSize = GroupElementTy->getScalarSizeInBits() * VF;
4165 auto IG = InterleaveR->getInterleaveGroup();
4166 return IG->getFactor() == VF && IG->getNumMembers() == VF &&
4167 GroupSize == VectorRegWidth;
4168}
4169
4170/// Returns true if \p VPValue is a narrow VPValue.
4171static bool isAlreadyNarrow(VPValue *VPV) {
4172 if (VPV->isLiveIn())
4173 return true;
4174 auto *RepR = dyn_cast<VPReplicateRecipe>(VPV);
4175 return RepR && RepR->isSingleScalar();
4176}
4177
4179 unsigned VectorRegWidth) {
4180 VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
4181 if (!VectorLoop || VectorLoop->getEntry()->getNumSuccessors() != 0)
4182 return;
4183
4184 VPTypeAnalysis TypeInfo(Plan);
4185
4186 unsigned VFMinVal = VF.getKnownMinValue();
4188 for (auto &R : *VectorLoop->getEntryBasicBlock()) {
4190 continue;
4191
4194 continue;
4195
4196 // Bail out on recipes not supported at the moment:
4197 // * phi recipes other than the canonical induction
4198 // * recipes writing to memory except interleave groups
4199 // Only support plans with a canonical induction phi.
4200 if (R.isPhi())
4201 return;
4202
4203 auto *InterleaveR = dyn_cast<VPInterleaveRecipe>(&R);
4204 if (R.mayWriteToMemory() && !InterleaveR)
4205 return;
4206
4207 // Do not narrow interleave groups if there are VectorPointer recipes and
4208 // the plan was unrolled. The recipe implicitly uses VF from
4209 // VPTransformState.
4210 // TODO: Remove restriction once the VF for the VectorPointer offset is
4211 // modeled explicitly as operand.
4212 if (isa<VPVectorPointerRecipe>(&R) && Plan.getUF() > 1)
4213 return;
4214
4215 // All other ops are allowed, but we reject uses that cannot be converted
4216 // when checking all allowed consumers (store interleave groups) below.
4217 if (!InterleaveR)
4218 continue;
4219
4220 // Bail out on non-consecutive interleave groups.
4221 if (!isConsecutiveInterleaveGroup(InterleaveR, VFMinVal, TypeInfo,
4222 VectorRegWidth))
4223 return;
4224
4225 // Skip read interleave groups.
4226 if (InterleaveR->getStoredValues().empty())
4227 continue;
4228
4229 // Narrow interleave groups, if all operands are already matching narrow
4230 // ops.
4231 auto *Member0 = InterleaveR->getStoredValues()[0];
4232 if (isAlreadyNarrow(Member0) &&
4233 all_of(InterleaveR->getStoredValues(),
4234 [Member0](VPValue *VPV) { return Member0 == VPV; })) {
4235 StoreGroups.push_back(InterleaveR);
4236 continue;
4237 }
4238
4239 // For now, we only support full interleave groups storing load interleave
4240 // groups.
4241 if (all_of(enumerate(InterleaveR->getStoredValues()), [](auto Op) {
4242 VPRecipeBase *DefR = Op.value()->getDefiningRecipe();
4243 if (!DefR)
4244 return false;
4245 auto *IR = dyn_cast<VPInterleaveRecipe>(DefR);
4246 return IR && IR->getInterleaveGroup()->isFull() &&
4247 IR->getVPValue(Op.index()) == Op.value();
4248 })) {
4249 StoreGroups.push_back(InterleaveR);
4250 continue;
4251 }
4252
4253 // Check if all values feeding InterleaveR are matching wide recipes, which
4254 // operands that can be narrowed.
4255 auto *WideMember0 = dyn_cast_or_null<VPWidenRecipe>(
4256 InterleaveR->getStoredValues()[0]->getDefiningRecipe());
4257 if (!WideMember0)
4258 return;
4259 for (const auto &[I, V] : enumerate(InterleaveR->getStoredValues())) {
4260 auto *R = dyn_cast_or_null<VPWidenRecipe>(V->getDefiningRecipe());
4261 if (!R || R->getOpcode() != WideMember0->getOpcode() ||
4262 R->getNumOperands() > 2)
4263 return;
4264 if (any_of(enumerate(R->operands()),
4265 [WideMember0, Idx = I](const auto &P) {
4266 const auto &[OpIdx, OpV] = P;
4267 return !canNarrowLoad(WideMember0, OpIdx, OpV, Idx);
4268 }))
4269 return;
4270 }
4271 StoreGroups.push_back(InterleaveR);
4272 }
4273
4274 if (StoreGroups.empty())
4275 return;
4276
4277 // Convert InterleaveGroup \p R to a single VPWidenLoadRecipe.
4278 SmallPtrSet<VPValue *, 4> NarrowedOps;
4279 auto NarrowOp = [&NarrowedOps](VPValue *V) -> VPValue * {
4280 auto *R = V->getDefiningRecipe();
4281 if (!R || NarrowedOps.contains(V))
4282 return V;
4283 if (auto *LoadGroup = dyn_cast<VPInterleaveRecipe>(R)) {
4284 // Narrow interleave group to wide load, as transformed VPlan will only
4285 // process one original iteration.
4286 auto *LI =
4287 cast<LoadInst>(LoadGroup->getInterleaveGroup()->getInsertPos());
4288 auto *L = new VPWidenLoadRecipe(
4289 *LI, LoadGroup->getAddr(), LoadGroup->getMask(), /*Consecutive=*/true,
4290 /*Reverse=*/false, LI->getAlign(), {}, LoadGroup->getDebugLoc());
4291 L->insertBefore(LoadGroup);
4292 NarrowedOps.insert(L);
4293 return L;
4294 }
4295
4296 if (auto *RepR = dyn_cast<VPReplicateRecipe>(R)) {
4297 assert(RepR->isSingleScalar() &&
4298 isa<LoadInst>(RepR->getUnderlyingInstr()) &&
4299 "must be a single scalar load");
4300 NarrowedOps.insert(RepR);
4301 return RepR;
4302 }
4303 auto *WideLoad = cast<VPWidenLoadRecipe>(R);
4304
4305 VPValue *PtrOp = WideLoad->getAddr();
4306 if (auto *VecPtr = dyn_cast<VPVectorPointerRecipe>(PtrOp))
4307 PtrOp = VecPtr->getOperand(0);
4308 // Narrow wide load to uniform scalar load, as transformed VPlan will only
4309 // process one original iteration.
4310 auto *N = new VPReplicateRecipe(&WideLoad->getIngredient(), {PtrOp},
4311 /*IsUniform*/ true,
4312 /*Mask*/ nullptr, *WideLoad);
4313 N->insertBefore(WideLoad);
4314 NarrowedOps.insert(N);
4315 return N;
4316 };
4317
4318 // Narrow operation tree rooted at store groups.
4319 for (auto *StoreGroup : StoreGroups) {
4320 VPValue *Res = nullptr;
4321 VPValue *Member0 = StoreGroup->getStoredValues()[0];
4322 if (isAlreadyNarrow(Member0)) {
4323 Res = Member0;
4324 } else if (auto *WideMember0 =
4326 for (unsigned Idx = 0, E = WideMember0->getNumOperands(); Idx != E; ++Idx)
4327 WideMember0->setOperand(Idx, NarrowOp(WideMember0->getOperand(Idx)));
4328 Res = WideMember0;
4329 } else {
4330 Res = NarrowOp(Member0);
4331 }
4332
4333 auto *SI =
4334 cast<StoreInst>(StoreGroup->getInterleaveGroup()->getInsertPos());
4335 auto *S = new VPWidenStoreRecipe(
4336 *SI, StoreGroup->getAddr(), Res, nullptr, /*Consecutive=*/true,
4337 /*Reverse=*/false, SI->getAlign(), {}, StoreGroup->getDebugLoc());
4338 S->insertBefore(StoreGroup);
4339 StoreGroup->eraseFromParent();
4340 }
4341
4342 // Adjust induction to reflect that the transformed plan only processes one
4343 // original iteration.
4344 auto *CanIV = VectorLoop->getCanonicalIV();
4345 auto *Inc = cast<VPInstruction>(CanIV->getBackedgeValue());
4346 VPBuilder PHBuilder(Plan.getVectorPreheader());
4347
4348 VPValue *UF = Plan.getOrAddLiveIn(
4349 ConstantInt::get(VectorLoop->getCanonicalIVType(), 1 * Plan.getUF()));
4350 if (VF.isScalable()) {
4351 VPValue *VScale = PHBuilder.createElementCount(
4353 VPValue *VScaleUF = PHBuilder.createNaryOp(Instruction::Mul, {VScale, UF});
4354 Inc->setOperand(1, VScaleUF);
4355 Plan.getVF().replaceAllUsesWith(VScale);
4356 } else {
4357 Inc->setOperand(1, UF);
4359 Plan.getConstantInt(CanIV->getScalarType(), 1));
4360 }
4361 removeDeadRecipes(Plan);
4362}
4363
4364/// Add branch weight metadata, if the \p Plan's middle block is terminated by a
4365/// BranchOnCond recipe.
4367 VPlan &Plan, ElementCount VF, std::optional<unsigned> VScaleForTuning) {
4368 VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
4369 auto *MiddleTerm =
4371 // Only add branch metadata if there is a (conditional) terminator.
4372 if (!MiddleTerm)
4373 return;
4374
4375 assert(MiddleTerm->getOpcode() == VPInstruction::BranchOnCond &&
4376 "must have a BranchOnCond");
4377 // Assume that `TripCount % VectorStep ` is equally distributed.
4378 unsigned VectorStep = Plan.getUF() * VF.getKnownMinValue();
4379 if (VF.isScalable() && VScaleForTuning.has_value())
4380 VectorStep *= *VScaleForTuning;
4381 assert(VectorStep > 0 && "trip count should not be zero");
4382 MDBuilder MDB(Plan.getContext());
4383 MDNode *BranchWeights =
4384 MDB.createBranchWeights({1, VectorStep - 1}, /*IsExpected=*/false);
4385 MiddleTerm->addMetadata(LLVMContext::MD_prof, BranchWeights);
4386}
4387
4388/// Create and return a ResumePhi for \p WideIV, unless it is truncated. If the
4389/// induction recipe is not canonical, creates a VPDerivedIVRecipe to compute
4390/// the end value of the induction.
4392 VPWidenInductionRecipe *WideIV, VPBuilder &VectorPHBuilder,
4393 VPBuilder &ScalarPHBuilder, VPTypeAnalysis &TypeInfo, VPValue *VectorTC) {
4394 auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
4395 // Truncated wide inductions resume from the last lane of their vector value
4396 // in the last vector iteration which is handled elsewhere.
4397 if (WideIntOrFp && WideIntOrFp->getTruncInst())
4398 return nullptr;
4399
4400 VPValue *Start = WideIV->getStartValue();
4401 VPValue *Step = WideIV->getStepValue();
4403 VPValue *EndValue = VectorTC;
4404 if (!WideIntOrFp || !WideIntOrFp->isCanonical()) {
4405 EndValue = VectorPHBuilder.createDerivedIV(
4406 ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
4407 Start, VectorTC, Step);
4408 }
4409
4410 // EndValue is derived from the vector trip count (which has the same type as
4411 // the widest induction) and thus may be wider than the induction here.
4412 Type *ScalarTypeOfWideIV = TypeInfo.inferScalarType(WideIV);
4413 if (ScalarTypeOfWideIV != TypeInfo.inferScalarType(EndValue)) {
4414 EndValue = VectorPHBuilder.createScalarCast(Instruction::Trunc, EndValue,
4415 ScalarTypeOfWideIV,
4416 WideIV->getDebugLoc());
4417 }
4418
4419 auto *ResumePhiRecipe = ScalarPHBuilder.createScalarPhi(
4420 {EndValue, Start}, WideIV->getDebugLoc(), "bc.resume.val");
4421 return ResumePhiRecipe;
4422}
4423
4425 VPlan &Plan, VPRecipeBuilder &Builder,
4426 DenseMap<VPValue *, VPValue *> &IVEndValues) {
4427 VPTypeAnalysis TypeInfo(Plan);
4428 auto *ScalarPH = Plan.getScalarPreheader();
4429 auto *MiddleVPBB = cast<VPBasicBlock>(ScalarPH->getPredecessors()[0]);
4430 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
4431 VPBuilder VectorPHBuilder(
4432 cast<VPBasicBlock>(VectorRegion->getSinglePredecessor()));
4433 VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
4434 VPBuilder ScalarPHBuilder(ScalarPH);
4435 for (VPRecipeBase &ScalarPhiR : Plan.getScalarHeader()->phis()) {
4436 auto *ScalarPhiIRI = cast<VPIRPhi>(&ScalarPhiR);
4437
4438 // TODO: Extract final value from induction recipe initially, optimize to
4439 // pre-computed end value together in optimizeInductionExitUsers.
4440 auto *VectorPhiR =
4441 cast<VPHeaderPHIRecipe>(Builder.getRecipe(&ScalarPhiIRI->getIRPhi()));
4442 if (auto *WideIVR = dyn_cast<VPWidenInductionRecipe>(VectorPhiR)) {
4444 WideIVR, VectorPHBuilder, ScalarPHBuilder, TypeInfo,
4445 &Plan.getVectorTripCount())) {
4446 assert(isa<VPPhi>(ResumePhi) && "Expected a phi");
4447 IVEndValues[WideIVR] = ResumePhi->getOperand(0);
4448 ScalarPhiIRI->addOperand(ResumePhi);
4449 continue;
4450 }
4451 // TODO: Also handle truncated inductions here. Computing end-values
4452 // separately should be done as VPlan-to-VPlan optimization, after
4453 // legalizing all resume values to use the last lane from the loop.
4454 assert(cast<VPWidenIntOrFpInductionRecipe>(VectorPhiR)->getTruncInst() &&
4455 "should only skip truncated wide inductions");
4456 continue;
4457 }
4458
4459 // The backedge value provides the value to resume coming out of a loop,
4460 // which for FORs is a vector whose last element needs to be extracted. The
4461 // start value provides the value if the loop is bypassed.
4462 bool IsFOR = isa<VPFirstOrderRecurrencePHIRecipe>(VectorPhiR);
4463 auto *ResumeFromVectorLoop = VectorPhiR->getBackedgeValue();
4464 assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
4465 "Cannot handle loops with uncountable early exits");
4466 if (IsFOR)
4467 ResumeFromVectorLoop = MiddleBuilder.createNaryOp(
4468 VPInstruction::ExtractLastElement, {ResumeFromVectorLoop}, {},
4469 "vector.recur.extract");
4470 StringRef Name = IsFOR ? "scalar.recur.init" : "bc.merge.rdx";
4471 auto *ResumePhiR = ScalarPHBuilder.createScalarPhi(
4472 {ResumeFromVectorLoop, VectorPhiR->getStartValue()}, {}, Name);
4473 ScalarPhiIRI->addOperand(ResumePhiR);
4474 }
4475}
4476
4478 VFRange &Range) {
4479 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
4480 auto *ScalarPHVPBB = Plan.getScalarPreheader();
4481 auto *MiddleVPBB = Plan.getMiddleBlock();
4482 VPBuilder ScalarPHBuilder(ScalarPHVPBB);
4483 VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
4484
4485 auto IsScalableOne = [](ElementCount VF) -> bool {
4486 return VF == ElementCount::getScalable(1);
4487 };
4488
4489 for (auto &HeaderPhi : VectorRegion->getEntryBasicBlock()->phis()) {
4490 auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&HeaderPhi);
4491 if (!FOR)
4492 continue;
4493
4494 assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
4495 "Cannot handle loops with uncountable early exits");
4496
4497 // This is the second phase of vectorizing first-order recurrences, creating
4498 // extract for users outside the loop. An overview of the transformation is
4499 // described below. Suppose we have the following loop with some use after
4500 // the loop of the last a[i-1],
4501 //
4502 // for (int i = 0; i < n; ++i) {
4503 // t = a[i - 1];
4504 // b[i] = a[i] - t;
4505 // }
4506 // use t;
4507 //
4508 // There is a first-order recurrence on "a". For this loop, the shorthand
4509 // scalar IR looks like:
4510 //
4511 // scalar.ph:
4512 // s.init = a[-1]
4513 // br scalar.body
4514 //
4515 // scalar.body:
4516 // i = phi [0, scalar.ph], [i+1, scalar.body]
4517 // s1 = phi [s.init, scalar.ph], [s2, scalar.body]
4518 // s2 = a[i]
4519 // b[i] = s2 - s1
4520 // br cond, scalar.body, exit.block
4521 //
4522 // exit.block:
4523 // use = lcssa.phi [s1, scalar.body]
4524 //
4525 // In this example, s1 is a recurrence because it's value depends on the
4526 // previous iteration. In the first phase of vectorization, we created a
4527 // VPFirstOrderRecurrencePHIRecipe v1 for s1. Now we create the extracts
4528 // for users in the scalar preheader and exit block.
4529 //
4530 // vector.ph:
4531 // v_init = vector(..., ..., ..., a[-1])
4532 // br vector.body
4533 //
4534 // vector.body
4535 // i = phi [0, vector.ph], [i+4, vector.body]
4536 // v1 = phi [v_init, vector.ph], [v2, vector.body]
4537 // v2 = a[i, i+1, i+2, i+3]
4538 // b[i] = v2 - v1
4539 // // Next, third phase will introduce v1' = splice(v1(3), v2(0, 1, 2))
4540 // b[i, i+1, i+2, i+3] = v2 - v1
4541 // br cond, vector.body, middle.block
4542 //
4543 // middle.block:
4544 // vector.recur.extract.for.phi = v2(2)
4545 // vector.recur.extract = v2(3)
4546 // br cond, scalar.ph, exit.block
4547 //
4548 // scalar.ph:
4549 // scalar.recur.init = phi [vector.recur.extract, middle.block],
4550 // [s.init, otherwise]
4551 // br scalar.body
4552 //
4553 // scalar.body:
4554 // i = phi [0, scalar.ph], [i+1, scalar.body]
4555 // s1 = phi [scalar.recur.init, scalar.ph], [s2, scalar.body]
4556 // s2 = a[i]
4557 // b[i] = s2 - s1
4558 // br cond, scalar.body, exit.block
4559 //
4560 // exit.block:
4561 // lo = lcssa.phi [s1, scalar.body],
4562 // [vector.recur.extract.for.phi, middle.block]
4563 //
4564 // Now update VPIRInstructions modeling LCSSA phis in the exit block.
4565 // Extract the penultimate value of the recurrence and use it as operand for
4566 // the VPIRInstruction modeling the phi.
4567 for (VPUser *U : FOR->users()) {
4568 using namespace llvm::VPlanPatternMatch;
4569 if (!match(U, m_ExtractLastElement(m_Specific(FOR))))
4570 continue;
4571 // For VF vscale x 1, if vscale = 1, we are unable to extract the
4572 // penultimate value of the recurrence. Instead we rely on the existing
4573 // extract of the last element from the result of
4574 // VPInstruction::FirstOrderRecurrenceSplice.
4575 // TODO: Consider vscale_range info and UF.
4577 Range))
4578 return;
4579 VPValue *PenultimateElement = MiddleBuilder.createNaryOp(
4580 VPInstruction::ExtractPenultimateElement, {FOR->getBackedgeValue()},
4581 {}, "vector.recur.extract.for.phi");
4582 cast<VPInstruction>(U)->replaceAllUsesWith(PenultimateElement);
4583 }
4584 }
4585}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
AMDGPU Register Bank Select
This file implements a class to represent arbitrary precision integral constant values and operations...
ReachingDefInfo InstSet & ToRemove
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool isEqual(const Function &Caller, const Function &Callee)
static const Function * getParent(const Value *V)
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
static bool isSentinel(const DWARFDebugNames::AttributeEncoding &AE)
@ Default
Hexagon Common GEP
iv Induction Variable Users
Definition IVUsers.cpp:48
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
licm
Definition LICM.cpp:389
Legalize the Machine IR a function s Machine IR
Definition Legalizer.cpp:80
static bool mergeBlocksIntoPredecessors(Loop &L, DominatorTree &DT, LoopInfo &LI, MemorySSAUpdater *MSSAU, ScalarEvolution &SE)
#define I(x, y, z)
Definition MD5.cpp:58
MachineInstr unsigned OpIdx
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
uint64_t IntrinsicInst * II
#define P(N)
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
const SmallVectorImpl< MachineOperand > & Cond
This file contains some templates that are useful if you are working with the STL at all.
This file implements a set that has insertion order iteration characteristics.
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This file implements the TypeSwitch template, which mimics a switch() statement whose cases are type ...
This file implements dominator tree analysis for a single level of a VPlan's H-CFG.
This file contains the declarations of different VPlan-related auxiliary helpers.
static VPValue * optimizeLatchExitInductionUser(VPlan &Plan, VPTypeAnalysis &TypeInfo, VPBlockBase *PredVPBB, VPValue *Op, DenseMap< VPValue *, VPValue * > &EndValues, ScalarEvolution &SE)
Attempts to optimize the induction variable exit values for users in the exit block coming from the l...
static void removeCommonBlendMask(VPBlendRecipe *Blend)
Try to see if all of Blend's masks share a common value logically and'ed and remove it from the masks...
static void tryToCreateAbstractReductionRecipe(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries to create abstract recipes from the reduction recipe for following optimizations ...
static bool sinkScalarOperands(VPlan &Plan)
static bool cannotHoistOrSinkRecipe(const VPRecipeBase &R)
Return true if we do not know how to (mechanically) hoist or sink R out of a loop region.
static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Try to simplify the branch condition of Plan.
static VPInstruction * addResumePhiRecipeForInduction(VPWidenInductionRecipe *WideIV, VPBuilder &VectorPHBuilder, VPBuilder &ScalarPHBuilder, VPTypeAnalysis &TypeInfo, VPValue *VectorTC)
Create and return a ResumePhi for WideIV, unless it is truncated.
static void simplifyRecipe(VPSingleDefRecipe *Def, VPTypeAnalysis &TypeInfo)
Try to simplify VPSingleDefRecipe Def.
static void removeRedundantInductionCasts(VPlan &Plan)
Remove redundant casts of inductions.
static bool tryToReplaceALMWithWideALM(VPlan &Plan, ElementCount VF, unsigned UF)
Try to replace multiple active lane masks used for control flow with a single, wide active lane mask ...
static std::optional< std::pair< bool, unsigned > > getOpcodeOrIntrinsicID(const VPSingleDefRecipe *R)
Get any instruction opcode or intrinsic ID data embedded in recipe R.
static VPExpressionRecipe * tryToMatchAndCreateExtendedReduction(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries convert extended in-loop reductions to VPExpressionRecipe and clamp the Range if ...
static VPScalarIVStepsRecipe * createScalarIVSteps(VPlan &Plan, InductionDescriptor::InductionKind Kind, Instruction::BinaryOps InductionOpcode, FPMathOperator *FPBinOp, Instruction *TruncI, VPValue *StartV, VPValue *Step, DebugLoc DL, VPBuilder &Builder)
static RemoveMask_match< Op0_t, Op1_t > m_RemoveMask(const Op0_t &In, Op1_t &Out)
Match a specific mask In, or a combination of it (logical-and In, Out).
static bool sinkRecurrenceUsersAfterPrevious(VPFirstOrderRecurrencePHIRecipe *FOR, VPRecipeBase *Previous, VPDominatorTree &VPDT)
Sink users of FOR after the recipe defining the previous value Previous of the recurrence.
static bool mergeReplicateRegionsIntoSuccessors(VPlan &Plan)
static VPActiveLaneMaskPHIRecipe * addVPLaneMaskPhiAndUpdateExitBranch(VPlan &Plan, bool DataAndControlFlowWithoutRuntimeCheck)
static void expandVPWidenPointerInduction(VPWidenPointerInductionRecipe *R, VPTypeAnalysis &TypeInfo)
Expand a VPWidenPointerInductionRecipe into executable recipes, for the initial value,...
static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL)
Replace recipes with their EVL variants.
static bool isDeadRecipe(VPRecipeBase &R)
Returns true if R is dead and can be removed.
static void legalizeAndOptimizeInductions(VPlan &Plan)
Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd (IndStart, ScalarIVSteps (0,...
static void addReplicateRegions(VPlan &Plan)
static VPValue * tryToFoldLiveIns(VPSingleDefRecipe &R, ArrayRef< VPValue * > Operands, const DataLayout &DL, VPTypeAnalysis &TypeInfo)
Try to fold R using InstSimplifyFolder.
static void removeRedundantExpandSCEVRecipes(VPlan &Plan)
Remove redundant EpxandSCEVRecipes in Plan's entry block by replacing them with already existing reci...
static bool isConditionTrueViaVFAndUF(VPValue *Cond, VPlan &Plan, ElementCount BestVF, unsigned BestUF, ScalarEvolution &SE)
Return true if Cond is known to be true for given BestVF and BestUF.
static bool isConsecutiveInterleaveGroup(VPInterleaveRecipe *InterleaveR, unsigned VF, VPTypeAnalysis &TypeInfo, unsigned VectorRegWidth)
Returns true if IR is a full interleave group with factor and number of members both equal to VF.
static bool hoistPreviousBeforeFORUsers(VPFirstOrderRecurrencePHIRecipe *FOR, VPRecipeBase *Previous, VPDominatorTree &VPDT)
Try to hoist Previous and its operands before all users of FOR.
static SmallVector< VPUser * > collectUsersRecursively(VPValue *V)
static void recursivelyDeleteDeadRecipes(VPValue *V)
static VPValue * optimizeEarlyExitInductionUser(VPlan &Plan, VPTypeAnalysis &TypeInfo, VPBlockBase *PredVPBB, VPValue *Op, ScalarEvolution &SE)
Attempts to optimize the induction variable exit values for users in the early exit block.
static cl::opt< bool > EnableWideActiveLaneMask("enable-wide-lane-mask", cl::init(false), cl::Hidden, cl::desc("Enable use of wide get active lane mask instructions"))
static VPWidenInductionRecipe * getOptimizableIVOf(VPValue *VPV, ScalarEvolution &SE)
Check if VPV is an untruncated wide induction, either before or after the increment.
static VPRegionBlock * createReplicateRegion(VPReplicateRecipe *PredRecipe, VPlan &Plan)
static VPBasicBlock * getPredicatedThenBlock(VPRegionBlock *R)
If R is a triangle region, return the 'then' block of the triangle.
static void simplifyBlends(VPlan &Plan)
Normalize and simplify VPBlendRecipes.
static VPRecipeBase * optimizeMaskToEVL(VPValue *HeaderMask, VPRecipeBase &CurRecipe, VPTypeAnalysis &TypeInfo, VPValue &EVL)
Try to optimize a CurRecipe masked by HeaderMask to a corresponding EVL-based recipe without the head...
static bool isAlreadyNarrow(VPValue *VPV)
Returns true if VPValue is a narrow VPValue.
static bool optimizeVectorInductionWidthForTCAndVFUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF)
Optimize the width of vector induction variables in Plan based on a known constant Trip Count,...
VPValue * getPredicatedMask(VPRegionBlock *R)
If R is a region with a VPBranchOnMaskRecipe in the entry block, return the mask.
static VPExpressionRecipe * tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries convert extended in-loop reductions to VPExpressionRecipe and clamp the Range if ...
static void expandVPWidenIntOrFpInduction(VPWidenIntOrFpInductionRecipe *WidenIVR, VPTypeAnalysis &TypeInfo)
Expand a VPWidenIntOrFpInduction into executable recipes, for the initial value, phi and backedge val...
static VPSingleDefRecipe * findHeaderMask(VPlan &Plan)
Collect the header mask with the pattern: (ICMP_ULE, WideCanonicalIV, backedge-taken-count) TODO: Int...
static void removeRedundantCanonicalIVs(VPlan &Plan)
Try to replace VPWidenCanonicalIVRecipes with a widened canonical IV recipe, if it exists.
static bool canNarrowLoad(VPWidenRecipe *WideMember0, unsigned OpIdx, VPValue *OpV, unsigned Idx)
Returns true if V is VPWidenLoadRecipe or VPInterleaveRecipe that can be converted to a narrower reci...
static void narrowToSingleScalarRecipes(VPlan &Plan)
This file provides utility VPlan to VPlan transformations.
This file declares the class VPlanVerifier, which contains utility functions to check the consistency...
This file contains the declarations of the Vectorization Plan base classes:
Value * RHS
Value * LHS
BinaryOperator * Mul
static const uint32_t IV[8]
Definition blake3_impl.h:83
Class for arbitrary precision integers.
Definition APInt.h:78
LLVM_ABI APInt zext(unsigned width) const
Zero extend to a new width.
Definition APInt.cpp:1012
unsigned getActiveBits() const
Compute the number of active bits in the value.
Definition APInt.h:1513
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1489
LLVM_ABI APInt sext(unsigned width) const
Sign extend to a new width.
Definition APInt.cpp:985
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
LLVM Basic Block Representation.
Definition BasicBlock.h:62
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this basic block belongs to.
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233
This class represents a function call, abstracting a target machine's calling convention.
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:701
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:702
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition InstrTypes.h:789
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
static ConstantInt * getSigned(IntegerType *Ty, int64_t V)
Return a ConstantInt with the specified value for the specified type.
Definition Constants.h:131
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
A debug info location.
Definition DebugLoc.h:124
static DebugLoc getCompilerGenerated()
Definition DebugLoc.h:163
static DebugLoc getUnknown()
Definition DebugLoc.h:162
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition DenseMap.h:205
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:248
bool dominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
dominates - Returns true iff A dominates B.
constexpr bool isVector() const
One or more elements.
Definition TypeSize.h:325
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition TypeSize.h:313
Utility class for floating point operations which can have information about relaxed accuracy require...
Definition Operator.h:200
Represents flags for the getelementptr instruction/expression.
GEPNoWrapFlags withoutNoUnsignedWrap() const
static GEPNoWrapFlags none()
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
A struct for saving information about induction variables.
InductionKind
This enum represents the kinds of inductions that we support.
@ IK_PtrInduction
Pointer induction var. Step = C.
@ IK_IntInduction
Integer induction variable. Step = C.
InstSimplifyFolder - Use InstructionSimplify to fold operations to existing values.
bool isCast() const
bool isBinaryOp() const
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:319
The group of interleaved loads/stores sharing the same stride and close to each other.
InstTy * getMember(uint32_t Index) const
Get the member with the given index Index.
uint32_t getNumMembers() const
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
static bool getDecisionAndClampRange(const std::function< bool(ElementCount)> &Predicate, VFRange &Range)
Test a Predicate on a Range of VF's.
Definition VPlan.cpp:1557
LLVM_ABI MDNode * createBranchWeights(uint32_t TrueWeight, uint32_t FalseWeight, bool IsExpected=false)
Return metadata containing two branch weights.
Definition MDBuilder.cpp:38
Metadata node.
Definition Metadata.h:1078
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:36
ValueT lookup(const KeyT &Key) const
Definition MapVector.h:103
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
ScalarEvolution * getSE() const
Returns the ScalarEvolution analysis used.
LLVM_ABI const SCEV * getSCEV(Value *V)
Returns the SCEV expression of V, in the context of the current SCEV predicate.
static LLVM_ABI unsigned getOpcode(RecurKind Kind)
Returns the opcode corresponding to the RecurrenceKind.
RegionT * getParent() const
Get the parent of the Region.
Definition RegionInfo.h:362
This class uses information about analyze scalars to rewrite expressions in canonical form.
LLVM_ABI Value * expandCodeFor(const SCEV *SH, Type *Ty, BasicBlock::iterator I)
Insert code to directly compute the specified SCEV expression into the program.
static const SCEV * rewrite(const SCEV *Scev, ScalarEvolution &SE, ValueToSCEVMapTy &Map)
This class represents an analyzed expression in the program.
LLVM_ABI bool isZero() const
Return true if the expression is a constant zero.
LLVM_ABI Type * getType() const
Return the LLVM type of this SCEV expression.
The main scalar evolution driver.
const DataLayout & getDataLayout() const
Return the DataLayout associated with the module this SCEV instance is operating on.
LLVM_ABI const SCEV * getNegativeSCEV(const SCEV *V, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
Return the SCEV object corresponding to -V.
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI const SCEV * getUDivExpr(const SCEV *LHS, const SCEV *RHS)
Get a canonical unsigned division expression, or something simpler if possible.
LLVM_ABI const SCEV * getElementCount(Type *Ty, ElementCount EC, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
LLVM_ABI const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
LLVM_ABI bool isKnownPredicate(CmpPredicate Pred, const SCEV *LHS, const SCEV *RHS)
Test if the given expression is known to satisfy the condition described by Pred, LHS,...
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
Definition SetVector.h:59
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:102
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:150
size_type size() const
Definition SmallPtrSet.h:99
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
iterator begin() const
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
Provides information about what library functions are available for the current target.
static LLVM_ABI PartialReductionExtendKind getPartialReductionExtendKind(Instruction *I)
Get the kind of extension that an instruction represents.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
This class implements a switch-like dispatch statement for a value of 'T' using dyn_cast functionalit...
Definition TypeSwitch.h:88
TypeSwitch< T, ResultT > & Case(CallableT &&caseFn)
Add a case on the given type.
Definition TypeSwitch.h:97
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:298
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:297
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
Definition Type.cpp:295
bool isStructTy() const
True if this is an instance of StructType.
Definition Type.h:261
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:231
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:294
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:184
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
op_range operands()
Definition User.h:292
A recipe for generating the active lane mask for the vector loop that is used to predicate the vector...
Definition VPlan.h:3534
VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph.
Definition VPlan.h:3821
void appendRecipe(VPRecipeBase *Recipe)
Augment the existing recipes of a VPBasicBlock with an additional Recipe as the last recipe.
Definition VPlan.h:3896
RecipeListTy::iterator iterator
Instruction iterators...
Definition VPlan.h:3848
iterator end()
Definition VPlan.h:3858
iterator begin()
Recipe iterator methods.
Definition VPlan.h:3856
iterator_range< iterator > phis()
Returns an iterator range over the PHI-like recipes in the block.
Definition VPlan.h:3909
iterator getFirstNonPhi()
Return the position of the first non-phi node recipe in the block.
Definition VPlan.cpp:220
VPRegionBlock * getEnclosingLoopRegion()
Definition VPlan.cpp:593
VPBasicBlock * splitAt(iterator SplitAt)
Split current block at SplitAt by inserting a new block between the current block and its successors ...
Definition VPlan.cpp:565
VPRecipeBase * getTerminator()
If the block has multiple successors, return the branch recipe terminating the block.
Definition VPlan.cpp:638
const VPRecipeBase & back() const
Definition VPlan.h:3870
A recipe for vectorizing a phi-node as a sequence of mask-based select instructions.
Definition VPlan.h:2409
VPValue * getMask(unsigned Idx) const
Return mask number Idx.
Definition VPlan.h:2443
unsigned getNumIncomingValues() const
Return the number of incoming values, taking into account when normalized the first incoming value wi...
Definition VPlan.h:2433
void setMask(unsigned Idx, VPValue *V)
Set mask number Idx to V.
Definition VPlan.h:2449
bool isNormalized() const
A normalized blend is one that has an odd number of operands, whereby the first operand does not have...
Definition VPlan.h:2429
VPBlockBase is the building block of the Hierarchical Control-Flow Graph.
Definition VPlan.h:80
VPRegionBlock * getParent()
Definition VPlan.h:172
const VPBasicBlock * getExitingBasicBlock() const
Definition VPlan.cpp:190
size_t getNumSuccessors() const
Definition VPlan.h:218
void swapSuccessors()
Swap successors of the block. The block must have exactly 2 successors.
Definition VPlan.h:321
size_t getNumPredecessors() const
Definition VPlan.h:219
const VPBlocksTy & getPredecessors() const
Definition VPlan.h:203
VPlan * getPlan()
Definition VPlan.cpp:165
VPBlockBase * getSinglePredecessor() const
Definition VPlan.h:214
const VPBasicBlock * getEntryBasicBlock() const
Definition VPlan.cpp:170
VPBlockBase * getSingleHierarchicalPredecessor()
Definition VPlan.h:263
VPBlockBase * getSingleSuccessor() const
Definition VPlan.h:208
const VPBlocksTy & getSuccessors() const
Definition VPlan.h:197
static auto blocksOnly(const T &Range)
Return an iterator range over Range which only includes BlockTy blocks.
Definition VPlanUtils.h:187
static void insertOnEdge(VPBlockBase *From, VPBlockBase *To, VPBlockBase *BlockPtr)
Inserts BlockPtr on the edge between From and To.
Definition VPlanUtils.h:208
static void insertTwoBlocksAfter(VPBlockBase *IfTrue, VPBlockBase *IfFalse, VPBlockBase *BlockPtr)
Insert disconnected VPBlockBases IfTrue and IfFalse after BlockPtr.
Definition VPlanUtils.h:127
static void connectBlocks(VPBlockBase *From, VPBlockBase *To, unsigned PredIdx=-1u, unsigned SuccIdx=-1u)
Connect VPBlockBases From and To bi-directionally.
Definition VPlanUtils.h:146
static void disconnectBlocks(VPBlockBase *From, VPBlockBase *To)
Disconnect VPBlockBases From and To bi-directionally.
Definition VPlanUtils.h:165
A recipe for generating conditional branches on the bits of a mask.
Definition VPlan.h:2953
RAII object that stores the current insertion point and restores it when the object is destroyed.
VPlan-based builder utility analogous to IRBuilder.
VPValue * createElementCount(Type *Ty, ElementCount EC)
VPInstruction * createScalarCast(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy, DebugLoc DL, const VPIRFlags &Flags={}, const VPIRMetadata &Metadata={})
VPDerivedIVRecipe * createDerivedIV(InductionDescriptor::InductionKind Kind, FPMathOperator *FPBinOp, VPValue *Start, VPValue *Current, VPValue *Step, const Twine &Name="")
Convert the input value Current to the corresponding value of an induction with Start and Step values...
static VPBuilder getToInsertAfter(VPRecipeBase *R)
Create a VPBuilder to insert after R.
VPPhi * createScalarPhi(ArrayRef< VPValue * > IncomingValues, DebugLoc DL, const Twine &Name="")
VPInstruction * createNaryOp(unsigned Opcode, ArrayRef< VPValue * > Operands, Instruction *Inst=nullptr, const Twine &Name="")
Create an N-ary operation with Opcode, Operands and set Inst as its underlying Instruction.
void setInsertPoint(VPBasicBlock *TheBB)
This specifies that created VPInstructions should be appended to the end of the specified block.
Canonical scalar induction phi of the vector loop.
Definition VPlan.h:3477
unsigned getNumDefinedValues() const
Returns the number of values defined by the VPDef.
Definition VPlanValue.h:424
ArrayRef< VPValue * > definedValues()
Returns an ArrayRef of the values defined by the VPDef.
Definition VPlanValue.h:419
VPValue * getVPSingleValue()
Returns the only VPValue defined by the VPDef.
Definition VPlanValue.h:397
VPValue * getVPValue(unsigned I)
Returns the VPValue with index I defined by the VPDef.
Definition VPlanValue.h:409
A recipe for converting the input value IV value to the corresponding value of an IV with different s...
Definition VPlan.h:3642
Template specialization of the standard LLVM dominator tree utility for VPBlockBases.
bool properlyDominates(const VPRecipeBase *A, const VPRecipeBase *B)
A recipe for generating the phi node for the current index of elements, adjusted in accordance with E...
Definition VPlan.h:3565
A recipe to combine multiple recipes into a single 'expression' recipe, which should be considered a ...
Definition VPlan.h:2998
A pure virtual base class for all recipes modeling header phis, including phis for first order recurr...
Definition VPlan.h:1984
virtual VPValue * getBackedgeValue()
Returns the incoming value from the loop backedge.
Definition VPlan.h:2032
VPValue * getStartValue()
Returns the start value of the phi, if one is set.
Definition VPlan.h:2021
A special type of VPBasicBlock that wraps an existing IR basic block.
Definition VPlan.h:3974
BasicBlock * getIRBasicBlock() const
Definition VPlan.h:3998
Class to record and manage LLVM IR flags.
Definition VPlan.h:596
static LLVM_ABI_FOR_TEST VPIRInstruction * create(Instruction &I)
Create a new VPIRPhi for \I , if it is a PHINode, otherwise create a VPIRInstruction.
Helper to manage IR metadata for recipes.
Definition VPlan.h:938
void intersect(const VPIRMetadata &MD)
Intersect this VPIRMetada object with MD, keeping only metadata nodes that are common to both.
This is a concrete Recipe that models a single VPlan-level instruction.
Definition VPlan.h:976
@ ExtractLane
Extracts a single lane (first operand) from a set of vector operands.
Definition VPlan.h:1060
@ Unpack
Extracts all lanes from its (non-scalable) vector operand.
Definition VPlan.h:1011
@ FirstOrderRecurrenceSplice
Definition VPlan.h:982
@ BuildVector
Creates a fixed-width vector containing all operands.
Definition VPlan.h:1006
@ BuildStructVector
Given operands of (the same) struct type, creates a struct of fixed- width vectors each containing a ...
Definition VPlan.h:1003
@ CanonicalIVIncrementForPart
Definition VPlan.h:996
@ CalculateTripCountMinusVF
Definition VPlan.h:994
const InterleaveGroup< Instruction > * getInterleaveGroup() const
Definition VPlan.h:2550
VPValue * getMask() const
Return the mask used by this recipe.
Definition VPlan.h:2542
ArrayRef< VPValue * > getStoredValues() const
Return the VPValues stored by this interleave group.
Definition VPlan.h:2571
A recipe for interleaved memory operations with vector-predication intrinsics.
Definition VPlan.h:2623
VPInterleaveRecipe is a recipe for transforming an interleave group of load or stores into one wide l...
Definition VPlan.h:2582
VPPredInstPHIRecipe is a recipe for generating the phi nodes needed when control converges back from ...
Definition VPlan.h:3139
VPRecipeBase is a base class modeling a sequence of one or more output IR instructions.
Definition VPlan.h:386
VPRegionBlock * getRegion()
Definition VPlan.h:4126
VPBasicBlock * getParent()
Definition VPlan.h:407
DebugLoc getDebugLoc() const
Returns the debug location of the recipe.
Definition VPlan.h:478
void moveBefore(VPBasicBlock &BB, iplist< VPRecipeBase >::iterator I)
Unlink this recipe and insert into BB before I.
void insertBefore(VPRecipeBase *InsertPos)
Insert an unlinked recipe into a basic block immediately before the specified recipe.
void insertAfter(VPRecipeBase *InsertPos)
Insert an unlinked Recipe into a basic block immediately after the specified Recipe.
iplist< VPRecipeBase >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Helper class to create VPRecipies from IR instructions.
VPRecipeBase * getRecipe(Instruction *I)
Return the recipe created for given ingredient.
A recipe to represent inloop reduction operations with vector-predication intrinsics,...
Definition VPlan.h:2831
A recipe to represent inloop reduction operations, performing a reduction on a vector operand into a ...
Definition VPlan.h:2672
VPRegionBlock represents a collection of VPBasicBlocks and VPRegionBlocks which form a Single-Entry-S...
Definition VPlan.h:4009
const VPBlockBase * getEntry() const
Definition VPlan.h:4045
Type * getCanonicalIVType()
Return the type of the canonical IV for loop regions.
Definition VPlan.h:4120
bool isReplicator() const
An indicator whether this region is to generate multiple replicated instances of output IR correspond...
Definition VPlan.h:4077
void setExiting(VPBlockBase *ExitingBlock)
Set ExitingBlock as the exiting VPBlockBase of this VPRegionBlock.
Definition VPlan.h:4062
VPCanonicalIVPHIRecipe * getCanonicalIV()
Returns the canonical induction recipe of the region.
Definition VPlan.h:4107
const VPBlockBase * getExiting() const
Definition VPlan.h:4057
VPBasicBlock * getPreheaderVPBB()
Returns the pre-header VPBasicBlock of the loop region.
Definition VPlan.h:4070
VPReplicateRecipe replicates a given instruction producing multiple scalar copies of the original sca...
Definition VPlan.h:2875
bool isSingleScalar() const
Definition VPlan.h:2920
VPValue * getMask()
Return the mask of a predicated VPReplicateRecipe.
Definition VPlan.h:2944
A recipe for handling phi nodes of integer and floating-point inductions, producing their scalar valu...
Definition VPlan.h:3711
VPSingleDef is a base class for recipes for modeling a sequence of one or more output IR that define ...
Definition VPlan.h:517
Instruction * getUnderlyingInstr()
Returns the underlying instruction.
Definition VPlan.h:582
VPSingleDefRecipe * clone() override=0
Clone the current recipe.
An analysis for type-inference for VPValues.
LLVMContext & getContext()
Return the LLVMContext used by the analysis.
Type * inferScalarType(const VPValue *V)
Infer the type of V. Returns the scalar type of V.
This class augments VPValue with operands which provide the inverse def-use edges from VPValue's user...
Definition VPlanValue.h:199
operand_range operands()
Definition VPlanValue.h:267
void setOperand(unsigned I, VPValue *New)
Definition VPlanValue.h:243
VPValue * getOperand(unsigned N) const
Definition VPlanValue.h:238
void addOperand(VPValue *Operand)
Definition VPlanValue.h:232
This is the base class of the VPlan Def/Use graph, used for modeling the data flow into,...
Definition VPlanValue.h:48
VPRecipeBase * getDefiningRecipe()
Returns the recipe defining this VPValue or nullptr if it is not defined by a recipe,...
Definition VPlan.cpp:135
Value * getLiveInIRValue() const
Returns the underlying IR value, if this VPValue is defined outside the scope of VPlan.
Definition VPlanValue.h:176
Value * getUnderlyingValue() const
Return the underlying Value attached to this VPValue.
Definition VPlanValue.h:85
void setUnderlyingValue(Value *Val)
Definition VPlanValue.h:186
void replaceAllUsesWith(VPValue *New)
Definition VPlan.cpp:1393
unsigned getNumUsers() const
Definition VPlanValue.h:113
bool isLiveIn() const
Returns true if this VPValue is a live-in, i.e. defined outside the VPlan.
Definition VPlanValue.h:171
void replaceUsesWithIf(VPValue *New, llvm::function_ref< bool(VPUser &U, unsigned Idx)> ShouldReplace)
Go through the uses list for this VPValue and make each use point to New if the callback ShouldReplac...
Definition VPlan.cpp:1397
user_range users()
Definition VPlanValue.h:134
A recipe to compute a pointer to the last element of each part of a widened memory access for widened...
Definition VPlan.h:1848
A Recipe for widening the canonical induction variable of the vector loop.
Definition VPlan.h:3606
VPWidenCastRecipe is a recipe to create vector cast instructions.
Definition VPlan.h:1489
Instruction::CastOps getOpcode() const
Definition VPlan.h:1540
A recipe for handling GEP instructions.
Definition VPlan.h:1776
Base class for widened induction (VPWidenIntOrFpInductionRecipe and VPWidenPointerInductionRecipe),...
Definition VPlan.h:2049
PHINode * getPHINode() const
Definition VPlan.h:2091
VPValue * getStepValue()
Returns the step value of the induction.
Definition VPlan.h:2077
const InductionDescriptor & getInductionDescriptor() const
Returns the induction descriptor for the recipe.
Definition VPlan.h:2094
A recipe for handling phi nodes of integer and floating-point inductions, producing their vector valu...
Definition VPlan.h:2124
VPValue * getLastUnrolledPartOperand()
Returns the VPValue representing the value of this induction at the last unrolled part,...
Definition VPlan.h:2205
A recipe for widening vector intrinsics.
Definition VPlan.h:1547
A common base class for widening memory operations.
Definition VPlan.h:3181
A recipe for widened phis.
Definition VPlan.h:2260
VPWidenRecipe is a recipe for producing a widened instruction using the opcode and operands of the re...
Definition VPlan.h:1446
VPlan models a candidate for vectorization, encoding various decisions take to produce efficient outp...
Definition VPlan.h:4139
bool hasVF(ElementCount VF) const
Definition VPlan.h:4345
LLVMContext & getContext() const
Definition VPlan.h:4333
VPBasicBlock * getEntry()
Definition VPlan.h:4233
VPValue & getVectorTripCount()
The vector trip count.
Definition VPlan.h:4324
bool hasScalableVF() const
Definition VPlan.h:4346
VPValue & getVFxUF()
Returns VF * UF of the vector loop region.
Definition VPlan.h:4331
VPValue & getVF()
Returns the VF of the vector loop region.
Definition VPlan.h:4327
VPValue * getTripCount() const
The trip count of the original loop.
Definition VPlan.h:4295
VPValue * getTrue()
Return a VPValue wrapping i1 true.
Definition VPlan.h:4402
VPValue * getOrCreateBackedgeTakenCount()
The backedge taken count of the original loop.
Definition VPlan.h:4316
unsigned getUF() const
Definition VPlan.h:4365
VPRegionBlock * createReplicateRegion(VPBlockBase *Entry, VPBlockBase *Exiting, const std::string &Name="")
Create a new replicate region with Entry, Exiting and Name.
Definition VPlan.h:4479
bool hasUF(unsigned UF) const
Definition VPlan.h:4363
ArrayRef< VPIRBasicBlock * > getExitBlocks() const
Return an ArrayRef containing VPIRBasicBlocks wrapping the exit blocks of the original scalar loop.
Definition VPlan.h:4285
VPValue * getConstantInt(Type *Ty, uint64_t Val, bool IsSigned=false)
Return a VPValue wrapping a ConstantInt with the given type and value.
Definition VPlan.h:4408
void setVF(ElementCount VF)
Definition VPlan.h:4339
bool isUnrolled() const
Returns true if the VPlan already has been unrolled, i.e.
Definition VPlan.h:4378
LLVM_ABI_FOR_TEST VPRegionBlock * getVectorLoopRegion()
Returns the VPRegionBlock of the vector loop.
Definition VPlan.cpp:1027
void resetTripCount(VPValue *NewTripCount)
Resets the trip count for the VPlan.
Definition VPlan.h:4309
VPBasicBlock * getMiddleBlock()
Returns the 'middle' block of the plan, that is the block that selects whether to execute the scalar ...
Definition VPlan.h:4258
VPBasicBlock * createVPBasicBlock(const Twine &Name, VPRecipeBase *Recipe=nullptr)
Create a new VPBasicBlock with Name and containing Recipe if present.
Definition VPlan.h:4457
VPValue * getFalse()
Return a VPValue wrapping i1 false.
Definition VPlan.h:4405
VPValue * getOrAddLiveIn(Value *V)
Gets the live-in VPValue for V or adds a new live-in (if none exists yet) for V.
Definition VPlan.h:4387
bool hasScalarVFOnly() const
Definition VPlan.h:4356
VPBasicBlock * getScalarPreheader() const
Return the VPBasicBlock for the preheader of the scalar loop.
Definition VPlan.h:4276
ArrayRef< VPValue * > getLiveIns() const
Return the list of live-in VPValues available in the VPlan.
Definition VPlan.h:4427
VPIRBasicBlock * getScalarHeader() const
Return the VPIRBasicBlock wrapping the header of the scalar loop.
Definition VPlan.h:4281
VPValue * getLiveIn(Value *V) const
Return the live-in VPValue for V, if there is one or nullptr otherwise.
Definition VPlan.h:4424
VPBasicBlock * getVectorPreheader()
Returns the preheader of the vector loop region, if one exists, or null otherwise.
Definition VPlan.h:4238
void setUF(unsigned UF)
Definition VPlan.h:4370
bool hasScalarTail() const
Returns true if the scalar tail may execute after the vector loop.
Definition VPlan.h:4511
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
iterator_range< user_iterator > users()
Definition Value.h:426
bool hasName() const
Definition Value.h:262
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:314
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:169
constexpr LeafTy multiplyCoefficientBy(ScalarTy RHS) const
Definition TypeSize.h:257
constexpr bool isFixed() const
Returns true if the quantity is not scaled by vscale.
Definition TypeSize.h:172
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:166
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
IteratorT end() const
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
LLVM_ABI APInt RoundingUDiv(const APInt &A, const APInt &B, APInt::Rounding RM)
Return A unsign-divided by B, rounded by the given rounding mode.
Definition APInt.cpp:2763
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
m_Intrinsic_Ty< Opnd0, Opnd1, Opnd2 >::Ty m_MaskedStore(const Opnd0 &Op0, const Opnd1 &Op1, const Opnd2 &Op2)
Matches MaskedStore Intrinsic.
ap_match< APInt > m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
CastInst_match< OpTy, TruncInst > m_Trunc(const OpTy &Op)
Matches Trunc.
LogicalOp_match< LHS, RHS, Instruction::And > m_LogicalAnd(const LHS &L, const RHS &R)
Matches L && R either in the form of L & R or L ?
match_combine_or< CastInst_match< OpTy, ZExtInst >, OpTy > m_ZExtOrSelf(const OpTy &Op)
bool match(Val *V, const Pattern &P)
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
m_Intrinsic_Ty< Opnd0, Opnd1, Opnd2 >::Ty m_MaskedLoad(const Opnd0 &Op0, const Opnd1 &Op1, const Opnd2 &Op2)
Matches MaskedLoad Intrinsic.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
deferredval_ty< Value > m_Deferred(Value *const &V)
Like m_Specific(), but works if the specific value to match is determined as part of the same match()...
SpecificCmpClass_match< LHS, RHS, ICmpInst > m_SpecificICmp(CmpPredicate MatchPred, const LHS &L, const RHS &R)
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
CmpClass_match< LHS, RHS, ICmpInst > m_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
BinaryOp_match< LHS, RHS, Instruction::Mul, true > m_c_Mul(const LHS &L, const RHS &R)
Matches a Mul with LHS and RHS in either order.
MatchFunctor< Val, Pattern > match_fn(const Pattern &P)
A match functor that can be used as a UnaryPredicate in functional algorithms like all_of.
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
AllRecipe_commutative_match< Instruction::And, Op0_t, Op1_t > m_c_BinaryAnd(const Op0_t &Op0, const Op1_t &Op1)
Match a binary AND operation.
AllRecipe_match< Instruction::Or, Op0_t, Op1_t > m_BinaryOr(const Op0_t &Op0, const Op1_t &Op1)
Match a binary OR operation.
AllRecipe_commutative_match< Opcode, Op0_t, Op1_t > m_c_Binary(const Op0_t &Op0, const Op1_t &Op1)
AllRecipe_commutative_match< Instruction::Or, Op0_t, Op1_t > m_c_BinaryOr(const Op0_t &Op0, const Op1_t &Op1)
GEPLikeRecipe_match< Op0_t, Op1_t > m_GetElementPtr(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::ExtractLastLanePerPart, Op0_t > m_ExtractLastLanePerPart(const Op0_t &Op0)
VPInstruction_match< VPInstruction::ExtractLastElement, Op0_t > m_ExtractLastElement(const Op0_t &Op0)
AllRecipe_match< Opcode, Op0_t, Op1_t > m_Binary(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< Instruction::ExtractElement, Op0_t, Op1_t > m_ExtractElement(const Op0_t &Op0, const Op1_t &Op1)
specific_intval< 1 > m_False()
VPDerivedIV_match< Op0_t, Op1_t, Op2_t > m_DerivedIV(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2)
VPInstruction_match< VPInstruction::ActiveLaneMask, Op0_t, Op1_t, Op2_t > m_ActiveLaneMask(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2)
VPInstruction_match< VPInstruction::BranchOnCount > m_BranchOnCount()
specific_intval< 1 > m_True()
VectorEndPointerRecipe_match< Op0_t, Op1_t > m_VecEndPtr(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::Broadcast, Op0_t > m_Broadcast(const Op0_t &Op0)
class_match< VPValue > m_VPValue()
Match an arbitrary VPValue and ignore it.
VPInstruction_match< VPInstruction::ExplicitVectorLength, Op0_t > m_EVL(const Op0_t &Op0)
VPInstruction_match< VPInstruction::BuildVector > m_BuildVector()
BuildVector is matches only its opcode, w/o matching its operands as the number of operands is not fi...
VPInstruction_match< VPInstruction::FirstActiveLane, Op0_t > m_FirstActiveLane(const Op0_t &Op0)
bind_ty< VPInstruction > m_VPInstruction(VPInstruction *&V)
Match a VPInstruction, capturing if we match.
VPInstruction_match< VPInstruction::BranchOnCond > m_BranchOnCond()
initializer< Ty > init(const Ty &Val)
NodeAddr< DefNode * > Def
Definition RDFGraph.h:384
bool isSingleScalar(const VPValue *VPV)
Returns true if VPV is a single scalar, either because it produces the same value for all lanes or on...
bool isUniformAcrossVFsAndUFs(VPValue *V)
Checks if V is uniform across all VF lanes and UF parts.
VPValue * getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr)
Get or create a VPValue that corresponds to the expansion of Expr.
bool onlyFirstLaneUsed(const VPValue *Def)
Returns true if only the first lane of Def is used.
bool onlyScalarValuesUsed(const VPValue *Def)
Returns true if only scalar values of Def are used by all users.
bool isHeaderMask(const VPValue *V, const VPlan &Plan)
Return true if V is a header mask in Plan.
const SCEV * getSCEVExprForVPValue(const VPValue *V, ScalarEvolution &SE, const Loop *L=nullptr)
Return the SCEV expression for V.
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
@ Offset
Definition DWP.cpp:477
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1725
LLVM_ABI Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
DenseMap< const Value *, const SCEV * > ValueToSCEVMapTy
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2472
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
constexpr from_range_t from_range
auto dyn_cast_if_present(const Y &Val)
dyn_cast_if_present<X> - Functionally identical to dyn_cast, except that a null (or none in the case ...
Definition Casting.h:732
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2136
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:632
auto cast_or_null(const Y &Val)
Definition Casting.h:714
iterator_range< df_iterator< VPBlockShallowTraversalWrapper< VPBlockBase * > > > vp_depth_first_shallow(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order.
Definition VPlanCFG.h:216
iterator_range< df_iterator< VPBlockDeepTraversalWrapper< VPBlockBase * > > > vp_depth_first_deep(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order while traversing t...
Definition VPlanCFG.h:243
detail::concat_range< ValueT, RangeTs... > concat(RangeTs &&...Ranges)
Returns a concatenated range across two or more ranges.
Definition STLExtras.h:1150
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:385
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1732
auto reverse(ContainerTy &&C)
Definition STLExtras.h:406
iterator_range< po_iterator< VPBlockDeepTraversalWrapper< VPBlockBase * > > > vp_post_order_deep(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in post order while traversing through ...
Definition VPlanCFG.h:236
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1622
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
SmallVector< ValueTypeFromRangeType< R >, Size > to_vector(R &&Range)
Given a range of type R, iterate the entire range and return a SmallVector with elements of the vecto...
iterator_range< filter_iterator< detail::IterOfRange< RangeT >, PredicateT > > make_filter_range(RangeT &&Range, PredicateT Pred)
Convenience function that takes a range of elements and a predicate, and return a new filter_iterator...
Definition STLExtras.h:550
bool canConstantBeExtended(const APInt *C, Type *NarrowType, TTI::PartialReductionExtendKind ExtKind)
Check if a constant CI can be safely treated as having been extended from a narrower type with the gi...
Definition VPlan.cpp:1734
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
auto drop_end(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the last N elements excluded.
Definition STLExtras.h:323
RecurKind
These are the kinds of recurrences that we support.
@ Mul
Product of integers.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
@ AddChainWithSubs
A chain of adds and subs.
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition STLExtras.h:1954
DWARFExpression::Operation Op
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition STLExtras.h:1961
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="", bool Before=false)
Split the specified block at the specified instruction.
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1758
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition STLExtras.h:2108
@ DataAndControlFlowWithoutRuntimeCheck
Use predicate to control both data and control flow, but modify the trip count so that a runtime over...
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition Hashing.h:592
bool equal(L &&LRange, R &&RRange)
Wrapper function around std::equal to detect if pair-wise elements between two ranges are the same.
Definition STLExtras.h:2088
Type * toVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
@ Default
The result values are uniform if and only if all operands are uniform.
Definition Uniformity.h:20
constexpr detail::IsaCheckPredicate< Types... > IsaPred
Function object wrapper for the llvm::isa type check.
Definition Casting.h:830
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition Hashing.h:466
#define N
RemoveMask_match(const Op0_t &In, Op1_t &Out)
bool match(OpTy *V) const
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
An information struct used to provide DenseMap with the various necessary components for a given valu...
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
A range of powers-of-2 vectorization factors with fixed start and adjustable end.
Struct to hold various analysis needed for cost computations.
A recipe for handling first-order recurrence phis.
Definition VPlan.h:2302
A recipe for widening load operations with vector-predication intrinsics, using the address to load f...
Definition VPlan.h:3312
A recipe for widening load operations, using the address to load from and an optional mask.
Definition VPlan.h:3272
A recipe for widening select instructions.
Definition VPlan.h:1730
A recipe for widening store operations with vector-predication intrinsics, using the value to store,...
Definition VPlan.h:3394
A recipe for widening store operations, using the stored value, the address to store to and an option...
Definition VPlan.h:3352
static void materializeBroadcasts(VPlan &Plan)
Add explicit broadcasts for live-ins and VPValues defined in Plan's entry block if they are used as v...
static void materializePacksAndUnpacks(VPlan &Plan)
Add explicit Build[Struct]Vector recipes to Pack multiple scalar values into vectors and Unpack recip...
static void materializeBackedgeTakenCount(VPlan &Plan, VPBasicBlock *VectorPH)
Materialize the backedge-taken count to be computed explicitly using VPInstructions.
static void optimizeInductionExitUsers(VPlan &Plan, DenseMap< VPValue *, VPValue * > &EndValues, ScalarEvolution &SE)
If there's a single exit block, optimize its phi recipes that use exiting IV values by feeding them p...
static void addScalarResumePhis(VPlan &Plan, VPRecipeBuilder &Builder, DenseMap< VPValue *, VPValue * > &IVEndValues)
Create resume phis in the scalar preheader for first-order recurrences, reductions and inductions,...
static void canonicalizeEVLLoops(VPlan &Plan)
Transform EVL loops to use variable-length stepping after region dissolution.
static void dropPoisonGeneratingRecipes(VPlan &Plan, const std::function< bool(BasicBlock *)> &BlockNeedsPredication)
Drop poison flags from recipes that may generate a poison value that is used after vectorization,...
static void createAndOptimizeReplicateRegions(VPlan &Plan)
Wrap predicated VPReplicateRecipes with a mask operand in an if-then region block and remove the mask...
static void createInterleaveGroups(VPlan &Plan, const SmallPtrSetImpl< const InterleaveGroup< Instruction > * > &InterleaveGroups, VPRecipeBuilder &RecipeBuilder, const bool &ScalarEpilogueAllowed)
static bool runPass(bool(*Transform)(VPlan &, ArgsTy...), VPlan &Plan, typename std::remove_reference< ArgsTy >::type &...Args)
Helper to run a VPlan transform Transform on VPlan, forwarding extra arguments to the transform.
static void addBranchWeightToMiddleTerminator(VPlan &Plan, ElementCount VF, std::optional< unsigned > VScaleForTuning)
Add branch weight metadata, if the Plan's middle block is terminated by a BranchOnCond recipe.
static DenseMap< const SCEV *, Value * > expandSCEVs(VPlan &Plan, ScalarEvolution &SE)
Expand VPExpandSCEVRecipes in Plan's entry block.
static void convertToConcreteRecipes(VPlan &Plan)
Lower abstract recipes to concrete ones, that can be codegen'd.
static void convertToAbstractRecipes(VPlan &Plan, VPCostContext &Ctx, VFRange &Range)
This function converts initial recipes to the abstract recipes and clamps Range based on cost model f...
static void materializeConstantVectorTripCount(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
static LLVM_ABI_FOR_TEST bool tryToConvertVPInstructionsToVPRecipes(VPlan &Plan, function_ref< const InductionDescriptor *(PHINode *)> GetIntOrFpInductionDescriptor, const TargetLibraryInfo &TLI)
Replaces the VPInstructions in Plan with corresponding widen recipes.
static void addExitUsersForFirstOrderRecurrences(VPlan &Plan, VFRange &Range)
Handle users in the exit block for first order reductions in the original exit block.
static void addExplicitVectorLength(VPlan &Plan, const std::optional< unsigned > &MaxEVLSafeElements)
Add a VPEVLBasedIVPHIRecipe and related recipes to Plan and replaces all uses except the canonical IV...
static void replaceSymbolicStrides(VPlan &Plan, PredicatedScalarEvolution &PSE, const DenseMap< Value *, const SCEV * > &StridesMap)
Replace symbolic strides from StridesMap in Plan with constants when possible.
static void removeBranchOnConst(VPlan &Plan)
Remove BranchOnCond recipes with true or false conditions together with removing dead edges to their ...
static void removeDeadRecipes(VPlan &Plan)
Remove dead recipes from Plan.
static void materializeVectorTripCount(VPlan &Plan, VPBasicBlock *VectorPHVPBB, bool TailByMasking, bool RequiresScalarEpilogue)
Materialize vector trip count computations to a set of VPInstructions.
static void simplifyRecipes(VPlan &Plan)
Perform instcombine-like simplifications on recipes in Plan.
static void handleUncountableEarlyExit(VPBasicBlock *EarlyExitingVPBB, VPBasicBlock *EarlyExitVPBB, VPlan &Plan, VPBasicBlock *HeaderVPBB, VPBasicBlock *LatchVPBB)
Update Plan to account for the uncountable early exit from EarlyExitingVPBB to EarlyExitVPBB by.
static void clearReductionWrapFlags(VPlan &Plan)
Clear NSW/NUW flags from reduction instructions if necessary.
static void cse(VPlan &Plan)
Perform common-subexpression-elimination on Plan.
static void addActiveLaneMask(VPlan &Plan, bool UseActiveLaneMaskForControlFlow, bool DataAndControlFlowWithoutRuntimeCheck)
Replace (ICMP_ULE, wide canonical IV, backedge-taken-count) checks with an (active-lane-mask recipe,...
static void optimize(VPlan &Plan)
Apply VPlan-to-VPlan optimizations to Plan, including induction recipe optimizations,...
static void dissolveLoopRegions(VPlan &Plan)
Replace loop regions with explicit CFG.
static void narrowInterleaveGroups(VPlan &Plan, ElementCount VF, unsigned VectorRegWidth)
Try to convert a plan with interleave groups with VF elements to a plan with the interleave groups re...
static void truncateToMinimalBitwidths(VPlan &Plan, const MapVector< Instruction *, uint64_t > &MinBWs)
Insert truncates and extends for any truncated recipe.
static bool adjustFixedOrderRecurrences(VPlan &Plan, VPBuilder &Builder)
Try to have all users of fixed-order recurrences appear after the recipe defining their previous valu...
static void optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Optimize Plan based on BestVF and BestUF.
static void materializeVFAndVFxUF(VPlan &Plan, VPBasicBlock *VectorPH, ElementCount VF)
Materialize VF and VFxUF to be computed explicitly using VPInstructions.