LLVM 22.0.0git
VPlanTransforms.cpp
Go to the documentation of this file.
1//===-- VPlanTransforms.cpp - Utility VPlan to VPlan transforms -----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file implements a set of utility VPlan to VPlan transformations.
11///
12//===----------------------------------------------------------------------===//
13
14#include "VPlanTransforms.h"
15#include "VPRecipeBuilder.h"
16#include "VPlan.h"
17#include "VPlanAnalysis.h"
18#include "VPlanCFG.h"
19#include "VPlanDominatorTree.h"
20#include "VPlanHelpers.h"
21#include "VPlanPatternMatch.h"
22#include "VPlanUtils.h"
23#include "VPlanVerifier.h"
24#include "llvm/ADT/APInt.h"
26#include "llvm/ADT/STLExtras.h"
27#include "llvm/ADT/SetVector.h"
28#include "llvm/ADT/TypeSwitch.h"
34#include "llvm/IR/Intrinsics.h"
35#include "llvm/IR/MDBuilder.h"
39
40using namespace llvm;
41using namespace VPlanPatternMatch;
42
44 "enable-wide-lane-mask", cl::init(false), cl::Hidden,
45 cl::desc("Enable use of wide get active lane mask instructions"));
46
48 VPlan &Plan,
50 GetIntOrFpInductionDescriptor,
51 const TargetLibraryInfo &TLI) {
52
54 Plan.getVectorLoopRegion());
56 // Skip blocks outside region
57 if (!VPBB->getParent())
58 break;
59 VPRecipeBase *Term = VPBB->getTerminator();
60 auto EndIter = Term ? Term->getIterator() : VPBB->end();
61 // Introduce each ingredient into VPlan.
62 for (VPRecipeBase &Ingredient :
63 make_early_inc_range(make_range(VPBB->begin(), EndIter))) {
64
65 VPValue *VPV = Ingredient.getVPSingleValue();
66 if (!VPV->getUnderlyingValue())
67 continue;
68
70
71 VPRecipeBase *NewRecipe = nullptr;
72 if (auto *PhiR = dyn_cast<VPPhi>(&Ingredient)) {
73 auto *Phi = cast<PHINode>(PhiR->getUnderlyingValue());
74 const auto *II = GetIntOrFpInductionDescriptor(Phi);
75 if (!II) {
76 NewRecipe = new VPWidenPHIRecipe(Phi, nullptr, PhiR->getDebugLoc());
77 for (VPValue *Op : PhiR->operands())
78 NewRecipe->addOperand(Op);
79 } else {
80 VPValue *Start = Plan.getOrAddLiveIn(II->getStartValue());
81 VPValue *Step =
83 NewRecipe = new VPWidenIntOrFpInductionRecipe(
84 Phi, Start, Step, &Plan.getVF(), *II, Ingredient.getDebugLoc());
85 }
86 } else {
87 assert(isa<VPInstruction>(&Ingredient) &&
88 "only VPInstructions expected here");
89 assert(!isa<PHINode>(Inst) && "phis should be handled above");
90 // Create VPWidenMemoryRecipe for loads and stores.
91 if (LoadInst *Load = dyn_cast<LoadInst>(Inst)) {
92 NewRecipe = new VPWidenLoadRecipe(
93 *Load, Ingredient.getOperand(0), nullptr /*Mask*/,
94 false /*Consecutive*/, false /*Reverse*/, Load->getAlign(),
95 VPIRMetadata(*Load), Ingredient.getDebugLoc());
96 } else if (StoreInst *Store = dyn_cast<StoreInst>(Inst)) {
97 NewRecipe = new VPWidenStoreRecipe(
98 *Store, Ingredient.getOperand(1), Ingredient.getOperand(0),
99 nullptr /*Mask*/, false /*Consecutive*/, false /*Reverse*/,
100 Store->getAlign(), VPIRMetadata(*Store),
101 Ingredient.getDebugLoc());
103 NewRecipe = new VPWidenGEPRecipe(GEP, Ingredient.operands());
104 } else if (CallInst *CI = dyn_cast<CallInst>(Inst)) {
105 Intrinsic::ID VectorID = getVectorIntrinsicIDForCall(CI, &TLI);
106 if (VectorID == Intrinsic::not_intrinsic)
107 return false;
108 NewRecipe = new VPWidenIntrinsicRecipe(
109 *CI, getVectorIntrinsicIDForCall(CI, &TLI),
110 drop_end(Ingredient.operands()), CI->getType(),
111 CI->getDebugLoc());
112 } else if (SelectInst *SI = dyn_cast<SelectInst>(Inst)) {
113 NewRecipe = new VPWidenSelectRecipe(*SI, Ingredient.operands());
114 } else if (auto *CI = dyn_cast<CastInst>(Inst)) {
115 NewRecipe = new VPWidenCastRecipe(
116 CI->getOpcode(), Ingredient.getOperand(0), CI->getType(), *CI);
117 } else {
118 NewRecipe = new VPWidenRecipe(*Inst, Ingredient.operands());
119 }
120 }
121
122 NewRecipe->insertBefore(&Ingredient);
123 if (NewRecipe->getNumDefinedValues() == 1)
124 VPV->replaceAllUsesWith(NewRecipe->getVPSingleValue());
125 else
126 assert(NewRecipe->getNumDefinedValues() == 0 &&
127 "Only recpies with zero or one defined values expected");
128 Ingredient.eraseFromParent();
129 }
130 }
131 return true;
132}
133
134/// Return true if we do not know how to (mechanically) hoist or sink \p R out
135/// of a loop region.
137 // Assumes don't alias anything or throw; as long as they're guaranteed to
138 // execute, they're safe to hoist.
140 return false;
141
142 // TODO: Relax checks in the future, e.g. we could also hoist reads, if their
143 // memory location is not modified in the vector loop.
144 if (R.mayHaveSideEffects() || R.mayReadFromMemory() || R.isPhi())
145 return true;
146
147 // Allocas cannot be hoisted.
148 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
149 return RepR && RepR->getOpcode() == Instruction::Alloca;
150}
151
152static bool sinkScalarOperands(VPlan &Plan) {
153 auto Iter = vp_depth_first_deep(Plan.getEntry());
154 bool Changed = false;
155 // First, collect the operands of all recipes in replicate blocks as seeds for
156 // sinking.
159 VPBasicBlock *EntryVPBB = VPR->getEntryBasicBlock();
160 if (!VPR->isReplicator() || EntryVPBB->getSuccessors().size() != 2)
161 continue;
162 VPBasicBlock *VPBB = dyn_cast<VPBasicBlock>(EntryVPBB->getSuccessors()[0]);
163 if (!VPBB || VPBB->getSingleSuccessor() != VPR->getExitingBasicBlock())
164 continue;
165 for (auto &Recipe : *VPBB) {
166 for (VPValue *Op : Recipe.operands())
167 if (auto *Def =
168 dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe()))
169 WorkList.insert({VPBB, Def});
170 }
171 }
172
173 bool ScalarVFOnly = Plan.hasScalarVFOnly();
174 // Try to sink each replicate or scalar IV steps recipe in the worklist.
175 for (unsigned I = 0; I != WorkList.size(); ++I) {
176 VPBasicBlock *SinkTo;
177 VPSingleDefRecipe *SinkCandidate;
178 std::tie(SinkTo, SinkCandidate) = WorkList[I];
179 if (SinkCandidate->getParent() == SinkTo ||
180 SinkCandidate->mayHaveSideEffects() ||
181 SinkCandidate->mayReadOrWriteMemory())
182 continue;
183 if (auto *RepR = dyn_cast<VPReplicateRecipe>(SinkCandidate)) {
184 if (!ScalarVFOnly && RepR->isSingleScalar())
185 continue;
186 } else if (!isa<VPScalarIVStepsRecipe>(SinkCandidate))
187 continue;
188
189 bool NeedsDuplicating = false;
190 // All recipe users of the sink candidate must be in the same block SinkTo
191 // or all users outside of SinkTo must be uniform-after-vectorization (
192 // i.e., only first lane is used) . In the latter case, we need to duplicate
193 // SinkCandidate.
194 auto CanSinkWithUser = [SinkTo, &NeedsDuplicating,
195 SinkCandidate](VPUser *U) {
196 auto *UI = cast<VPRecipeBase>(U);
197 if (UI->getParent() == SinkTo)
198 return true;
199 NeedsDuplicating = UI->onlyFirstLaneUsed(SinkCandidate);
200 // We only know how to duplicate VPReplicateRecipes and
201 // VPScalarIVStepsRecipes for now.
202 return NeedsDuplicating &&
204 };
205 if (!all_of(SinkCandidate->users(), CanSinkWithUser))
206 continue;
207
208 if (NeedsDuplicating) {
209 if (ScalarVFOnly)
210 continue;
211 VPSingleDefRecipe *Clone;
212 if (auto *SinkCandidateRepR =
213 dyn_cast<VPReplicateRecipe>(SinkCandidate)) {
214 // TODO: Handle converting to uniform recipes as separate transform,
215 // then cloning should be sufficient here.
216 Instruction *I = SinkCandidate->getUnderlyingInstr();
217 Clone = new VPReplicateRecipe(I, SinkCandidate->operands(), true,
218 nullptr /*Mask*/, *SinkCandidateRepR);
219 // TODO: add ".cloned" suffix to name of Clone's VPValue.
220 } else {
221 Clone = SinkCandidate->clone();
222 }
223
224 Clone->insertBefore(SinkCandidate);
225 SinkCandidate->replaceUsesWithIf(Clone, [SinkTo](VPUser &U, unsigned) {
226 return cast<VPRecipeBase>(&U)->getParent() != SinkTo;
227 });
228 }
229 SinkCandidate->moveBefore(*SinkTo, SinkTo->getFirstNonPhi());
230 for (VPValue *Op : SinkCandidate->operands())
231 if (auto *Def =
232 dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe()))
233 WorkList.insert({SinkTo, Def});
234 Changed = true;
235 }
236 return Changed;
237}
238
239/// If \p R is a region with a VPBranchOnMaskRecipe in the entry block, return
240/// the mask.
242 auto *EntryBB = dyn_cast<VPBasicBlock>(R->getEntry());
243 if (!EntryBB || EntryBB->size() != 1 ||
244 !isa<VPBranchOnMaskRecipe>(EntryBB->begin()))
245 return nullptr;
246
247 return cast<VPBranchOnMaskRecipe>(&*EntryBB->begin())->getOperand(0);
248}
249
250/// If \p R is a triangle region, return the 'then' block of the triangle.
252 auto *EntryBB = cast<VPBasicBlock>(R->getEntry());
253 if (EntryBB->getNumSuccessors() != 2)
254 return nullptr;
255
256 auto *Succ0 = dyn_cast<VPBasicBlock>(EntryBB->getSuccessors()[0]);
257 auto *Succ1 = dyn_cast<VPBasicBlock>(EntryBB->getSuccessors()[1]);
258 if (!Succ0 || !Succ1)
259 return nullptr;
260
261 if (Succ0->getNumSuccessors() + Succ1->getNumSuccessors() != 1)
262 return nullptr;
263 if (Succ0->getSingleSuccessor() == Succ1)
264 return Succ0;
265 if (Succ1->getSingleSuccessor() == Succ0)
266 return Succ1;
267 return nullptr;
268}
269
270// Merge replicate regions in their successor region, if a replicate region
271// is connected to a successor replicate region with the same predicate by a
272// single, empty VPBasicBlock.
274 SmallPtrSet<VPRegionBlock *, 4> TransformedRegions;
275
276 // Collect replicate regions followed by an empty block, followed by another
277 // replicate region with matching masks to process front. This is to avoid
278 // iterator invalidation issues while merging regions.
281 vp_depth_first_deep(Plan.getEntry()))) {
282 if (!Region1->isReplicator())
283 continue;
284 auto *MiddleBasicBlock =
285 dyn_cast_or_null<VPBasicBlock>(Region1->getSingleSuccessor());
286 if (!MiddleBasicBlock || !MiddleBasicBlock->empty())
287 continue;
288
289 auto *Region2 =
290 dyn_cast_or_null<VPRegionBlock>(MiddleBasicBlock->getSingleSuccessor());
291 if (!Region2 || !Region2->isReplicator())
292 continue;
293
294 VPValue *Mask1 = getPredicatedMask(Region1);
295 VPValue *Mask2 = getPredicatedMask(Region2);
296 if (!Mask1 || Mask1 != Mask2)
297 continue;
298
299 assert(Mask1 && Mask2 && "both region must have conditions");
300 WorkList.push_back(Region1);
301 }
302
303 // Move recipes from Region1 to its successor region, if both are triangles.
304 for (VPRegionBlock *Region1 : WorkList) {
305 if (TransformedRegions.contains(Region1))
306 continue;
307 auto *MiddleBasicBlock = cast<VPBasicBlock>(Region1->getSingleSuccessor());
308 auto *Region2 = cast<VPRegionBlock>(MiddleBasicBlock->getSingleSuccessor());
309
310 VPBasicBlock *Then1 = getPredicatedThenBlock(Region1);
311 VPBasicBlock *Then2 = getPredicatedThenBlock(Region2);
312 if (!Then1 || !Then2)
313 continue;
314
315 // Note: No fusion-preventing memory dependencies are expected in either
316 // region. Such dependencies should be rejected during earlier dependence
317 // checks, which guarantee accesses can be re-ordered for vectorization.
318 //
319 // Move recipes to the successor region.
320 for (VPRecipeBase &ToMove : make_early_inc_range(reverse(*Then1)))
321 ToMove.moveBefore(*Then2, Then2->getFirstNonPhi());
322
323 auto *Merge1 = cast<VPBasicBlock>(Then1->getSingleSuccessor());
324 auto *Merge2 = cast<VPBasicBlock>(Then2->getSingleSuccessor());
325
326 // Move VPPredInstPHIRecipes from the merge block to the successor region's
327 // merge block. Update all users inside the successor region to use the
328 // original values.
329 for (VPRecipeBase &Phi1ToMove : make_early_inc_range(reverse(*Merge1))) {
330 VPValue *PredInst1 =
331 cast<VPPredInstPHIRecipe>(&Phi1ToMove)->getOperand(0);
332 VPValue *Phi1ToMoveV = Phi1ToMove.getVPSingleValue();
333 Phi1ToMoveV->replaceUsesWithIf(PredInst1, [Then2](VPUser &U, unsigned) {
334 return cast<VPRecipeBase>(&U)->getParent() == Then2;
335 });
336
337 // Remove phi recipes that are unused after merging the regions.
338 if (Phi1ToMove.getVPSingleValue()->getNumUsers() == 0) {
339 Phi1ToMove.eraseFromParent();
340 continue;
341 }
342 Phi1ToMove.moveBefore(*Merge2, Merge2->begin());
343 }
344
345 // Remove the dead recipes in Region1's entry block.
346 for (VPRecipeBase &R :
347 make_early_inc_range(reverse(*Region1->getEntryBasicBlock())))
348 R.eraseFromParent();
349
350 // Finally, remove the first region.
351 for (VPBlockBase *Pred : make_early_inc_range(Region1->getPredecessors())) {
352 VPBlockUtils::disconnectBlocks(Pred, Region1);
353 VPBlockUtils::connectBlocks(Pred, MiddleBasicBlock);
354 }
355 VPBlockUtils::disconnectBlocks(Region1, MiddleBasicBlock);
356 TransformedRegions.insert(Region1);
357 }
358
359 return !TransformedRegions.empty();
360}
361
363 VPlan &Plan) {
364 Instruction *Instr = PredRecipe->getUnderlyingInstr();
365 // Build the triangular if-then region.
366 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
367 assert(Instr->getParent() && "Predicated instruction not in any basic block");
368 auto *BlockInMask = PredRecipe->getMask();
369 auto *MaskDef = BlockInMask->getDefiningRecipe();
370 auto *BOMRecipe = new VPBranchOnMaskRecipe(
371 BlockInMask, MaskDef ? MaskDef->getDebugLoc() : DebugLoc::getUnknown());
372 auto *Entry =
373 Plan.createVPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
374
375 // Replace predicated replicate recipe with a replicate recipe without a
376 // mask but in the replicate region.
377 auto *RecipeWithoutMask = new VPReplicateRecipe(
378 PredRecipe->getUnderlyingInstr(), drop_end(PredRecipe->operands()),
379 PredRecipe->isSingleScalar(), nullptr /*Mask*/, *PredRecipe);
380 auto *Pred =
381 Plan.createVPBasicBlock(Twine(RegionName) + ".if", RecipeWithoutMask);
382
383 VPPredInstPHIRecipe *PHIRecipe = nullptr;
384 if (PredRecipe->getNumUsers() != 0) {
385 PHIRecipe = new VPPredInstPHIRecipe(RecipeWithoutMask,
386 RecipeWithoutMask->getDebugLoc());
387 PredRecipe->replaceAllUsesWith(PHIRecipe);
388 PHIRecipe->setOperand(0, RecipeWithoutMask);
389 }
390 PredRecipe->eraseFromParent();
391 auto *Exiting =
392 Plan.createVPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
394 Plan.createReplicateRegion(Entry, Exiting, RegionName);
395
396 // Note: first set Entry as region entry and then connect successors starting
397 // from it in order, to propagate the "parent" of each VPBasicBlock.
398 VPBlockUtils::insertTwoBlocksAfter(Pred, Exiting, Entry);
399 VPBlockUtils::connectBlocks(Pred, Exiting);
400
401 return Region;
402}
403
404static void addReplicateRegions(VPlan &Plan) {
407 vp_depth_first_deep(Plan.getEntry()))) {
408 for (VPRecipeBase &R : *VPBB)
409 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
410 if (RepR->isPredicated())
411 WorkList.push_back(RepR);
412 }
413 }
414
415 unsigned BBNum = 0;
416 for (VPReplicateRecipe *RepR : WorkList) {
417 VPBasicBlock *CurrentBlock = RepR->getParent();
418 VPBasicBlock *SplitBlock = CurrentBlock->splitAt(RepR->getIterator());
419
420 BasicBlock *OrigBB = RepR->getUnderlyingInstr()->getParent();
421 SplitBlock->setName(
422 OrigBB->hasName() ? OrigBB->getName() + "." + Twine(BBNum++) : "");
423 // Record predicated instructions for above packing optimizations.
425 Region->setParent(CurrentBlock->getParent());
427
428 VPRegionBlock *ParentRegion = Region->getParent();
429 if (ParentRegion && ParentRegion->getExiting() == CurrentBlock)
430 ParentRegion->setExiting(SplitBlock);
431 }
432}
433
434/// Remove redundant VPBasicBlocks by merging them into their predecessor if
435/// the predecessor has a single successor.
439 vp_depth_first_deep(Plan.getEntry()))) {
440 // Don't fold the blocks in the skeleton of the Plan into their single
441 // predecessors for now.
442 // TODO: Remove restriction once more of the skeleton is modeled in VPlan.
443 if (!VPBB->getParent())
444 continue;
445 auto *PredVPBB =
446 dyn_cast_or_null<VPBasicBlock>(VPBB->getSinglePredecessor());
447 if (!PredVPBB || PredVPBB->getNumSuccessors() != 1 ||
448 isa<VPIRBasicBlock>(PredVPBB))
449 continue;
450 WorkList.push_back(VPBB);
451 }
452
453 for (VPBasicBlock *VPBB : WorkList) {
454 VPBasicBlock *PredVPBB = cast<VPBasicBlock>(VPBB->getSinglePredecessor());
455 for (VPRecipeBase &R : make_early_inc_range(*VPBB))
456 R.moveBefore(*PredVPBB, PredVPBB->end());
457 VPBlockUtils::disconnectBlocks(PredVPBB, VPBB);
458 auto *ParentRegion = VPBB->getParent();
459 if (ParentRegion && ParentRegion->getExiting() == VPBB)
460 ParentRegion->setExiting(PredVPBB);
461 for (auto *Succ : to_vector(VPBB->successors())) {
463 VPBlockUtils::connectBlocks(PredVPBB, Succ);
464 }
465 // VPBB is now dead and will be cleaned up when the plan gets destroyed.
466 }
467 return !WorkList.empty();
468}
469
471 // Convert masked VPReplicateRecipes to if-then region blocks.
473
474 bool ShouldSimplify = true;
475 while (ShouldSimplify) {
476 ShouldSimplify = sinkScalarOperands(Plan);
477 ShouldSimplify |= mergeReplicateRegionsIntoSuccessors(Plan);
478 ShouldSimplify |= mergeBlocksIntoPredecessors(Plan);
479 }
480}
481
482/// Remove redundant casts of inductions.
483///
484/// Such redundant casts are casts of induction variables that can be ignored,
485/// because we already proved that the casted phi is equal to the uncasted phi
486/// in the vectorized loop. There is no need to vectorize the cast - the same
487/// value can be used for both the phi and casts in the vector loop.
489 for (auto &Phi : Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
491 if (!IV || IV->getTruncInst())
492 continue;
493
494 // A sequence of IR Casts has potentially been recorded for IV, which
495 // *must be bypassed* when the IV is vectorized, because the vectorized IV
496 // will produce the desired casted value. This sequence forms a def-use
497 // chain and is provided in reverse order, ending with the cast that uses
498 // the IV phi. Search for the recipe of the last cast in the chain and
499 // replace it with the original IV. Note that only the final cast is
500 // expected to have users outside the cast-chain and the dead casts left
501 // over will be cleaned up later.
502 auto &Casts = IV->getInductionDescriptor().getCastInsts();
503 VPValue *FindMyCast = IV;
504 for (Instruction *IRCast : reverse(Casts)) {
505 VPSingleDefRecipe *FoundUserCast = nullptr;
506 for (auto *U : FindMyCast->users()) {
507 auto *UserCast = dyn_cast<VPSingleDefRecipe>(U);
508 if (UserCast && UserCast->getUnderlyingValue() == IRCast) {
509 FoundUserCast = UserCast;
510 break;
511 }
512 }
513 FindMyCast = FoundUserCast;
514 }
515 FindMyCast->replaceAllUsesWith(IV);
516 }
517}
518
519/// Try to replace VPWidenCanonicalIVRecipes with a widened canonical IV
520/// recipe, if it exists.
522 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
523 VPCanonicalIVPHIRecipe *CanonicalIV = LoopRegion->getCanonicalIV();
524 VPWidenCanonicalIVRecipe *WidenNewIV = nullptr;
525 for (VPUser *U : CanonicalIV->users()) {
527 if (WidenNewIV)
528 break;
529 }
530
531 if (!WidenNewIV)
532 return;
533
534 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
535 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
536 auto *WidenOriginalIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
537
538 if (!WidenOriginalIV || !WidenOriginalIV->isCanonical())
539 continue;
540
541 // Replace WidenNewIV with WidenOriginalIV if WidenOriginalIV provides
542 // everything WidenNewIV's users need. That is, WidenOriginalIV will
543 // generate a vector phi or all users of WidenNewIV demand the first lane
544 // only.
545 if (!vputils::onlyScalarValuesUsed(WidenOriginalIV) ||
546 vputils::onlyFirstLaneUsed(WidenNewIV)) {
547 WidenNewIV->replaceAllUsesWith(WidenOriginalIV);
548 WidenNewIV->eraseFromParent();
549 return;
550 }
551 }
552}
553
554/// Returns true if \p R is dead and can be removed.
555static bool isDeadRecipe(VPRecipeBase &R) {
556 // Do remove conditional assume instructions as their conditions may be
557 // flattened.
558 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
559 bool IsConditionalAssume = RepR && RepR->isPredicated() &&
561 if (IsConditionalAssume)
562 return true;
563
564 if (R.mayHaveSideEffects())
565 return false;
566
567 // Recipe is dead if no user keeps the recipe alive.
568 return all_of(R.definedValues(),
569 [](VPValue *V) { return V->getNumUsers() == 0; });
570}
571
574 vp_post_order_deep(Plan.getEntry()))) {
575 // The recipes in the block are processed in reverse order, to catch chains
576 // of dead recipes.
577 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
578 if (isDeadRecipe(R)) {
579 R.eraseFromParent();
580 continue;
581 }
582
583 // Check if R is a dead VPPhi <-> update cycle and remove it.
584 auto *PhiR = dyn_cast<VPPhi>(&R);
585 if (!PhiR || PhiR->getNumOperands() != 2 || PhiR->getNumUsers() != 1)
586 continue;
587 VPValue *Incoming = PhiR->getOperand(1);
588 if (*PhiR->user_begin() != Incoming->getDefiningRecipe() ||
589 Incoming->getNumUsers() != 1)
590 continue;
591 PhiR->replaceAllUsesWith(PhiR->getOperand(0));
592 PhiR->eraseFromParent();
593 Incoming->getDefiningRecipe()->eraseFromParent();
594 }
595 }
596}
597
600 Instruction::BinaryOps InductionOpcode,
601 FPMathOperator *FPBinOp, Instruction *TruncI,
602 VPValue *StartV, VPValue *Step, DebugLoc DL,
603 VPBuilder &Builder) {
604 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
605 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
606 VPCanonicalIVPHIRecipe *CanonicalIV = LoopRegion->getCanonicalIV();
607 VPSingleDefRecipe *BaseIV = Builder.createDerivedIV(
608 Kind, FPBinOp, StartV, CanonicalIV, Step, "offset.idx");
609
610 // Truncate base induction if needed.
611 VPTypeAnalysis TypeInfo(Plan);
612 Type *ResultTy = TypeInfo.inferScalarType(BaseIV);
613 if (TruncI) {
614 Type *TruncTy = TruncI->getType();
615 assert(ResultTy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits() &&
616 "Not truncating.");
617 assert(ResultTy->isIntegerTy() && "Truncation requires an integer type");
618 BaseIV = Builder.createScalarCast(Instruction::Trunc, BaseIV, TruncTy, DL);
619 ResultTy = TruncTy;
620 }
621
622 // Truncate step if needed.
623 Type *StepTy = TypeInfo.inferScalarType(Step);
624 if (ResultTy != StepTy) {
625 assert(StepTy->getScalarSizeInBits() > ResultTy->getScalarSizeInBits() &&
626 "Not truncating.");
627 assert(StepTy->isIntegerTy() && "Truncation requires an integer type");
628 auto *VecPreheader =
630 VPBuilder::InsertPointGuard Guard(Builder);
631 Builder.setInsertPoint(VecPreheader);
632 Step = Builder.createScalarCast(Instruction::Trunc, Step, ResultTy, DL);
633 }
634 return Builder.createScalarIVSteps(InductionOpcode, FPBinOp, BaseIV, Step,
635 &Plan.getVF(), DL);
636}
637
640 for (unsigned I = 0; I != Users.size(); ++I) {
642 if (isa<VPHeaderPHIRecipe>(Cur))
643 continue;
644 for (VPValue *V : Cur->definedValues())
645 Users.insert_range(V->users());
646 }
647 return Users.takeVector();
648}
649
650/// Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd
651/// (IndStart, ScalarIVSteps (0, Step)) if only its scalar values are used, as
652/// VPWidenPointerInductionRecipe will generate vectors only. If some users
653/// require vectors while other require scalars, the scalar uses need to extract
654/// the scalars from the generated vectors (Note that this is different to how
655/// int/fp inductions are handled). Legalize extract-from-ends using uniform
656/// VPReplicateRecipe of wide inductions to use regular VPReplicateRecipe, so
657/// the correct end value is available. Also optimize
658/// VPWidenIntOrFpInductionRecipe, if any of its users needs scalar values, by
659/// providing them scalar steps built on the canonical scalar IV and update the
660/// original IV's users. This is an optional optimization to reduce the needs of
661/// vector extracts.
664 bool HasOnlyVectorVFs = !Plan.hasScalarVFOnly();
665 VPBuilder Builder(HeaderVPBB, HeaderVPBB->getFirstNonPhi());
666 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
667 auto *PhiR = dyn_cast<VPWidenInductionRecipe>(&Phi);
668 if (!PhiR)
669 continue;
670
671 // Try to narrow wide and replicating recipes to uniform recipes, based on
672 // VPlan analysis.
673 // TODO: Apply to all recipes in the future, to replace legacy uniformity
674 // analysis.
675 auto Users = collectUsersRecursively(PhiR);
676 for (VPUser *U : reverse(Users)) {
677 auto *Def = dyn_cast<VPSingleDefRecipe>(U);
678 auto *RepR = dyn_cast<VPReplicateRecipe>(U);
679 // Skip recipes that shouldn't be narrowed.
680 if (!Def || !isa<VPReplicateRecipe, VPWidenRecipe>(Def) ||
681 Def->getNumUsers() == 0 || !Def->getUnderlyingValue() ||
682 (RepR && (RepR->isSingleScalar() || RepR->isPredicated())))
683 continue;
684
685 // Skip recipes that may have other lanes than their first used.
687 continue;
688
689 auto *Clone = new VPReplicateRecipe(Def->getUnderlyingInstr(),
690 Def->operands(), /*IsUniform*/ true);
691 Clone->insertAfter(Def);
692 Def->replaceAllUsesWith(Clone);
693 }
694
695 // Replace wide pointer inductions which have only their scalars used by
696 // PtrAdd(IndStart, ScalarIVSteps (0, Step)).
697 if (auto *PtrIV = dyn_cast<VPWidenPointerInductionRecipe>(&Phi)) {
698 if (!PtrIV->onlyScalarsGenerated(Plan.hasScalableVF()))
699 continue;
700
701 const InductionDescriptor &ID = PtrIV->getInductionDescriptor();
702 VPValue *StartV = Plan.getConstantInt(ID.getStep()->getType(), 0);
703 VPValue *StepV = PtrIV->getOperand(1);
705 Plan, InductionDescriptor::IK_IntInduction, Instruction::Add, nullptr,
706 nullptr, StartV, StepV, PtrIV->getDebugLoc(), Builder);
707
708 VPValue *PtrAdd = Builder.createPtrAdd(PtrIV->getStartValue(), Steps,
709 PtrIV->getDebugLoc(), "next.gep");
710
711 PtrIV->replaceAllUsesWith(PtrAdd);
712 continue;
713 }
714
715 // Replace widened induction with scalar steps for users that only use
716 // scalars.
717 auto *WideIV = cast<VPWidenIntOrFpInductionRecipe>(&Phi);
718 if (HasOnlyVectorVFs && none_of(WideIV->users(), [WideIV](VPUser *U) {
719 return U->usesScalars(WideIV);
720 }))
721 continue;
722
723 const InductionDescriptor &ID = WideIV->getInductionDescriptor();
725 Plan, ID.getKind(), ID.getInductionOpcode(),
726 dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
727 WideIV->getTruncInst(), WideIV->getStartValue(), WideIV->getStepValue(),
728 WideIV->getDebugLoc(), Builder);
729
730 // Update scalar users of IV to use Step instead.
731 if (!HasOnlyVectorVFs)
732 WideIV->replaceAllUsesWith(Steps);
733 else
734 WideIV->replaceUsesWithIf(Steps, [WideIV](VPUser &U, unsigned) {
735 return U.usesScalars(WideIV);
736 });
737 }
738}
739
740/// Check if \p VPV is an untruncated wide induction, either before or after the
741/// increment. If so return the header IV (before the increment), otherwise
742/// return null.
744 ScalarEvolution &SE) {
745 auto *WideIV = dyn_cast<VPWidenInductionRecipe>(VPV);
746 if (WideIV) {
747 // VPV itself is a wide induction, separately compute the end value for exit
748 // users if it is not a truncated IV.
749 auto *IntOrFpIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
750 return (IntOrFpIV && IntOrFpIV->getTruncInst()) ? nullptr : WideIV;
751 }
752
753 // Check if VPV is an optimizable induction increment.
754 VPRecipeBase *Def = VPV->getDefiningRecipe();
755 if (!Def || Def->getNumOperands() != 2)
756 return nullptr;
757 WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(0));
758 if (!WideIV)
759 WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(1));
760 if (!WideIV)
761 return nullptr;
762
763 auto IsWideIVInc = [&]() {
764 auto &ID = WideIV->getInductionDescriptor();
765
766 // Check if VPV increments the induction by the induction step.
767 VPValue *IVStep = WideIV->getStepValue();
768 switch (ID.getInductionOpcode()) {
769 case Instruction::Add:
770 return match(VPV, m_c_Add(m_Specific(WideIV), m_Specific(IVStep)));
771 case Instruction::FAdd:
773 m_Specific(IVStep)));
774 case Instruction::FSub:
775 return match(VPV, m_Binary<Instruction::FSub>(m_Specific(WideIV),
776 m_Specific(IVStep)));
777 case Instruction::Sub: {
778 // IVStep will be the negated step of the subtraction. Check if Step == -1
779 // * IVStep.
780 VPValue *Step;
781 if (!match(VPV, m_Sub(m_VPValue(), m_VPValue(Step))))
782 return false;
783 const SCEV *IVStepSCEV = vputils::getSCEVExprForVPValue(IVStep, SE);
784 const SCEV *StepSCEV = vputils::getSCEVExprForVPValue(Step, SE);
785 return !isa<SCEVCouldNotCompute>(IVStepSCEV) &&
786 !isa<SCEVCouldNotCompute>(StepSCEV) &&
787 IVStepSCEV == SE.getNegativeSCEV(StepSCEV);
788 }
789 default:
790 return ID.getKind() == InductionDescriptor::IK_PtrInduction &&
791 match(VPV, m_GetElementPtr(m_Specific(WideIV),
792 m_Specific(WideIV->getStepValue())));
793 }
794 llvm_unreachable("should have been covered by switch above");
795 };
796 return IsWideIVInc() ? WideIV : nullptr;
797}
798
799/// Attempts to optimize the induction variable exit values for users in the
800/// early exit block.
802 VPTypeAnalysis &TypeInfo,
803 VPBlockBase *PredVPBB,
804 VPValue *Op,
805 ScalarEvolution &SE) {
806 VPValue *Incoming, *Mask;
809 return nullptr;
810
811 auto *WideIV = getOptimizableIVOf(Incoming, SE);
812 if (!WideIV)
813 return nullptr;
814
815 auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
816 if (WideIntOrFp && WideIntOrFp->getTruncInst())
817 return nullptr;
818
819 // Calculate the final index.
820 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
821 auto *CanonicalIV = LoopRegion->getCanonicalIV();
822 Type *CanonicalIVType = LoopRegion->getCanonicalIVType();
823 VPBuilder B(cast<VPBasicBlock>(PredVPBB));
824
825 DebugLoc DL = cast<VPInstruction>(Op)->getDebugLoc();
826 VPValue *FirstActiveLane =
827 B.createNaryOp(VPInstruction::FirstActiveLane, Mask, DL);
828 Type *FirstActiveLaneType = TypeInfo.inferScalarType(FirstActiveLane);
829 FirstActiveLane = B.createScalarZExtOrTrunc(FirstActiveLane, CanonicalIVType,
830 FirstActiveLaneType, DL);
831 VPValue *EndValue =
832 B.createNaryOp(Instruction::Add, {CanonicalIV, FirstActiveLane}, DL);
833
834 // `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
835 // changed it means the exit is using the incremented value, so we need to
836 // add the step.
837 if (Incoming != WideIV) {
838 VPValue *One = Plan.getConstantInt(CanonicalIVType, 1);
839 EndValue = B.createNaryOp(Instruction::Add, {EndValue, One}, DL);
840 }
841
842 if (!WideIntOrFp || !WideIntOrFp->isCanonical()) {
843 const InductionDescriptor &ID = WideIV->getInductionDescriptor();
844 VPValue *Start = WideIV->getStartValue();
845 VPValue *Step = WideIV->getStepValue();
846 EndValue = B.createDerivedIV(
847 ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
848 Start, EndValue, Step);
849 }
850
851 return EndValue;
852}
853
854/// Attempts to optimize the induction variable exit values for users in the
855/// exit block coming from the latch in the original scalar loop.
857 VPlan &Plan, VPTypeAnalysis &TypeInfo, VPBlockBase *PredVPBB, VPValue *Op,
861 return nullptr;
862
863 auto *WideIV = getOptimizableIVOf(Incoming, SE);
864 if (!WideIV)
865 return nullptr;
866
867 VPValue *EndValue = EndValues.lookup(WideIV);
868 assert(EndValue && "end value must have been pre-computed");
869
870 // `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
871 // changed it means the exit is using the incremented value, so we don't
872 // need to subtract the step.
873 if (Incoming != WideIV)
874 return EndValue;
875
876 // Otherwise, subtract the step from the EndValue.
877 VPBuilder B(cast<VPBasicBlock>(PredVPBB)->getTerminator());
878 VPValue *Step = WideIV->getStepValue();
879 Type *ScalarTy = TypeInfo.inferScalarType(WideIV);
880 if (ScalarTy->isIntegerTy())
881 return B.createNaryOp(Instruction::Sub, {EndValue, Step}, {}, "ind.escape");
882 if (ScalarTy->isPointerTy()) {
883 Type *StepTy = TypeInfo.inferScalarType(Step);
884 auto *Zero = Plan.getConstantInt(StepTy, 0);
885 return B.createPtrAdd(EndValue,
886 B.createNaryOp(Instruction::Sub, {Zero, Step}),
887 DebugLoc::getUnknown(), "ind.escape");
888 }
889 if (ScalarTy->isFloatingPointTy()) {
890 const auto &ID = WideIV->getInductionDescriptor();
891 return B.createNaryOp(
892 ID.getInductionBinOp()->getOpcode() == Instruction::FAdd
893 ? Instruction::FSub
894 : Instruction::FAdd,
895 {EndValue, Step}, {ID.getInductionBinOp()->getFastMathFlags()});
896 }
897 llvm_unreachable("all possible induction types must be handled");
898 return nullptr;
899}
900
902 VPlan &Plan, DenseMap<VPValue *, VPValue *> &EndValues,
903 ScalarEvolution &SE) {
904 VPBlockBase *MiddleVPBB = Plan.getMiddleBlock();
905 VPTypeAnalysis TypeInfo(Plan);
906 for (VPIRBasicBlock *ExitVPBB : Plan.getExitBlocks()) {
907 for (VPRecipeBase &R : ExitVPBB->phis()) {
908 auto *ExitIRI = cast<VPIRPhi>(&R);
909
910 for (auto [Idx, PredVPBB] : enumerate(ExitVPBB->getPredecessors())) {
911 VPValue *Escape = nullptr;
912 if (PredVPBB == MiddleVPBB)
913 Escape = optimizeLatchExitInductionUser(Plan, TypeInfo, PredVPBB,
914 ExitIRI->getOperand(Idx),
915 EndValues, SE);
916 else
917 Escape = optimizeEarlyExitInductionUser(Plan, TypeInfo, PredVPBB,
918 ExitIRI->getOperand(Idx), SE);
919 if (Escape)
920 ExitIRI->setOperand(Idx, Escape);
921 }
922 }
923 }
924}
925
926/// Remove redundant EpxandSCEVRecipes in \p Plan's entry block by replacing
927/// them with already existing recipes expanding the same SCEV expression.
930
931 for (VPRecipeBase &R :
933 auto *ExpR = dyn_cast<VPExpandSCEVRecipe>(&R);
934 if (!ExpR)
935 continue;
936
937 const auto &[V, Inserted] = SCEV2VPV.try_emplace(ExpR->getSCEV(), ExpR);
938 if (Inserted)
939 continue;
940 ExpR->replaceAllUsesWith(V->second);
941 ExpR->eraseFromParent();
942 }
943}
944
946 SmallVector<VPValue *> WorkList;
948 WorkList.push_back(V);
949
950 while (!WorkList.empty()) {
951 VPValue *Cur = WorkList.pop_back_val();
952 if (!Seen.insert(Cur).second)
953 continue;
955 if (!R)
956 continue;
957 if (!isDeadRecipe(*R))
958 continue;
959 append_range(WorkList, R->operands());
960 R->eraseFromParent();
961 }
962}
963
964/// Get any instruction opcode or intrinsic ID data embedded in recipe \p R.
965/// Returns an optional pair, where the first element indicates whether it is
966/// an intrinsic ID.
967static std::optional<std::pair<bool, unsigned>>
969 return TypeSwitch<const VPSingleDefRecipe *,
970 std::optional<std::pair<bool, unsigned>>>(R)
973 [](auto *I) { return std::make_pair(false, I->getOpcode()); })
974 .Case<VPWidenIntrinsicRecipe>([](auto *I) {
975 return std::make_pair(true, I->getVectorIntrinsicID());
976 })
977 .Case<VPVectorPointerRecipe, VPPredInstPHIRecipe>([](auto *I) {
978 // For recipes that do not directly map to LLVM IR instructions,
979 // assign opcodes after the last VPInstruction opcode (which is also
980 // after the last IR Instruction opcode), based on the VPDefID.
981 return std::make_pair(false,
982 VPInstruction::OpsEnd + 1 + I->getVPDefID());
983 })
984 .Default([](auto *) { return std::nullopt; });
985}
986
987/// Try to fold \p R using InstSimplifyFolder. Will succeed and return a
988/// non-nullptr VPValue for a handled opcode or intrinsic ID if corresponding \p
989/// Operands are foldable live-ins.
991 ArrayRef<VPValue *> Operands,
992 const DataLayout &DL,
993 VPTypeAnalysis &TypeInfo) {
994 auto OpcodeOrIID = getOpcodeOrIntrinsicID(&R);
995 if (!OpcodeOrIID)
996 return nullptr;
997
999 for (VPValue *Op : Operands) {
1000 if (!Op->isLiveIn() || !Op->getLiveInIRValue())
1001 return nullptr;
1002 Ops.push_back(Op->getLiveInIRValue());
1003 }
1004
1005 auto FoldToIRValue = [&]() -> Value * {
1006 InstSimplifyFolder Folder(DL);
1007 if (OpcodeOrIID->first) {
1008 if (R.getNumOperands() != 2)
1009 return nullptr;
1010 unsigned ID = OpcodeOrIID->second;
1011 return Folder.FoldBinaryIntrinsic(ID, Ops[0], Ops[1],
1012 TypeInfo.inferScalarType(&R));
1013 }
1014 unsigned Opcode = OpcodeOrIID->second;
1015 if (Instruction::isBinaryOp(Opcode))
1016 return Folder.FoldBinOp(static_cast<Instruction::BinaryOps>(Opcode),
1017 Ops[0], Ops[1]);
1018 if (Instruction::isCast(Opcode))
1019 return Folder.FoldCast(static_cast<Instruction::CastOps>(Opcode), Ops[0],
1020 TypeInfo.inferScalarType(R.getVPSingleValue()));
1021 switch (Opcode) {
1023 return Folder.FoldSelect(Ops[0], Ops[1],
1025 case VPInstruction::Not:
1026 return Folder.FoldBinOp(Instruction::BinaryOps::Xor, Ops[0],
1028 case Instruction::Select:
1029 return Folder.FoldSelect(Ops[0], Ops[1], Ops[2]);
1030 case Instruction::ICmp:
1031 case Instruction::FCmp:
1032 return Folder.FoldCmp(cast<VPRecipeWithIRFlags>(R).getPredicate(), Ops[0],
1033 Ops[1]);
1034 case Instruction::GetElementPtr: {
1035 auto &RFlags = cast<VPRecipeWithIRFlags>(R);
1036 auto *GEP = cast<GetElementPtrInst>(RFlags.getUnderlyingInstr());
1037 return Folder.FoldGEP(GEP->getSourceElementType(), Ops[0],
1038 drop_begin(Ops), RFlags.getGEPNoWrapFlags());
1039 }
1042 return Folder.FoldGEP(IntegerType::getInt8Ty(TypeInfo.getContext()),
1043 Ops[0], Ops[1],
1044 cast<VPRecipeWithIRFlags>(R).getGEPNoWrapFlags());
1045 // An extract of a live-in is an extract of a broadcast, so return the
1046 // broadcasted element.
1047 case Instruction::ExtractElement:
1048 assert(!Ops[0]->getType()->isVectorTy() && "Live-ins should be scalar");
1049 return Ops[0];
1050 }
1051 return nullptr;
1052 };
1053
1054 if (Value *V = FoldToIRValue())
1055 return R.getParent()->getPlan()->getOrAddLiveIn(V);
1056 return nullptr;
1057}
1058
1059/// Try to simplify recipe \p R.
1060static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
1061 VPlan *Plan = R.getParent()->getPlan();
1062
1063 auto *Def = dyn_cast<VPSingleDefRecipe>(&R);
1064 if (!Def)
1065 return;
1066
1067 // Simplification of live-in IR values for SingleDef recipes using
1068 // InstSimplifyFolder.
1069 const DataLayout &DL =
1071 if (VPValue *V = tryToFoldLiveIns(*Def, Def->operands(), DL, TypeInfo))
1072 return Def->replaceAllUsesWith(V);
1073
1074 // Fold PredPHI LiveIn -> LiveIn.
1075 if (auto *PredPHI = dyn_cast<VPPredInstPHIRecipe>(&R)) {
1076 VPValue *Op = PredPHI->getOperand(0);
1077 if (Op->isLiveIn())
1078 PredPHI->replaceAllUsesWith(Op);
1079 }
1080
1081 VPBuilder Builder(Def);
1082 VPValue *A;
1083 if (match(Def, m_Trunc(m_ZExtOrSExt(m_VPValue(A))))) {
1084 Type *TruncTy = TypeInfo.inferScalarType(Def);
1085 Type *ATy = TypeInfo.inferScalarType(A);
1086 if (TruncTy == ATy) {
1087 Def->replaceAllUsesWith(A);
1088 } else {
1089 // Don't replace a scalarizing recipe with a widened cast.
1090 if (isa<VPReplicateRecipe>(Def))
1091 return;
1092 if (ATy->getScalarSizeInBits() < TruncTy->getScalarSizeInBits()) {
1093
1094 unsigned ExtOpcode = match(R.getOperand(0), m_SExt(m_VPValue()))
1095 ? Instruction::SExt
1096 : Instruction::ZExt;
1097 auto *Ext = Builder.createWidenCast(Instruction::CastOps(ExtOpcode), A,
1098 TruncTy);
1099 if (auto *UnderlyingExt = R.getOperand(0)->getUnderlyingValue()) {
1100 // UnderlyingExt has distinct return type, used to retain legacy cost.
1101 Ext->setUnderlyingValue(UnderlyingExt);
1102 }
1103 Def->replaceAllUsesWith(Ext);
1104 } else if (ATy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits()) {
1105 auto *Trunc = Builder.createWidenCast(Instruction::Trunc, A, TruncTy);
1106 Def->replaceAllUsesWith(Trunc);
1107 }
1108 }
1109#ifndef NDEBUG
1110 // Verify that the cached type info is for both A and its users is still
1111 // accurate by comparing it to freshly computed types.
1112 VPTypeAnalysis TypeInfo2(*Plan);
1113 assert(TypeInfo.inferScalarType(A) == TypeInfo2.inferScalarType(A));
1114 for (VPUser *U : A->users()) {
1115 auto *R = cast<VPRecipeBase>(U);
1116 for (VPValue *VPV : R->definedValues())
1117 assert(TypeInfo.inferScalarType(VPV) == TypeInfo2.inferScalarType(VPV));
1118 }
1119#endif
1120 }
1121
1122 // Simplify (X && Y) || (X && !Y) -> X.
1123 // TODO: Split up into simpler, modular combines: (X && Y) || (X && Z) into X
1124 // && (Y || Z) and (X || !X) into true. This requires queuing newly created
1125 // recipes to be visited during simplification.
1126 VPValue *X, *Y, *Z;
1127 if (match(Def,
1130 Def->replaceAllUsesWith(X);
1131 Def->eraseFromParent();
1132 return;
1133 }
1134
1135 // x | 1 -> 1
1136 if (match(Def, m_c_BinaryOr(m_VPValue(X), m_AllOnes())))
1137 return Def->replaceAllUsesWith(Def->getOperand(Def->getOperand(0) == X));
1138
1139 // x | 0 -> x
1140 if (match(Def, m_c_BinaryOr(m_VPValue(X), m_ZeroInt())))
1141 return Def->replaceAllUsesWith(X);
1142
1143 // x & 0 -> 0
1144 if (match(Def, m_c_BinaryAnd(m_VPValue(X), m_ZeroInt())))
1145 return Def->replaceAllUsesWith(Def->getOperand(Def->getOperand(0) == X));
1146
1147 // x && false -> false
1148 if (match(Def, m_LogicalAnd(m_VPValue(X), m_False())))
1149 return Def->replaceAllUsesWith(Def->getOperand(1));
1150
1151 // (x && y) || (x && z) -> x && (y || z)
1154 // Simplify only if one of the operands has one use to avoid creating an
1155 // extra recipe.
1156 (!Def->getOperand(0)->hasMoreThanOneUniqueUser() ||
1157 !Def->getOperand(1)->hasMoreThanOneUniqueUser()))
1158 return Def->replaceAllUsesWith(
1159 Builder.createLogicalAnd(X, Builder.createOr(Y, Z)));
1160
1161 // x && !x -> 0
1163 return Def->replaceAllUsesWith(Plan->getFalse());
1164
1165 if (match(Def, m_Select(m_VPValue(), m_VPValue(X), m_Deferred(X))))
1166 return Def->replaceAllUsesWith(X);
1167
1168 // select !c, x, y -> select c, y, x
1169 VPValue *C;
1170 if (match(Def, m_Select(m_Not(m_VPValue(C)), m_VPValue(X), m_VPValue(Y)))) {
1171 Def->setOperand(0, C);
1172 Def->setOperand(1, Y);
1173 Def->setOperand(2, X);
1174 return;
1175 }
1176
1177 // Reassociate (x && y) && z -> x && (y && z) if x has multiple users. With
1178 // tail folding it is likely that x is a header mask and can be simplified
1179 // further.
1181 m_VPValue(Z))) &&
1182 X->hasMoreThanOneUniqueUser())
1183 return Def->replaceAllUsesWith(
1184 Builder.createLogicalAnd(X, Builder.createLogicalAnd(Y, Z)));
1185
1186 if (match(Def, m_c_Mul(m_VPValue(A), m_One())))
1187 return Def->replaceAllUsesWith(A);
1188
1189 if (match(Def, m_c_Mul(m_VPValue(A), m_ZeroInt())))
1190 return Def->replaceAllUsesWith(R.getOperand(0) == A ? R.getOperand(1)
1191 : R.getOperand(0));
1192
1193 if (match(Def, m_Not(m_VPValue(A)))) {
1194 if (match(A, m_Not(m_VPValue(A))))
1195 return Def->replaceAllUsesWith(A);
1196
1197 // Try to fold Not into compares by adjusting the predicate in-place.
1198 CmpPredicate Pred;
1199 if (match(A, m_Cmp(Pred, m_VPValue(), m_VPValue()))) {
1200 auto *Cmp = cast<VPRecipeWithIRFlags>(A);
1201 if (all_of(Cmp->users(),
1203 m_Not(m_Specific(Cmp)),
1204 m_Select(m_Specific(Cmp), m_VPValue(), m_VPValue()))))) {
1205 Cmp->setPredicate(CmpInst::getInversePredicate(Pred));
1206 for (VPUser *U : to_vector(Cmp->users())) {
1207 auto *R = cast<VPSingleDefRecipe>(U);
1208 if (match(R, m_Select(m_Specific(Cmp), m_VPValue(X), m_VPValue(Y)))) {
1209 // select (cmp pred), x, y -> select (cmp inv_pred), y, x
1210 R->setOperand(1, Y);
1211 R->setOperand(2, X);
1212 } else {
1213 // not (cmp pred) -> cmp inv_pred
1214 assert(match(R, m_Not(m_Specific(Cmp))) && "Unexpected user");
1215 R->replaceAllUsesWith(Cmp);
1216 }
1217 }
1218 // If Cmp doesn't have a debug location, use the one from the negation,
1219 // to preserve the location.
1220 if (!Cmp->getDebugLoc() && R.getDebugLoc())
1221 Cmp->setDebugLoc(R.getDebugLoc());
1222 }
1223 }
1224 }
1225
1226 // Remove redundant DerviedIVs, that is 0 + A * 1 -> A and 0 + 0 * x -> 0.
1227 if ((match(Def, m_DerivedIV(m_ZeroInt(), m_VPValue(A), m_One())) ||
1228 match(Def, m_DerivedIV(m_ZeroInt(), m_ZeroInt(), m_VPValue()))) &&
1229 TypeInfo.inferScalarType(Def->getOperand(1)) ==
1230 TypeInfo.inferScalarType(Def))
1231 return Def->replaceAllUsesWith(Def->getOperand(1));
1232
1234 m_One()))) {
1235 Type *WideStepTy = TypeInfo.inferScalarType(Def);
1236 if (TypeInfo.inferScalarType(X) != WideStepTy)
1237 X = Builder.createWidenCast(Instruction::Trunc, X, WideStepTy);
1238 Def->replaceAllUsesWith(X);
1239 return;
1240 }
1241
1242 // For i1 vp.merges produced by AnyOf reductions:
1243 // vp.merge true, (or x, y), x, evl -> vp.merge y, true, x, evl
1245 m_VPValue(X), m_VPValue())) &&
1247 TypeInfo.inferScalarType(R.getVPSingleValue())->isIntegerTy(1)) {
1248 Def->setOperand(1, Def->getOperand(0));
1249 Def->setOperand(0, Y);
1250 return;
1251 }
1252
1253 if (auto *Phi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(Def)) {
1254 if (Phi->getOperand(0) == Phi->getOperand(1))
1255 Def->replaceAllUsesWith(Phi->getOperand(0));
1256 return;
1257 }
1258
1259 // Look through ExtractLastElement (BuildVector ....).
1262 auto *BuildVector = cast<VPInstruction>(R.getOperand(0));
1263 Def->replaceAllUsesWith(
1264 BuildVector->getOperand(BuildVector->getNumOperands() - 1));
1265 return;
1266 }
1267
1268 // Look through ExtractPenultimateElement (BuildVector ....).
1270 m_BuildVector()))) {
1271 auto *BuildVector = cast<VPInstruction>(R.getOperand(0));
1272 Def->replaceAllUsesWith(
1273 BuildVector->getOperand(BuildVector->getNumOperands() - 2));
1274 return;
1275 }
1276
1277 uint64_t Idx;
1279 auto *BuildVector = cast<VPInstruction>(R.getOperand(0));
1280 Def->replaceAllUsesWith(BuildVector->getOperand(Idx));
1281 return;
1282 }
1283
1284 if (match(Def, m_BuildVector()) && all_equal(R.operands())) {
1285 Def->replaceAllUsesWith(
1286 Builder.createNaryOp(VPInstruction::Broadcast, Def->getOperand(0)));
1287 return;
1288 }
1289
1290 if (auto *Phi = dyn_cast<VPPhi>(Def)) {
1291 if (Phi->getNumOperands() == 1)
1292 Phi->replaceAllUsesWith(Phi->getOperand(0));
1293 return;
1294 }
1295
1296 // Some simplifications can only be applied after unrolling. Perform them
1297 // below.
1298 if (!Plan->isUnrolled())
1299 return;
1300
1301 // Hoist an invariant increment Y of a phi X, by having X start at Y.
1302 if (match(Def, m_c_Add(m_VPValue(X), m_VPValue(Y))) && Y->isLiveIn() &&
1303 isa<VPPhi>(X)) {
1304 auto *Phi = cast<VPPhi>(X);
1305 if (Phi->getOperand(1) != Def && match(Phi->getOperand(0), m_ZeroInt()) &&
1306 Phi->getNumUsers() == 1 && (*Phi->user_begin() == &R)) {
1307 Phi->setOperand(0, Y);
1308 Def->replaceAllUsesWith(Phi);
1309 return;
1310 }
1311 }
1312
1313 // VPVectorPointer for part 0 can be replaced by their start pointer.
1314 if (auto *VecPtr = dyn_cast<VPVectorPointerRecipe>(&R)) {
1315 if (VecPtr->isFirstPart()) {
1316 VecPtr->replaceAllUsesWith(VecPtr->getOperand(0));
1317 return;
1318 }
1319 }
1320
1321 // VPScalarIVSteps for part 0 can be replaced by their start value, if only
1322 // the first lane is demanded.
1323 if (auto *Steps = dyn_cast<VPScalarIVStepsRecipe>(Def)) {
1324 if (Steps->isPart0() && vputils::onlyFirstLaneUsed(Steps)) {
1325 Steps->replaceAllUsesWith(Steps->getOperand(0));
1326 return;
1327 }
1328 }
1329 // Simplify redundant ReductionStartVector recipes after unrolling.
1330 VPValue *StartV;
1332 m_VPValue(StartV), m_VPValue(), m_VPValue()))) {
1333 Def->replaceUsesWithIf(StartV, [](const VPUser &U, unsigned Idx) {
1334 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&U);
1335 return PhiR && PhiR->isInLoop();
1336 });
1337 return;
1338 }
1339
1340 if (match(Def,
1343 Def->replaceAllUsesWith(A);
1344 return;
1345 }
1346
1351 cast<VPReplicateRecipe>(A)->isSingleScalar())) &&
1352 all_of(A->users(),
1353 [Def, A](VPUser *U) { return U->usesScalars(A) || Def == U; })) {
1354 return Def->replaceAllUsesWith(A);
1355 }
1356
1357 if (Plan->getUF() == 1 &&
1359 return Def->replaceAllUsesWith(
1360 Builder.createNaryOp(VPInstruction::ExtractLastElement, {A}));
1361 }
1362}
1363
1366 Plan.getEntry());
1367 VPTypeAnalysis TypeInfo(Plan);
1369 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
1370 simplifyRecipe(R, TypeInfo);
1371 }
1372 }
1373}
1374
1376 if (Plan.hasScalarVFOnly())
1377 return;
1378
1379 // Try to narrow wide and replicating recipes to single scalar recipes,
1380 // based on VPlan analysis. Only process blocks in the loop region for now,
1381 // without traversing into nested regions, as recipes in replicate regions
1382 // cannot be converted yet.
1385 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
1387 continue;
1388 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
1389 if (RepR && (RepR->isSingleScalar() || RepR->isPredicated()))
1390 continue;
1391
1392 auto *RepOrWidenR = cast<VPSingleDefRecipe>(&R);
1393 if (RepR && isa<StoreInst>(RepR->getUnderlyingInstr()) &&
1394 vputils::isSingleScalar(RepR->getOperand(1))) {
1395 auto *Clone = new VPReplicateRecipe(
1396 RepOrWidenR->getUnderlyingInstr(), RepOrWidenR->operands(),
1397 true /*IsSingleScalar*/, nullptr /*Mask*/, *RepR /*Metadata*/);
1398 Clone->insertBefore(RepOrWidenR);
1399 unsigned ExtractOpc =
1400 vputils::isUniformAcrossVFsAndUFs(RepR->getOperand(1))
1403 auto *Ext = new VPInstruction(ExtractOpc, {Clone->getOperand(0)});
1404 Ext->insertBefore(Clone);
1405 Clone->setOperand(0, Ext);
1406 RepR->eraseFromParent();
1407 continue;
1408 }
1409
1410 // Skip recipes that aren't single scalars or don't have only their
1411 // scalar results used. In the latter case, we would introduce extra
1412 // broadcasts.
1413 if (!vputils::isSingleScalar(RepOrWidenR) ||
1414 !all_of(RepOrWidenR->users(), [RepOrWidenR](const VPUser *U) {
1415 return U->usesScalars(RepOrWidenR) ||
1416 match(cast<VPRecipeBase>(U),
1417 m_CombineOr(m_ExtractLastElement(m_VPValue()),
1418 m_ExtractLastLanePerPart(m_VPValue())));
1419 }))
1420 continue;
1421
1422 auto *Clone = new VPReplicateRecipe(RepOrWidenR->getUnderlyingInstr(),
1423 RepOrWidenR->operands(),
1424 true /*IsSingleScalar*/);
1425 Clone->insertBefore(RepOrWidenR);
1426 RepOrWidenR->replaceAllUsesWith(Clone);
1427 if (isDeadRecipe(*RepOrWidenR))
1428 RepOrWidenR->eraseFromParent();
1429 }
1430 }
1431}
1432
1433/// Try to see if all of \p Blend's masks share a common value logically and'ed
1434/// and remove it from the masks.
1436 if (Blend->isNormalized())
1437 return;
1438 VPValue *CommonEdgeMask;
1439 if (!match(Blend->getMask(0),
1440 m_LogicalAnd(m_VPValue(CommonEdgeMask), m_VPValue())))
1441 return;
1442 for (unsigned I = 0; I < Blend->getNumIncomingValues(); I++)
1443 if (!match(Blend->getMask(I),
1444 m_LogicalAnd(m_Specific(CommonEdgeMask), m_VPValue())))
1445 return;
1446 for (unsigned I = 0; I < Blend->getNumIncomingValues(); I++)
1447 Blend->setMask(I, Blend->getMask(I)->getDefiningRecipe()->getOperand(1));
1448}
1449
1450/// Normalize and simplify VPBlendRecipes. Should be run after simplifyRecipes
1451/// to make sure the masks are simplified.
1452static void simplifyBlends(VPlan &Plan) {
1455 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
1456 auto *Blend = dyn_cast<VPBlendRecipe>(&R);
1457 if (!Blend)
1458 continue;
1459
1460 removeCommonBlendMask(Blend);
1461
1462 // Try to remove redundant blend recipes.
1463 SmallPtrSet<VPValue *, 4> UniqueValues;
1464 if (Blend->isNormalized() || !match(Blend->getMask(0), m_False()))
1465 UniqueValues.insert(Blend->getIncomingValue(0));
1466 for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
1467 if (!match(Blend->getMask(I), m_False()))
1468 UniqueValues.insert(Blend->getIncomingValue(I));
1469
1470 if (UniqueValues.size() == 1) {
1471 Blend->replaceAllUsesWith(*UniqueValues.begin());
1472 Blend->eraseFromParent();
1473 continue;
1474 }
1475
1476 if (Blend->isNormalized())
1477 continue;
1478
1479 // Normalize the blend so its first incoming value is used as the initial
1480 // value with the others blended into it.
1481
1482 unsigned StartIndex = 0;
1483 for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
1484 // If a value's mask is used only by the blend then is can be deadcoded.
1485 // TODO: Find the most expensive mask that can be deadcoded, or a mask
1486 // that's used by multiple blends where it can be removed from them all.
1487 VPValue *Mask = Blend->getMask(I);
1488 if (Mask->getNumUsers() == 1 && !match(Mask, m_False())) {
1489 StartIndex = I;
1490 break;
1491 }
1492 }
1493
1494 SmallVector<VPValue *, 4> OperandsWithMask;
1495 OperandsWithMask.push_back(Blend->getIncomingValue(StartIndex));
1496
1497 for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
1498 if (I == StartIndex)
1499 continue;
1500 OperandsWithMask.push_back(Blend->getIncomingValue(I));
1501 OperandsWithMask.push_back(Blend->getMask(I));
1502 }
1503
1504 auto *NewBlend =
1505 new VPBlendRecipe(cast_or_null<PHINode>(Blend->getUnderlyingValue()),
1506 OperandsWithMask, Blend->getDebugLoc());
1507 NewBlend->insertBefore(&R);
1508
1509 VPValue *DeadMask = Blend->getMask(StartIndex);
1510 Blend->replaceAllUsesWith(NewBlend);
1511 Blend->eraseFromParent();
1513
1514 /// Simplify BLEND %a, %b, Not(%mask) -> BLEND %b, %a, %mask.
1515 VPValue *NewMask;
1516 if (NewBlend->getNumOperands() == 3 &&
1517 match(NewBlend->getMask(1), m_Not(m_VPValue(NewMask)))) {
1518 VPValue *Inc0 = NewBlend->getOperand(0);
1519 VPValue *Inc1 = NewBlend->getOperand(1);
1520 VPValue *OldMask = NewBlend->getOperand(2);
1521 NewBlend->setOperand(0, Inc1);
1522 NewBlend->setOperand(1, Inc0);
1523 NewBlend->setOperand(2, NewMask);
1524 if (OldMask->getNumUsers() == 0)
1525 cast<VPInstruction>(OldMask)->eraseFromParent();
1526 }
1527 }
1528 }
1529}
1530
1531/// Optimize the width of vector induction variables in \p Plan based on a known
1532/// constant Trip Count, \p BestVF and \p BestUF.
1534 ElementCount BestVF,
1535 unsigned BestUF) {
1536 // Only proceed if we have not completely removed the vector region.
1537 if (!Plan.getVectorLoopRegion())
1538 return false;
1539
1540 const APInt *TC;
1541 if (!BestVF.isFixed() || !match(Plan.getTripCount(), m_APInt(TC)))
1542 return false;
1543
1544 // Calculate the minimum power-of-2 bit width that can fit the known TC, VF
1545 // and UF. Returns at least 8.
1546 auto ComputeBitWidth = [](APInt TC, uint64_t Align) {
1547 APInt AlignedTC =
1550 APInt MaxVal = AlignedTC - 1;
1551 return std::max<unsigned>(PowerOf2Ceil(MaxVal.getActiveBits()), 8);
1552 };
1553 unsigned NewBitWidth =
1554 ComputeBitWidth(*TC, BestVF.getKnownMinValue() * BestUF);
1555
1556 LLVMContext &Ctx = Plan.getContext();
1557 auto *NewIVTy = IntegerType::get(Ctx, NewBitWidth);
1558
1559 bool MadeChange = false;
1560
1561 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
1562 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
1563 auto *WideIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
1564
1565 // Currently only handle canonical IVs as it is trivial to replace the start
1566 // and stop values, and we currently only perform the optimization when the
1567 // IV has a single use.
1568 if (!WideIV || !WideIV->isCanonical() ||
1569 WideIV->hasMoreThanOneUniqueUser() ||
1570 NewIVTy == WideIV->getScalarType())
1571 continue;
1572
1573 // Currently only handle cases where the single user is a header-mask
1574 // comparison with the backedge-taken-count.
1575 if (!match(*WideIV->user_begin(),
1576 m_ICmp(m_Specific(WideIV),
1579 continue;
1580
1581 // Update IV operands and comparison bound to use new narrower type.
1582 auto *NewStart = Plan.getConstantInt(NewIVTy, 0);
1583 WideIV->setStartValue(NewStart);
1584 auto *NewStep = Plan.getConstantInt(NewIVTy, 1);
1585 WideIV->setStepValue(NewStep);
1586
1587 auto *NewBTC = new VPWidenCastRecipe(
1588 Instruction::Trunc, Plan.getOrCreateBackedgeTakenCount(), NewIVTy);
1589 Plan.getVectorPreheader()->appendRecipe(NewBTC);
1590 auto *Cmp = cast<VPInstruction>(*WideIV->user_begin());
1591 Cmp->setOperand(1, NewBTC);
1592
1593 MadeChange = true;
1594 }
1595
1596 return MadeChange;
1597}
1598
1599/// Return true if \p Cond is known to be true for given \p BestVF and \p
1600/// BestUF.
1602 ElementCount BestVF, unsigned BestUF,
1603 ScalarEvolution &SE) {
1605 return any_of(Cond->getDefiningRecipe()->operands(), [&Plan, BestVF, BestUF,
1606 &SE](VPValue *C) {
1607 return isConditionTrueViaVFAndUF(C, Plan, BestVF, BestUF, SE);
1608 });
1609
1610 auto *CanIV = Plan.getVectorLoopRegion()->getCanonicalIV();
1612 m_Specific(CanIV->getBackedgeValue()),
1613 m_Specific(&Plan.getVectorTripCount()))))
1614 return false;
1615
1616 // The compare checks CanIV + VFxUF == vector trip count. The vector trip
1617 // count is not conveniently available as SCEV so far, so we compare directly
1618 // against the original trip count. This is stricter than necessary, as we
1619 // will only return true if the trip count == vector trip count.
1620 const SCEV *VectorTripCount =
1622 if (isa<SCEVCouldNotCompute>(VectorTripCount))
1623 VectorTripCount = vputils::getSCEVExprForVPValue(Plan.getTripCount(), SE);
1624 assert(!isa<SCEVCouldNotCompute>(VectorTripCount) &&
1625 "Trip count SCEV must be computable");
1626 ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF);
1627 const SCEV *C = SE.getElementCount(VectorTripCount->getType(), NumElements);
1628 return SE.isKnownPredicate(CmpInst::ICMP_EQ, VectorTripCount, C);
1629}
1630
1631/// Try to replace multiple active lane masks used for control flow with
1632/// a single, wide active lane mask instruction followed by multiple
1633/// extract subvector intrinsics. This applies to the active lane mask
1634/// instructions both in the loop and in the preheader.
1635/// Incoming values of all ActiveLaneMaskPHIs are updated to use the
1636/// new extracts from the first active lane mask, which has it's last
1637/// operand (multiplier) set to UF.
1639 unsigned UF) {
1640 if (!EnableWideActiveLaneMask || !VF.isVector() || UF == 1)
1641 return false;
1642
1643 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
1644 VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
1645 auto *Term = &ExitingVPBB->back();
1646
1647 using namespace llvm::VPlanPatternMatch;
1649 m_VPValue(), m_VPValue(), m_VPValue())))))
1650 return false;
1651
1652 auto *Header = cast<VPBasicBlock>(VectorRegion->getEntry());
1653 LLVMContext &Ctx = Plan.getContext();
1654
1655 auto ExtractFromALM = [&](VPInstruction *ALM,
1656 SmallVectorImpl<VPValue *> &Extracts) {
1657 DebugLoc DL = ALM->getDebugLoc();
1658 for (unsigned Part = 0; Part < UF; ++Part) {
1660 Ops.append({ALM, Plan.getOrAddLiveIn(
1661 ConstantInt::get(IntegerType::getInt64Ty(Ctx),
1662 VF.getKnownMinValue() * Part))});
1663 auto *Ext = new VPWidenIntrinsicRecipe(Intrinsic::vector_extract, Ops,
1665 Extracts[Part] = Ext;
1666 Ext->insertAfter(ALM);
1667 }
1668 };
1669
1670 // Create a list of each active lane mask phi, ordered by unroll part.
1672 for (VPRecipeBase &R : Header->phis()) {
1674 if (!Phi)
1675 continue;
1676 VPValue *Index = nullptr;
1677 match(Phi->getBackedgeValue(),
1679 assert(Index && "Expected index from ActiveLaneMask instruction");
1680
1681 uint64_t Part;
1682 if (match(Index,
1684 m_VPValue(), m_ConstantInt(Part))))
1685 Phis[Part] = Phi;
1686 else
1687 // Anything other than a CanonicalIVIncrementForPart is part 0
1688 Phis[0] = Phi;
1689 }
1690
1691 assert(all_of(Phis, [](VPActiveLaneMaskPHIRecipe *Phi) { return Phi; }) &&
1692 "Expected one VPActiveLaneMaskPHIRecipe for each unroll part");
1693
1694 auto *EntryALM = cast<VPInstruction>(Phis[0]->getStartValue());
1695 auto *LoopALM = cast<VPInstruction>(Phis[0]->getBackedgeValue());
1696
1697 assert((EntryALM->getOpcode() == VPInstruction::ActiveLaneMask &&
1698 LoopALM->getOpcode() == VPInstruction::ActiveLaneMask) &&
1699 "Expected incoming values of Phi to be ActiveLaneMasks");
1700
1701 // When using wide lane masks, the return type of the get.active.lane.mask
1702 // intrinsic is VF x UF (last operand).
1703 VPValue *ALMMultiplier = Plan.getConstantInt(64, UF);
1704 EntryALM->setOperand(2, ALMMultiplier);
1705 LoopALM->setOperand(2, ALMMultiplier);
1706
1707 // Create UF x extract vectors and insert into preheader.
1708 SmallVector<VPValue *> EntryExtracts(UF);
1709 ExtractFromALM(EntryALM, EntryExtracts);
1710
1711 // Create UF x extract vectors and insert before the loop compare & branch,
1712 // updating the compare to use the first extract.
1713 SmallVector<VPValue *> LoopExtracts(UF);
1714 ExtractFromALM(LoopALM, LoopExtracts);
1715 VPInstruction *Not = cast<VPInstruction>(Term->getOperand(0));
1716 Not->setOperand(0, LoopExtracts[0]);
1717
1718 // Update the incoming values of active lane mask phis.
1719 for (unsigned Part = 0; Part < UF; ++Part) {
1720 Phis[Part]->setStartValue(EntryExtracts[Part]);
1721 Phis[Part]->setBackedgeValue(LoopExtracts[Part]);
1722 }
1723
1724 return true;
1725}
1726
1727/// Try to simplify the branch condition of \p Plan. This may restrict the
1728/// resulting plan to \p BestVF and \p BestUF.
1730 unsigned BestUF,
1732 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
1733 VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
1734 auto *Term = &ExitingVPBB->back();
1735 VPValue *Cond;
1736 ScalarEvolution &SE = *PSE.getSE();
1737 if (match(Term, m_BranchOnCount()) ||
1739 m_VPValue(), m_VPValue(), m_VPValue()))))) {
1740 // Try to simplify the branch condition if TC <= VF * UF when the latch
1741 // terminator is BranchOnCount or BranchOnCond where the input is
1742 // Not(ActiveLaneMask).
1743 const SCEV *TripCount =
1745 assert(!isa<SCEVCouldNotCompute>(TripCount) &&
1746 "Trip count SCEV must be computable");
1747 ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF);
1748 const SCEV *C = SE.getElementCount(TripCount->getType(), NumElements);
1749 if (TripCount->isZero() ||
1750 !SE.isKnownPredicate(CmpInst::ICMP_ULE, TripCount, C))
1751 return false;
1752 } else if (match(Term, m_BranchOnCond(m_VPValue(Cond)))) {
1753 // For BranchOnCond, check if we can prove the condition to be true using VF
1754 // and UF.
1755 if (!isConditionTrueViaVFAndUF(Cond, Plan, BestVF, BestUF, SE))
1756 return false;
1757 } else {
1758 return false;
1759 }
1760
1761 // The vector loop region only executes once. If possible, completely remove
1762 // the region, otherwise replace the terminator controlling the latch with
1763 // (BranchOnCond true).
1764 // TODO: VPWidenIntOrFpInductionRecipe is only partially supported; add
1765 // support for other non-canonical widen induction recipes (e.g.,
1766 // VPWidenPointerInductionRecipe).
1767 auto *Header = cast<VPBasicBlock>(VectorRegion->getEntry());
1768 if (all_of(Header->phis(), [](VPRecipeBase &Phi) {
1769 if (auto *R = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi))
1770 return R->isCanonical();
1771 return isa<VPCanonicalIVPHIRecipe, VPEVLBasedIVPHIRecipe,
1772 VPFirstOrderRecurrencePHIRecipe, VPPhi>(&Phi);
1773 })) {
1774 for (VPRecipeBase &HeaderR : make_early_inc_range(Header->phis())) {
1775 if (auto *R = dyn_cast<VPWidenIntOrFpInductionRecipe>(&HeaderR)) {
1776 VPBuilder Builder(Plan.getVectorPreheader());
1777 VPValue *StepV = Builder.createNaryOp(VPInstruction::StepVector, {},
1778 R->getScalarType());
1779 HeaderR.getVPSingleValue()->replaceAllUsesWith(StepV);
1780 HeaderR.eraseFromParent();
1781 continue;
1782 }
1783 auto *Phi = cast<VPPhiAccessors>(&HeaderR);
1784 HeaderR.getVPSingleValue()->replaceAllUsesWith(Phi->getIncomingValue(0));
1785 HeaderR.eraseFromParent();
1786 }
1787
1788 VPBlockBase *Preheader = VectorRegion->getSinglePredecessor();
1789 VPBlockBase *Exit = VectorRegion->getSingleSuccessor();
1790 VPBlockUtils::disconnectBlocks(Preheader, VectorRegion);
1791 VPBlockUtils::disconnectBlocks(VectorRegion, Exit);
1792
1793 for (VPBlockBase *B : vp_depth_first_shallow(VectorRegion->getEntry()))
1794 B->setParent(nullptr);
1795
1796 VPBlockUtils::connectBlocks(Preheader, Header);
1797 VPBlockUtils::connectBlocks(ExitingVPBB, Exit);
1799 } else {
1800 // The vector region contains header phis for which we cannot remove the
1801 // loop region yet.
1802 auto *BOC = new VPInstruction(VPInstruction::BranchOnCond, {Plan.getTrue()},
1803 Term->getDebugLoc());
1804 ExitingVPBB->appendRecipe(BOC);
1805 }
1806
1807 Term->eraseFromParent();
1808
1809 return true;
1810}
1811
1813 unsigned BestUF,
1815 assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
1816 assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
1817
1818 bool MadeChange = tryToReplaceALMWithWideALM(Plan, BestVF, BestUF);
1819 MadeChange |= simplifyBranchConditionForVFAndUF(Plan, BestVF, BestUF, PSE);
1820 MadeChange |= optimizeVectorInductionWidthForTCAndVFUF(Plan, BestVF, BestUF);
1821
1822 if (MadeChange) {
1823 Plan.setVF(BestVF);
1824 assert(Plan.getUF() == BestUF && "BestUF must match the Plan's UF");
1825 }
1826 // TODO: Further simplifications are possible
1827 // 1. Replace inductions with constants.
1828 // 2. Replace vector loop region with VPBasicBlock.
1829}
1830
1831/// Sink users of \p FOR after the recipe defining the previous value \p
1832/// Previous of the recurrence. \returns true if all users of \p FOR could be
1833/// re-arranged as needed or false if it is not possible.
1834static bool
1836 VPRecipeBase *Previous,
1837 VPDominatorTree &VPDT) {
1838 // Collect recipes that need sinking.
1841 Seen.insert(Previous);
1842 auto TryToPushSinkCandidate = [&](VPRecipeBase *SinkCandidate) {
1843 // The previous value must not depend on the users of the recurrence phi. In
1844 // that case, FOR is not a fixed order recurrence.
1845 if (SinkCandidate == Previous)
1846 return false;
1847
1848 if (isa<VPHeaderPHIRecipe>(SinkCandidate) ||
1849 !Seen.insert(SinkCandidate).second ||
1850 VPDT.properlyDominates(Previous, SinkCandidate))
1851 return true;
1852
1853 if (cannotHoistOrSinkRecipe(*SinkCandidate))
1854 return false;
1855
1856 WorkList.push_back(SinkCandidate);
1857 return true;
1858 };
1859
1860 // Recursively sink users of FOR after Previous.
1861 WorkList.push_back(FOR);
1862 for (unsigned I = 0; I != WorkList.size(); ++I) {
1863 VPRecipeBase *Current = WorkList[I];
1864 assert(Current->getNumDefinedValues() == 1 &&
1865 "only recipes with a single defined value expected");
1866
1867 for (VPUser *User : Current->getVPSingleValue()->users()) {
1868 if (!TryToPushSinkCandidate(cast<VPRecipeBase>(User)))
1869 return false;
1870 }
1871 }
1872
1873 // Keep recipes to sink ordered by dominance so earlier instructions are
1874 // processed first.
1875 sort(WorkList, [&VPDT](const VPRecipeBase *A, const VPRecipeBase *B) {
1876 return VPDT.properlyDominates(A, B);
1877 });
1878
1879 for (VPRecipeBase *SinkCandidate : WorkList) {
1880 if (SinkCandidate == FOR)
1881 continue;
1882
1883 SinkCandidate->moveAfter(Previous);
1884 Previous = SinkCandidate;
1885 }
1886 return true;
1887}
1888
1889/// Try to hoist \p Previous and its operands before all users of \p FOR.
1891 VPRecipeBase *Previous,
1892 VPDominatorTree &VPDT) {
1893 if (cannotHoistOrSinkRecipe(*Previous))
1894 return false;
1895
1896 // Collect recipes that need hoisting.
1897 SmallVector<VPRecipeBase *> HoistCandidates;
1899 VPRecipeBase *HoistPoint = nullptr;
1900 // Find the closest hoist point by looking at all users of FOR and selecting
1901 // the recipe dominating all other users.
1902 for (VPUser *U : FOR->users()) {
1903 auto *R = cast<VPRecipeBase>(U);
1904 if (!HoistPoint || VPDT.properlyDominates(R, HoistPoint))
1905 HoistPoint = R;
1906 }
1907 assert(all_of(FOR->users(),
1908 [&VPDT, HoistPoint](VPUser *U) {
1909 auto *R = cast<VPRecipeBase>(U);
1910 return HoistPoint == R ||
1911 VPDT.properlyDominates(HoistPoint, R);
1912 }) &&
1913 "HoistPoint must dominate all users of FOR");
1914
1915 auto NeedsHoisting = [HoistPoint, &VPDT,
1916 &Visited](VPValue *HoistCandidateV) -> VPRecipeBase * {
1917 VPRecipeBase *HoistCandidate = HoistCandidateV->getDefiningRecipe();
1918 if (!HoistCandidate)
1919 return nullptr;
1920 VPRegionBlock *EnclosingLoopRegion =
1921 HoistCandidate->getParent()->getEnclosingLoopRegion();
1922 assert((!HoistCandidate->getRegion() ||
1923 HoistCandidate->getRegion() == EnclosingLoopRegion) &&
1924 "CFG in VPlan should still be flat, without replicate regions");
1925 // Hoist candidate was already visited, no need to hoist.
1926 if (!Visited.insert(HoistCandidate).second)
1927 return nullptr;
1928
1929 // Candidate is outside loop region or a header phi, dominates FOR users w/o
1930 // hoisting.
1931 if (!EnclosingLoopRegion || isa<VPHeaderPHIRecipe>(HoistCandidate))
1932 return nullptr;
1933
1934 // If we reached a recipe that dominates HoistPoint, we don't need to
1935 // hoist the recipe.
1936 if (VPDT.properlyDominates(HoistCandidate, HoistPoint))
1937 return nullptr;
1938 return HoistCandidate;
1939 };
1940
1941 if (!NeedsHoisting(Previous->getVPSingleValue()))
1942 return true;
1943
1944 // Recursively try to hoist Previous and its operands before all users of FOR.
1945 HoistCandidates.push_back(Previous);
1946
1947 for (unsigned I = 0; I != HoistCandidates.size(); ++I) {
1948 VPRecipeBase *Current = HoistCandidates[I];
1949 assert(Current->getNumDefinedValues() == 1 &&
1950 "only recipes with a single defined value expected");
1951 if (cannotHoistOrSinkRecipe(*Current))
1952 return false;
1953
1954 for (VPValue *Op : Current->operands()) {
1955 // If we reach FOR, it means the original Previous depends on some other
1956 // recurrence that in turn depends on FOR. If that is the case, we would
1957 // also need to hoist recipes involving the other FOR, which may break
1958 // dependencies.
1959 if (Op == FOR)
1960 return false;
1961
1962 if (auto *R = NeedsHoisting(Op))
1963 HoistCandidates.push_back(R);
1964 }
1965 }
1966
1967 // Order recipes to hoist by dominance so earlier instructions are processed
1968 // first.
1969 sort(HoistCandidates, [&VPDT](const VPRecipeBase *A, const VPRecipeBase *B) {
1970 return VPDT.properlyDominates(A, B);
1971 });
1972
1973 for (VPRecipeBase *HoistCandidate : HoistCandidates) {
1974 HoistCandidate->moveBefore(*HoistPoint->getParent(),
1975 HoistPoint->getIterator());
1976 }
1977
1978 return true;
1979}
1980
1982 VPBuilder &LoopBuilder) {
1983 VPDominatorTree VPDT(Plan);
1984
1986 for (VPRecipeBase &R :
1989 RecurrencePhis.push_back(FOR);
1990
1991 for (VPFirstOrderRecurrencePHIRecipe *FOR : RecurrencePhis) {
1993 VPRecipeBase *Previous = FOR->getBackedgeValue()->getDefiningRecipe();
1994 // Fixed-order recurrences do not contain cycles, so this loop is guaranteed
1995 // to terminate.
1996 while (auto *PrevPhi =
1998 assert(PrevPhi->getParent() == FOR->getParent());
1999 assert(SeenPhis.insert(PrevPhi).second);
2000 Previous = PrevPhi->getBackedgeValue()->getDefiningRecipe();
2001 }
2002
2003 if (!sinkRecurrenceUsersAfterPrevious(FOR, Previous, VPDT) &&
2004 !hoistPreviousBeforeFORUsers(FOR, Previous, VPDT))
2005 return false;
2006
2007 // Introduce a recipe to combine the incoming and previous values of a
2008 // fixed-order recurrence.
2009 VPBasicBlock *InsertBlock = Previous->getParent();
2010 if (isa<VPHeaderPHIRecipe>(Previous))
2011 LoopBuilder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi());
2012 else
2013 LoopBuilder.setInsertPoint(InsertBlock,
2014 std::next(Previous->getIterator()));
2015
2016 auto *RecurSplice =
2018 {FOR, FOR->getBackedgeValue()});
2019
2020 FOR->replaceAllUsesWith(RecurSplice);
2021 // Set the first operand of RecurSplice to FOR again, after replacing
2022 // all users.
2023 RecurSplice->setOperand(0, FOR);
2024 }
2025 return true;
2026}
2027
2029 for (VPRecipeBase &R :
2031 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
2032 if (!PhiR)
2033 continue;
2034 RecurKind RK = PhiR->getRecurrenceKind();
2035 if (RK != RecurKind::Add && RK != RecurKind::Mul && RK != RecurKind::Sub &&
2037 continue;
2038
2039 for (VPUser *U : collectUsersRecursively(PhiR))
2040 if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(U)) {
2041 RecWithFlags->dropPoisonGeneratingFlags();
2042 }
2043 }
2044}
2045
2046namespace {
2047struct VPCSEDenseMapInfo : public DenseMapInfo<VPSingleDefRecipe *> {
2048 static bool isSentinel(const VPSingleDefRecipe *Def) {
2049 return Def == getEmptyKey() || Def == getTombstoneKey();
2050 }
2051
2052 /// If recipe \p R will lower to a GEP with a non-i8 source element type,
2053 /// return that source element type.
2054 static Type *getGEPSourceElementType(const VPSingleDefRecipe *R) {
2055 // All VPInstructions that lower to GEPs must have the i8 source element
2056 // type (as they are PtrAdds), so we omit it.
2058 .Case<VPReplicateRecipe>([](auto *I) -> Type * {
2059 if (auto *GEP = dyn_cast<GetElementPtrInst>(I->getUnderlyingValue()))
2060 return GEP->getSourceElementType();
2061 return nullptr;
2062 })
2063 .Case<VPVectorPointerRecipe, VPWidenGEPRecipe>(
2064 [](auto *I) { return I->getSourceElementType(); })
2065 .Default([](auto *) { return nullptr; });
2066 }
2067
2068 /// Returns true if recipe \p Def can be safely handed for CSE.
2069 static bool canHandle(const VPSingleDefRecipe *Def) {
2070 // We can extend the list of handled recipes in the future,
2071 // provided we account for the data embedded in them while checking for
2072 // equality or hashing.
2073 auto C = getOpcodeOrIntrinsicID(Def);
2074
2075 // The issue with (Insert|Extract)Value is that the index of the
2076 // insert/extract is not a proper operand in LLVM IR, and hence also not in
2077 // VPlan.
2078 if (!C || (!C->first && (C->second == Instruction::InsertValue ||
2079 C->second == Instruction::ExtractValue)))
2080 return false;
2081
2082 // During CSE, we can only handle recipes that don't read from memory: if
2083 // they read from memory, there could be an intervening write to memory
2084 // before the next instance is CSE'd, leading to an incorrect result.
2085 return !Def->mayReadFromMemory();
2086 }
2087
2088 /// Hash the underlying data of \p Def.
2089 static unsigned getHashValue(const VPSingleDefRecipe *Def) {
2090 const VPlan *Plan = Def->getParent()->getPlan();
2091 VPTypeAnalysis TypeInfo(*Plan);
2092 hash_code Result = hash_combine(
2093 Def->getVPDefID(), getOpcodeOrIntrinsicID(Def),
2094 getGEPSourceElementType(Def), TypeInfo.inferScalarType(Def),
2096 if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(Def))
2097 if (RFlags->hasPredicate())
2098 return hash_combine(Result, RFlags->getPredicate());
2099 return Result;
2100 }
2101
2102 /// Check equality of underlying data of \p L and \p R.
2103 static bool isEqual(const VPSingleDefRecipe *L, const VPSingleDefRecipe *R) {
2104 if (isSentinel(L) || isSentinel(R))
2105 return L == R;
2106 if (L->getVPDefID() != R->getVPDefID() ||
2108 getGEPSourceElementType(L) != getGEPSourceElementType(R) ||
2110 !equal(L->operands(), R->operands()))
2111 return false;
2113 "must have valid opcode info for both recipes");
2114 if (auto *LFlags = dyn_cast<VPRecipeWithIRFlags>(L))
2115 if (LFlags->hasPredicate() &&
2116 LFlags->getPredicate() !=
2117 cast<VPRecipeWithIRFlags>(R)->getPredicate())
2118 return false;
2119 // Recipes in replicate regions implicitly depend on predicate. If either
2120 // recipe is in a replicate region, only consider them equal if both have
2121 // the same parent.
2122 const VPRegionBlock *RegionL = L->getRegion();
2123 const VPRegionBlock *RegionR = R->getRegion();
2124 if (((RegionL && RegionL->isReplicator()) ||
2125 (RegionR && RegionR->isReplicator())) &&
2126 L->getParent() != R->getParent())
2127 return false;
2128 const VPlan *Plan = L->getParent()->getPlan();
2129 VPTypeAnalysis TypeInfo(*Plan);
2130 return TypeInfo.inferScalarType(L) == TypeInfo.inferScalarType(R);
2131 }
2132};
2133} // end anonymous namespace
2134
2135/// Perform a common-subexpression-elimination of VPSingleDefRecipes on the \p
2136/// Plan.
2138 VPDominatorTree VPDT(Plan);
2140
2142 vp_depth_first_deep(Plan.getEntry()))) {
2143 for (VPRecipeBase &R : *VPBB) {
2144 auto *Def = dyn_cast<VPSingleDefRecipe>(&R);
2145 if (!Def || !VPCSEDenseMapInfo::canHandle(Def))
2146 continue;
2147 if (VPSingleDefRecipe *V = CSEMap.lookup(Def)) {
2148 // V must dominate Def for a valid replacement.
2149 if (!VPDT.dominates(V->getParent(), VPBB))
2150 continue;
2151 // Only keep flags present on both V and Def.
2152 if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(V))
2153 RFlags->intersectFlags(*cast<VPRecipeWithIRFlags>(Def));
2154 Def->replaceAllUsesWith(V);
2155 continue;
2156 }
2157 CSEMap[Def] = Def;
2158 }
2159 }
2160}
2161
2162/// Move loop-invariant recipes out of the vector loop region in \p Plan.
2163static void licm(VPlan &Plan) {
2164 VPBasicBlock *Preheader = Plan.getVectorPreheader();
2165
2166 // Hoist any loop invariant recipes from the vector loop region to the
2167 // preheader. Preform a shallow traversal of the vector loop region, to
2168 // exclude recipes in replicate regions. Since the top-level blocks in the
2169 // vector loop region are guaranteed to execute if the vector pre-header is,
2170 // we don't need to check speculation safety.
2171 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2172 assert(Preheader->getSingleSuccessor() == LoopRegion &&
2173 "Expected vector prehader's successor to be the vector loop region");
2175 vp_depth_first_shallow(LoopRegion->getEntry()))) {
2176 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
2178 continue;
2179 if (any_of(R.operands(), [](VPValue *Op) {
2180 return !Op->isDefinedOutsideLoopRegions();
2181 }))
2182 continue;
2183 R.moveBefore(*Preheader, Preheader->end());
2184 }
2185 }
2186}
2187
2189 VPlan &Plan, const MapVector<Instruction *, uint64_t> &MinBWs) {
2190 if (Plan.hasScalarVFOnly())
2191 return;
2192 // Keep track of created truncates, so they can be re-used. Note that we
2193 // cannot use RAUW after creating a new truncate, as this would could make
2194 // other uses have different types for their operands, making them invalidly
2195 // typed.
2197 VPTypeAnalysis TypeInfo(Plan);
2198 VPBasicBlock *PH = Plan.getVectorPreheader();
2201 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
2204 &R))
2205 continue;
2206
2207 VPValue *ResultVPV = R.getVPSingleValue();
2208 auto *UI = cast_or_null<Instruction>(ResultVPV->getUnderlyingValue());
2209 unsigned NewResSizeInBits = MinBWs.lookup(UI);
2210 if (!NewResSizeInBits)
2211 continue;
2212
2213 // If the value wasn't vectorized, we must maintain the original scalar
2214 // type. Skip those here, after incrementing NumProcessedRecipes. Also
2215 // skip casts which do not need to be handled explicitly here, as
2216 // redundant casts will be removed during recipe simplification.
2218 continue;
2219
2220 Type *OldResTy = TypeInfo.inferScalarType(ResultVPV);
2221 unsigned OldResSizeInBits = OldResTy->getScalarSizeInBits();
2222 assert(OldResTy->isIntegerTy() && "only integer types supported");
2223 (void)OldResSizeInBits;
2224
2225 auto *NewResTy = IntegerType::get(Plan.getContext(), NewResSizeInBits);
2226
2227 // Any wrapping introduced by shrinking this operation shouldn't be
2228 // considered undefined behavior. So, we can't unconditionally copy
2229 // arithmetic wrapping flags to VPW.
2230 if (auto *VPW = dyn_cast<VPRecipeWithIRFlags>(&R))
2231 VPW->dropPoisonGeneratingFlags();
2232
2233 if (OldResSizeInBits != NewResSizeInBits &&
2234 !match(&R, m_ICmp(m_VPValue(), m_VPValue()))) {
2235 // Extend result to original width.
2236 auto *Ext =
2237 new VPWidenCastRecipe(Instruction::ZExt, ResultVPV, OldResTy);
2238 Ext->insertAfter(&R);
2239 ResultVPV->replaceAllUsesWith(Ext);
2240 Ext->setOperand(0, ResultVPV);
2241 assert(OldResSizeInBits > NewResSizeInBits && "Nothing to shrink?");
2242 } else {
2243 assert(match(&R, m_ICmp(m_VPValue(), m_VPValue())) &&
2244 "Only ICmps should not need extending the result.");
2245 }
2246
2247 assert(!isa<VPWidenStoreRecipe>(&R) && "stores cannot be narrowed");
2249 continue;
2250
2251 // Shrink operands by introducing truncates as needed.
2252 unsigned StartIdx = isa<VPWidenSelectRecipe>(&R) ? 1 : 0;
2253 for (unsigned Idx = StartIdx; Idx != R.getNumOperands(); ++Idx) {
2254 auto *Op = R.getOperand(Idx);
2255 unsigned OpSizeInBits =
2257 if (OpSizeInBits == NewResSizeInBits)
2258 continue;
2259 assert(OpSizeInBits > NewResSizeInBits && "nothing to truncate");
2260 auto [ProcessedIter, IterIsEmpty] = ProcessedTruncs.try_emplace(Op);
2261 if (!IterIsEmpty) {
2262 R.setOperand(Idx, ProcessedIter->second);
2263 continue;
2264 }
2265
2266 VPBuilder Builder;
2267 if (Op->isLiveIn())
2268 Builder.setInsertPoint(PH);
2269 else
2270 Builder.setInsertPoint(&R);
2271 VPWidenCastRecipe *NewOp =
2272 Builder.createWidenCast(Instruction::Trunc, Op, NewResTy);
2273 ProcessedIter->second = NewOp;
2274 R.setOperand(Idx, NewOp);
2275 }
2276
2277 }
2278 }
2279}
2280
2284 VPValue *Cond;
2285 // Skip blocks that are not terminated by BranchOnCond.
2286 if (VPBB->empty() || !match(&VPBB->back(), m_BranchOnCond(m_VPValue(Cond))))
2287 continue;
2288
2289 assert(VPBB->getNumSuccessors() == 2 &&
2290 "Two successors expected for BranchOnCond");
2291 unsigned RemovedIdx;
2292 if (match(Cond, m_True()))
2293 RemovedIdx = 1;
2294 else if (match(Cond, m_False()))
2295 RemovedIdx = 0;
2296 else
2297 continue;
2298
2299 VPBasicBlock *RemovedSucc =
2300 cast<VPBasicBlock>(VPBB->getSuccessors()[RemovedIdx]);
2301 assert(count(RemovedSucc->getPredecessors(), VPBB) == 1 &&
2302 "There must be a single edge between VPBB and its successor");
2303 // Values coming from VPBB into phi recipes of RemoveSucc are removed from
2304 // these recipes.
2305 for (VPRecipeBase &R : RemovedSucc->phis())
2306 cast<VPPhiAccessors>(&R)->removeIncomingValueFor(VPBB);
2307
2308 // Disconnect blocks and remove the terminator. RemovedSucc will be deleted
2309 // automatically on VPlan destruction if it becomes unreachable.
2310 VPBlockUtils::disconnectBlocks(VPBB, RemovedSucc);
2311 VPBB->back().eraseFromParent();
2312 }
2313}
2314
2333
2334// Add a VPActiveLaneMaskPHIRecipe and related recipes to \p Plan and replace
2335// the loop terminator with a branch-on-cond recipe with the negated
2336// active-lane-mask as operand. Note that this turns the loop into an
2337// uncountable one. Only the existing terminator is replaced, all other existing
2338// recipes/users remain unchanged, except for poison-generating flags being
2339// dropped from the canonical IV increment. Return the created
2340// VPActiveLaneMaskPHIRecipe.
2341//
2342// The function uses the following definitions:
2343//
2344// %TripCount = DataWithControlFlowWithoutRuntimeCheck ?
2345// calculate-trip-count-minus-VF (original TC) : original TC
2346// %IncrementValue = DataWithControlFlowWithoutRuntimeCheck ?
2347// CanonicalIVPhi : CanonicalIVIncrement
2348// %StartV is the canonical induction start value.
2349//
2350// The function adds the following recipes:
2351//
2352// vector.ph:
2353// %TripCount = calculate-trip-count-minus-VF (original TC)
2354// [if DataWithControlFlowWithoutRuntimeCheck]
2355// %EntryInc = canonical-iv-increment-for-part %StartV
2356// %EntryALM = active-lane-mask %EntryInc, %TripCount
2357//
2358// vector.body:
2359// ...
2360// %P = active-lane-mask-phi [ %EntryALM, %vector.ph ], [ %ALM, %vector.body ]
2361// ...
2362// %InLoopInc = canonical-iv-increment-for-part %IncrementValue
2363// %ALM = active-lane-mask %InLoopInc, TripCount
2364// %Negated = Not %ALM
2365// branch-on-cond %Negated
2366//
2369 VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
2370 VPBasicBlock *EB = TopRegion->getExitingBasicBlock();
2371 auto *CanonicalIVPHI = TopRegion->getCanonicalIV();
2372 VPValue *StartV = CanonicalIVPHI->getStartValue();
2373
2374 auto *CanonicalIVIncrement =
2375 cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue());
2376 // TODO: Check if dropping the flags is needed if
2377 // !DataAndControlFlowWithoutRuntimeCheck.
2378 CanonicalIVIncrement->dropPoisonGeneratingFlags();
2379 DebugLoc DL = CanonicalIVIncrement->getDebugLoc();
2380 // We can't use StartV directly in the ActiveLaneMask VPInstruction, since
2381 // we have to take unrolling into account. Each part needs to start at
2382 // Part * VF
2383 auto *VecPreheader = Plan.getVectorPreheader();
2384 VPBuilder Builder(VecPreheader);
2385
2386 // Create the ActiveLaneMask instruction using the correct start values.
2387 VPValue *TC = Plan.getTripCount();
2388
2389 VPValue *TripCount, *IncrementValue;
2391 // When the loop is guarded by a runtime overflow check for the loop
2392 // induction variable increment by VF, we can increment the value before
2393 // the get.active.lane mask and use the unmodified tripcount.
2394 IncrementValue = CanonicalIVIncrement;
2395 TripCount = TC;
2396 } else {
2397 // When avoiding a runtime check, the active.lane.mask inside the loop
2398 // uses a modified trip count and the induction variable increment is
2399 // done after the active.lane.mask intrinsic is called.
2400 IncrementValue = CanonicalIVPHI;
2401 TripCount = Builder.createNaryOp(VPInstruction::CalculateTripCountMinusVF,
2402 {TC}, DL);
2403 }
2404 auto *EntryIncrement = Builder.createOverflowingOp(
2405 VPInstruction::CanonicalIVIncrementForPart, {StartV}, {false, false}, DL,
2406 "index.part.next");
2407
2408 // Create the active lane mask instruction in the VPlan preheader.
2409 VPValue *ALMMultiplier =
2410 Plan.getConstantInt(TopRegion->getCanonicalIVType(), 1);
2411 auto *EntryALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
2412 {EntryIncrement, TC, ALMMultiplier}, DL,
2413 "active.lane.mask.entry");
2414
2415 // Now create the ActiveLaneMaskPhi recipe in the main loop using the
2416 // preheader ActiveLaneMask instruction.
2417 auto *LaneMaskPhi =
2419 LaneMaskPhi->insertAfter(CanonicalIVPHI);
2420
2421 // Create the active lane mask for the next iteration of the loop before the
2422 // original terminator.
2423 VPRecipeBase *OriginalTerminator = EB->getTerminator();
2424 Builder.setInsertPoint(OriginalTerminator);
2425 auto *InLoopIncrement =
2426 Builder.createOverflowingOp(VPInstruction::CanonicalIVIncrementForPart,
2427 {IncrementValue}, {false, false}, DL);
2428 auto *ALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
2429 {InLoopIncrement, TripCount, ALMMultiplier},
2430 DL, "active.lane.mask.next");
2431 LaneMaskPhi->addOperand(ALM);
2432
2433 // Replace the original terminator with BranchOnCond. We have to invert the
2434 // mask here because a true condition means jumping to the exit block.
2435 auto *NotMask = Builder.createNot(ALM, DL);
2436 Builder.createNaryOp(VPInstruction::BranchOnCond, {NotMask}, DL);
2437 OriginalTerminator->eraseFromParent();
2438 return LaneMaskPhi;
2439}
2440
2441/// Collect the header mask with the pattern:
2442/// (ICMP_ULE, WideCanonicalIV, backedge-taken-count)
2443/// TODO: Introduce explicit recipe for header-mask instead of searching
2444/// for the header-mask pattern manually.
2446 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2447 SmallVector<VPValue *> WideCanonicalIVs;
2448 auto *FoundWidenCanonicalIVUser = find_if(
2450 assert(count_if(LoopRegion->getCanonicalIV()->users(),
2452 "Must have at most one VPWideCanonicalIVRecipe");
2453 if (FoundWidenCanonicalIVUser !=
2454 LoopRegion->getCanonicalIV()->users().end()) {
2455 auto *WideCanonicalIV =
2456 cast<VPWidenCanonicalIVRecipe>(*FoundWidenCanonicalIVUser);
2457 WideCanonicalIVs.push_back(WideCanonicalIV);
2458 }
2459
2460 // Also include VPWidenIntOrFpInductionRecipes that represent a widened
2461 // version of the canonical induction.
2462 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
2463 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
2464 auto *WidenOriginalIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
2465 if (WidenOriginalIV && WidenOriginalIV->isCanonical())
2466 WideCanonicalIVs.push_back(WidenOriginalIV);
2467 }
2468
2469 // Walk users of wide canonical IVs and find the single compare of the form
2470 // (ICMP_ULE, WideCanonicalIV, backedge-taken-count).
2471 VPSingleDefRecipe *HeaderMask = nullptr;
2472 for (auto *Wide : WideCanonicalIVs) {
2473 for (VPUser *U : SmallVector<VPUser *>(Wide->users())) {
2474 auto *VPI = dyn_cast<VPInstruction>(U);
2475 if (!VPI || !vputils::isHeaderMask(VPI, Plan))
2476 continue;
2477
2478 assert(VPI->getOperand(0) == Wide &&
2479 "WidenCanonicalIV must be the first operand of the compare");
2480 assert(!HeaderMask && "Multiple header masks found?");
2481 HeaderMask = VPI;
2482 }
2483 }
2484 return HeaderMask;
2485}
2486
2488 VPlan &Plan, bool UseActiveLaneMaskForControlFlow,
2491 UseActiveLaneMaskForControlFlow) &&
2492 "DataAndControlFlowWithoutRuntimeCheck implies "
2493 "UseActiveLaneMaskForControlFlow");
2494
2495 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2496 auto *FoundWidenCanonicalIVUser = find_if(
2498 assert(FoundWidenCanonicalIVUser &&
2499 "Must have widened canonical IV when tail folding!");
2500 VPSingleDefRecipe *HeaderMask = findHeaderMask(Plan);
2501 auto *WideCanonicalIV =
2502 cast<VPWidenCanonicalIVRecipe>(*FoundWidenCanonicalIVUser);
2503 VPSingleDefRecipe *LaneMask;
2504 if (UseActiveLaneMaskForControlFlow) {
2507 } else {
2508 VPBuilder B = VPBuilder::getToInsertAfter(WideCanonicalIV);
2509 VPValue *ALMMultiplier = Plan.getOrAddLiveIn(
2510 ConstantInt::get(LoopRegion->getCanonicalIVType(), 1));
2511 LaneMask =
2512 B.createNaryOp(VPInstruction::ActiveLaneMask,
2513 {WideCanonicalIV, Plan.getTripCount(), ALMMultiplier},
2514 nullptr, "active.lane.mask");
2515 }
2516
2517 // Walk users of WideCanonicalIV and replace the header mask of the form
2518 // (ICMP_ULE, WideCanonicalIV, backedge-taken-count) with an active-lane-mask,
2519 // removing the old one to ensure there is always only a single header mask.
2520 HeaderMask->replaceAllUsesWith(LaneMask);
2521 HeaderMask->eraseFromParent();
2522}
2523
2524/// Try to optimize a \p CurRecipe masked by \p HeaderMask to a corresponding
2525/// EVL-based recipe without the header mask. Returns nullptr if no EVL-based
2526/// recipe could be created.
2527/// \p HeaderMask Header Mask.
2528/// \p CurRecipe Recipe to be transform.
2529/// \p TypeInfo VPlan-based type analysis.
2530/// \p AllOneMask The vector mask parameter of vector-predication intrinsics.
2531/// \p EVL The explicit vector length parameter of vector-predication
2532/// intrinsics.
2534 VPRecipeBase &CurRecipe,
2535 VPTypeAnalysis &TypeInfo,
2536 VPValue &AllOneMask, VPValue &EVL) {
2537 // FIXME: Don't transform recipes to EVL recipes if they're not masked by the
2538 // header mask.
2539 auto GetNewMask = [&](VPValue *OrigMask) -> VPValue * {
2540 assert(OrigMask && "Unmasked recipe when folding tail");
2541 // HeaderMask will be handled using EVL.
2542 VPValue *Mask;
2543 if (match(OrigMask, m_LogicalAnd(m_Specific(HeaderMask), m_VPValue(Mask))))
2544 return Mask;
2545 return HeaderMask == OrigMask ? nullptr : OrigMask;
2546 };
2547
2548 /// Adjust any end pointers so that they point to the end of EVL lanes not VF.
2549 auto GetNewAddr = [&CurRecipe, &EVL](VPValue *Addr) -> VPValue * {
2550 auto *EndPtr = dyn_cast<VPVectorEndPointerRecipe>(Addr);
2551 if (!EndPtr)
2552 return Addr;
2553 assert(EndPtr->getOperand(1) == &EndPtr->getParent()->getPlan()->getVF() &&
2554 "VPVectorEndPointerRecipe with non-VF VF operand?");
2555 assert(
2556 all_of(EndPtr->users(),
2557 [](VPUser *U) {
2558 return cast<VPWidenMemoryRecipe>(U)->isReverse();
2559 }) &&
2560 "VPVectorEndPointRecipe not used by reversed widened memory recipe?");
2561 VPVectorEndPointerRecipe *EVLAddr = EndPtr->clone();
2562 EVLAddr->insertBefore(&CurRecipe);
2563 EVLAddr->setOperand(1, &EVL);
2564 return EVLAddr;
2565 };
2566
2569 VPValue *NewMask = GetNewMask(L->getMask());
2570 VPValue *NewAddr = GetNewAddr(L->getAddr());
2571 return new VPWidenLoadEVLRecipe(*L, NewAddr, EVL, NewMask);
2572 })
2573 .Case<VPWidenStoreRecipe>([&](VPWidenStoreRecipe *S) {
2574 VPValue *NewMask = GetNewMask(S->getMask());
2575 VPValue *NewAddr = GetNewAddr(S->getAddr());
2576 return new VPWidenStoreEVLRecipe(*S, NewAddr, EVL, NewMask);
2577 })
2578 .Case<VPInterleaveRecipe>([&](VPInterleaveRecipe *IR) {
2579 VPValue *NewMask = GetNewMask(IR->getMask());
2580 return new VPInterleaveEVLRecipe(*IR, EVL, NewMask);
2581 })
2582 .Case<VPReductionRecipe>([&](VPReductionRecipe *Red) {
2583 VPValue *NewMask = GetNewMask(Red->getCondOp());
2584 return new VPReductionEVLRecipe(*Red, EVL, NewMask);
2585 })
2586 .Case<VPInstruction>([&](VPInstruction *VPI) -> VPRecipeBase * {
2587 VPValue *LHS, *RHS;
2588 // Transform select with a header mask condition
2589 // select(header_mask, LHS, RHS)
2590 // into vector predication merge.
2591 // vp.merge(all-true, LHS, RHS, EVL)
2592 if (!match(VPI, m_Select(m_Specific(HeaderMask), m_VPValue(LHS),
2593 m_VPValue(RHS))))
2594 return nullptr;
2595 // Use all true as the condition because this transformation is
2596 // limited to selects whose condition is a header mask.
2597 return new VPWidenIntrinsicRecipe(
2598 Intrinsic::vp_merge, {&AllOneMask, LHS, RHS, &EVL},
2599 TypeInfo.inferScalarType(LHS), VPI->getDebugLoc());
2600 })
2601 .Default([&](VPRecipeBase *R) { return nullptr; });
2602}
2603
2604/// Replace recipes with their EVL variants.
2606 VPTypeAnalysis TypeInfo(Plan);
2607 VPValue *AllOneMask = Plan.getTrue();
2608 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2609 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
2610
2611 assert(all_of(Plan.getVF().users(),
2614 "User of VF that we can't transform to EVL.");
2615 Plan.getVF().replaceUsesWithIf(&EVL, [](VPUser &U, unsigned Idx) {
2617 });
2618
2619 assert(all_of(Plan.getVFxUF().users(),
2620 [&LoopRegion, &Plan](VPUser *U) {
2621 return match(U,
2622 m_c_Add(m_Specific(LoopRegion->getCanonicalIV()),
2623 m_Specific(&Plan.getVFxUF()))) ||
2624 isa<VPWidenPointerInductionRecipe>(U);
2625 }) &&
2626 "Only users of VFxUF should be VPWidenPointerInductionRecipe and the "
2627 "increment of the canonical induction.");
2628 Plan.getVFxUF().replaceUsesWithIf(&EVL, [](VPUser &U, unsigned Idx) {
2629 // Only replace uses in VPWidenPointerInductionRecipe; The increment of the
2630 // canonical induction must not be updated.
2632 });
2633
2634 // Defer erasing recipes till the end so that we don't invalidate the
2635 // VPTypeAnalysis cache.
2637
2638 // Create a scalar phi to track the previous EVL if fixed-order recurrence is
2639 // contained.
2640 bool ContainsFORs =
2642 if (ContainsFORs) {
2643 // TODO: Use VPInstruction::ExplicitVectorLength to get maximum EVL.
2644 VPValue *MaxEVL = &Plan.getVF();
2645 // Emit VPScalarCastRecipe in preheader if VF is not a 32 bits integer.
2646 VPBuilder Builder(LoopRegion->getPreheaderVPBB());
2647 MaxEVL = Builder.createScalarZExtOrTrunc(
2648 MaxEVL, Type::getInt32Ty(Plan.getContext()),
2649 TypeInfo.inferScalarType(MaxEVL), DebugLoc::getUnknown());
2650
2651 Builder.setInsertPoint(Header, Header->getFirstNonPhi());
2652 VPValue *PrevEVL = Builder.createScalarPhi(
2653 {MaxEVL, &EVL}, DebugLoc::getUnknown(), "prev.evl");
2654
2657 for (VPRecipeBase &R : *VPBB) {
2658 VPValue *V1, *V2;
2659 if (!match(&R,
2661 m_VPValue(V1), m_VPValue(V2))))
2662 continue;
2663 VPValue *Imm = Plan.getOrAddLiveIn(
2666 Intrinsic::experimental_vp_splice,
2667 {V1, V2, Imm, AllOneMask, PrevEVL, &EVL},
2668 TypeInfo.inferScalarType(R.getVPSingleValue()), R.getDebugLoc());
2669 VPSplice->insertBefore(&R);
2670 R.getVPSingleValue()->replaceAllUsesWith(VPSplice);
2671 ToErase.push_back(&R);
2672 }
2673 }
2674 }
2675
2676 VPValue *HeaderMask = findHeaderMask(Plan);
2677 if (!HeaderMask)
2678 return;
2679
2680 // Replace header masks with a mask equivalent to predicating by EVL:
2681 //
2682 // icmp ule widen-canonical-iv backedge-taken-count
2683 // ->
2684 // icmp ult step-vector, EVL
2685 VPRecipeBase *EVLR = EVL.getDefiningRecipe();
2686 VPBuilder Builder(EVLR->getParent(), std::next(EVLR->getIterator()));
2687 Type *EVLType = TypeInfo.inferScalarType(&EVL);
2688 VPValue *EVLMask = Builder.createICmp(
2690 Builder.createNaryOp(VPInstruction::StepVector, {}, EVLType), &EVL);
2691 HeaderMask->replaceAllUsesWith(EVLMask);
2692 ToErase.push_back(HeaderMask->getDefiningRecipe());
2693
2694 // Try to optimize header mask recipes away to their EVL variants.
2695 // TODO: Split optimizeMaskToEVL out and move into
2696 // VPlanTransforms::optimize. transformRecipestoEVLRecipes should be run in
2697 // tryToBuildVPlanWithVPRecipes beforehand.
2698 for (VPUser *U : collectUsersRecursively(EVLMask)) {
2699 auto *CurRecipe = cast<VPRecipeBase>(U);
2700 VPRecipeBase *EVLRecipe =
2701 optimizeMaskToEVL(EVLMask, *CurRecipe, TypeInfo, *AllOneMask, EVL);
2702 if (!EVLRecipe)
2703 continue;
2704
2705 unsigned NumDefVal = EVLRecipe->getNumDefinedValues();
2706 assert(NumDefVal == CurRecipe->getNumDefinedValues() &&
2707 "New recipe must define the same number of values as the "
2708 "original.");
2709 EVLRecipe->insertBefore(CurRecipe);
2711 EVLRecipe)) {
2712 for (unsigned I = 0; I < NumDefVal; ++I) {
2713 VPValue *CurVPV = CurRecipe->getVPValue(I);
2714 CurVPV->replaceAllUsesWith(EVLRecipe->getVPValue(I));
2715 }
2716 }
2717 ToErase.push_back(CurRecipe);
2718 }
2719 // Remove dead EVL mask.
2720 if (EVLMask->getNumUsers() == 0)
2721 ToErase.push_back(EVLMask->getDefiningRecipe());
2722
2723 for (VPRecipeBase *R : reverse(ToErase)) {
2724 SmallVector<VPValue *> PossiblyDead(R->operands());
2725 R->eraseFromParent();
2726 for (VPValue *Op : PossiblyDead)
2728 }
2729}
2730
2731/// Add a VPEVLBasedIVPHIRecipe and related recipes to \p Plan and
2732/// replaces all uses except the canonical IV increment of
2733/// VPCanonicalIVPHIRecipe with a VPEVLBasedIVPHIRecipe. VPCanonicalIVPHIRecipe
2734/// is used only for loop iterations counting after this transformation.
2735///
2736/// The function uses the following definitions:
2737/// %StartV is the canonical induction start value.
2738///
2739/// The function adds the following recipes:
2740///
2741/// vector.ph:
2742/// ...
2743///
2744/// vector.body:
2745/// ...
2746/// %EVLPhi = EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI [ %StartV, %vector.ph ],
2747/// [ %NextEVLIV, %vector.body ]
2748/// %AVL = phi [ trip-count, %vector.ph ], [ %NextAVL, %vector.body ]
2749/// %VPEVL = EXPLICIT-VECTOR-LENGTH %AVL
2750/// ...
2751/// %OpEVL = cast i32 %VPEVL to IVSize
2752/// %NextEVLIV = add IVSize %OpEVL, %EVLPhi
2753/// %NextAVL = sub IVSize nuw %AVL, %OpEVL
2754/// ...
2755///
2756/// If MaxSafeElements is provided, the function adds the following recipes:
2757/// vector.ph:
2758/// ...
2759///
2760/// vector.body:
2761/// ...
2762/// %EVLPhi = EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI [ %StartV, %vector.ph ],
2763/// [ %NextEVLIV, %vector.body ]
2764/// %AVL = phi [ trip-count, %vector.ph ], [ %NextAVL, %vector.body ]
2765/// %cmp = cmp ult %AVL, MaxSafeElements
2766/// %SAFE_AVL = select %cmp, %AVL, MaxSafeElements
2767/// %VPEVL = EXPLICIT-VECTOR-LENGTH %SAFE_AVL
2768/// ...
2769/// %OpEVL = cast i32 %VPEVL to IVSize
2770/// %NextEVLIV = add IVSize %OpEVL, %EVLPhi
2771/// %NextAVL = sub IVSize nuw %AVL, %OpEVL
2772/// ...
2773///
2775 VPlan &Plan, const std::optional<unsigned> &MaxSafeElements) {
2776 if (Plan.hasScalarVFOnly())
2777 return;
2778 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2779 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
2780
2781 auto *CanonicalIVPHI = LoopRegion->getCanonicalIV();
2782 auto *CanIVTy = LoopRegion->getCanonicalIVType();
2783 VPValue *StartV = CanonicalIVPHI->getStartValue();
2784
2785 // Create the ExplicitVectorLengthPhi recipe in the main loop.
2786 auto *EVLPhi = new VPEVLBasedIVPHIRecipe(StartV, DebugLoc::getUnknown());
2787 EVLPhi->insertAfter(CanonicalIVPHI);
2788 VPBuilder Builder(Header, Header->getFirstNonPhi());
2789 // Create the AVL (application vector length), starting from TC -> 0 in steps
2790 // of EVL.
2791 VPPhi *AVLPhi = Builder.createScalarPhi(
2792 {Plan.getTripCount()}, DebugLoc::getCompilerGenerated(), "avl");
2793 VPValue *AVL = AVLPhi;
2794
2795 if (MaxSafeElements) {
2796 // Support for MaxSafeDist for correct loop emission.
2797 VPValue *AVLSafe = Plan.getConstantInt(CanIVTy, *MaxSafeElements);
2798 VPValue *Cmp = Builder.createICmp(ICmpInst::ICMP_ULT, AVL, AVLSafe);
2799 AVL = Builder.createSelect(Cmp, AVL, AVLSafe, DebugLoc::getUnknown(),
2800 "safe_avl");
2801 }
2802 auto *VPEVL = Builder.createNaryOp(VPInstruction::ExplicitVectorLength, AVL,
2804
2805 auto *CanonicalIVIncrement =
2806 cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue());
2807 Builder.setInsertPoint(CanonicalIVIncrement);
2808 VPValue *OpVPEVL = VPEVL;
2809
2810 auto *I32Ty = Type::getInt32Ty(Plan.getContext());
2811 OpVPEVL = Builder.createScalarZExtOrTrunc(
2812 OpVPEVL, CanIVTy, I32Ty, CanonicalIVIncrement->getDebugLoc());
2813
2814 auto *NextEVLIV = Builder.createOverflowingOp(
2815 Instruction::Add, {OpVPEVL, EVLPhi},
2816 {CanonicalIVIncrement->hasNoUnsignedWrap(),
2817 CanonicalIVIncrement->hasNoSignedWrap()},
2818 CanonicalIVIncrement->getDebugLoc(), "index.evl.next");
2819 EVLPhi->addOperand(NextEVLIV);
2820
2821 VPValue *NextAVL = Builder.createOverflowingOp(
2822 Instruction::Sub, {AVLPhi, OpVPEVL}, {/*hasNUW=*/true, /*hasNSW=*/false},
2823 DebugLoc::getCompilerGenerated(), "avl.next");
2824 AVLPhi->addOperand(NextAVL);
2825
2826 transformRecipestoEVLRecipes(Plan, *VPEVL);
2827
2828 // Replace all uses of VPCanonicalIVPHIRecipe by
2829 // VPEVLBasedIVPHIRecipe except for the canonical IV increment.
2830 CanonicalIVPHI->replaceAllUsesWith(EVLPhi);
2831 CanonicalIVIncrement->setOperand(0, CanonicalIVPHI);
2832 // TODO: support unroll factor > 1.
2833 Plan.setUF(1);
2834}
2835
2837 // Find EVL loop entries by locating VPEVLBasedIVPHIRecipe.
2838 // There should be only one EVL PHI in the entire plan.
2839 VPEVLBasedIVPHIRecipe *EVLPhi = nullptr;
2840
2843 for (VPRecipeBase &R : VPBB->phis())
2844 if (auto *PhiR = dyn_cast<VPEVLBasedIVPHIRecipe>(&R)) {
2845 assert(!EVLPhi && "Found multiple EVL PHIs. Only one expected");
2846 EVLPhi = PhiR;
2847 }
2848
2849 // Early return if no EVL PHI is found.
2850 if (!EVLPhi)
2851 return;
2852
2853 VPBasicBlock *HeaderVPBB = EVLPhi->getParent();
2854 VPValue *EVLIncrement = EVLPhi->getBackedgeValue();
2855 VPValue *AVL;
2856 [[maybe_unused]] bool FoundAVL =
2857 match(EVLIncrement,
2858 m_c_Add(m_ZExtOrSelf(m_EVL(m_VPValue(AVL))), m_Specific(EVLPhi)));
2859 assert(FoundAVL && "Didn't find AVL?");
2860
2861 // The AVL may be capped to a safe distance.
2862 VPValue *SafeAVL;
2863 if (match(AVL, m_Select(m_VPValue(), m_VPValue(SafeAVL), m_VPValue())))
2864 AVL = SafeAVL;
2865
2866 VPValue *AVLNext;
2867 [[maybe_unused]] bool FoundAVLNext =
2869 m_Specific(Plan.getTripCount()), m_VPValue(AVLNext)));
2870 assert(FoundAVLNext && "Didn't find AVL backedge?");
2871
2872 // Convert EVLPhi to concrete recipe.
2873 auto *ScalarR =
2874 VPBuilder(EVLPhi).createScalarPhi({EVLPhi->getStartValue(), EVLIncrement},
2875 EVLPhi->getDebugLoc(), "evl.based.iv");
2876 EVLPhi->replaceAllUsesWith(ScalarR);
2877 EVLPhi->eraseFromParent();
2878
2879 // Replace CanonicalIVInc with EVL-PHI increment.
2880 auto *CanonicalIV = cast<VPPhi>(&*HeaderVPBB->begin());
2881 VPValue *Backedge = CanonicalIV->getIncomingValue(1);
2882 assert(match(Backedge, m_c_Add(m_Specific(CanonicalIV),
2883 m_Specific(&Plan.getVFxUF()))) &&
2884 "Unexpected canonical iv");
2885 Backedge->replaceAllUsesWith(EVLIncrement);
2886
2887 // Remove unused phi and increment.
2888 VPRecipeBase *CanonicalIVIncrement = Backedge->getDefiningRecipe();
2889 CanonicalIVIncrement->eraseFromParent();
2890 CanonicalIV->eraseFromParent();
2891
2892 // Replace the use of VectorTripCount in the latch-exiting block.
2893 // Before: (branch-on-count EVLIVInc, VectorTripCount)
2894 // After: (branch-on-cond eq AVLNext, 0)
2895
2896 VPBasicBlock *LatchExiting =
2897 HeaderVPBB->getPredecessors()[1]->getEntryBasicBlock();
2898 auto *LatchExitingBr = cast<VPInstruction>(LatchExiting->getTerminator());
2899 // Skip single-iteration loop region
2900 if (match(LatchExitingBr, m_BranchOnCond(m_True())))
2901 return;
2902 assert(LatchExitingBr &&
2903 match(LatchExitingBr,
2904 m_BranchOnCount(m_VPValue(EVLIncrement),
2905 m_Specific(&Plan.getVectorTripCount()))) &&
2906 "Unexpected terminator in EVL loop");
2907
2908 Type *AVLTy = VPTypeAnalysis(Plan).inferScalarType(AVLNext);
2909 VPBuilder Builder(LatchExitingBr);
2910 VPValue *Cmp = Builder.createICmp(CmpInst::ICMP_EQ, AVLNext,
2911 Plan.getConstantInt(AVLTy, 0));
2912 Builder.createNaryOp(VPInstruction::BranchOnCond, Cmp);
2913 LatchExitingBr->eraseFromParent();
2914}
2915
2917 VPlan &Plan, PredicatedScalarEvolution &PSE,
2918 const DenseMap<Value *, const SCEV *> &StridesMap) {
2919 // Replace VPValues for known constant strides guaranteed by predicate scalar
2920 // evolution.
2921 auto CanUseVersionedStride = [&Plan](VPUser &U, unsigned) {
2922 auto *R = cast<VPRecipeBase>(&U);
2923 return R->getRegion() ||
2924 R->getParent() == Plan.getVectorLoopRegion()->getSinglePredecessor();
2925 };
2926 ValueToSCEVMapTy RewriteMap;
2927 for (const SCEV *Stride : StridesMap.values()) {
2928 using namespace SCEVPatternMatch;
2929 auto *StrideV = cast<SCEVUnknown>(Stride)->getValue();
2930 const APInt *StrideConst;
2931 if (!match(PSE.getSCEV(StrideV), m_scev_APInt(StrideConst)))
2932 // Only handle constant strides for now.
2933 continue;
2934
2935 auto *CI = Plan.getConstantInt(*StrideConst);
2936 if (VPValue *StrideVPV = Plan.getLiveIn(StrideV))
2937 StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
2938
2939 // The versioned value may not be used in the loop directly but through a
2940 // sext/zext. Add new live-ins in those cases.
2941 for (Value *U : StrideV->users()) {
2943 continue;
2944 VPValue *StrideVPV = Plan.getLiveIn(U);
2945 if (!StrideVPV)
2946 continue;
2947 unsigned BW = U->getType()->getScalarSizeInBits();
2948 APInt C =
2949 isa<SExtInst>(U) ? StrideConst->sext(BW) : StrideConst->zext(BW);
2950 VPValue *CI = Plan.getConstantInt(C);
2951 StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
2952 }
2953 RewriteMap[StrideV] = PSE.getSCEV(StrideV);
2954 }
2955
2956 for (VPRecipeBase &R : *Plan.getEntry()) {
2957 auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(&R);
2958 if (!ExpSCEV)
2959 continue;
2960 const SCEV *ScevExpr = ExpSCEV->getSCEV();
2961 auto *NewSCEV =
2962 SCEVParameterRewriter::rewrite(ScevExpr, *PSE.getSE(), RewriteMap);
2963 if (NewSCEV != ScevExpr) {
2964 VPValue *NewExp = vputils::getOrCreateVPValueForSCEVExpr(Plan, NewSCEV);
2965 ExpSCEV->replaceAllUsesWith(NewExp);
2966 if (Plan.getTripCount() == ExpSCEV)
2967 Plan.resetTripCount(NewExp);
2968 }
2969 }
2970}
2971
2973 VPlan &Plan,
2974 const std::function<bool(BasicBlock *)> &BlockNeedsPredication) {
2975 // Collect recipes in the backward slice of `Root` that may generate a poison
2976 // value that is used after vectorization.
2978 auto CollectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) {
2980 Worklist.push_back(Root);
2981
2982 // Traverse the backward slice of Root through its use-def chain.
2983 while (!Worklist.empty()) {
2984 VPRecipeBase *CurRec = Worklist.pop_back_val();
2985
2986 if (!Visited.insert(CurRec).second)
2987 continue;
2988
2989 // Prune search if we find another recipe generating a widen memory
2990 // instruction. Widen memory instructions involved in address computation
2991 // will lead to gather/scatter instructions, which don't need to be
2992 // handled.
2994 VPHeaderPHIRecipe>(CurRec))
2995 continue;
2996
2997 // This recipe contributes to the address computation of a widen
2998 // load/store. If the underlying instruction has poison-generating flags,
2999 // drop them directly.
3000 if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(CurRec)) {
3001 VPValue *A, *B;
3002 // Dropping disjoint from an OR may yield incorrect results, as some
3003 // analysis may have converted it to an Add implicitly (e.g. SCEV used
3004 // for dependence analysis). Instead, replace it with an equivalent Add.
3005 // This is possible as all users of the disjoint OR only access lanes
3006 // where the operands are disjoint or poison otherwise.
3007 if (match(RecWithFlags, m_BinaryOr(m_VPValue(A), m_VPValue(B))) &&
3008 RecWithFlags->isDisjoint()) {
3009 VPBuilder Builder(RecWithFlags);
3010 VPInstruction *New = Builder.createOverflowingOp(
3011 Instruction::Add, {A, B}, {false, false},
3012 RecWithFlags->getDebugLoc());
3013 New->setUnderlyingValue(RecWithFlags->getUnderlyingValue());
3014 RecWithFlags->replaceAllUsesWith(New);
3015 RecWithFlags->eraseFromParent();
3016 CurRec = New;
3017 } else
3018 RecWithFlags->dropPoisonGeneratingFlags();
3019 } else {
3022 (void)Instr;
3023 assert((!Instr || !Instr->hasPoisonGeneratingFlags()) &&
3024 "found instruction with poison generating flags not covered by "
3025 "VPRecipeWithIRFlags");
3026 }
3027
3028 // Add new definitions to the worklist.
3029 for (VPValue *Operand : CurRec->operands())
3030 if (VPRecipeBase *OpDef = Operand->getDefiningRecipe())
3031 Worklist.push_back(OpDef);
3032 }
3033 });
3034
3035 // Traverse all the recipes in the VPlan and collect the poison-generating
3036 // recipes in the backward slice starting at the address of a VPWidenRecipe or
3037 // VPInterleaveRecipe.
3038 auto Iter = vp_depth_first_deep(Plan.getEntry());
3040 for (VPRecipeBase &Recipe : *VPBB) {
3041 if (auto *WidenRec = dyn_cast<VPWidenMemoryRecipe>(&Recipe)) {
3042 Instruction &UnderlyingInstr = WidenRec->getIngredient();
3043 VPRecipeBase *AddrDef = WidenRec->getAddr()->getDefiningRecipe();
3044 if (AddrDef && WidenRec->isConsecutive() &&
3045 BlockNeedsPredication(UnderlyingInstr.getParent()))
3046 CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
3047 } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) {
3048 VPRecipeBase *AddrDef = InterleaveRec->getAddr()->getDefiningRecipe();
3049 if (AddrDef) {
3050 // Check if any member of the interleave group needs predication.
3051 const InterleaveGroup<Instruction> *InterGroup =
3052 InterleaveRec->getInterleaveGroup();
3053 bool NeedPredication = false;
3054 for (int I = 0, NumMembers = InterGroup->getNumMembers();
3055 I < NumMembers; ++I) {
3056 Instruction *Member = InterGroup->getMember(I);
3057 if (Member)
3058 NeedPredication |= BlockNeedsPredication(Member->getParent());
3059 }
3060
3061 if (NeedPredication)
3062 CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
3063 }
3064 }
3065 }
3066 }
3067}
3068
3070 VPlan &Plan,
3072 &InterleaveGroups,
3073 VPRecipeBuilder &RecipeBuilder, const bool &ScalarEpilogueAllowed) {
3074 if (InterleaveGroups.empty())
3075 return;
3076
3077 // Interleave memory: for each Interleave Group we marked earlier as relevant
3078 // for this VPlan, replace the Recipes widening its memory instructions with a
3079 // single VPInterleaveRecipe at its insertion point.
3080 VPDominatorTree VPDT(Plan);
3081 for (const auto *IG : InterleaveGroups) {
3082 auto *Start =
3083 cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IG->getMember(0)));
3084 VPIRMetadata InterleaveMD(*Start);
3085 SmallVector<VPValue *, 4> StoredValues;
3086 if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(Start))
3087 StoredValues.push_back(StoreR->getStoredValue());
3088 for (unsigned I = 1; I < IG->getFactor(); ++I) {
3089 Instruction *MemberI = IG->getMember(I);
3090 if (!MemberI)
3091 continue;
3092 VPWidenMemoryRecipe *MemoryR =
3093 cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(MemberI));
3094 if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(MemoryR))
3095 StoredValues.push_back(StoreR->getStoredValue());
3096 InterleaveMD.intersect(*MemoryR);
3097 }
3098
3099 bool NeedsMaskForGaps =
3100 (IG->requiresScalarEpilogue() && !ScalarEpilogueAllowed) ||
3101 (!StoredValues.empty() && !IG->isFull());
3102
3103 Instruction *IRInsertPos = IG->getInsertPos();
3104 auto *InsertPos =
3105 cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IRInsertPos));
3106
3108 if (auto *Gep = dyn_cast<GetElementPtrInst>(
3109 getLoadStorePointerOperand(IRInsertPos)->stripPointerCasts()))
3110 NW = Gep->getNoWrapFlags().withoutNoUnsignedWrap();
3111
3112 // Get or create the start address for the interleave group.
3113 VPValue *Addr = Start->getAddr();
3114 VPRecipeBase *AddrDef = Addr->getDefiningRecipe();
3115 if (AddrDef && !VPDT.properlyDominates(AddrDef, InsertPos)) {
3116 // We cannot re-use the address of member zero because it does not
3117 // dominate the insert position. Instead, use the address of the insert
3118 // position and create a PtrAdd adjusting it to the address of member
3119 // zero.
3120 // TODO: Hoist Addr's defining recipe (and any operands as needed) to
3121 // InsertPos or sink loads above zero members to join it.
3122 assert(IG->getIndex(IRInsertPos) != 0 &&
3123 "index of insert position shouldn't be zero");
3124 auto &DL = IRInsertPos->getDataLayout();
3125 APInt Offset(32,
3126 DL.getTypeAllocSize(getLoadStoreType(IRInsertPos)) *
3127 IG->getIndex(IRInsertPos),
3128 /*IsSigned=*/true);
3129 VPValue *OffsetVPV = Plan.getConstantInt(-Offset);
3130 VPBuilder B(InsertPos);
3131 Addr = B.createNoWrapPtrAdd(InsertPos->getAddr(), OffsetVPV, NW);
3132 }
3133 // If the group is reverse, adjust the index to refer to the last vector
3134 // lane instead of the first. We adjust the index from the first vector
3135 // lane, rather than directly getting the pointer for lane VF - 1, because
3136 // the pointer operand of the interleaved access is supposed to be uniform.
3137 if (IG->isReverse()) {
3138 auto *ReversePtr = new VPVectorEndPointerRecipe(
3139 Addr, &Plan.getVF(), getLoadStoreType(IRInsertPos),
3140 -(int64_t)IG->getFactor(), NW, InsertPos->getDebugLoc());
3141 ReversePtr->insertBefore(InsertPos);
3142 Addr = ReversePtr;
3143 }
3144 auto *VPIG = new VPInterleaveRecipe(IG, Addr, StoredValues,
3145 InsertPos->getMask(), NeedsMaskForGaps,
3146 InterleaveMD, InsertPos->getDebugLoc());
3147 VPIG->insertBefore(InsertPos);
3148
3149 unsigned J = 0;
3150 for (unsigned i = 0; i < IG->getFactor(); ++i)
3151 if (Instruction *Member = IG->getMember(i)) {
3152 VPRecipeBase *MemberR = RecipeBuilder.getRecipe(Member);
3153 if (!Member->getType()->isVoidTy()) {
3154 VPValue *OriginalV = MemberR->getVPSingleValue();
3155 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));
3156 J++;
3157 }
3158 MemberR->eraseFromParent();
3159 }
3160 }
3161}
3162
3163/// Expand a VPWidenIntOrFpInduction into executable recipes, for the initial
3164/// value, phi and backedge value. In the following example:
3165///
3166/// vector.ph:
3167/// Successor(s): vector loop
3168///
3169/// <x1> vector loop: {
3170/// vector.body:
3171/// WIDEN-INDUCTION %i = phi %start, %step, %vf
3172/// ...
3173/// EMIT branch-on-count ...
3174/// No successors
3175/// }
3176///
3177/// WIDEN-INDUCTION will get expanded to:
3178///
3179/// vector.ph:
3180/// ...
3181/// vp<%induction.start> = ...
3182/// vp<%induction.increment> = ...
3183///
3184/// Successor(s): vector loop
3185///
3186/// <x1> vector loop: {
3187/// vector.body:
3188/// ir<%i> = WIDEN-PHI vp<%induction.start>, vp<%vec.ind.next>
3189/// ...
3190/// vp<%vec.ind.next> = add ir<%i>, vp<%induction.increment>
3191/// EMIT branch-on-count ...
3192/// No successors
3193/// }
3194static void
3196 VPTypeAnalysis &TypeInfo) {
3197 VPlan *Plan = WidenIVR->getParent()->getPlan();
3198 VPValue *Start = WidenIVR->getStartValue();
3199 VPValue *Step = WidenIVR->getStepValue();
3200 VPValue *VF = WidenIVR->getVFValue();
3201 DebugLoc DL = WidenIVR->getDebugLoc();
3202
3203 // The value from the original loop to which we are mapping the new induction
3204 // variable.
3205 Type *Ty = TypeInfo.inferScalarType(WidenIVR);
3206
3207 const InductionDescriptor &ID = WidenIVR->getInductionDescriptor();
3210 // FIXME: The newly created binary instructions should contain nsw/nuw
3211 // flags, which can be found from the original scalar operations.
3212 VPIRFlags Flags;
3213 if (ID.getKind() == InductionDescriptor::IK_IntInduction) {
3214 AddOp = Instruction::Add;
3215 MulOp = Instruction::Mul;
3216 } else {
3217 AddOp = ID.getInductionOpcode();
3218 MulOp = Instruction::FMul;
3219 Flags = ID.getInductionBinOp()->getFastMathFlags();
3220 }
3221
3222 // If the phi is truncated, truncate the start and step values.
3223 VPBuilder Builder(Plan->getVectorPreheader());
3224 Type *StepTy = TypeInfo.inferScalarType(Step);
3225 if (Ty->getScalarSizeInBits() < StepTy->getScalarSizeInBits()) {
3226 assert(StepTy->isIntegerTy() && "Truncation requires an integer type");
3227 Step = Builder.createScalarCast(Instruction::Trunc, Step, Ty, DL);
3228 Start = Builder.createScalarCast(Instruction::Trunc, Start, Ty, DL);
3229 StepTy = Ty;
3230 }
3231
3232 // Construct the initial value of the vector IV in the vector loop preheader.
3233 Type *IVIntTy =
3235 VPValue *Init = Builder.createNaryOp(VPInstruction::StepVector, {}, IVIntTy);
3236 if (StepTy->isFloatingPointTy())
3237 Init = Builder.createWidenCast(Instruction::UIToFP, Init, StepTy);
3238
3239 VPValue *SplatStart = Builder.createNaryOp(VPInstruction::Broadcast, Start);
3240 VPValue *SplatStep = Builder.createNaryOp(VPInstruction::Broadcast, Step);
3241
3242 Init = Builder.createNaryOp(MulOp, {Init, SplatStep}, Flags);
3243 Init = Builder.createNaryOp(AddOp, {SplatStart, Init}, Flags,
3244 DebugLoc::getUnknown(), "induction");
3245
3246 // Create the widened phi of the vector IV.
3247 auto *WidePHI = new VPWidenPHIRecipe(WidenIVR->getPHINode(), Init,
3248 WidenIVR->getDebugLoc(), "vec.ind");
3249 WidePHI->insertBefore(WidenIVR);
3250
3251 // Create the backedge value for the vector IV.
3252 VPValue *Inc;
3253 VPValue *Prev;
3254 // If unrolled, use the increment and prev value from the operands.
3255 if (auto *SplatVF = WidenIVR->getSplatVFValue()) {
3256 Inc = SplatVF;
3257 Prev = WidenIVR->getLastUnrolledPartOperand();
3258 } else {
3259 if (VPRecipeBase *R = VF->getDefiningRecipe())
3260 Builder.setInsertPoint(R->getParent(), std::next(R->getIterator()));
3261 // Multiply the vectorization factor by the step using integer or
3262 // floating-point arithmetic as appropriate.
3263 if (StepTy->isFloatingPointTy())
3264 VF = Builder.createScalarCast(Instruction::CastOps::UIToFP, VF, StepTy,
3265 DL);
3266 else
3267 VF = Builder.createScalarZExtOrTrunc(VF, StepTy,
3268 TypeInfo.inferScalarType(VF), DL);
3269
3270 Inc = Builder.createNaryOp(MulOp, {Step, VF}, Flags);
3271 Inc = Builder.createNaryOp(VPInstruction::Broadcast, Inc);
3272 Prev = WidePHI;
3273 }
3274
3276 Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator());
3277 auto *Next = Builder.createNaryOp(AddOp, {Prev, Inc}, Flags,
3278 WidenIVR->getDebugLoc(), "vec.ind.next");
3279
3280 WidePHI->addOperand(Next);
3281
3282 WidenIVR->replaceAllUsesWith(WidePHI);
3283}
3284
3285/// Expand a VPWidenPointerInductionRecipe into executable recipes, for the
3286/// initial value, phi and backedge value. In the following example:
3287///
3288/// <x1> vector loop: {
3289/// vector.body:
3290/// EMIT ir<%ptr.iv> = WIDEN-POINTER-INDUCTION %start, %step, %vf
3291/// ...
3292/// EMIT branch-on-count ...
3293/// }
3294///
3295/// WIDEN-POINTER-INDUCTION will get expanded to:
3296///
3297/// <x1> vector loop: {
3298/// vector.body:
3299/// EMIT-SCALAR %pointer.phi = phi %start, %ptr.ind
3300/// EMIT %mul = mul %stepvector, %step
3301/// EMIT %vector.gep = wide-ptradd %pointer.phi, %mul
3302/// ...
3303/// EMIT %ptr.ind = ptradd %pointer.phi, %vf
3304/// EMIT branch-on-count ...
3305/// }
3307 VPTypeAnalysis &TypeInfo) {
3308 VPlan *Plan = R->getParent()->getPlan();
3309 VPValue *Start = R->getStartValue();
3310 VPValue *Step = R->getStepValue();
3311 VPValue *VF = R->getVFValue();
3312
3313 assert(R->getInductionDescriptor().getKind() ==
3315 "Not a pointer induction according to InductionDescriptor!");
3316 assert(TypeInfo.inferScalarType(R)->isPointerTy() && "Unexpected type.");
3317 assert(!R->onlyScalarsGenerated(Plan->hasScalableVF()) &&
3318 "Recipe should have been replaced");
3319
3320 VPBuilder Builder(R);
3321 DebugLoc DL = R->getDebugLoc();
3322
3323 // Build a scalar pointer phi.
3324 VPPhi *ScalarPtrPhi = Builder.createScalarPhi(Start, DL, "pointer.phi");
3325
3326 // Create actual address geps that use the pointer phi as base and a
3327 // vectorized version of the step value (<step*0, ..., step*N>) as offset.
3328 Builder.setInsertPoint(R->getParent(), R->getParent()->getFirstNonPhi());
3329 Type *StepTy = TypeInfo.inferScalarType(Step);
3330 VPValue *Offset = Builder.createNaryOp(VPInstruction::StepVector, {}, StepTy);
3331 Offset = Builder.createNaryOp(Instruction::Mul, {Offset, Step});
3332 VPValue *PtrAdd = Builder.createNaryOp(
3333 VPInstruction::WidePtrAdd, {ScalarPtrPhi, Offset}, DL, "vector.gep");
3334 R->replaceAllUsesWith(PtrAdd);
3335
3336 // Create the backedge value for the scalar pointer phi.
3338 Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator());
3339 VF = Builder.createScalarZExtOrTrunc(VF, StepTy, TypeInfo.inferScalarType(VF),
3340 DL);
3341 VPValue *Inc = Builder.createNaryOp(Instruction::Mul, {Step, VF});
3342
3343 VPValue *InductionGEP =
3344 Builder.createPtrAdd(ScalarPtrPhi, Inc, DL, "ptr.ind");
3345 ScalarPtrPhi->addOperand(InductionGEP);
3346}
3347
3349 // Replace loop regions with explicity CFG.
3350 SmallVector<VPRegionBlock *> LoopRegions;
3352 vp_depth_first_deep(Plan.getEntry()))) {
3353 if (!R->isReplicator())
3354 LoopRegions.push_back(R);
3355 }
3356 for (VPRegionBlock *R : LoopRegions)
3357 R->dissolveToCFGLoop();
3358}
3359
3361 VPTypeAnalysis TypeInfo(Plan);
3364 vp_depth_first_deep(Plan.getEntry()))) {
3365 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
3366 if (auto *WidenIVR = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R)) {
3367 expandVPWidenIntOrFpInduction(WidenIVR, TypeInfo);
3368 ToRemove.push_back(WidenIVR);
3369 continue;
3370 }
3371
3372 if (auto *WidenIVR = dyn_cast<VPWidenPointerInductionRecipe>(&R)) {
3373 expandVPWidenPointerInduction(WidenIVR, TypeInfo);
3374 ToRemove.push_back(WidenIVR);
3375 continue;
3376 }
3377
3378 // Expand VPBlendRecipe into VPInstruction::Select.
3379 VPBuilder Builder(&R);
3380 if (auto *Blend = dyn_cast<VPBlendRecipe>(&R)) {
3381 VPValue *Select = Blend->getIncomingValue(0);
3382 for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
3383 Select = Builder.createSelect(Blend->getMask(I),
3384 Blend->getIncomingValue(I), Select,
3385 R.getDebugLoc(), "predphi");
3386 Blend->replaceAllUsesWith(Select);
3387 ToRemove.push_back(Blend);
3388 }
3389
3390 if (auto *Expr = dyn_cast<VPExpressionRecipe>(&R)) {
3391 Expr->decompose();
3392 ToRemove.push_back(Expr);
3393 }
3394
3395 VPValue *VectorStep;
3396 VPValue *ScalarStep;
3398 m_VPValue(VectorStep), m_VPValue(ScalarStep))))
3399 continue;
3400
3401 // Expand WideIVStep.
3402 auto *VPI = cast<VPInstruction>(&R);
3403 Type *IVTy = TypeInfo.inferScalarType(VPI);
3404 if (TypeInfo.inferScalarType(VectorStep) != IVTy) {
3406 ? Instruction::UIToFP
3407 : Instruction::Trunc;
3408 VectorStep = Builder.createWidenCast(CastOp, VectorStep, IVTy);
3409 }
3410
3411 assert(!match(ScalarStep, m_One()) && "Expected non-unit scalar-step");
3412 if (TypeInfo.inferScalarType(ScalarStep) != IVTy) {
3413 ScalarStep =
3414 Builder.createWidenCast(Instruction::Trunc, ScalarStep, IVTy);
3415 }
3416
3417 VPIRFlags Flags;
3418 if (IVTy->isFloatingPointTy())
3419 Flags = {VPI->getFastMathFlags()};
3420
3421 unsigned MulOpc =
3422 IVTy->isFloatingPointTy() ? Instruction::FMul : Instruction::Mul;
3423 VPInstruction *Mul = Builder.createNaryOp(
3424 MulOpc, {VectorStep, ScalarStep}, Flags, R.getDebugLoc());
3425 VectorStep = Mul;
3426 VPI->replaceAllUsesWith(VectorStep);
3427 ToRemove.push_back(VPI);
3428 }
3429 }
3430
3431 for (VPRecipeBase *R : ToRemove)
3432 R->eraseFromParent();
3433}
3434
3436 VPBasicBlock *EarlyExitVPBB,
3437 VPlan &Plan,
3438 VPBasicBlock *HeaderVPBB,
3439 VPBasicBlock *LatchVPBB) {
3440 VPBlockBase *MiddleVPBB = LatchVPBB->getSuccessors()[0];
3441 if (!EarlyExitVPBB->getSinglePredecessor() &&
3442 EarlyExitVPBB->getPredecessors()[1] == MiddleVPBB) {
3443 assert(EarlyExitVPBB->getNumPredecessors() == 2 &&
3444 EarlyExitVPBB->getPredecessors()[0] == EarlyExitingVPBB &&
3445 "unsupported early exit VPBB");
3446 // Early exit operand should always be last phi operand. If EarlyExitVPBB
3447 // has two predecessors and EarlyExitingVPBB is the first, swap the operands
3448 // of the phis.
3449 for (VPRecipeBase &R : EarlyExitVPBB->phis())
3450 cast<VPIRPhi>(&R)->swapOperands();
3451 }
3452
3453 VPBuilder Builder(LatchVPBB->getTerminator());
3454 VPBlockBase *TrueSucc = EarlyExitingVPBB->getSuccessors()[0];
3455 assert(match(EarlyExitingVPBB->getTerminator(), m_BranchOnCond()) &&
3456 "Terminator must be be BranchOnCond");
3457 VPValue *CondOfEarlyExitingVPBB =
3458 EarlyExitingVPBB->getTerminator()->getOperand(0);
3459 auto *CondToEarlyExit = TrueSucc == EarlyExitVPBB
3460 ? CondOfEarlyExitingVPBB
3461 : Builder.createNot(CondOfEarlyExitingVPBB);
3462
3463 // Split the middle block and have it conditionally branch to the early exit
3464 // block if CondToEarlyExit.
3465 VPValue *IsEarlyExitTaken =
3466 Builder.createNaryOp(VPInstruction::AnyOf, {CondToEarlyExit});
3467 VPBasicBlock *NewMiddle = Plan.createVPBasicBlock("middle.split");
3468 VPBasicBlock *VectorEarlyExitVPBB =
3469 Plan.createVPBasicBlock("vector.early.exit");
3470 VPBlockUtils::insertOnEdge(LatchVPBB, MiddleVPBB, NewMiddle);
3471 VPBlockUtils::connectBlocks(NewMiddle, VectorEarlyExitVPBB);
3472 NewMiddle->swapSuccessors();
3473
3474 VPBlockUtils::connectBlocks(VectorEarlyExitVPBB, EarlyExitVPBB);
3475
3476 // Update the exit phis in the early exit block.
3477 VPBuilder MiddleBuilder(NewMiddle);
3478 VPBuilder EarlyExitB(VectorEarlyExitVPBB);
3479 for (VPRecipeBase &R : EarlyExitVPBB->phis()) {
3480 auto *ExitIRI = cast<VPIRPhi>(&R);
3481 // Early exit operand should always be last, i.e., 0 if EarlyExitVPBB has
3482 // a single predecessor and 1 if it has two.
3483 unsigned EarlyExitIdx = ExitIRI->getNumOperands() - 1;
3484 if (ExitIRI->getNumOperands() != 1) {
3485 // The first of two operands corresponds to the latch exit, via MiddleVPBB
3486 // predecessor. Extract its last lane.
3487 ExitIRI->extractLastLaneOfFirstOperand(MiddleBuilder);
3488 }
3489
3490 VPValue *IncomingFromEarlyExit = ExitIRI->getOperand(EarlyExitIdx);
3491 if (!IncomingFromEarlyExit->isLiveIn()) {
3492 // Update the incoming value from the early exit.
3493 VPValue *FirstActiveLane = EarlyExitB.createNaryOp(
3494 VPInstruction::FirstActiveLane, {CondToEarlyExit}, nullptr,
3495 "first.active.lane");
3496 IncomingFromEarlyExit = EarlyExitB.createNaryOp(
3497 VPInstruction::ExtractLane, {FirstActiveLane, IncomingFromEarlyExit},
3498 nullptr, "early.exit.value");
3499 ExitIRI->setOperand(EarlyExitIdx, IncomingFromEarlyExit);
3500 }
3501 }
3502 MiddleBuilder.createNaryOp(VPInstruction::BranchOnCond, {IsEarlyExitTaken});
3503
3504 // Replace the condition controlling the non-early exit from the vector loop
3505 // with one exiting if either the original condition of the vector latch is
3506 // true or the early exit has been taken.
3507 auto *LatchExitingBranch = cast<VPInstruction>(LatchVPBB->getTerminator());
3508 assert(LatchExitingBranch->getOpcode() == VPInstruction::BranchOnCount &&
3509 "Unexpected terminator");
3510 auto *IsLatchExitTaken =
3511 Builder.createICmp(CmpInst::ICMP_EQ, LatchExitingBranch->getOperand(0),
3512 LatchExitingBranch->getOperand(1));
3513 auto *AnyExitTaken = Builder.createNaryOp(
3514 Instruction::Or, {IsEarlyExitTaken, IsLatchExitTaken});
3515 Builder.createNaryOp(VPInstruction::BranchOnCond, AnyExitTaken);
3516 LatchExitingBranch->eraseFromParent();
3517}
3518
3519/// This function tries convert extended in-loop reductions to
3520/// VPExpressionRecipe and clamp the \p Range if it is beneficial and
3521/// valid. The created recipe must be decomposed to its constituent
3522/// recipes before execution.
3523static VPExpressionRecipe *
3525 VFRange &Range) {
3526 Type *RedTy = Ctx.Types.inferScalarType(Red);
3527 VPValue *VecOp = Red->getVecOp();
3528
3529 // Clamp the range if using extended-reduction is profitable.
3530 auto IsExtendedRedValidAndClampRange =
3531 [&](unsigned Opcode, Instruction::CastOps ExtOpc, Type *SrcTy) -> bool {
3533 [&](ElementCount VF) {
3534 auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF));
3536
3537 InstructionCost ExtRedCost;
3538 InstructionCost ExtCost =
3539 cast<VPWidenCastRecipe>(VecOp)->computeCost(VF, Ctx);
3540 InstructionCost RedCost = Red->computeCost(VF, Ctx);
3541
3545 // FIXME: Move partial reduction creation, costing and clamping
3546 // here from LoopVectorize.cpp.
3547 ExtRedCost = Ctx.TTI.getPartialReductionCost(
3548 Opcode, SrcTy, nullptr, RedTy, VF, ExtKind,
3549 llvm::TargetTransformInfo::PR_None, std::nullopt, Ctx.CostKind);
3550 } else {
3551 ExtRedCost = Ctx.TTI.getExtendedReductionCost(
3552 Opcode, ExtOpc == Instruction::CastOps::ZExt, RedTy, SrcVecTy,
3553 Red->getFastMathFlags(), CostKind);
3554 }
3555 return ExtRedCost.isValid() && ExtRedCost < ExtCost + RedCost;
3556 },
3557 Range);
3558 };
3559
3560 VPValue *A;
3561 // Match reduce(ext)).
3562 if (match(VecOp, m_ZExtOrSExt(m_VPValue(A))) &&
3563 IsExtendedRedValidAndClampRange(
3564 RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()),
3565 cast<VPWidenCastRecipe>(VecOp)->getOpcode(),
3566 Ctx.Types.inferScalarType(A)))
3567 return new VPExpressionRecipe(cast<VPWidenCastRecipe>(VecOp), Red);
3568
3569 return nullptr;
3570}
3571
3572/// This function tries convert extended in-loop reductions to
3573/// VPExpressionRecipe and clamp the \p Range if it is beneficial
3574/// and valid. The created VPExpressionRecipe must be decomposed to its
3575/// constituent recipes before execution. Patterns of the
3576/// VPExpressionRecipe:
3577/// reduce.add(mul(...)),
3578/// reduce.add(mul(ext(A), ext(B))),
3579/// reduce.add(ext(mul(ext(A), ext(B)))).
3580static VPExpressionRecipe *
3582 VPCostContext &Ctx, VFRange &Range) {
3583 bool IsPartialReduction = isa<VPPartialReductionRecipe>(Red);
3584
3585 unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind());
3586 if (Opcode != Instruction::Add && Opcode != Instruction::Sub)
3587 return nullptr;
3588
3589 Type *RedTy = Ctx.Types.inferScalarType(Red);
3590
3591 // Clamp the range if using multiply-accumulate-reduction is profitable.
3592 auto IsMulAccValidAndClampRange =
3594 VPWidenCastRecipe *OuterExt) -> bool {
3596 [&](ElementCount VF) {
3598 Type *SrcTy =
3599 Ext0 ? Ctx.Types.inferScalarType(Ext0->getOperand(0)) : RedTy;
3600 InstructionCost MulAccCost;
3601
3602 if (IsPartialReduction) {
3603 Type *SrcTy2 =
3604 Ext1 ? Ctx.Types.inferScalarType(Ext1->getOperand(0)) : nullptr;
3605 // FIXME: Move partial reduction creation, costing and clamping
3606 // here from LoopVectorize.cpp.
3607 MulAccCost = Ctx.TTI.getPartialReductionCost(
3608 Opcode, SrcTy, SrcTy2, RedTy, VF,
3610 Ext0->getOpcode())
3613 Ext1->getOpcode())
3615 Mul->getOpcode(), CostKind);
3616 } else {
3617 // Only partial reductions support mixed extends at the moment.
3618 if (Ext0 && Ext1 && Ext0->getOpcode() != Ext1->getOpcode())
3619 return false;
3620
3621 bool IsZExt =
3622 !Ext0 || Ext0->getOpcode() == Instruction::CastOps::ZExt;
3623 auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF));
3624 MulAccCost = Ctx.TTI.getMulAccReductionCost(IsZExt, Opcode, RedTy,
3625 SrcVecTy, CostKind);
3626 }
3627
3628 InstructionCost MulCost = Mul->computeCost(VF, Ctx);
3629 InstructionCost RedCost = Red->computeCost(VF, Ctx);
3630 InstructionCost ExtCost = 0;
3631 if (Ext0)
3632 ExtCost += Ext0->computeCost(VF, Ctx);
3633 if (Ext1)
3634 ExtCost += Ext1->computeCost(VF, Ctx);
3635 if (OuterExt)
3636 ExtCost += OuterExt->computeCost(VF, Ctx);
3637
3638 return MulAccCost.isValid() &&
3639 MulAccCost < ExtCost + MulCost + RedCost;
3640 },
3641 Range);
3642 };
3643
3644 VPValue *VecOp = Red->getVecOp();
3645 VPRecipeBase *Sub = nullptr;
3646 VPValue *A, *B;
3647 VPValue *Tmp = nullptr;
3648 // Sub reductions could have a sub between the add reduction and vec op.
3649 if (match(VecOp, m_Sub(m_ZeroInt(), m_VPValue(Tmp)))) {
3650 Sub = VecOp->getDefiningRecipe();
3651 VecOp = Tmp;
3652 }
3653
3654 // If ValB is a constant and can be safely extended, truncate it to the same
3655 // type as ExtA's operand, then extend it to the same type as ExtA. This
3656 // creates two uniform extends that can more easily be matched by the rest of
3657 // the bundling code. The ExtB reference, ValB and operand 1 of Mul are all
3658 // replaced with the new extend of the constant.
3659 auto ExtendAndReplaceConstantOp = [&Ctx](VPWidenCastRecipe *ExtA,
3660 VPWidenCastRecipe *&ExtB,
3661 VPValue *&ValB, VPWidenRecipe *Mul) {
3662 if (!ExtA || ExtB || !ValB->isLiveIn())
3663 return;
3664 Type *NarrowTy = Ctx.Types.inferScalarType(ExtA->getOperand(0));
3665 Instruction::CastOps ExtOpc = ExtA->getOpcode();
3666 const APInt *Const;
3667 if (!match(ValB, m_APInt(Const)) ||
3669 Const, NarrowTy, TTI::getPartialReductionExtendKind(ExtOpc)))
3670 return;
3671 // The truncate ensures that the type of each extended operand is the
3672 // same, and it's been proven that the constant can be extended from
3673 // NarrowTy safely. Necessary since ExtA's extended operand would be
3674 // e.g. an i8, while the const will likely be an i32. This will be
3675 // elided by later optimisations.
3676 VPBuilder Builder(Mul);
3677 auto *Trunc =
3678 Builder.createWidenCast(Instruction::CastOps::Trunc, ValB, NarrowTy);
3679 Type *WideTy = Ctx.Types.inferScalarType(ExtA);
3680 ValB = ExtB = Builder.createWidenCast(ExtOpc, Trunc, WideTy);
3681 Mul->setOperand(1, ExtB);
3682 };
3683
3684 // Try to match reduce.add(mul(...)).
3685 if (match(VecOp, m_Mul(m_VPValue(A), m_VPValue(B)))) {
3686 auto *RecipeA =
3687 dyn_cast_if_present<VPWidenCastRecipe>(A->getDefiningRecipe());
3688 auto *RecipeB =
3689 dyn_cast_if_present<VPWidenCastRecipe>(B->getDefiningRecipe());
3690 auto *Mul = cast<VPWidenRecipe>(VecOp->getDefiningRecipe());
3691
3692 // Convert reduce.add(mul(ext, const)) to reduce.add(mul(ext, ext(const)))
3693 ExtendAndReplaceConstantOp(RecipeA, RecipeB, B, Mul);
3694
3695 // Match reduce.add/sub(mul(ext, ext)).
3696 if (RecipeA && RecipeB && match(RecipeA, m_ZExtOrSExt(m_VPValue())) &&
3697 match(RecipeB, m_ZExtOrSExt(m_VPValue())) &&
3698 IsMulAccValidAndClampRange(Mul, RecipeA, RecipeB, nullptr)) {
3699 if (Sub)
3700 return new VPExpressionRecipe(RecipeA, RecipeB, Mul,
3701 cast<VPWidenRecipe>(Sub), Red);
3702 return new VPExpressionRecipe(RecipeA, RecipeB, Mul, Red);
3703 }
3704 // TODO: Add an expression type for this variant with a negated mul
3705 if (!Sub && IsMulAccValidAndClampRange(Mul, nullptr, nullptr, nullptr))
3706 return new VPExpressionRecipe(Mul, Red);
3707 }
3708 // TODO: Add an expression type for negated versions of other expression
3709 // variants.
3710 if (Sub)
3711 return nullptr;
3712
3713 // Match reduce.add(ext(mul(A, B))).
3714 if (match(VecOp, m_ZExtOrSExt(m_Mul(m_VPValue(A), m_VPValue(B))))) {
3715 auto *Ext = cast<VPWidenCastRecipe>(VecOp->getDefiningRecipe());
3716 auto *Mul = cast<VPWidenRecipe>(Ext->getOperand(0)->getDefiningRecipe());
3717 auto *Ext0 = dyn_cast_if_present<VPWidenCastRecipe>(A->getDefiningRecipe());
3718 auto *Ext1 = dyn_cast_if_present<VPWidenCastRecipe>(B->getDefiningRecipe());
3719
3720 // reduce.add(ext(mul(ext, const)))
3721 // -> reduce.add(ext(mul(ext, ext(const))))
3722 ExtendAndReplaceConstantOp(Ext0, Ext1, B, Mul);
3723
3724 // reduce.add(ext(mul(ext(A), ext(B))))
3725 // -> reduce.add(mul(wider_ext(A), wider_ext(B)))
3726 // The inner extends must either have the same opcode as the outer extend or
3727 // be the same, in which case the multiply can never result in a negative
3728 // value and the outer extend can be folded away by doing wider
3729 // extends for the operands of the mul.
3730 if (Ext0 && Ext1 &&
3731 (Ext->getOpcode() == Ext0->getOpcode() || Ext0 == Ext1) &&
3732 Ext0->getOpcode() == Ext1->getOpcode() &&
3733 IsMulAccValidAndClampRange(Mul, Ext0, Ext1, Ext) && Mul->hasOneUse()) {
3734 auto *NewExt0 = new VPWidenCastRecipe(
3735 Ext0->getOpcode(), Ext0->getOperand(0), Ext->getResultType(), *Ext0,
3736 *Ext0, Ext0->getDebugLoc());
3737 NewExt0->insertBefore(Ext0);
3738
3739 VPWidenCastRecipe *NewExt1 = NewExt0;
3740 if (Ext0 != Ext1) {
3741 NewExt1 = new VPWidenCastRecipe(Ext1->getOpcode(), Ext1->getOperand(0),
3742 Ext->getResultType(), *Ext1, *Ext1,
3743 Ext1->getDebugLoc());
3744 NewExt1->insertBefore(Ext1);
3745 }
3746 Mul->setOperand(0, NewExt0);
3747 Mul->setOperand(1, NewExt1);
3748 Red->setOperand(1, Mul);
3749 return new VPExpressionRecipe(NewExt0, NewExt1, Mul, Red);
3750 }
3751 }
3752 return nullptr;
3753}
3754
3755/// This function tries to create abstract recipes from the reduction recipe for
3756/// following optimizations and cost estimation.
3758 VPCostContext &Ctx,
3759 VFRange &Range) {
3760 VPExpressionRecipe *AbstractR = nullptr;
3761 auto IP = std::next(Red->getIterator());
3762 auto *VPBB = Red->getParent();
3763 if (auto *MulAcc = tryToMatchAndCreateMulAccumulateReduction(Red, Ctx, Range))
3764 AbstractR = MulAcc;
3765 else if (auto *ExtRed = tryToMatchAndCreateExtendedReduction(Red, Ctx, Range))
3766 AbstractR = ExtRed;
3767 // Cannot create abstract inloop reduction recipes.
3768 if (!AbstractR)
3769 return;
3770
3771 AbstractR->insertBefore(*VPBB, IP);
3772 Red->replaceAllUsesWith(AbstractR);
3773}
3774
3785
3787 if (Plan.hasScalarVFOnly())
3788 return;
3789
3790#ifndef NDEBUG
3791 VPDominatorTree VPDT(Plan);
3792#endif
3793
3794 SmallVector<VPValue *> VPValues;
3797 append_range(VPValues, Plan.getLiveIns());
3798 for (VPRecipeBase &R : *Plan.getEntry())
3799 append_range(VPValues, R.definedValues());
3800
3801 auto *VectorPreheader = Plan.getVectorPreheader();
3802 for (VPValue *VPV : VPValues) {
3804 (VPV->isLiveIn() && VPV->getLiveInIRValue() &&
3805 isa<Constant>(VPV->getLiveInIRValue())))
3806 continue;
3807
3808 // Add explicit broadcast at the insert point that dominates all users.
3809 VPBasicBlock *HoistBlock = VectorPreheader;
3810 VPBasicBlock::iterator HoistPoint = VectorPreheader->end();
3811 for (VPUser *User : VPV->users()) {
3812 if (User->usesScalars(VPV))
3813 continue;
3814 if (cast<VPRecipeBase>(User)->getParent() == VectorPreheader)
3815 HoistPoint = HoistBlock->begin();
3816 else
3817 assert(VPDT.dominates(VectorPreheader,
3818 cast<VPRecipeBase>(User)->getParent()) &&
3819 "All users must be in the vector preheader or dominated by it");
3820 }
3821
3822 VPBuilder Builder(cast<VPBasicBlock>(HoistBlock), HoistPoint);
3823 auto *Broadcast = Builder.createNaryOp(VPInstruction::Broadcast, {VPV});
3824 VPV->replaceUsesWithIf(Broadcast,
3825 [VPV, Broadcast](VPUser &U, unsigned Idx) {
3826 return Broadcast != &U && !U.usesScalars(VPV);
3827 });
3828 }
3829}
3830
3832 VPlan &Plan, ElementCount BestVF, unsigned BestUF,
3834 assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
3835 assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
3836
3837 VPValue *TC = Plan.getTripCount();
3838 // Skip cases for which the trip count may be non-trivial to materialize.
3839 // I.e., when a scalar tail is absent - due to tail folding, or when a scalar
3840 // tail is required.
3841 if (!Plan.hasScalarTail() ||
3843 Plan.getScalarPreheader() ||
3844 !TC->isLiveIn())
3845 return;
3846
3847 // Materialize vector trip counts for constants early if it can simply
3848 // be computed as (Original TC / VF * UF) * VF * UF.
3849 // TODO: Compute vector trip counts for loops requiring a scalar epilogue and
3850 // tail-folded loops.
3851 ScalarEvolution &SE = *PSE.getSE();
3852 auto *TCScev = SE.getSCEV(TC->getLiveInIRValue());
3853 if (!isa<SCEVConstant>(TCScev))
3854 return;
3855 const SCEV *VFxUF = SE.getElementCount(TCScev->getType(), BestVF * BestUF);
3856 auto VecTCScev = SE.getMulExpr(SE.getUDivExpr(TCScev, VFxUF), VFxUF);
3857 if (auto *ConstVecTC = dyn_cast<SCEVConstant>(VecTCScev))
3858 Plan.getVectorTripCount().setUnderlyingValue(ConstVecTC->getValue());
3859}
3860
3862 VPBasicBlock *VectorPH) {
3864 if (BTC->getNumUsers() == 0)
3865 return;
3866
3867 VPBuilder Builder(VectorPH, VectorPH->begin());
3868 auto *TCTy = VPTypeAnalysis(Plan).inferScalarType(Plan.getTripCount());
3869 auto *TCMO = Builder.createNaryOp(
3870 Instruction::Sub, {Plan.getTripCount(), Plan.getConstantInt(TCTy, 1)},
3871 DebugLoc::getCompilerGenerated(), "trip.count.minus.1");
3872 BTC->replaceAllUsesWith(TCMO);
3873}
3874
3876 if (Plan.hasScalarVFOnly())
3877 return;
3878
3879 VPTypeAnalysis TypeInfo(Plan);
3880 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3881 auto VPBBsOutsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
3883 auto VPBBsInsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
3884 vp_depth_first_shallow(LoopRegion->getEntry()));
3885 // Materialize Build(Struct)Vector for all replicating VPReplicateRecipes and
3886 // VPInstructions, excluding ones in replicate regions. Those are not
3887 // materialized explicitly yet. Those vector users are still handled in
3888 // VPReplicateRegion::execute(), via shouldPack().
3889 // TODO: materialize build vectors for replicating recipes in replicating
3890 // regions.
3891 for (VPBasicBlock *VPBB :
3892 concat<VPBasicBlock *>(VPBBsOutsideLoopRegion, VPBBsInsideLoopRegion)) {
3893 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
3895 continue;
3896 auto *DefR = cast<VPRecipeWithIRFlags>(&R);
3897 auto UsesVectorOrInsideReplicateRegion = [DefR, LoopRegion](VPUser *U) {
3898 VPRegionBlock *ParentRegion = cast<VPRecipeBase>(U)->getRegion();
3899 return !U->usesScalars(DefR) || ParentRegion != LoopRegion;
3900 };
3901 if ((isa<VPReplicateRecipe>(DefR) &&
3902 cast<VPReplicateRecipe>(DefR)->isSingleScalar()) ||
3903 (isa<VPInstruction>(DefR) &&
3905 !cast<VPInstruction>(DefR)->doesGeneratePerAllLanes())) ||
3906 none_of(DefR->users(), UsesVectorOrInsideReplicateRegion))
3907 continue;
3908
3909 Type *ScalarTy = TypeInfo.inferScalarType(DefR);
3910 unsigned Opcode = ScalarTy->isStructTy()
3913 auto *BuildVector = new VPInstruction(Opcode, {DefR});
3914 BuildVector->insertAfter(DefR);
3915
3916 DefR->replaceUsesWithIf(
3917 BuildVector, [BuildVector, &UsesVectorOrInsideReplicateRegion](
3918 VPUser &U, unsigned) {
3919 return &U != BuildVector && UsesVectorOrInsideReplicateRegion(&U);
3920 });
3921 }
3922 }
3923
3924 // Create explicit VPInstructions to convert vectors to scalars. The current
3925 // implementation is conservative - it may miss some cases that may or may not
3926 // be vector values. TODO: introduce Unpacks speculatively - remove them later
3927 // if they are known to operate on scalar values.
3928 for (VPBasicBlock *VPBB : VPBBsInsideLoopRegion) {
3929 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
3932 continue;
3933 for (VPValue *Def : R.definedValues()) {
3934 // Skip recipes that are single-scalar or only have their first lane
3935 // used.
3936 // TODO: The Defs skipped here may or may not be vector values.
3937 // Introduce Unpacks, and remove them later, if they are guaranteed to
3938 // produce scalar values.
3940 continue;
3941
3942 // At the moment, we create unpacks only for scalar users outside
3943 // replicate regions. Recipes inside replicate regions still extract the
3944 // required lanes implicitly.
3945 // TODO: Remove once replicate regions are unrolled completely.
3946 auto IsCandidateUnpackUser = [Def](VPUser *U) {
3947 VPRegionBlock *ParentRegion = cast<VPRecipeBase>(U)->getRegion();
3948 return U->usesScalars(Def) &&
3949 (!ParentRegion || !ParentRegion->isReplicator());
3950 };
3951 if (none_of(Def->users(), IsCandidateUnpackUser))
3952 continue;
3953
3954 auto *Unpack = new VPInstruction(VPInstruction::Unpack, {Def});
3955 if (R.isPhi())
3956 Unpack->insertBefore(*VPBB, VPBB->getFirstNonPhi());
3957 else
3958 Unpack->insertAfter(&R);
3959 Def->replaceUsesWithIf(Unpack,
3960 [&IsCandidateUnpackUser](VPUser &U, unsigned) {
3961 return IsCandidateUnpackUser(&U);
3962 });
3963 }
3964 }
3965 }
3966}
3967
3969 VPBasicBlock *VectorPHVPBB,
3970 bool TailByMasking,
3971 bool RequiresScalarEpilogue) {
3972 VPValue &VectorTC = Plan.getVectorTripCount();
3973 assert(VectorTC.isLiveIn() && "vector-trip-count must be a live-in");
3974 // There's nothing to do if there are no users of the vector trip count or its
3975 // IR value has already been set.
3976 if (VectorTC.getNumUsers() == 0 || VectorTC.getLiveInIRValue())
3977 return;
3978
3979 VPValue *TC = Plan.getTripCount();
3980 Type *TCTy = VPTypeAnalysis(Plan).inferScalarType(TC);
3981 VPBuilder Builder(VectorPHVPBB, VectorPHVPBB->begin());
3982 VPValue *Step = &Plan.getVFxUF();
3983
3984 // If the tail is to be folded by masking, round the number of iterations N
3985 // up to a multiple of Step instead of rounding down. This is done by first
3986 // adding Step-1 and then rounding down. Note that it's ok if this addition
3987 // overflows: the vector induction variable will eventually wrap to zero given
3988 // that it starts at zero and its Step is a power of two; the loop will then
3989 // exit, with the last early-exit vector comparison also producing all-true.
3990 // For scalable vectors the VF is not guaranteed to be a power of 2, but this
3991 // is accounted for in emitIterationCountCheck that adds an overflow check.
3992 if (TailByMasking) {
3993 TC = Builder.createNaryOp(
3994 Instruction::Add,
3995 {TC, Builder.createNaryOp(Instruction::Sub,
3996 {Step, Plan.getConstantInt(TCTy, 1)})},
3997 DebugLoc::getCompilerGenerated(), "n.rnd.up");
3998 }
3999
4000 // Now we need to generate the expression for the part of the loop that the
4001 // vectorized body will execute. This is equal to N - (N % Step) if scalar
4002 // iterations are not required for correctness, or N - Step, otherwise. Step
4003 // is equal to the vectorization factor (number of SIMD elements) times the
4004 // unroll factor (number of SIMD instructions).
4005 VPValue *R =
4006 Builder.createNaryOp(Instruction::URem, {TC, Step},
4007 DebugLoc::getCompilerGenerated(), "n.mod.vf");
4008
4009 // There are cases where we *must* run at least one iteration in the remainder
4010 // loop. See the cost model for when this can happen. If the step evenly
4011 // divides the trip count, we set the remainder to be equal to the step. If
4012 // the step does not evenly divide the trip count, no adjustment is necessary
4013 // since there will already be scalar iterations. Note that the minimum
4014 // iterations check ensures that N >= Step.
4015 if (RequiresScalarEpilogue) {
4016 assert(!TailByMasking &&
4017 "requiring scalar epilogue is not supported with fail folding");
4018 VPValue *IsZero =
4019 Builder.createICmp(CmpInst::ICMP_EQ, R, Plan.getConstantInt(TCTy, 0));
4020 R = Builder.createSelect(IsZero, Step, R);
4021 }
4022
4023 VPValue *Res = Builder.createNaryOp(
4024 Instruction::Sub, {TC, R}, DebugLoc::getCompilerGenerated(), "n.vec");
4025 VectorTC.replaceAllUsesWith(Res);
4026}
4027
4029 ElementCount VFEC) {
4030 VPBuilder Builder(VectorPH, VectorPH->begin());
4031 Type *TCTy = VPTypeAnalysis(Plan).inferScalarType(Plan.getTripCount());
4032 VPValue &VF = Plan.getVF();
4033 VPValue &VFxUF = Plan.getVFxUF();
4034 // Note that after the transform, Plan.getVF and Plan.getVFxUF should not be
4035 // used.
4036 // TODO: Assert that they aren't used.
4037
4038 // If there are no users of the runtime VF, compute VFxUF by constant folding
4039 // the multiplication of VF and UF.
4040 if (VF.getNumUsers() == 0) {
4041 VPValue *RuntimeVFxUF =
4042 Builder.createElementCount(TCTy, VFEC * Plan.getUF());
4043 VFxUF.replaceAllUsesWith(RuntimeVFxUF);
4044 return;
4045 }
4046
4047 // For users of the runtime VF, compute it as VF * vscale, and VFxUF as (VF *
4048 // vscale) * UF.
4049 VPValue *RuntimeVF = Builder.createElementCount(TCTy, VFEC);
4051 VPValue *BC = Builder.createNaryOp(VPInstruction::Broadcast, RuntimeVF);
4053 BC, [&VF](VPUser &U, unsigned) { return !U.usesScalars(&VF); });
4054 }
4055 VF.replaceAllUsesWith(RuntimeVF);
4056
4057 VPValue *UF = Plan.getConstantInt(TCTy, Plan.getUF());
4058 VPValue *MulByUF = Builder.createNaryOp(Instruction::Mul, {RuntimeVF, UF});
4059 VFxUF.replaceAllUsesWith(MulByUF);
4060}
4061
4064 const DataLayout &DL = SE.getDataLayout();
4065 SCEVExpander Expander(SE, DL, "induction", /*PreserveLCSSA=*/false);
4066
4067 auto *Entry = cast<VPIRBasicBlock>(Plan.getEntry());
4068 BasicBlock *EntryBB = Entry->getIRBasicBlock();
4069 DenseMap<const SCEV *, Value *> ExpandedSCEVs;
4070 for (VPRecipeBase &R : make_early_inc_range(*Entry)) {
4072 continue;
4073 auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(&R);
4074 if (!ExpSCEV)
4075 break;
4076 const SCEV *Expr = ExpSCEV->getSCEV();
4077 Value *Res =
4078 Expander.expandCodeFor(Expr, Expr->getType(), EntryBB->getTerminator());
4079 ExpandedSCEVs[ExpSCEV->getSCEV()] = Res;
4080 VPValue *Exp = Plan.getOrAddLiveIn(Res);
4081 ExpSCEV->replaceAllUsesWith(Exp);
4082 if (Plan.getTripCount() == ExpSCEV)
4083 Plan.resetTripCount(Exp);
4084 ExpSCEV->eraseFromParent();
4085 }
4087 "VPExpandSCEVRecipes must be at the beginning of the entry block, "
4088 "after any VPIRInstructions");
4089 // Add IR instructions in the entry basic block but not in the VPIRBasicBlock
4090 // to the VPIRBasicBlock.
4091 auto EI = Entry->begin();
4092 for (Instruction &I : drop_end(*EntryBB)) {
4093 if (EI != Entry->end() && isa<VPIRInstruction>(*EI) &&
4094 &cast<VPIRInstruction>(&*EI)->getInstruction() == &I) {
4095 EI++;
4096 continue;
4097 }
4099 }
4100
4101 return ExpandedSCEVs;
4102}
4103
4104/// Returns true if \p V is VPWidenLoadRecipe or VPInterleaveRecipe that can be
4105/// converted to a narrower recipe. \p V is used by a wide recipe that feeds a
4106/// store interleave group at index \p Idx, \p WideMember0 is the recipe feeding
4107/// the same interleave group at index 0. A VPWidenLoadRecipe can be narrowed to
4108/// an index-independent load if it feeds all wide ops at all indices (\p OpV
4109/// must be the operand at index \p OpIdx for both the recipe at lane 0, \p
4110/// WideMember0). A VPInterleaveRecipe can be narrowed to a wide load, if \p V
4111/// is defined at \p Idx of a load interleave group.
4112static bool canNarrowLoad(VPWidenRecipe *WideMember0, unsigned OpIdx,
4113 VPValue *OpV, unsigned Idx) {
4114 auto *DefR = OpV->getDefiningRecipe();
4115 if (!DefR)
4116 return WideMember0->getOperand(OpIdx) == OpV;
4117 if (auto *W = dyn_cast<VPWidenLoadRecipe>(DefR))
4118 return !W->getMask() && WideMember0->getOperand(OpIdx) == OpV;
4119
4120 if (auto *IR = dyn_cast<VPInterleaveRecipe>(DefR))
4121 return IR->getInterleaveGroup()->isFull() && IR->getVPValue(Idx) == OpV;
4122 return false;
4123}
4124
4125/// Returns true if \p IR is a full interleave group with factor and number of
4126/// members both equal to \p VF. The interleave group must also access the full
4127/// vector width \p VectorRegWidth.
4129 unsigned VF, VPTypeAnalysis &TypeInfo,
4130 unsigned VectorRegWidth) {
4131 if (!InterleaveR || InterleaveR->getMask())
4132 return false;
4133
4134 Type *GroupElementTy = nullptr;
4135 if (InterleaveR->getStoredValues().empty()) {
4136 GroupElementTy = TypeInfo.inferScalarType(InterleaveR->getVPValue(0));
4137 if (!all_of(InterleaveR->definedValues(),
4138 [&TypeInfo, GroupElementTy](VPValue *Op) {
4139 return TypeInfo.inferScalarType(Op) == GroupElementTy;
4140 }))
4141 return false;
4142 } else {
4143 GroupElementTy =
4144 TypeInfo.inferScalarType(InterleaveR->getStoredValues()[0]);
4145 if (!all_of(InterleaveR->getStoredValues(),
4146 [&TypeInfo, GroupElementTy](VPValue *Op) {
4147 return TypeInfo.inferScalarType(Op) == GroupElementTy;
4148 }))
4149 return false;
4150 }
4151
4152 unsigned GroupSize = GroupElementTy->getScalarSizeInBits() * VF;
4153 auto IG = InterleaveR->getInterleaveGroup();
4154 return IG->getFactor() == VF && IG->getNumMembers() == VF &&
4155 GroupSize == VectorRegWidth;
4156}
4157
4158/// Returns true if \p VPValue is a narrow VPValue.
4159static bool isAlreadyNarrow(VPValue *VPV) {
4160 if (VPV->isLiveIn())
4161 return true;
4162 auto *RepR = dyn_cast<VPReplicateRecipe>(VPV);
4163 return RepR && RepR->isSingleScalar();
4164}
4165
4167 unsigned VectorRegWidth) {
4168 VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
4169 if (!VectorLoop || VectorLoop->getEntry()->getNumSuccessors() != 0)
4170 return;
4171
4172 VPTypeAnalysis TypeInfo(Plan);
4173
4174 unsigned VFMinVal = VF.getKnownMinValue();
4176 for (auto &R : *VectorLoop->getEntryBasicBlock()) {
4178 continue;
4179
4182 continue;
4183
4184 // Bail out on recipes not supported at the moment:
4185 // * phi recipes other than the canonical induction
4186 // * recipes writing to memory except interleave groups
4187 // Only support plans with a canonical induction phi.
4188 if (R.isPhi())
4189 return;
4190
4191 auto *InterleaveR = dyn_cast<VPInterleaveRecipe>(&R);
4192 if (R.mayWriteToMemory() && !InterleaveR)
4193 return;
4194
4195 // Do not narrow interleave groups if there are VectorPointer recipes and
4196 // the plan was unrolled. The recipe implicitly uses VF from
4197 // VPTransformState.
4198 // TODO: Remove restriction once the VF for the VectorPointer offset is
4199 // modeled explicitly as operand.
4200 if (isa<VPVectorPointerRecipe>(&R) && Plan.getUF() > 1)
4201 return;
4202
4203 // All other ops are allowed, but we reject uses that cannot be converted
4204 // when checking all allowed consumers (store interleave groups) below.
4205 if (!InterleaveR)
4206 continue;
4207
4208 // Bail out on non-consecutive interleave groups.
4209 if (!isConsecutiveInterleaveGroup(InterleaveR, VFMinVal, TypeInfo,
4210 VectorRegWidth))
4211 return;
4212
4213 // Skip read interleave groups.
4214 if (InterleaveR->getStoredValues().empty())
4215 continue;
4216
4217 // Narrow interleave groups, if all operands are already matching narrow
4218 // ops.
4219 auto *Member0 = InterleaveR->getStoredValues()[0];
4220 if (isAlreadyNarrow(Member0) &&
4221 all_of(InterleaveR->getStoredValues(),
4222 [Member0](VPValue *VPV) { return Member0 == VPV; })) {
4223 StoreGroups.push_back(InterleaveR);
4224 continue;
4225 }
4226
4227 // For now, we only support full interleave groups storing load interleave
4228 // groups.
4229 if (all_of(enumerate(InterleaveR->getStoredValues()), [](auto Op) {
4230 VPRecipeBase *DefR = Op.value()->getDefiningRecipe();
4231 if (!DefR)
4232 return false;
4233 auto *IR = dyn_cast<VPInterleaveRecipe>(DefR);
4234 return IR && IR->getInterleaveGroup()->isFull() &&
4235 IR->getVPValue(Op.index()) == Op.value();
4236 })) {
4237 StoreGroups.push_back(InterleaveR);
4238 continue;
4239 }
4240
4241 // Check if all values feeding InterleaveR are matching wide recipes, which
4242 // operands that can be narrowed.
4243 auto *WideMember0 = dyn_cast_or_null<VPWidenRecipe>(
4244 InterleaveR->getStoredValues()[0]->getDefiningRecipe());
4245 if (!WideMember0)
4246 return;
4247 for (const auto &[I, V] : enumerate(InterleaveR->getStoredValues())) {
4248 auto *R = dyn_cast_or_null<VPWidenRecipe>(V->getDefiningRecipe());
4249 if (!R || R->getOpcode() != WideMember0->getOpcode() ||
4250 R->getNumOperands() > 2)
4251 return;
4252 if (any_of(enumerate(R->operands()),
4253 [WideMember0, Idx = I](const auto &P) {
4254 const auto &[OpIdx, OpV] = P;
4255 return !canNarrowLoad(WideMember0, OpIdx, OpV, Idx);
4256 }))
4257 return;
4258 }
4259 StoreGroups.push_back(InterleaveR);
4260 }
4261
4262 if (StoreGroups.empty())
4263 return;
4264
4265 // Convert InterleaveGroup \p R to a single VPWidenLoadRecipe.
4266 SmallPtrSet<VPValue *, 4> NarrowedOps;
4267 auto NarrowOp = [&NarrowedOps](VPValue *V) -> VPValue * {
4268 auto *R = V->getDefiningRecipe();
4269 if (!R || NarrowedOps.contains(V))
4270 return V;
4271 if (auto *LoadGroup = dyn_cast<VPInterleaveRecipe>(R)) {
4272 // Narrow interleave group to wide load, as transformed VPlan will only
4273 // process one original iteration.
4274 auto *LI =
4275 cast<LoadInst>(LoadGroup->getInterleaveGroup()->getInsertPos());
4276 auto *L = new VPWidenLoadRecipe(
4277 *LI, LoadGroup->getAddr(), LoadGroup->getMask(), /*Consecutive=*/true,
4278 /*Reverse=*/false, LI->getAlign(), {}, LoadGroup->getDebugLoc());
4279 L->insertBefore(LoadGroup);
4280 NarrowedOps.insert(L);
4281 return L;
4282 }
4283
4284 if (auto *RepR = dyn_cast<VPReplicateRecipe>(R)) {
4285 assert(RepR->isSingleScalar() &&
4286 isa<LoadInst>(RepR->getUnderlyingInstr()) &&
4287 "must be a single scalar load");
4288 NarrowedOps.insert(RepR);
4289 return RepR;
4290 }
4291 auto *WideLoad = cast<VPWidenLoadRecipe>(R);
4292
4293 VPValue *PtrOp = WideLoad->getAddr();
4294 if (auto *VecPtr = dyn_cast<VPVectorPointerRecipe>(PtrOp))
4295 PtrOp = VecPtr->getOperand(0);
4296 // Narrow wide load to uniform scalar load, as transformed VPlan will only
4297 // process one original iteration.
4298 auto *N = new VPReplicateRecipe(&WideLoad->getIngredient(), {PtrOp},
4299 /*IsUniform*/ true,
4300 /*Mask*/ nullptr, *WideLoad);
4301 N->insertBefore(WideLoad);
4302 NarrowedOps.insert(N);
4303 return N;
4304 };
4305
4306 // Narrow operation tree rooted at store groups.
4307 for (auto *StoreGroup : StoreGroups) {
4308 VPValue *Res = nullptr;
4309 VPValue *Member0 = StoreGroup->getStoredValues()[0];
4310 if (isAlreadyNarrow(Member0)) {
4311 Res = Member0;
4312 } else if (auto *WideMember0 =
4314 for (unsigned Idx = 0, E = WideMember0->getNumOperands(); Idx != E; ++Idx)
4315 WideMember0->setOperand(Idx, NarrowOp(WideMember0->getOperand(Idx)));
4316 Res = WideMember0;
4317 } else {
4318 Res = NarrowOp(Member0);
4319 }
4320
4321 auto *SI =
4322 cast<StoreInst>(StoreGroup->getInterleaveGroup()->getInsertPos());
4323 auto *S = new VPWidenStoreRecipe(
4324 *SI, StoreGroup->getAddr(), Res, nullptr, /*Consecutive=*/true,
4325 /*Reverse=*/false, SI->getAlign(), {}, StoreGroup->getDebugLoc());
4326 S->insertBefore(StoreGroup);
4327 StoreGroup->eraseFromParent();
4328 }
4329
4330 // Adjust induction to reflect that the transformed plan only processes one
4331 // original iteration.
4332 auto *CanIV = VectorLoop->getCanonicalIV();
4333 auto *Inc = cast<VPInstruction>(CanIV->getBackedgeValue());
4334 VPBuilder PHBuilder(Plan.getVectorPreheader());
4335
4336 VPValue *UF = Plan.getOrAddLiveIn(
4337 ConstantInt::get(VectorLoop->getCanonicalIVType(), 1 * Plan.getUF()));
4338 if (VF.isScalable()) {
4339 VPValue *VScale = PHBuilder.createElementCount(
4341 VPValue *VScaleUF = PHBuilder.createNaryOp(Instruction::Mul, {VScale, UF});
4342 Inc->setOperand(1, VScaleUF);
4343 Plan.getVF().replaceAllUsesWith(VScale);
4344 } else {
4345 Inc->setOperand(1, UF);
4347 Plan.getConstantInt(CanIV->getScalarType(), 1));
4348 }
4349 removeDeadRecipes(Plan);
4350}
4351
4352/// Add branch weight metadata, if the \p Plan's middle block is terminated by a
4353/// BranchOnCond recipe.
4355 VPlan &Plan, ElementCount VF, std::optional<unsigned> VScaleForTuning) {
4356 VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
4357 auto *MiddleTerm =
4359 // Only add branch metadata if there is a (conditional) terminator.
4360 if (!MiddleTerm)
4361 return;
4362
4363 assert(MiddleTerm->getOpcode() == VPInstruction::BranchOnCond &&
4364 "must have a BranchOnCond");
4365 // Assume that `TripCount % VectorStep ` is equally distributed.
4366 unsigned VectorStep = Plan.getUF() * VF.getKnownMinValue();
4367 if (VF.isScalable() && VScaleForTuning.has_value())
4368 VectorStep *= *VScaleForTuning;
4369 assert(VectorStep > 0 && "trip count should not be zero");
4370 MDBuilder MDB(Plan.getContext());
4371 MDNode *BranchWeights =
4372 MDB.createBranchWeights({1, VectorStep - 1}, /*IsExpected=*/false);
4373 MiddleTerm->addMetadata(LLVMContext::MD_prof, BranchWeights);
4374}
4375
4376/// Create and return a ResumePhi for \p WideIV, unless it is truncated. If the
4377/// induction recipe is not canonical, creates a VPDerivedIVRecipe to compute
4378/// the end value of the induction.
4380 VPWidenInductionRecipe *WideIV, VPBuilder &VectorPHBuilder,
4381 VPBuilder &ScalarPHBuilder, VPTypeAnalysis &TypeInfo, VPValue *VectorTC) {
4382 auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
4383 // Truncated wide inductions resume from the last lane of their vector value
4384 // in the last vector iteration which is handled elsewhere.
4385 if (WideIntOrFp && WideIntOrFp->getTruncInst())
4386 return nullptr;
4387
4388 VPValue *Start = WideIV->getStartValue();
4389 VPValue *Step = WideIV->getStepValue();
4391 VPValue *EndValue = VectorTC;
4392 if (!WideIntOrFp || !WideIntOrFp->isCanonical()) {
4393 EndValue = VectorPHBuilder.createDerivedIV(
4394 ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
4395 Start, VectorTC, Step);
4396 }
4397
4398 // EndValue is derived from the vector trip count (which has the same type as
4399 // the widest induction) and thus may be wider than the induction here.
4400 Type *ScalarTypeOfWideIV = TypeInfo.inferScalarType(WideIV);
4401 if (ScalarTypeOfWideIV != TypeInfo.inferScalarType(EndValue)) {
4402 EndValue = VectorPHBuilder.createScalarCast(Instruction::Trunc, EndValue,
4403 ScalarTypeOfWideIV,
4404 WideIV->getDebugLoc());
4405 }
4406
4407 auto *ResumePhiRecipe = ScalarPHBuilder.createScalarPhi(
4408 {EndValue, Start}, WideIV->getDebugLoc(), "bc.resume.val");
4409 return ResumePhiRecipe;
4410}
4411
4413 VPlan &Plan, VPRecipeBuilder &Builder,
4414 DenseMap<VPValue *, VPValue *> &IVEndValues) {
4415 VPTypeAnalysis TypeInfo(Plan);
4416 auto *ScalarPH = Plan.getScalarPreheader();
4417 auto *MiddleVPBB = cast<VPBasicBlock>(ScalarPH->getPredecessors()[0]);
4418 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
4419 VPBuilder VectorPHBuilder(
4420 cast<VPBasicBlock>(VectorRegion->getSinglePredecessor()));
4421 VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
4422 VPBuilder ScalarPHBuilder(ScalarPH);
4423 for (VPRecipeBase &ScalarPhiR : Plan.getScalarHeader()->phis()) {
4424 auto *ScalarPhiIRI = cast<VPIRPhi>(&ScalarPhiR);
4425
4426 // TODO: Extract final value from induction recipe initially, optimize to
4427 // pre-computed end value together in optimizeInductionExitUsers.
4428 auto *VectorPhiR =
4429 cast<VPHeaderPHIRecipe>(Builder.getRecipe(&ScalarPhiIRI->getIRPhi()));
4430 if (auto *WideIVR = dyn_cast<VPWidenInductionRecipe>(VectorPhiR)) {
4432 WideIVR, VectorPHBuilder, ScalarPHBuilder, TypeInfo,
4433 &Plan.getVectorTripCount())) {
4434 assert(isa<VPPhi>(ResumePhi) && "Expected a phi");
4435 IVEndValues[WideIVR] = ResumePhi->getOperand(0);
4436 ScalarPhiIRI->addOperand(ResumePhi);
4437 continue;
4438 }
4439 // TODO: Also handle truncated inductions here. Computing end-values
4440 // separately should be done as VPlan-to-VPlan optimization, after
4441 // legalizing all resume values to use the last lane from the loop.
4442 assert(cast<VPWidenIntOrFpInductionRecipe>(VectorPhiR)->getTruncInst() &&
4443 "should only skip truncated wide inductions");
4444 continue;
4445 }
4446
4447 // The backedge value provides the value to resume coming out of a loop,
4448 // which for FORs is a vector whose last element needs to be extracted. The
4449 // start value provides the value if the loop is bypassed.
4450 bool IsFOR = isa<VPFirstOrderRecurrencePHIRecipe>(VectorPhiR);
4451 auto *ResumeFromVectorLoop = VectorPhiR->getBackedgeValue();
4452 assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
4453 "Cannot handle loops with uncountable early exits");
4454 if (IsFOR)
4455 ResumeFromVectorLoop = MiddleBuilder.createNaryOp(
4456 VPInstruction::ExtractLastElement, {ResumeFromVectorLoop}, {},
4457 "vector.recur.extract");
4458 StringRef Name = IsFOR ? "scalar.recur.init" : "bc.merge.rdx";
4459 auto *ResumePhiR = ScalarPHBuilder.createScalarPhi(
4460 {ResumeFromVectorLoop, VectorPhiR->getStartValue()}, {}, Name);
4461 ScalarPhiIRI->addOperand(ResumePhiR);
4462 }
4463}
4464
4466 VFRange &Range) {
4467 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
4468 auto *ScalarPHVPBB = Plan.getScalarPreheader();
4469 auto *MiddleVPBB = Plan.getMiddleBlock();
4470 VPBuilder ScalarPHBuilder(ScalarPHVPBB);
4471 VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
4472
4473 auto IsScalableOne = [](ElementCount VF) -> bool {
4474 return VF == ElementCount::getScalable(1);
4475 };
4476
4477 for (auto &HeaderPhi : VectorRegion->getEntryBasicBlock()->phis()) {
4478 auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&HeaderPhi);
4479 if (!FOR)
4480 continue;
4481
4482 assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
4483 "Cannot handle loops with uncountable early exits");
4484
4485 // This is the second phase of vectorizing first-order recurrences, creating
4486 // extract for users outside the loop. An overview of the transformation is
4487 // described below. Suppose we have the following loop with some use after
4488 // the loop of the last a[i-1],
4489 //
4490 // for (int i = 0; i < n; ++i) {
4491 // t = a[i - 1];
4492 // b[i] = a[i] - t;
4493 // }
4494 // use t;
4495 //
4496 // There is a first-order recurrence on "a". For this loop, the shorthand
4497 // scalar IR looks like:
4498 //
4499 // scalar.ph:
4500 // s.init = a[-1]
4501 // br scalar.body
4502 //
4503 // scalar.body:
4504 // i = phi [0, scalar.ph], [i+1, scalar.body]
4505 // s1 = phi [s.init, scalar.ph], [s2, scalar.body]
4506 // s2 = a[i]
4507 // b[i] = s2 - s1
4508 // br cond, scalar.body, exit.block
4509 //
4510 // exit.block:
4511 // use = lcssa.phi [s1, scalar.body]
4512 //
4513 // In this example, s1 is a recurrence because it's value depends on the
4514 // previous iteration. In the first phase of vectorization, we created a
4515 // VPFirstOrderRecurrencePHIRecipe v1 for s1. Now we create the extracts
4516 // for users in the scalar preheader and exit block.
4517 //
4518 // vector.ph:
4519 // v_init = vector(..., ..., ..., a[-1])
4520 // br vector.body
4521 //
4522 // vector.body
4523 // i = phi [0, vector.ph], [i+4, vector.body]
4524 // v1 = phi [v_init, vector.ph], [v2, vector.body]
4525 // v2 = a[i, i+1, i+2, i+3]
4526 // b[i] = v2 - v1
4527 // // Next, third phase will introduce v1' = splice(v1(3), v2(0, 1, 2))
4528 // b[i, i+1, i+2, i+3] = v2 - v1
4529 // br cond, vector.body, middle.block
4530 //
4531 // middle.block:
4532 // vector.recur.extract.for.phi = v2(2)
4533 // vector.recur.extract = v2(3)
4534 // br cond, scalar.ph, exit.block
4535 //
4536 // scalar.ph:
4537 // scalar.recur.init = phi [vector.recur.extract, middle.block],
4538 // [s.init, otherwise]
4539 // br scalar.body
4540 //
4541 // scalar.body:
4542 // i = phi [0, scalar.ph], [i+1, scalar.body]
4543 // s1 = phi [scalar.recur.init, scalar.ph], [s2, scalar.body]
4544 // s2 = a[i]
4545 // b[i] = s2 - s1
4546 // br cond, scalar.body, exit.block
4547 //
4548 // exit.block:
4549 // lo = lcssa.phi [s1, scalar.body],
4550 // [vector.recur.extract.for.phi, middle.block]
4551 //
4552 // Now update VPIRInstructions modeling LCSSA phis in the exit block.
4553 // Extract the penultimate value of the recurrence and use it as operand for
4554 // the VPIRInstruction modeling the phi.
4555 for (VPUser *U : FOR->users()) {
4556 using namespace llvm::VPlanPatternMatch;
4557 if (!match(U, m_ExtractLastElement(m_Specific(FOR))))
4558 continue;
4559 // For VF vscale x 1, if vscale = 1, we are unable to extract the
4560 // penultimate value of the recurrence. Instead we rely on the existing
4561 // extract of the last element from the result of
4562 // VPInstruction::FirstOrderRecurrenceSplice.
4563 // TODO: Consider vscale_range info and UF.
4565 Range))
4566 return;
4567 VPValue *PenultimateElement = MiddleBuilder.createNaryOp(
4568 VPInstruction::ExtractPenultimateElement, {FOR->getBackedgeValue()},
4569 {}, "vector.recur.extract.for.phi");
4570 cast<VPInstruction>(U)->replaceAllUsesWith(PenultimateElement);
4571 }
4572 }
4573}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
AMDGPU Register Bank Select
This file implements a class to represent arbitrary precision integral constant values and operations...
ReachingDefInfo InstSet & ToRemove
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool isEqual(const Function &Caller, const Function &Callee)
static const Function * getParent(const Value *V)
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
static bool isSentinel(const DWARFDebugNames::AttributeEncoding &AE)
@ Default
Hexagon Common GEP
iv Induction Variable Users
Definition IVUsers.cpp:48
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
licm
Definition LICM.cpp:389
Legalize the Machine IR a function s Machine IR
Definition Legalizer.cpp:80
static bool mergeBlocksIntoPredecessors(Loop &L, DominatorTree &DT, LoopInfo &LI, MemorySSAUpdater *MSSAU, ScalarEvolution &SE)
#define I(x, y, z)
Definition MD5.cpp:58
MachineInstr unsigned OpIdx
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
uint64_t IntrinsicInst * II
#define P(N)
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
const SmallVectorImpl< MachineOperand > & Cond
This file contains some templates that are useful if you are working with the STL at all.
This file implements a set that has insertion order iteration characteristics.
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This file implements the TypeSwitch template, which mimics a switch() statement whose cases are type ...
This file implements dominator tree analysis for a single level of a VPlan's H-CFG.
This file contains the declarations of different VPlan-related auxiliary helpers.
static VPValue * optimizeLatchExitInductionUser(VPlan &Plan, VPTypeAnalysis &TypeInfo, VPBlockBase *PredVPBB, VPValue *Op, DenseMap< VPValue *, VPValue * > &EndValues, ScalarEvolution &SE)
Attempts to optimize the induction variable exit values for users in the exit block coming from the l...
static void removeCommonBlendMask(VPBlendRecipe *Blend)
Try to see if all of Blend's masks share a common value logically and'ed and remove it from the masks...
static void tryToCreateAbstractReductionRecipe(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries to create abstract recipes from the reduction recipe for following optimizations ...
static bool sinkScalarOperands(VPlan &Plan)
static bool cannotHoistOrSinkRecipe(const VPRecipeBase &R)
Return true if we do not know how to (mechanically) hoist or sink R out of a loop region.
static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Try to simplify the branch condition of Plan.
static VPInstruction * addResumePhiRecipeForInduction(VPWidenInductionRecipe *WideIV, VPBuilder &VectorPHBuilder, VPBuilder &ScalarPHBuilder, VPTypeAnalysis &TypeInfo, VPValue *VectorTC)
Create and return a ResumePhi for WideIV, unless it is truncated.
static void removeRedundantInductionCasts(VPlan &Plan)
Remove redundant casts of inductions.
static bool tryToReplaceALMWithWideALM(VPlan &Plan, ElementCount VF, unsigned UF)
Try to replace multiple active lane masks used for control flow with a single, wide active lane mask ...
static std::optional< std::pair< bool, unsigned > > getOpcodeOrIntrinsicID(const VPSingleDefRecipe *R)
Get any instruction opcode or intrinsic ID data embedded in recipe R.
static VPExpressionRecipe * tryToMatchAndCreateExtendedReduction(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries convert extended in-loop reductions to VPExpressionRecipe and clamp the Range if ...
static VPScalarIVStepsRecipe * createScalarIVSteps(VPlan &Plan, InductionDescriptor::InductionKind Kind, Instruction::BinaryOps InductionOpcode, FPMathOperator *FPBinOp, Instruction *TruncI, VPValue *StartV, VPValue *Step, DebugLoc DL, VPBuilder &Builder)
static bool sinkRecurrenceUsersAfterPrevious(VPFirstOrderRecurrencePHIRecipe *FOR, VPRecipeBase *Previous, VPDominatorTree &VPDT)
Sink users of FOR after the recipe defining the previous value Previous of the recurrence.
static bool mergeReplicateRegionsIntoSuccessors(VPlan &Plan)
static VPActiveLaneMaskPHIRecipe * addVPLaneMaskPhiAndUpdateExitBranch(VPlan &Plan, bool DataAndControlFlowWithoutRuntimeCheck)
static void expandVPWidenPointerInduction(VPWidenPointerInductionRecipe *R, VPTypeAnalysis &TypeInfo)
Expand a VPWidenPointerInductionRecipe into executable recipes, for the initial value,...
static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL)
Replace recipes with their EVL variants.
static bool isDeadRecipe(VPRecipeBase &R)
Returns true if R is dead and can be removed.
static void legalizeAndOptimizeInductions(VPlan &Plan)
Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd (IndStart, ScalarIVSteps (0,...
static void addReplicateRegions(VPlan &Plan)
static VPValue * tryToFoldLiveIns(VPSingleDefRecipe &R, ArrayRef< VPValue * > Operands, const DataLayout &DL, VPTypeAnalysis &TypeInfo)
Try to fold R using InstSimplifyFolder.
static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo)
Try to simplify recipe R.
static void removeRedundantExpandSCEVRecipes(VPlan &Plan)
Remove redundant EpxandSCEVRecipes in Plan's entry block by replacing them with already existing reci...
static bool isConditionTrueViaVFAndUF(VPValue *Cond, VPlan &Plan, ElementCount BestVF, unsigned BestUF, ScalarEvolution &SE)
Return true if Cond is known to be true for given BestVF and BestUF.
static bool isConsecutiveInterleaveGroup(VPInterleaveRecipe *InterleaveR, unsigned VF, VPTypeAnalysis &TypeInfo, unsigned VectorRegWidth)
Returns true if IR is a full interleave group with factor and number of members both equal to VF.
static bool hoistPreviousBeforeFORUsers(VPFirstOrderRecurrencePHIRecipe *FOR, VPRecipeBase *Previous, VPDominatorTree &VPDT)
Try to hoist Previous and its operands before all users of FOR.
static SmallVector< VPUser * > collectUsersRecursively(VPValue *V)
static void recursivelyDeleteDeadRecipes(VPValue *V)
static VPValue * optimizeEarlyExitInductionUser(VPlan &Plan, VPTypeAnalysis &TypeInfo, VPBlockBase *PredVPBB, VPValue *Op, ScalarEvolution &SE)
Attempts to optimize the induction variable exit values for users in the early exit block.
static cl::opt< bool > EnableWideActiveLaneMask("enable-wide-lane-mask", cl::init(false), cl::Hidden, cl::desc("Enable use of wide get active lane mask instructions"))
static VPWidenInductionRecipe * getOptimizableIVOf(VPValue *VPV, ScalarEvolution &SE)
Check if VPV is an untruncated wide induction, either before or after the increment.
static VPRegionBlock * createReplicateRegion(VPReplicateRecipe *PredRecipe, VPlan &Plan)
static VPBasicBlock * getPredicatedThenBlock(VPRegionBlock *R)
If R is a triangle region, return the 'then' block of the triangle.
static void simplifyBlends(VPlan &Plan)
Normalize and simplify VPBlendRecipes.
static bool isAlreadyNarrow(VPValue *VPV)
Returns true if VPValue is a narrow VPValue.
static bool optimizeVectorInductionWidthForTCAndVFUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF)
Optimize the width of vector induction variables in Plan based on a known constant Trip Count,...
VPValue * getPredicatedMask(VPRegionBlock *R)
If R is a region with a VPBranchOnMaskRecipe in the entry block, return the mask.
static VPExpressionRecipe * tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries convert extended in-loop reductions to VPExpressionRecipe and clamp the Range if ...
static void expandVPWidenIntOrFpInduction(VPWidenIntOrFpInductionRecipe *WidenIVR, VPTypeAnalysis &TypeInfo)
Expand a VPWidenIntOrFpInduction into executable recipes, for the initial value, phi and backedge val...
static VPSingleDefRecipe * findHeaderMask(VPlan &Plan)
Collect the header mask with the pattern: (ICMP_ULE, WideCanonicalIV, backedge-taken-count) TODO: Int...
static VPRecipeBase * optimizeMaskToEVL(VPValue *HeaderMask, VPRecipeBase &CurRecipe, VPTypeAnalysis &TypeInfo, VPValue &AllOneMask, VPValue &EVL)
Try to optimize a CurRecipe masked by HeaderMask to a corresponding EVL-based recipe without the head...
static void removeRedundantCanonicalIVs(VPlan &Plan)
Try to replace VPWidenCanonicalIVRecipes with a widened canonical IV recipe, if it exists.
static bool canNarrowLoad(VPWidenRecipe *WideMember0, unsigned OpIdx, VPValue *OpV, unsigned Idx)
Returns true if V is VPWidenLoadRecipe or VPInterleaveRecipe that can be converted to a narrower reci...
static void narrowToSingleScalarRecipes(VPlan &Plan)
This file provides utility VPlan to VPlan transformations.
This file declares the class VPlanVerifier, which contains utility functions to check the consistency...
This file contains the declarations of the Vectorization Plan base classes:
Value * RHS
Value * LHS
BinaryOperator * Mul
static const uint32_t IV[8]
Definition blake3_impl.h:83
Class for arbitrary precision integers.
Definition APInt.h:78
LLVM_ABI APInt zext(unsigned width) const
Zero extend to a new width.
Definition APInt.cpp:1012
unsigned getActiveBits() const
Compute the number of active bits in the value.
Definition APInt.h:1512
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1488
LLVM_ABI APInt sext(unsigned width) const
Sign extend to a new width.
Definition APInt.cpp:985
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
LLVM Basic Block Representation.
Definition BasicBlock.h:62
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this basic block belongs to.
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233
This class represents a function call, abstracting a target machine's calling convention.
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:701
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:702
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition InstrTypes.h:789
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
static ConstantInt * getSigned(IntegerType *Ty, int64_t V)
Return a ConstantInt with the specified value for the specified type.
Definition Constants.h:131
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
A debug info location.
Definition DebugLoc.h:124
static DebugLoc getCompilerGenerated()
Definition DebugLoc.h:163
static DebugLoc getUnknown()
Definition DebugLoc.h:162
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition DenseMap.h:205
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:248
bool dominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
dominates - Returns true iff A dominates B.
constexpr bool isVector() const
One or more elements.
Definition TypeSize.h:325
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition TypeSize.h:313
Utility class for floating point operations which can have information about relaxed accuracy require...
Definition Operator.h:200
Represents flags for the getelementptr instruction/expression.
GEPNoWrapFlags withoutNoUnsignedWrap() const
static GEPNoWrapFlags none()
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
A struct for saving information about induction variables.
InductionKind
This enum represents the kinds of inductions that we support.
@ IK_PtrInduction
Pointer induction var. Step = C.
@ IK_IntInduction
Integer induction variable. Step = C.
InstSimplifyFolder - Use InstructionSimplify to fold operations to existing values.
bool isCast() const
bool isBinaryOp() const
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:319
The group of interleaved loads/stores sharing the same stride and close to each other.
InstTy * getMember(uint32_t Index) const
Get the member with the given index Index.
uint32_t getNumMembers() const
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
static bool getDecisionAndClampRange(const std::function< bool(ElementCount)> &Predicate, VFRange &Range)
Test a Predicate on a Range of VF's.
Definition VPlan.cpp:1557
LLVM_ABI MDNode * createBranchWeights(uint32_t TrueWeight, uint32_t FalseWeight, bool IsExpected=false)
Return metadata containing two branch weights.
Definition MDBuilder.cpp:38
Metadata node.
Definition Metadata.h:1078
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:36
ValueT lookup(const KeyT &Key) const
Definition MapVector.h:103
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
ScalarEvolution * getSE() const
Returns the ScalarEvolution analysis used.
LLVM_ABI const SCEV * getSCEV(Value *V)
Returns the SCEV expression of V, in the context of the current SCEV predicate.
static LLVM_ABI unsigned getOpcode(RecurKind Kind)
Returns the opcode corresponding to the RecurrenceKind.
RegionT * getParent() const
Get the parent of the Region.
Definition RegionInfo.h:362
This class uses information about analyze scalars to rewrite expressions in canonical form.
LLVM_ABI Value * expandCodeFor(const SCEV *SH, Type *Ty, BasicBlock::iterator I)
Insert code to directly compute the specified SCEV expression into the program.
static const SCEV * rewrite(const SCEV *Scev, ScalarEvolution &SE, ValueToSCEVMapTy &Map)
This class represents an analyzed expression in the program.
LLVM_ABI bool isZero() const
Return true if the expression is a constant zero.
LLVM_ABI Type * getType() const
Return the LLVM type of this SCEV expression.
The main scalar evolution driver.
const DataLayout & getDataLayout() const
Return the DataLayout associated with the module this SCEV instance is operating on.
LLVM_ABI const SCEV * getNegativeSCEV(const SCEV *V, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
Return the SCEV object corresponding to -V.
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI const SCEV * getUDivExpr(const SCEV *LHS, const SCEV *RHS)
Get a canonical unsigned division expression, or something simpler if possible.
LLVM_ABI const SCEV * getElementCount(Type *Ty, ElementCount EC, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
LLVM_ABI const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
LLVM_ABI bool isKnownPredicate(CmpPredicate Pred, const SCEV *LHS, const SCEV *RHS)
Test if the given expression is known to satisfy the condition described by Pred, LHS,...
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
Definition SetVector.h:59
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:102
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:150
size_type size() const
Definition SmallPtrSet.h:99
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
iterator begin() const
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
Provides information about what library functions are available for the current target.
static LLVM_ABI PartialReductionExtendKind getPartialReductionExtendKind(Instruction *I)
Get the kind of extension that an instruction represents.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
This class implements a switch-like dispatch statement for a value of 'T' using dyn_cast functionalit...
Definition TypeSwitch.h:88
TypeSwitch< T, ResultT > & Case(CallableT &&caseFn)
Add a case on the given type.
Definition TypeSwitch.h:97
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:298
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:297
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
Definition Type.cpp:295
bool isStructTy() const
True if this is an instance of StructType.
Definition Type.h:261
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:231
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:294
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:184
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
op_range operands()
Definition User.h:292
A recipe for generating the active lane mask for the vector loop that is used to predicate the vector...
Definition VPlan.h:3536
VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph.
Definition VPlan.h:3823
void appendRecipe(VPRecipeBase *Recipe)
Augment the existing recipes of a VPBasicBlock with an additional Recipe as the last recipe.
Definition VPlan.h:3898
RecipeListTy::iterator iterator
Instruction iterators...
Definition VPlan.h:3850
iterator end()
Definition VPlan.h:3860
iterator begin()
Recipe iterator methods.
Definition VPlan.h:3858
iterator_range< iterator > phis()
Returns an iterator range over the PHI-like recipes in the block.
Definition VPlan.h:3911
iterator getFirstNonPhi()
Return the position of the first non-phi node recipe in the block.
Definition VPlan.cpp:220
VPRegionBlock * getEnclosingLoopRegion()
Definition VPlan.cpp:593
VPBasicBlock * splitAt(iterator SplitAt)
Split current block at SplitAt by inserting a new block between the current block and its successors ...
Definition VPlan.cpp:565
VPRecipeBase * getTerminator()
If the block has multiple successors, return the branch recipe terminating the block.
Definition VPlan.cpp:638
const VPRecipeBase & back() const
Definition VPlan.h:3872
A recipe for vectorizing a phi-node as a sequence of mask-based select instructions.
Definition VPlan.h:2414
VPValue * getMask(unsigned Idx) const
Return mask number Idx.
Definition VPlan.h:2448
unsigned getNumIncomingValues() const
Return the number of incoming values, taking into account when normalized the first incoming value wi...
Definition VPlan.h:2438
void setMask(unsigned Idx, VPValue *V)
Set mask number Idx to V.
Definition VPlan.h:2454
bool isNormalized() const
A normalized blend is one that has an odd number of operands, whereby the first operand does not have...
Definition VPlan.h:2434
VPBlockBase is the building block of the Hierarchical Control-Flow Graph.
Definition VPlan.h:80
VPRegionBlock * getParent()
Definition VPlan.h:172
const VPBasicBlock * getExitingBasicBlock() const
Definition VPlan.cpp:190
size_t getNumSuccessors() const
Definition VPlan.h:218
void swapSuccessors()
Swap successors of the block. The block must have exactly 2 successors.
Definition VPlan.h:321
size_t getNumPredecessors() const
Definition VPlan.h:219
const VPBlocksTy & getPredecessors() const
Definition VPlan.h:203
VPlan * getPlan()
Definition VPlan.cpp:165
VPBlockBase * getSinglePredecessor() const
Definition VPlan.h:214
const VPBasicBlock * getEntryBasicBlock() const
Definition VPlan.cpp:170
VPBlockBase * getSingleHierarchicalPredecessor()
Definition VPlan.h:263
VPBlockBase * getSingleSuccessor() const
Definition VPlan.h:208
const VPBlocksTy & getSuccessors() const
Definition VPlan.h:197
static auto blocksOnly(const T &Range)
Return an iterator range over Range which only includes BlockTy blocks.
Definition VPlanUtils.h:187
static void insertOnEdge(VPBlockBase *From, VPBlockBase *To, VPBlockBase *BlockPtr)
Inserts BlockPtr on the edge between From and To.
Definition VPlanUtils.h:208
static void insertTwoBlocksAfter(VPBlockBase *IfTrue, VPBlockBase *IfFalse, VPBlockBase *BlockPtr)
Insert disconnected VPBlockBases IfTrue and IfFalse after BlockPtr.
Definition VPlanUtils.h:127
static void connectBlocks(VPBlockBase *From, VPBlockBase *To, unsigned PredIdx=-1u, unsigned SuccIdx=-1u)
Connect VPBlockBases From and To bi-directionally.
Definition VPlanUtils.h:146
static void disconnectBlocks(VPBlockBase *From, VPBlockBase *To)
Disconnect VPBlockBases From and To bi-directionally.
Definition VPlanUtils.h:165
A recipe for generating conditional branches on the bits of a mask.
Definition VPlan.h:2958
RAII object that stores the current insertion point and restores it when the object is destroyed.
VPlan-based builder utility analogous to IRBuilder.
VPValue * createElementCount(Type *Ty, ElementCount EC)
VPInstruction * createScalarCast(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy, DebugLoc DL, const VPIRFlags &Flags={}, const VPIRMetadata &Metadata={})
VPDerivedIVRecipe * createDerivedIV(InductionDescriptor::InductionKind Kind, FPMathOperator *FPBinOp, VPValue *Start, VPValue *Current, VPValue *Step, const Twine &Name="")
Convert the input value Current to the corresponding value of an induction with Start and Step values...
static VPBuilder getToInsertAfter(VPRecipeBase *R)
Create a VPBuilder to insert after R.
VPPhi * createScalarPhi(ArrayRef< VPValue * > IncomingValues, DebugLoc DL, const Twine &Name="")
VPInstruction * createNaryOp(unsigned Opcode, ArrayRef< VPValue * > Operands, Instruction *Inst=nullptr, const Twine &Name="")
Create an N-ary operation with Opcode, Operands and set Inst as its underlying Instruction.
void setInsertPoint(VPBasicBlock *TheBB)
This specifies that created VPInstructions should be appended to the end of the specified block.
Canonical scalar induction phi of the vector loop.
Definition VPlan.h:3479
unsigned getNumDefinedValues() const
Returns the number of values defined by the VPDef.
Definition VPlanValue.h:424
ArrayRef< VPValue * > definedValues()
Returns an ArrayRef of the values defined by the VPDef.
Definition VPlanValue.h:419
VPValue * getVPSingleValue()
Returns the only VPValue defined by the VPDef.
Definition VPlanValue.h:397
VPValue * getVPValue(unsigned I)
Returns the VPValue with index I defined by the VPDef.
Definition VPlanValue.h:409
A recipe for converting the input value IV value to the corresponding value of an IV with different s...
Definition VPlan.h:3644
Template specialization of the standard LLVM dominator tree utility for VPBlockBases.
bool properlyDominates(const VPRecipeBase *A, const VPRecipeBase *B)
A recipe for generating the phi node for the current index of elements, adjusted in accordance with E...
Definition VPlan.h:3567
A recipe to combine multiple recipes into a single 'expression' recipe, which should be considered a ...
Definition VPlan.h:3003
A pure virtual base class for all recipes modeling header phis, including phis for first order recurr...
Definition VPlan.h:1989
virtual VPValue * getBackedgeValue()
Returns the incoming value from the loop backedge.
Definition VPlan.h:2037
VPValue * getStartValue()
Returns the start value of the phi, if one is set.
Definition VPlan.h:2026
A special type of VPBasicBlock that wraps an existing IR basic block.
Definition VPlan.h:3976
BasicBlock * getIRBasicBlock() const
Definition VPlan.h:4000
Class to record and manage LLVM IR flags.
Definition VPlan.h:596
static LLVM_ABI_FOR_TEST VPIRInstruction * create(Instruction &I)
Create a new VPIRPhi for \I , if it is a PHINode, otherwise create a VPIRInstruction.
Helper to manage IR metadata for recipes.
Definition VPlan.h:938
void intersect(const VPIRMetadata &MD)
Intersect this VPIRMetada object with MD, keeping only metadata nodes that are common to both.
This is a concrete Recipe that models a single VPlan-level instruction.
Definition VPlan.h:979
@ ExtractLane
Extracts a single lane (first operand) from a set of vector operands.
Definition VPlan.h:1063
@ Unpack
Extracts all lanes from its (non-scalable) vector operand.
Definition VPlan.h:1014
@ FirstOrderRecurrenceSplice
Definition VPlan.h:985
@ BuildVector
Creates a fixed-width vector containing all operands.
Definition VPlan.h:1009
@ BuildStructVector
Given operands of (the same) struct type, creates a struct of fixed- width vectors each containing a ...
Definition VPlan.h:1006
@ CanonicalIVIncrementForPart
Definition VPlan.h:999
@ CalculateTripCountMinusVF
Definition VPlan.h:997
const InterleaveGroup< Instruction > * getInterleaveGroup() const
Definition VPlan.h:2555
VPValue * getMask() const
Return the mask used by this recipe.
Definition VPlan.h:2547
ArrayRef< VPValue * > getStoredValues() const
Return the VPValues stored by this interleave group.
Definition VPlan.h:2576
A recipe for interleaved memory operations with vector-predication intrinsics.
Definition VPlan.h:2628
VPInterleaveRecipe is a recipe for transforming an interleave group of load or stores into one wide l...
Definition VPlan.h:2587
VPPredInstPHIRecipe is a recipe for generating the phi nodes needed when control converges back from ...
Definition VPlan.h:3144
VPRecipeBase is a base class modeling a sequence of one or more output IR instructions.
Definition VPlan.h:386
bool mayReadOrWriteMemory() const
Returns true if the recipe may read from or write to memory.
Definition VPlan.h:473
bool mayHaveSideEffects() const
Returns true if the recipe may have side-effects.
VPRegionBlock * getRegion()
Definition VPlan.h:4128
VPBasicBlock * getParent()
Definition VPlan.h:407
DebugLoc getDebugLoc() const
Returns the debug location of the recipe.
Definition VPlan.h:478
void moveBefore(VPBasicBlock &BB, iplist< VPRecipeBase >::iterator I)
Unlink this recipe and insert into BB before I.
void insertBefore(VPRecipeBase *InsertPos)
Insert an unlinked recipe into a basic block immediately before the specified recipe.
void insertAfter(VPRecipeBase *InsertPos)
Insert an unlinked Recipe into a basic block immediately after the specified Recipe.
iplist< VPRecipeBase >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Helper class to create VPRecipies from IR instructions.
VPRecipeBase * getRecipe(Instruction *I)
Return the recipe created for given ingredient.
A recipe to represent inloop reduction operations with vector-predication intrinsics,...
Definition VPlan.h:2836
A recipe to represent inloop reduction operations, performing a reduction on a vector operand into a ...
Definition VPlan.h:2677
VPRegionBlock represents a collection of VPBasicBlocks and VPRegionBlocks which form a Single-Entry-S...
Definition VPlan.h:4011
const VPBlockBase * getEntry() const
Definition VPlan.h:4047
Type * getCanonicalIVType()
Return the type of the canonical IV for loop regions.
Definition VPlan.h:4122
bool isReplicator() const
An indicator whether this region is to generate multiple replicated instances of output IR correspond...
Definition VPlan.h:4079
void setExiting(VPBlockBase *ExitingBlock)
Set ExitingBlock as the exiting VPBlockBase of this VPRegionBlock.
Definition VPlan.h:4064
VPCanonicalIVPHIRecipe * getCanonicalIV()
Returns the canonical induction recipe of the region.
Definition VPlan.h:4109
const VPBlockBase * getExiting() const
Definition VPlan.h:4059
VPBasicBlock * getPreheaderVPBB()
Returns the pre-header VPBasicBlock of the loop region.
Definition VPlan.h:4072
VPReplicateRecipe replicates a given instruction producing multiple scalar copies of the original sca...
Definition VPlan.h:2880
bool isSingleScalar() const
Definition VPlan.h:2925
VPValue * getMask()
Return the mask of a predicated VPReplicateRecipe.
Definition VPlan.h:2949
A recipe for handling phi nodes of integer and floating-point inductions, producing their scalar valu...
Definition VPlan.h:3713
VPSingleDef is a base class for recipes for modeling a sequence of one or more output IR that define ...
Definition VPlan.h:517
Instruction * getUnderlyingInstr()
Returns the underlying instruction.
Definition VPlan.h:582
VPSingleDefRecipe * clone() override=0
Clone the current recipe.
An analysis for type-inference for VPValues.
LLVMContext & getContext()
Return the LLVMContext used by the analysis.
Type * inferScalarType(const VPValue *V)
Infer the type of V. Returns the scalar type of V.
This class augments VPValue with operands which provide the inverse def-use edges from VPValue's user...
Definition VPlanValue.h:199
operand_range operands()
Definition VPlanValue.h:267
void setOperand(unsigned I, VPValue *New)
Definition VPlanValue.h:243
VPValue * getOperand(unsigned N) const
Definition VPlanValue.h:238
void addOperand(VPValue *Operand)
Definition VPlanValue.h:232
This is the base class of the VPlan Def/Use graph, used for modeling the data flow into,...
Definition VPlanValue.h:48
VPRecipeBase * getDefiningRecipe()
Returns the recipe defining this VPValue or nullptr if it is not defined by a recipe,...
Definition VPlan.cpp:135
Value * getLiveInIRValue() const
Returns the underlying IR value, if this VPValue is defined outside the scope of VPlan.
Definition VPlanValue.h:176
Value * getUnderlyingValue() const
Return the underlying Value attached to this VPValue.
Definition VPlanValue.h:85
void setUnderlyingValue(Value *Val)
Definition VPlanValue.h:186
void replaceAllUsesWith(VPValue *New)
Definition VPlan.cpp:1393
unsigned getNumUsers() const
Definition VPlanValue.h:113
bool isLiveIn() const
Returns true if this VPValue is a live-in, i.e. defined outside the VPlan.
Definition VPlanValue.h:171
void replaceUsesWithIf(VPValue *New, llvm::function_ref< bool(VPUser &U, unsigned Idx)> ShouldReplace)
Go through the uses list for this VPValue and make each use point to New if the callback ShouldReplac...
Definition VPlan.cpp:1397
user_range users()
Definition VPlanValue.h:134
A recipe to compute a pointer to the last element of each part of a widened memory access for widened...
Definition VPlan.h:1853
VPVectorEndPointerRecipe * clone() override
Clone the current recipe.
Definition VPlan.h:1897
A Recipe for widening the canonical induction variable of the vector loop.
Definition VPlan.h:3608
VPWidenCastRecipe is a recipe to create vector cast instructions.
Definition VPlan.h:1494
Instruction::CastOps getOpcode() const
Definition VPlan.h:1545
A recipe for handling GEP instructions.
Definition VPlan.h:1781
Base class for widened induction (VPWidenIntOrFpInductionRecipe and VPWidenPointerInductionRecipe),...
Definition VPlan.h:2054
PHINode * getPHINode() const
Definition VPlan.h:2096
VPValue * getStepValue()
Returns the step value of the induction.
Definition VPlan.h:2082
const InductionDescriptor & getInductionDescriptor() const
Returns the induction descriptor for the recipe.
Definition VPlan.h:2099
A recipe for handling phi nodes of integer and floating-point inductions, producing their vector valu...
Definition VPlan.h:2129
VPValue * getLastUnrolledPartOperand()
Returns the VPValue representing the value of this induction at the last unrolled part,...
Definition VPlan.h:2210
A recipe for widening vector intrinsics.
Definition VPlan.h:1552
A common base class for widening memory operations.
Definition VPlan.h:3186
VPValue * getMask() const
Return the mask used by this recipe.
Definition VPlan.h:3251
VPValue * getAddr() const
Return the address accessed by this recipe.
Definition VPlan.h:3244
A recipe for widened phis.
Definition VPlan.h:2265
VPWidenRecipe is a recipe for producing a widened instruction using the opcode and operands of the re...
Definition VPlan.h:1451
VPlan models a candidate for vectorization, encoding various decisions take to produce efficient outp...
Definition VPlan.h:4141
bool hasVF(ElementCount VF) const
Definition VPlan.h:4347
LLVMContext & getContext() const
Definition VPlan.h:4335
VPBasicBlock * getEntry()
Definition VPlan.h:4235
VPValue & getVectorTripCount()
The vector trip count.
Definition VPlan.h:4326
bool hasScalableVF() const
Definition VPlan.h:4348
VPValue & getVFxUF()
Returns VF * UF of the vector loop region.
Definition VPlan.h:4333
VPValue & getVF()
Returns the VF of the vector loop region.
Definition VPlan.h:4329
VPValue * getTripCount() const
The trip count of the original loop.
Definition VPlan.h:4297
VPValue * getTrue()
Return a VPValue wrapping i1 true.
Definition VPlan.h:4404
VPValue * getOrCreateBackedgeTakenCount()
The backedge taken count of the original loop.
Definition VPlan.h:4318
unsigned getUF() const
Definition VPlan.h:4367
VPRegionBlock * createReplicateRegion(VPBlockBase *Entry, VPBlockBase *Exiting, const std::string &Name="")
Create a new replicate region with Entry, Exiting and Name.
Definition VPlan.h:4481
bool hasUF(unsigned UF) const
Definition VPlan.h:4365
ArrayRef< VPIRBasicBlock * > getExitBlocks() const
Return an ArrayRef containing VPIRBasicBlocks wrapping the exit blocks of the original scalar loop.
Definition VPlan.h:4287
VPValue * getConstantInt(Type *Ty, uint64_t Val, bool IsSigned=false)
Return a VPValue wrapping a ConstantInt with the given type and value.
Definition VPlan.h:4410
void setVF(ElementCount VF)
Definition VPlan.h:4341
bool isUnrolled() const
Returns true if the VPlan already has been unrolled, i.e.
Definition VPlan.h:4380
LLVM_ABI_FOR_TEST VPRegionBlock * getVectorLoopRegion()
Returns the VPRegionBlock of the vector loop.
Definition VPlan.cpp:1027
void resetTripCount(VPValue *NewTripCount)
Resets the trip count for the VPlan.
Definition VPlan.h:4311
VPBasicBlock * getMiddleBlock()
Returns the 'middle' block of the plan, that is the block that selects whether to execute the scalar ...
Definition VPlan.h:4260
VPBasicBlock * createVPBasicBlock(const Twine &Name, VPRecipeBase *Recipe=nullptr)
Create a new VPBasicBlock with Name and containing Recipe if present.
Definition VPlan.h:4459
VPValue * getFalse()
Return a VPValue wrapping i1 false.
Definition VPlan.h:4407
VPValue * getOrAddLiveIn(Value *V)
Gets the live-in VPValue for V or adds a new live-in (if none exists yet) for V.
Definition VPlan.h:4389
bool hasScalarVFOnly() const
Definition VPlan.h:4358
VPBasicBlock * getScalarPreheader() const
Return the VPBasicBlock for the preheader of the scalar loop.
Definition VPlan.h:4278
ArrayRef< VPValue * > getLiveIns() const
Return the list of live-in VPValues available in the VPlan.
Definition VPlan.h:4429
VPIRBasicBlock * getScalarHeader() const
Return the VPIRBasicBlock wrapping the header of the scalar loop.
Definition VPlan.h:4283
VPValue * getLiveIn(Value *V) const
Return the live-in VPValue for V, if there is one or nullptr otherwise.
Definition VPlan.h:4426
VPBasicBlock * getVectorPreheader()
Returns the preheader of the vector loop region, if one exists, or null otherwise.
Definition VPlan.h:4240
void setUF(unsigned UF)
Definition VPlan.h:4372
bool hasScalarTail() const
Returns true if the scalar tail may execute after the vector loop.
Definition VPlan.h:4513
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
iterator_range< user_iterator > users()
Definition Value.h:426
bool hasName() const
Definition Value.h:262
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:169
constexpr LeafTy multiplyCoefficientBy(ScalarTy RHS) const
Definition TypeSize.h:257
constexpr bool isFixed() const
Returns true if the quantity is not scaled by vscale.
Definition TypeSize.h:172
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:166
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
IteratorT end() const
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
LLVM_ABI APInt RoundingUDiv(const APInt &A, const APInt &B, APInt::Rounding RM)
Return A unsign-divided by B, rounded by the given rounding mode.
Definition APInt.cpp:2763
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
ap_match< APInt > m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
CastInst_match< OpTy, TruncInst > m_Trunc(const OpTy &Op)
Matches Trunc.
LogicalOp_match< LHS, RHS, Instruction::And > m_LogicalAnd(const LHS &L, const RHS &R)
Matches L && R either in the form of L & R or L ?
match_combine_or< CastInst_match< OpTy, ZExtInst >, OpTy > m_ZExtOrSelf(const OpTy &Op)
bool match(Val *V, const Pattern &P)
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
deferredval_ty< Value > m_Deferred(Value *const &V)
Like m_Specific(), but works if the specific value to match is determined as part of the same match()...
SpecificCmpClass_match< LHS, RHS, ICmpInst > m_SpecificICmp(CmpPredicate MatchPred, const LHS &L, const RHS &R)
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
CmpClass_match< LHS, RHS, ICmpInst > m_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
BinaryOp_match< LHS, RHS, Instruction::Mul, true > m_c_Mul(const LHS &L, const RHS &R)
Matches a Mul with LHS and RHS in either order.
MatchFunctor< Val, Pattern > match_fn(const Pattern &P)
A match functor that can be used as a UnaryPredicate in functional algorithms like all_of.
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
AllRecipe_commutative_match< Instruction::And, Op0_t, Op1_t > m_c_BinaryAnd(const Op0_t &Op0, const Op1_t &Op1)
Match a binary AND operation.
AllRecipe_match< Instruction::Or, Op0_t, Op1_t > m_BinaryOr(const Op0_t &Op0, const Op1_t &Op1)
Match a binary OR operation.
AllRecipe_commutative_match< Opcode, Op0_t, Op1_t > m_c_Binary(const Op0_t &Op0, const Op1_t &Op1)
AllRecipe_commutative_match< Instruction::Or, Op0_t, Op1_t > m_c_BinaryOr(const Op0_t &Op0, const Op1_t &Op1)
GEPLikeRecipe_match< Op0_t, Op1_t > m_GetElementPtr(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::ExtractLastLanePerPart, Op0_t > m_ExtractLastLanePerPart(const Op0_t &Op0)
VPInstruction_match< VPInstruction::ExtractLastElement, Op0_t > m_ExtractLastElement(const Op0_t &Op0)
AllRecipe_match< Opcode, Op0_t, Op1_t > m_Binary(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< Instruction::ExtractElement, Op0_t, Op1_t > m_ExtractElement(const Op0_t &Op0, const Op1_t &Op1)
specific_intval< 1 > m_False()
VPDerivedIV_match< Op0_t, Op1_t, Op2_t > m_DerivedIV(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2)
VPInstruction_match< VPInstruction::ActiveLaneMask, Op0_t, Op1_t, Op2_t > m_ActiveLaneMask(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2)
VPInstruction_match< VPInstruction::BranchOnCount > m_BranchOnCount()
specific_intval< 1 > m_True()
VPInstruction_match< VPInstruction::Broadcast, Op0_t > m_Broadcast(const Op0_t &Op0)
class_match< VPValue > m_VPValue()
Match an arbitrary VPValue and ignore it.
VPInstruction_match< VPInstruction::ExplicitVectorLength, Op0_t > m_EVL(const Op0_t &Op0)
VPInstruction_match< VPInstruction::BuildVector > m_BuildVector()
BuildVector is matches only its opcode, w/o matching its operands as the number of operands is not fi...
VPInstruction_match< VPInstruction::FirstActiveLane, Op0_t > m_FirstActiveLane(const Op0_t &Op0)
bind_ty< VPInstruction > m_VPInstruction(VPInstruction *&V)
Match a VPInstruction, capturing if we match.
VPInstruction_match< VPInstruction::BranchOnCond > m_BranchOnCond()
initializer< Ty > init(const Ty &Val)
NodeAddr< DefNode * > Def
Definition RDFGraph.h:384
bool isSingleScalar(const VPValue *VPV)
Returns true if VPV is a single scalar, either because it produces the same value for all lanes or on...
bool isUniformAcrossVFsAndUFs(VPValue *V)
Checks if V is uniform across all VF lanes and UF parts.
VPValue * getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr)
Get or create a VPValue that corresponds to the expansion of Expr.
bool onlyFirstLaneUsed(const VPValue *Def)
Returns true if only the first lane of Def is used.
bool onlyScalarValuesUsed(const VPValue *Def)
Returns true if only scalar values of Def are used by all users.
bool isHeaderMask(const VPValue *V, const VPlan &Plan)
Return true if V is a header mask in Plan.
const SCEV * getSCEVExprForVPValue(const VPValue *V, ScalarEvolution &SE, const Loop *L=nullptr)
Return the SCEV expression for V.
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
@ Offset
Definition DWP.cpp:477
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1725
LLVM_ABI Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
DenseMap< const Value *, const SCEV * > ValueToSCEVMapTy
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2472
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
constexpr from_range_t from_range
auto dyn_cast_if_present(const Y &Val)
dyn_cast_if_present<X> - Functionally identical to dyn_cast, except that a null (or none in the case ...
Definition Casting.h:732
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2136
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:632
auto cast_or_null(const Y &Val)
Definition Casting.h:714
iterator_range< df_iterator< VPBlockShallowTraversalWrapper< VPBlockBase * > > > vp_depth_first_shallow(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order.
Definition VPlanCFG.h:216
iterator_range< df_iterator< VPBlockDeepTraversalWrapper< VPBlockBase * > > > vp_depth_first_deep(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order while traversing t...
Definition VPlanCFG.h:243
detail::concat_range< ValueT, RangeTs... > concat(RangeTs &&...Ranges)
Returns a concatenated range across two or more ranges.
Definition STLExtras.h:1150
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:385
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1732
auto reverse(ContainerTy &&C)
Definition STLExtras.h:406
iterator_range< po_iterator< VPBlockDeepTraversalWrapper< VPBlockBase * > > > vp_post_order_deep(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in post order while traversing through ...
Definition VPlanCFG.h:236
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1622
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
SmallVector< ValueTypeFromRangeType< R >, Size > to_vector(R &&Range)
Given a range of type R, iterate the entire range and return a SmallVector with elements of the vecto...
bool canConstantBeExtended(const APInt *C, Type *NarrowType, TTI::PartialReductionExtendKind ExtKind)
Check if a constant CI can be safely treated as having been extended from a narrower type with the gi...
Definition VPlan.cpp:1734
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
auto drop_end(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the last N elements excluded.
Definition STLExtras.h:323
RecurKind
These are the kinds of recurrences that we support.
@ Mul
Product of integers.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
@ AddChainWithSubs
A chain of adds and subs.
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition STLExtras.h:1954
DWARFExpression::Operation Op
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition STLExtras.h:1961
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="", bool Before=false)
Split the specified block at the specified instruction.
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1758
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition STLExtras.h:2108
@ DataAndControlFlowWithoutRuntimeCheck
Use predicate to control both data and control flow, but modify the trip count so that a runtime over...
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition Hashing.h:592
bool equal(L &&LRange, R &&RRange)
Wrapper function around std::equal to detect if pair-wise elements between two ranges are the same.
Definition STLExtras.h:2088
Type * toVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
@ Default
The result values are uniform if and only if all operands are uniform.
Definition Uniformity.h:20
constexpr detail::IsaCheckPredicate< Types... > IsaPred
Function object wrapper for the llvm::isa type check.
Definition Casting.h:830
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition Hashing.h:466
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
An information struct used to provide DenseMap with the various necessary components for a given valu...
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
A range of powers-of-2 vectorization factors with fixed start and adjustable end.
Struct to hold various analysis needed for cost computations.
A recipe for handling first-order recurrence phis.
Definition VPlan.h:2307
A recipe for widening load operations with vector-predication intrinsics, using the address to load f...
Definition VPlan.h:3314
A recipe for widening load operations, using the address to load from and an optional mask.
Definition VPlan.h:3274
A recipe for widening select instructions.
Definition VPlan.h:1735
A recipe for widening store operations with vector-predication intrinsics, using the value to store,...
Definition VPlan.h:3396
A recipe for widening store operations, using the stored value, the address to store to and an option...
Definition VPlan.h:3354
static void materializeBroadcasts(VPlan &Plan)
Add explicit broadcasts for live-ins and VPValues defined in Plan's entry block if they are used as v...
static void materializePacksAndUnpacks(VPlan &Plan)
Add explicit Build[Struct]Vector recipes to Pack multiple scalar values into vectors and Unpack recip...
static void materializeBackedgeTakenCount(VPlan &Plan, VPBasicBlock *VectorPH)
Materialize the backedge-taken count to be computed explicitly using VPInstructions.
static void optimizeInductionExitUsers(VPlan &Plan, DenseMap< VPValue *, VPValue * > &EndValues, ScalarEvolution &SE)
If there's a single exit block, optimize its phi recipes that use exiting IV values by feeding them p...
static void addScalarResumePhis(VPlan &Plan, VPRecipeBuilder &Builder, DenseMap< VPValue *, VPValue * > &IVEndValues)
Create resume phis in the scalar preheader for first-order recurrences, reductions and inductions,...
static void canonicalizeEVLLoops(VPlan &Plan)
Transform EVL loops to use variable-length stepping after region dissolution.
static void dropPoisonGeneratingRecipes(VPlan &Plan, const std::function< bool(BasicBlock *)> &BlockNeedsPredication)
Drop poison flags from recipes that may generate a poison value that is used after vectorization,...
static void createAndOptimizeReplicateRegions(VPlan &Plan)
Wrap predicated VPReplicateRecipes with a mask operand in an if-then region block and remove the mask...
static void createInterleaveGroups(VPlan &Plan, const SmallPtrSetImpl< const InterleaveGroup< Instruction > * > &InterleaveGroups, VPRecipeBuilder &RecipeBuilder, const bool &ScalarEpilogueAllowed)
static bool runPass(bool(*Transform)(VPlan &, ArgsTy...), VPlan &Plan, typename std::remove_reference< ArgsTy >::type &...Args)
Helper to run a VPlan transform Transform on VPlan, forwarding extra arguments to the transform.
static void addBranchWeightToMiddleTerminator(VPlan &Plan, ElementCount VF, std::optional< unsigned > VScaleForTuning)
Add branch weight metadata, if the Plan's middle block is terminated by a BranchOnCond recipe.
static DenseMap< const SCEV *, Value * > expandSCEVs(VPlan &Plan, ScalarEvolution &SE)
Expand VPExpandSCEVRecipes in Plan's entry block.
static void convertToConcreteRecipes(VPlan &Plan)
Lower abstract recipes to concrete ones, that can be codegen'd.
static void convertToAbstractRecipes(VPlan &Plan, VPCostContext &Ctx, VFRange &Range)
This function converts initial recipes to the abstract recipes and clamps Range based on cost model f...
static void materializeConstantVectorTripCount(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
static LLVM_ABI_FOR_TEST bool tryToConvertVPInstructionsToVPRecipes(VPlan &Plan, function_ref< const InductionDescriptor *(PHINode *)> GetIntOrFpInductionDescriptor, const TargetLibraryInfo &TLI)
Replaces the VPInstructions in Plan with corresponding widen recipes.
static void addExitUsersForFirstOrderRecurrences(VPlan &Plan, VFRange &Range)
Handle users in the exit block for first order reductions in the original exit block.
static void addExplicitVectorLength(VPlan &Plan, const std::optional< unsigned > &MaxEVLSafeElements)
Add a VPEVLBasedIVPHIRecipe and related recipes to Plan and replaces all uses except the canonical IV...
static void replaceSymbolicStrides(VPlan &Plan, PredicatedScalarEvolution &PSE, const DenseMap< Value *, const SCEV * > &StridesMap)
Replace symbolic strides from StridesMap in Plan with constants when possible.
static void removeBranchOnConst(VPlan &Plan)
Remove BranchOnCond recipes with true or false conditions together with removing dead edges to their ...
static void removeDeadRecipes(VPlan &Plan)
Remove dead recipes from Plan.
static void materializeVectorTripCount(VPlan &Plan, VPBasicBlock *VectorPHVPBB, bool TailByMasking, bool RequiresScalarEpilogue)
Materialize vector trip count computations to a set of VPInstructions.
static void simplifyRecipes(VPlan &Plan)
Perform instcombine-like simplifications on recipes in Plan.
static void handleUncountableEarlyExit(VPBasicBlock *EarlyExitingVPBB, VPBasicBlock *EarlyExitVPBB, VPlan &Plan, VPBasicBlock *HeaderVPBB, VPBasicBlock *LatchVPBB)
Update Plan to account for the uncountable early exit from EarlyExitingVPBB to EarlyExitVPBB by.
static void clearReductionWrapFlags(VPlan &Plan)
Clear NSW/NUW flags from reduction instructions if necessary.
static void cse(VPlan &Plan)
Perform common-subexpression-elimination on Plan.
static void addActiveLaneMask(VPlan &Plan, bool UseActiveLaneMaskForControlFlow, bool DataAndControlFlowWithoutRuntimeCheck)
Replace (ICMP_ULE, wide canonical IV, backedge-taken-count) checks with an (active-lane-mask recipe,...
static void optimize(VPlan &Plan)
Apply VPlan-to-VPlan optimizations to Plan, including induction recipe optimizations,...
static void dissolveLoopRegions(VPlan &Plan)
Replace loop regions with explicit CFG.
static void narrowInterleaveGroups(VPlan &Plan, ElementCount VF, unsigned VectorRegWidth)
Try to convert a plan with interleave groups with VF elements to a plan with the interleave groups re...
static void truncateToMinimalBitwidths(VPlan &Plan, const MapVector< Instruction *, uint64_t > &MinBWs)
Insert truncates and extends for any truncated recipe.
static bool adjustFixedOrderRecurrences(VPlan &Plan, VPBuilder &Builder)
Try to have all users of fixed-order recurrences appear after the recipe defining their previous valu...
static void optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Optimize Plan based on BestVF and BestUF.
static void materializeVFAndVFxUF(VPlan &Plan, VPBasicBlock *VectorPH, ElementCount VF)
Materialize VF and VFxUF to be computed explicitly using VPInstructions.