LLVM 22.0.0git
VPlanTransforms.cpp
Go to the documentation of this file.
1//===-- VPlanTransforms.cpp - Utility VPlan to VPlan transforms -----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file implements a set of utility VPlan to VPlan transformations.
11///
12//===----------------------------------------------------------------------===//
13
14#include "VPlanTransforms.h"
15#include "VPRecipeBuilder.h"
16#include "VPlan.h"
17#include "VPlanAnalysis.h"
18#include "VPlanCFG.h"
19#include "VPlanDominatorTree.h"
20#include "VPlanHelpers.h"
21#include "VPlanPatternMatch.h"
22#include "VPlanUtils.h"
23#include "VPlanVerifier.h"
24#include "llvm/ADT/APInt.h"
26#include "llvm/ADT/STLExtras.h"
28#include "llvm/ADT/SetVector.h"
30#include "llvm/ADT/TypeSwitch.h"
38#include "llvm/IR/Intrinsics.h"
39#include "llvm/IR/MDBuilder.h"
40#include "llvm/IR/Metadata.h"
44
45using namespace llvm;
46using namespace VPlanPatternMatch;
47
49 VPlan &Plan,
51 GetIntOrFpInductionDescriptor,
52 const TargetLibraryInfo &TLI) {
53
55 Plan.getVectorLoopRegion());
57 // Skip blocks outside region
58 if (!VPBB->getParent())
59 break;
60 VPRecipeBase *Term = VPBB->getTerminator();
61 auto EndIter = Term ? Term->getIterator() : VPBB->end();
62 // Introduce each ingredient into VPlan.
63 for (VPRecipeBase &Ingredient :
64 make_early_inc_range(make_range(VPBB->begin(), EndIter))) {
65
66 VPValue *VPV = Ingredient.getVPSingleValue();
67 if (!VPV->getUnderlyingValue())
68 continue;
69
71
72 VPRecipeBase *NewRecipe = nullptr;
73 if (auto *PhiR = dyn_cast<VPPhi>(&Ingredient)) {
74 auto *Phi = cast<PHINode>(PhiR->getUnderlyingValue());
75 const auto *II = GetIntOrFpInductionDescriptor(Phi);
76 if (!II) {
77 NewRecipe = new VPWidenPHIRecipe(Phi, nullptr, PhiR->getDebugLoc());
78 for (VPValue *Op : PhiR->operands())
79 NewRecipe->addOperand(Op);
80 } else {
81 VPValue *Start = Plan.getOrAddLiveIn(II->getStartValue());
82 VPValue *Step =
84 // It is always safe to copy over the NoWrap and FastMath flags. In
85 // particular, when folding tail by masking, the masked-off lanes are
86 // never used, so it is safe.
88 NewRecipe = new VPWidenIntOrFpInductionRecipe(
89 Phi, Start, Step, &Plan.getVF(), *II, Flags,
90 Ingredient.getDebugLoc());
91 }
92 } else {
93 auto *VPI = cast<VPInstruction>(&Ingredient);
94 assert(!isa<PHINode>(Inst) && "phis should be handled above");
95 // Create VPWidenMemoryRecipe for loads and stores.
96 if (LoadInst *Load = dyn_cast<LoadInst>(Inst)) {
97 NewRecipe = new VPWidenLoadRecipe(
98 *Load, Ingredient.getOperand(0), nullptr /*Mask*/,
99 false /*Consecutive*/, false /*Reverse*/, *VPI,
100 Ingredient.getDebugLoc());
101 } else if (StoreInst *Store = dyn_cast<StoreInst>(Inst)) {
102 NewRecipe = new VPWidenStoreRecipe(
103 *Store, Ingredient.getOperand(1), Ingredient.getOperand(0),
104 nullptr /*Mask*/, false /*Consecutive*/, false /*Reverse*/, *VPI,
105 Ingredient.getDebugLoc());
107 NewRecipe = new VPWidenGEPRecipe(GEP, Ingredient.operands(), *VPI,
108 Ingredient.getDebugLoc());
109 } else if (CallInst *CI = dyn_cast<CallInst>(Inst)) {
110 Intrinsic::ID VectorID = getVectorIntrinsicIDForCall(CI, &TLI);
111 if (VectorID == Intrinsic::not_intrinsic)
112 return false;
113 NewRecipe = new VPWidenIntrinsicRecipe(
114 *CI, getVectorIntrinsicIDForCall(CI, &TLI),
115 drop_end(Ingredient.operands()), CI->getType(), VPIRFlags(*CI),
116 *VPI, CI->getDebugLoc());
117 } else if (SelectInst *SI = dyn_cast<SelectInst>(Inst)) {
118 NewRecipe = new VPWidenSelectRecipe(SI, Ingredient.operands(), *VPI,
119 *VPI, Ingredient.getDebugLoc());
120 } else if (auto *CI = dyn_cast<CastInst>(Inst)) {
121 NewRecipe = new VPWidenCastRecipe(
122 CI->getOpcode(), Ingredient.getOperand(0), CI->getType(), CI,
123 VPIRFlags(*CI), VPIRMetadata(*CI));
124 } else {
125 NewRecipe = new VPWidenRecipe(*Inst, Ingredient.operands(), *VPI,
126 *VPI, Ingredient.getDebugLoc());
127 }
128 }
129
130 NewRecipe->insertBefore(&Ingredient);
131 if (NewRecipe->getNumDefinedValues() == 1)
132 VPV->replaceAllUsesWith(NewRecipe->getVPSingleValue());
133 else
134 assert(NewRecipe->getNumDefinedValues() == 0 &&
135 "Only recpies with zero or one defined values expected");
136 Ingredient.eraseFromParent();
137 }
138 }
139 return true;
140}
141
142/// Return true if we do not know how to (mechanically) hoist or sink \p R out
143/// of a loop region.
145 // Assumes don't alias anything or throw; as long as they're guaranteed to
146 // execute, they're safe to hoist.
148 return false;
149
150 // TODO: Relax checks in the future, e.g. we could also hoist reads, if their
151 // memory location is not modified in the vector loop.
152 if (R.mayHaveSideEffects() || R.mayReadFromMemory() || R.isPhi())
153 return true;
154
155 // Allocas cannot be hoisted.
156 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
157 return RepR && RepR->getOpcode() == Instruction::Alloca;
158}
159
160static bool sinkScalarOperands(VPlan &Plan) {
161 auto Iter = vp_depth_first_deep(Plan.getEntry());
162 bool ScalarVFOnly = Plan.hasScalarVFOnly();
163 bool Changed = false;
164
166 auto InsertIfValidSinkCandidate = [ScalarVFOnly, &WorkList](
167 VPBasicBlock *SinkTo, VPValue *Op) {
168 auto *Candidate =
169 dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe());
170 if (!Candidate)
171 return;
172
173 // We only know how to sink VPReplicateRecipes and VPScalarIVStepsRecipes
174 // for now.
176 return;
177
178 if (Candidate->getParent() == SinkTo || cannotHoistOrSinkRecipe(*Candidate))
179 return;
180
181 if (auto *RepR = dyn_cast<VPReplicateRecipe>(Candidate))
182 if (!ScalarVFOnly && RepR->isSingleScalar())
183 return;
184
185 WorkList.insert({SinkTo, Candidate});
186 };
187
188 // First, collect the operands of all recipes in replicate blocks as seeds for
189 // sinking.
191 VPBasicBlock *EntryVPBB = VPR->getEntryBasicBlock();
192 if (!VPR->isReplicator() || EntryVPBB->getSuccessors().size() != 2)
193 continue;
194 VPBasicBlock *VPBB = cast<VPBasicBlock>(EntryVPBB->getSuccessors().front());
195 if (VPBB->getSingleSuccessor() != VPR->getExitingBasicBlock())
196 continue;
197 for (auto &Recipe : *VPBB)
198 for (VPValue *Op : Recipe.operands())
199 InsertIfValidSinkCandidate(VPBB, Op);
200 }
201
202 // Try to sink each replicate or scalar IV steps recipe in the worklist.
203 for (unsigned I = 0; I != WorkList.size(); ++I) {
204 VPBasicBlock *SinkTo;
205 VPSingleDefRecipe *SinkCandidate;
206 std::tie(SinkTo, SinkCandidate) = WorkList[I];
207
208 // All recipe users of SinkCandidate must be in the same block SinkTo or all
209 // users outside of SinkTo must only use the first lane of SinkCandidate. In
210 // the latter case, we need to duplicate SinkCandidate.
211 auto UsersOutsideSinkTo =
212 make_filter_range(SinkCandidate->users(), [SinkTo](VPUser *U) {
213 return cast<VPRecipeBase>(U)->getParent() != SinkTo;
214 });
215 if (any_of(UsersOutsideSinkTo, [SinkCandidate](VPUser *U) {
216 return !U->usesFirstLaneOnly(SinkCandidate);
217 }))
218 continue;
219 bool NeedsDuplicating = !UsersOutsideSinkTo.empty();
220
221 if (NeedsDuplicating) {
222 if (ScalarVFOnly)
223 continue;
224 VPSingleDefRecipe *Clone;
225 if (auto *SinkCandidateRepR =
226 dyn_cast<VPReplicateRecipe>(SinkCandidate)) {
227 // TODO: Handle converting to uniform recipes as separate transform,
228 // then cloning should be sufficient here.
229 Instruction *I = SinkCandidate->getUnderlyingInstr();
230 Clone = new VPReplicateRecipe(I, SinkCandidate->operands(), true,
231 nullptr /*Mask*/, *SinkCandidateRepR,
232 *SinkCandidateRepR);
233 // TODO: add ".cloned" suffix to name of Clone's VPValue.
234 } else {
235 Clone = SinkCandidate->clone();
236 }
237
238 Clone->insertBefore(SinkCandidate);
239 SinkCandidate->replaceUsesWithIf(Clone, [SinkTo](VPUser &U, unsigned) {
240 return cast<VPRecipeBase>(&U)->getParent() != SinkTo;
241 });
242 }
243 SinkCandidate->moveBefore(*SinkTo, SinkTo->getFirstNonPhi());
244 for (VPValue *Op : SinkCandidate->operands())
245 InsertIfValidSinkCandidate(SinkTo, Op);
246 Changed = true;
247 }
248 return Changed;
249}
250
251/// If \p R is a region with a VPBranchOnMaskRecipe in the entry block, return
252/// the mask.
254 auto *EntryBB = dyn_cast<VPBasicBlock>(R->getEntry());
255 if (!EntryBB || EntryBB->size() != 1 ||
256 !isa<VPBranchOnMaskRecipe>(EntryBB->begin()))
257 return nullptr;
258
259 return cast<VPBranchOnMaskRecipe>(&*EntryBB->begin())->getOperand(0);
260}
261
262/// If \p R is a triangle region, return the 'then' block of the triangle.
264 auto *EntryBB = cast<VPBasicBlock>(R->getEntry());
265 if (EntryBB->getNumSuccessors() != 2)
266 return nullptr;
267
268 auto *Succ0 = dyn_cast<VPBasicBlock>(EntryBB->getSuccessors()[0]);
269 auto *Succ1 = dyn_cast<VPBasicBlock>(EntryBB->getSuccessors()[1]);
270 if (!Succ0 || !Succ1)
271 return nullptr;
272
273 if (Succ0->getNumSuccessors() + Succ1->getNumSuccessors() != 1)
274 return nullptr;
275 if (Succ0->getSingleSuccessor() == Succ1)
276 return Succ0;
277 if (Succ1->getSingleSuccessor() == Succ0)
278 return Succ1;
279 return nullptr;
280}
281
282// Merge replicate regions in their successor region, if a replicate region
283// is connected to a successor replicate region with the same predicate by a
284// single, empty VPBasicBlock.
286 SmallPtrSet<VPRegionBlock *, 4> TransformedRegions;
287
288 // Collect replicate regions followed by an empty block, followed by another
289 // replicate region with matching masks to process front. This is to avoid
290 // iterator invalidation issues while merging regions.
293 vp_depth_first_deep(Plan.getEntry()))) {
294 if (!Region1->isReplicator())
295 continue;
296 auto *MiddleBasicBlock =
297 dyn_cast_or_null<VPBasicBlock>(Region1->getSingleSuccessor());
298 if (!MiddleBasicBlock || !MiddleBasicBlock->empty())
299 continue;
300
301 auto *Region2 =
302 dyn_cast_or_null<VPRegionBlock>(MiddleBasicBlock->getSingleSuccessor());
303 if (!Region2 || !Region2->isReplicator())
304 continue;
305
306 VPValue *Mask1 = getPredicatedMask(Region1);
307 VPValue *Mask2 = getPredicatedMask(Region2);
308 if (!Mask1 || Mask1 != Mask2)
309 continue;
310
311 assert(Mask1 && Mask2 && "both region must have conditions");
312 WorkList.push_back(Region1);
313 }
314
315 // Move recipes from Region1 to its successor region, if both are triangles.
316 for (VPRegionBlock *Region1 : WorkList) {
317 if (TransformedRegions.contains(Region1))
318 continue;
319 auto *MiddleBasicBlock = cast<VPBasicBlock>(Region1->getSingleSuccessor());
320 auto *Region2 = cast<VPRegionBlock>(MiddleBasicBlock->getSingleSuccessor());
321
322 VPBasicBlock *Then1 = getPredicatedThenBlock(Region1);
323 VPBasicBlock *Then2 = getPredicatedThenBlock(Region2);
324 if (!Then1 || !Then2)
325 continue;
326
327 // Note: No fusion-preventing memory dependencies are expected in either
328 // region. Such dependencies should be rejected during earlier dependence
329 // checks, which guarantee accesses can be re-ordered for vectorization.
330 //
331 // Move recipes to the successor region.
332 for (VPRecipeBase &ToMove : make_early_inc_range(reverse(*Then1)))
333 ToMove.moveBefore(*Then2, Then2->getFirstNonPhi());
334
335 auto *Merge1 = cast<VPBasicBlock>(Then1->getSingleSuccessor());
336 auto *Merge2 = cast<VPBasicBlock>(Then2->getSingleSuccessor());
337
338 // Move VPPredInstPHIRecipes from the merge block to the successor region's
339 // merge block. Update all users inside the successor region to use the
340 // original values.
341 for (VPRecipeBase &Phi1ToMove : make_early_inc_range(reverse(*Merge1))) {
342 VPValue *PredInst1 =
343 cast<VPPredInstPHIRecipe>(&Phi1ToMove)->getOperand(0);
344 VPValue *Phi1ToMoveV = Phi1ToMove.getVPSingleValue();
345 Phi1ToMoveV->replaceUsesWithIf(PredInst1, [Then2](VPUser &U, unsigned) {
346 return cast<VPRecipeBase>(&U)->getParent() == Then2;
347 });
348
349 // Remove phi recipes that are unused after merging the regions.
350 if (Phi1ToMove.getVPSingleValue()->getNumUsers() == 0) {
351 Phi1ToMove.eraseFromParent();
352 continue;
353 }
354 Phi1ToMove.moveBefore(*Merge2, Merge2->begin());
355 }
356
357 // Remove the dead recipes in Region1's entry block.
358 for (VPRecipeBase &R :
359 make_early_inc_range(reverse(*Region1->getEntryBasicBlock())))
360 R.eraseFromParent();
361
362 // Finally, remove the first region.
363 for (VPBlockBase *Pred : make_early_inc_range(Region1->getPredecessors())) {
364 VPBlockUtils::disconnectBlocks(Pred, Region1);
365 VPBlockUtils::connectBlocks(Pred, MiddleBasicBlock);
366 }
367 VPBlockUtils::disconnectBlocks(Region1, MiddleBasicBlock);
368 TransformedRegions.insert(Region1);
369 }
370
371 return !TransformedRegions.empty();
372}
373
375 VPlan &Plan) {
376 Instruction *Instr = PredRecipe->getUnderlyingInstr();
377 // Build the triangular if-then region.
378 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
379 assert(Instr->getParent() && "Predicated instruction not in any basic block");
380 auto *BlockInMask = PredRecipe->getMask();
381 auto *MaskDef = BlockInMask->getDefiningRecipe();
382 auto *BOMRecipe = new VPBranchOnMaskRecipe(
383 BlockInMask, MaskDef ? MaskDef->getDebugLoc() : DebugLoc::getUnknown());
384 auto *Entry =
385 Plan.createVPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
386
387 // Replace predicated replicate recipe with a replicate recipe without a
388 // mask but in the replicate region.
389 auto *RecipeWithoutMask = new VPReplicateRecipe(
390 PredRecipe->getUnderlyingInstr(), drop_end(PredRecipe->operands()),
391 PredRecipe->isSingleScalar(), nullptr /*Mask*/, *PredRecipe, *PredRecipe,
392 PredRecipe->getDebugLoc());
393 auto *Pred =
394 Plan.createVPBasicBlock(Twine(RegionName) + ".if", RecipeWithoutMask);
395
396 VPPredInstPHIRecipe *PHIRecipe = nullptr;
397 if (PredRecipe->getNumUsers() != 0) {
398 PHIRecipe = new VPPredInstPHIRecipe(RecipeWithoutMask,
399 RecipeWithoutMask->getDebugLoc());
400 PredRecipe->replaceAllUsesWith(PHIRecipe);
401 PHIRecipe->setOperand(0, RecipeWithoutMask);
402 }
403 PredRecipe->eraseFromParent();
404 auto *Exiting =
405 Plan.createVPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
407 Plan.createReplicateRegion(Entry, Exiting, RegionName);
408
409 // Note: first set Entry as region entry and then connect successors starting
410 // from it in order, to propagate the "parent" of each VPBasicBlock.
411 VPBlockUtils::insertTwoBlocksAfter(Pred, Exiting, Entry);
412 VPBlockUtils::connectBlocks(Pred, Exiting);
413
414 return Region;
415}
416
417static void addReplicateRegions(VPlan &Plan) {
420 vp_depth_first_deep(Plan.getEntry()))) {
421 for (VPRecipeBase &R : *VPBB)
422 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
423 if (RepR->isPredicated())
424 WorkList.push_back(RepR);
425 }
426 }
427
428 unsigned BBNum = 0;
429 for (VPReplicateRecipe *RepR : WorkList) {
430 VPBasicBlock *CurrentBlock = RepR->getParent();
431 VPBasicBlock *SplitBlock = CurrentBlock->splitAt(RepR->getIterator());
432
433 BasicBlock *OrigBB = RepR->getUnderlyingInstr()->getParent();
434 SplitBlock->setName(
435 OrigBB->hasName() ? OrigBB->getName() + "." + Twine(BBNum++) : "");
436 // Record predicated instructions for above packing optimizations.
438 Region->setParent(CurrentBlock->getParent());
440
441 VPRegionBlock *ParentRegion = Region->getParent();
442 if (ParentRegion && ParentRegion->getExiting() == CurrentBlock)
443 ParentRegion->setExiting(SplitBlock);
444 }
445}
446
447/// Remove redundant VPBasicBlocks by merging them into their predecessor if
448/// the predecessor has a single successor.
452 vp_depth_first_deep(Plan.getEntry()))) {
453 // Don't fold the blocks in the skeleton of the Plan into their single
454 // predecessors for now.
455 // TODO: Remove restriction once more of the skeleton is modeled in VPlan.
456 if (!VPBB->getParent())
457 continue;
458 auto *PredVPBB =
459 dyn_cast_or_null<VPBasicBlock>(VPBB->getSinglePredecessor());
460 if (!PredVPBB || PredVPBB->getNumSuccessors() != 1 ||
461 isa<VPIRBasicBlock>(PredVPBB))
462 continue;
463 WorkList.push_back(VPBB);
464 }
465
466 for (VPBasicBlock *VPBB : WorkList) {
467 VPBasicBlock *PredVPBB = cast<VPBasicBlock>(VPBB->getSinglePredecessor());
468 for (VPRecipeBase &R : make_early_inc_range(*VPBB))
469 R.moveBefore(*PredVPBB, PredVPBB->end());
470 VPBlockUtils::disconnectBlocks(PredVPBB, VPBB);
471 auto *ParentRegion = VPBB->getParent();
472 if (ParentRegion && ParentRegion->getExiting() == VPBB)
473 ParentRegion->setExiting(PredVPBB);
474 for (auto *Succ : to_vector(VPBB->successors())) {
476 VPBlockUtils::connectBlocks(PredVPBB, Succ);
477 }
478 // VPBB is now dead and will be cleaned up when the plan gets destroyed.
479 }
480 return !WorkList.empty();
481}
482
484 // Convert masked VPReplicateRecipes to if-then region blocks.
486
487 bool ShouldSimplify = true;
488 while (ShouldSimplify) {
489 ShouldSimplify = sinkScalarOperands(Plan);
490 ShouldSimplify |= mergeReplicateRegionsIntoSuccessors(Plan);
491 ShouldSimplify |= mergeBlocksIntoPredecessors(Plan);
492 }
493}
494
495/// Remove redundant casts of inductions.
496///
497/// Such redundant casts are casts of induction variables that can be ignored,
498/// because we already proved that the casted phi is equal to the uncasted phi
499/// in the vectorized loop. There is no need to vectorize the cast - the same
500/// value can be used for both the phi and casts in the vector loop.
502 for (auto &Phi : Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
504 if (!IV || IV->getTruncInst())
505 continue;
506
507 // A sequence of IR Casts has potentially been recorded for IV, which
508 // *must be bypassed* when the IV is vectorized, because the vectorized IV
509 // will produce the desired casted value. This sequence forms a def-use
510 // chain and is provided in reverse order, ending with the cast that uses
511 // the IV phi. Search for the recipe of the last cast in the chain and
512 // replace it with the original IV. Note that only the final cast is
513 // expected to have users outside the cast-chain and the dead casts left
514 // over will be cleaned up later.
515 ArrayRef<Instruction *> Casts = IV->getInductionDescriptor().getCastInsts();
516 VPValue *FindMyCast = IV;
517 for (Instruction *IRCast : reverse(Casts)) {
518 VPSingleDefRecipe *FoundUserCast = nullptr;
519 for (auto *U : FindMyCast->users()) {
520 auto *UserCast = dyn_cast<VPSingleDefRecipe>(U);
521 if (UserCast && UserCast->getUnderlyingValue() == IRCast) {
522 FoundUserCast = UserCast;
523 break;
524 }
525 }
526 FindMyCast = FoundUserCast;
527 }
528 FindMyCast->replaceAllUsesWith(IV);
529 }
530}
531
532/// Try to replace VPWidenCanonicalIVRecipes with a widened canonical IV
533/// recipe, if it exists.
535 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
536 VPCanonicalIVPHIRecipe *CanonicalIV = LoopRegion->getCanonicalIV();
537 VPWidenCanonicalIVRecipe *WidenNewIV = nullptr;
538 for (VPUser *U : CanonicalIV->users()) {
540 if (WidenNewIV)
541 break;
542 }
543
544 if (!WidenNewIV)
545 return;
546
547 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
548 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
549 auto *WidenOriginalIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
550
551 if (!WidenOriginalIV || !WidenOriginalIV->isCanonical())
552 continue;
553
554 // Replace WidenNewIV with WidenOriginalIV if WidenOriginalIV provides
555 // everything WidenNewIV's users need. That is, WidenOriginalIV will
556 // generate a vector phi or all users of WidenNewIV demand the first lane
557 // only.
558 if (!vputils::onlyScalarValuesUsed(WidenOriginalIV) ||
559 vputils::onlyFirstLaneUsed(WidenNewIV)) {
560 // We are replacing a wide canonical iv with a suitable wide induction.
561 // This is used to compute header mask, hence all lanes will be used and
562 // we need to drop wrap flags only applying to lanes guranteed to execute
563 // in the original scalar loop.
564 WidenOriginalIV->dropPoisonGeneratingFlags();
565 WidenNewIV->replaceAllUsesWith(WidenOriginalIV);
566 WidenNewIV->eraseFromParent();
567 return;
568 }
569 }
570}
571
572/// Returns true if \p R is dead and can be removed.
573static bool isDeadRecipe(VPRecipeBase &R) {
574 // Do remove conditional assume instructions as their conditions may be
575 // flattened.
576 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
577 bool IsConditionalAssume = RepR && RepR->isPredicated() &&
579 if (IsConditionalAssume)
580 return true;
581
582 if (R.mayHaveSideEffects())
583 return false;
584
585 // Recipe is dead if no user keeps the recipe alive.
586 return all_of(R.definedValues(),
587 [](VPValue *V) { return V->getNumUsers() == 0; });
588}
589
592 vp_post_order_deep(Plan.getEntry()))) {
593 // The recipes in the block are processed in reverse order, to catch chains
594 // of dead recipes.
595 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
596 if (isDeadRecipe(R)) {
597 R.eraseFromParent();
598 continue;
599 }
600
601 // Check if R is a dead VPPhi <-> update cycle and remove it.
602 auto *PhiR = dyn_cast<VPPhi>(&R);
603 if (!PhiR || PhiR->getNumOperands() != 2)
604 continue;
605 VPUser *PhiUser = PhiR->getSingleUser();
606 if (!PhiUser)
607 continue;
608 VPValue *Incoming = PhiR->getOperand(1);
609 if (PhiUser != Incoming->getDefiningRecipe() ||
610 Incoming->getNumUsers() != 1)
611 continue;
612 PhiR->replaceAllUsesWith(PhiR->getOperand(0));
613 PhiR->eraseFromParent();
614 Incoming->getDefiningRecipe()->eraseFromParent();
615 }
616 }
617}
618
621 Instruction::BinaryOps InductionOpcode,
622 FPMathOperator *FPBinOp, Instruction *TruncI,
623 VPValue *StartV, VPValue *Step, DebugLoc DL,
624 VPBuilder &Builder) {
625 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
626 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
627 VPCanonicalIVPHIRecipe *CanonicalIV = LoopRegion->getCanonicalIV();
628 VPSingleDefRecipe *BaseIV = Builder.createDerivedIV(
629 Kind, FPBinOp, StartV, CanonicalIV, Step, "offset.idx");
630
631 // Truncate base induction if needed.
632 VPTypeAnalysis TypeInfo(Plan);
633 Type *ResultTy = TypeInfo.inferScalarType(BaseIV);
634 if (TruncI) {
635 Type *TruncTy = TruncI->getType();
636 assert(ResultTy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits() &&
637 "Not truncating.");
638 assert(ResultTy->isIntegerTy() && "Truncation requires an integer type");
639 BaseIV = Builder.createScalarCast(Instruction::Trunc, BaseIV, TruncTy, DL);
640 ResultTy = TruncTy;
641 }
642
643 // Truncate step if needed.
644 Type *StepTy = TypeInfo.inferScalarType(Step);
645 if (ResultTy != StepTy) {
646 assert(StepTy->getScalarSizeInBits() > ResultTy->getScalarSizeInBits() &&
647 "Not truncating.");
648 assert(StepTy->isIntegerTy() && "Truncation requires an integer type");
649 auto *VecPreheader =
651 VPBuilder::InsertPointGuard Guard(Builder);
652 Builder.setInsertPoint(VecPreheader);
653 Step = Builder.createScalarCast(Instruction::Trunc, Step, ResultTy, DL);
654 }
655 return Builder.createScalarIVSteps(InductionOpcode, FPBinOp, BaseIV, Step,
656 &Plan.getVF(), DL);
657}
658
661 for (unsigned I = 0; I != Users.size(); ++I) {
663 if (isa<VPHeaderPHIRecipe>(Cur))
664 continue;
665 for (VPValue *V : Cur->definedValues())
666 Users.insert_range(V->users());
667 }
668 return Users.takeVector();
669}
670
671/// Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd
672/// (IndStart, ScalarIVSteps (0, Step)) if only its scalar values are used, as
673/// VPWidenPointerInductionRecipe will generate vectors only. If some users
674/// require vectors while other require scalars, the scalar uses need to extract
675/// the scalars from the generated vectors (Note that this is different to how
676/// int/fp inductions are handled). Legalize extract-from-ends using uniform
677/// VPReplicateRecipe of wide inductions to use regular VPReplicateRecipe, so
678/// the correct end value is available. Also optimize
679/// VPWidenIntOrFpInductionRecipe, if any of its users needs scalar values, by
680/// providing them scalar steps built on the canonical scalar IV and update the
681/// original IV's users. This is an optional optimization to reduce the needs of
682/// vector extracts.
685 bool HasOnlyVectorVFs = !Plan.hasScalarVFOnly();
686 VPBuilder Builder(HeaderVPBB, HeaderVPBB->getFirstNonPhi());
687 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
688 auto *PhiR = dyn_cast<VPWidenInductionRecipe>(&Phi);
689 if (!PhiR)
690 continue;
691
692 // Try to narrow wide and replicating recipes to uniform recipes, based on
693 // VPlan analysis.
694 // TODO: Apply to all recipes in the future, to replace legacy uniformity
695 // analysis.
696 auto Users = collectUsersRecursively(PhiR);
697 for (VPUser *U : reverse(Users)) {
698 auto *Def = dyn_cast<VPRecipeWithIRFlags>(U);
699 auto *RepR = dyn_cast<VPReplicateRecipe>(U);
700 // Skip recipes that shouldn't be narrowed.
701 if (!Def || !isa<VPReplicateRecipe, VPWidenRecipe>(Def) ||
702 Def->getNumUsers() == 0 || !Def->getUnderlyingValue() ||
703 (RepR && (RepR->isSingleScalar() || RepR->isPredicated())))
704 continue;
705
706 // Skip recipes that may have other lanes than their first used.
708 continue;
709
710 auto *Clone = new VPReplicateRecipe(Def->getUnderlyingInstr(),
711 Def->operands(), /*IsUniform*/ true,
712 /*Mask*/ nullptr, /*Flags*/ *Def);
713 Clone->insertAfter(Def);
714 Def->replaceAllUsesWith(Clone);
715 }
716
717 // Replace wide pointer inductions which have only their scalars used by
718 // PtrAdd(IndStart, ScalarIVSteps (0, Step)).
719 if (auto *PtrIV = dyn_cast<VPWidenPointerInductionRecipe>(&Phi)) {
720 if (!PtrIV->onlyScalarsGenerated(Plan.hasScalableVF()))
721 continue;
722
723 const InductionDescriptor &ID = PtrIV->getInductionDescriptor();
724 VPValue *StartV = Plan.getConstantInt(ID.getStep()->getType(), 0);
725 VPValue *StepV = PtrIV->getOperand(1);
727 Plan, InductionDescriptor::IK_IntInduction, Instruction::Add, nullptr,
728 nullptr, StartV, StepV, PtrIV->getDebugLoc(), Builder);
729
730 VPValue *PtrAdd = Builder.createPtrAdd(PtrIV->getStartValue(), Steps,
731 PtrIV->getDebugLoc(), "next.gep");
732
733 PtrIV->replaceAllUsesWith(PtrAdd);
734 continue;
735 }
736
737 // Replace widened induction with scalar steps for users that only use
738 // scalars.
739 auto *WideIV = cast<VPWidenIntOrFpInductionRecipe>(&Phi);
740 if (HasOnlyVectorVFs && none_of(WideIV->users(), [WideIV](VPUser *U) {
741 return U->usesScalars(WideIV);
742 }))
743 continue;
744
745 const InductionDescriptor &ID = WideIV->getInductionDescriptor();
747 Plan, ID.getKind(), ID.getInductionOpcode(),
748 dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
749 WideIV->getTruncInst(), WideIV->getStartValue(), WideIV->getStepValue(),
750 WideIV->getDebugLoc(), Builder);
751
752 // Update scalar users of IV to use Step instead.
753 if (!HasOnlyVectorVFs)
754 WideIV->replaceAllUsesWith(Steps);
755 else
756 WideIV->replaceUsesWithIf(Steps, [WideIV](VPUser &U, unsigned) {
757 return U.usesScalars(WideIV);
758 });
759 }
760}
761
762/// Check if \p VPV is an untruncated wide induction, either before or after the
763/// increment. If so return the header IV (before the increment), otherwise
764/// return null.
766 ScalarEvolution &SE) {
767 auto *WideIV = dyn_cast<VPWidenInductionRecipe>(VPV);
768 if (WideIV) {
769 // VPV itself is a wide induction, separately compute the end value for exit
770 // users if it is not a truncated IV.
771 auto *IntOrFpIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
772 return (IntOrFpIV && IntOrFpIV->getTruncInst()) ? nullptr : WideIV;
773 }
774
775 // Check if VPV is an optimizable induction increment.
776 VPRecipeBase *Def = VPV->getDefiningRecipe();
777 if (!Def || Def->getNumOperands() != 2)
778 return nullptr;
779 WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(0));
780 if (!WideIV)
781 WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(1));
782 if (!WideIV)
783 return nullptr;
784
785 auto IsWideIVInc = [&]() {
786 auto &ID = WideIV->getInductionDescriptor();
787
788 // Check if VPV increments the induction by the induction step.
789 VPValue *IVStep = WideIV->getStepValue();
790 switch (ID.getInductionOpcode()) {
791 case Instruction::Add:
792 return match(VPV, m_c_Add(m_Specific(WideIV), m_Specific(IVStep)));
793 case Instruction::FAdd:
795 m_Specific(IVStep)));
796 case Instruction::FSub:
797 return match(VPV, m_Binary<Instruction::FSub>(m_Specific(WideIV),
798 m_Specific(IVStep)));
799 case Instruction::Sub: {
800 // IVStep will be the negated step of the subtraction. Check if Step == -1
801 // * IVStep.
802 VPValue *Step;
803 if (!match(VPV, m_Sub(m_VPValue(), m_VPValue(Step))))
804 return false;
805 const SCEV *IVStepSCEV = vputils::getSCEVExprForVPValue(IVStep, SE);
806 const SCEV *StepSCEV = vputils::getSCEVExprForVPValue(Step, SE);
807 return !isa<SCEVCouldNotCompute>(IVStepSCEV) &&
808 !isa<SCEVCouldNotCompute>(StepSCEV) &&
809 IVStepSCEV == SE.getNegativeSCEV(StepSCEV);
810 }
811 default:
812 return ID.getKind() == InductionDescriptor::IK_PtrInduction &&
813 match(VPV, m_GetElementPtr(m_Specific(WideIV),
814 m_Specific(WideIV->getStepValue())));
815 }
816 llvm_unreachable("should have been covered by switch above");
817 };
818 return IsWideIVInc() ? WideIV : nullptr;
819}
820
821/// Attempts to optimize the induction variable exit values for users in the
822/// early exit block.
824 VPTypeAnalysis &TypeInfo,
825 VPBlockBase *PredVPBB,
826 VPValue *Op,
827 ScalarEvolution &SE) {
828 VPValue *Incoming, *Mask;
831 return nullptr;
832
833 auto *WideIV = getOptimizableIVOf(Incoming, SE);
834 if (!WideIV)
835 return nullptr;
836
837 auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
838 if (WideIntOrFp && WideIntOrFp->getTruncInst())
839 return nullptr;
840
841 // Calculate the final index.
842 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
843 auto *CanonicalIV = LoopRegion->getCanonicalIV();
844 Type *CanonicalIVType = LoopRegion->getCanonicalIVType();
845 VPBuilder B(cast<VPBasicBlock>(PredVPBB));
846
847 DebugLoc DL = cast<VPInstruction>(Op)->getDebugLoc();
848 VPValue *FirstActiveLane =
849 B.createNaryOp(VPInstruction::FirstActiveLane, Mask, DL);
850 Type *FirstActiveLaneType = TypeInfo.inferScalarType(FirstActiveLane);
851 FirstActiveLane = B.createScalarZExtOrTrunc(FirstActiveLane, CanonicalIVType,
852 FirstActiveLaneType, DL);
853 VPValue *EndValue =
854 B.createNaryOp(Instruction::Add, {CanonicalIV, FirstActiveLane}, DL);
855
856 // `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
857 // changed it means the exit is using the incremented value, so we need to
858 // add the step.
859 if (Incoming != WideIV) {
860 VPValue *One = Plan.getConstantInt(CanonicalIVType, 1);
861 EndValue = B.createNaryOp(Instruction::Add, {EndValue, One}, DL);
862 }
863
864 if (!WideIntOrFp || !WideIntOrFp->isCanonical()) {
865 const InductionDescriptor &ID = WideIV->getInductionDescriptor();
866 VPValue *Start = WideIV->getStartValue();
867 VPValue *Step = WideIV->getStepValue();
868 EndValue = B.createDerivedIV(
869 ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
870 Start, EndValue, Step);
871 }
872
873 return EndValue;
874}
875
876/// Attempts to optimize the induction variable exit values for users in the
877/// exit block coming from the latch in the original scalar loop.
879 VPlan &Plan, VPTypeAnalysis &TypeInfo, VPBlockBase *PredVPBB, VPValue *Op,
883 return nullptr;
884
885 auto *WideIV = getOptimizableIVOf(Incoming, SE);
886 if (!WideIV)
887 return nullptr;
888
889 VPValue *EndValue = EndValues.lookup(WideIV);
890 assert(EndValue && "end value must have been pre-computed");
891
892 // `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
893 // changed it means the exit is using the incremented value, so we don't
894 // need to subtract the step.
895 if (Incoming != WideIV)
896 return EndValue;
897
898 // Otherwise, subtract the step from the EndValue.
899 VPBuilder B(cast<VPBasicBlock>(PredVPBB)->getTerminator());
900 VPValue *Step = WideIV->getStepValue();
901 Type *ScalarTy = TypeInfo.inferScalarType(WideIV);
902 if (ScalarTy->isIntegerTy())
903 return B.createNaryOp(Instruction::Sub, {EndValue, Step}, {}, "ind.escape");
904 if (ScalarTy->isPointerTy()) {
905 Type *StepTy = TypeInfo.inferScalarType(Step);
906 auto *Zero = Plan.getConstantInt(StepTy, 0);
907 return B.createPtrAdd(EndValue,
908 B.createNaryOp(Instruction::Sub, {Zero, Step}),
909 DebugLoc::getUnknown(), "ind.escape");
910 }
911 if (ScalarTy->isFloatingPointTy()) {
912 const auto &ID = WideIV->getInductionDescriptor();
913 return B.createNaryOp(
914 ID.getInductionBinOp()->getOpcode() == Instruction::FAdd
915 ? Instruction::FSub
916 : Instruction::FAdd,
917 {EndValue, Step}, {ID.getInductionBinOp()->getFastMathFlags()});
918 }
919 llvm_unreachable("all possible induction types must be handled");
920 return nullptr;
921}
922
924 VPlan &Plan, DenseMap<VPValue *, VPValue *> &EndValues,
925 ScalarEvolution &SE) {
926 VPBlockBase *MiddleVPBB = Plan.getMiddleBlock();
927 VPTypeAnalysis TypeInfo(Plan);
928 for (VPIRBasicBlock *ExitVPBB : Plan.getExitBlocks()) {
929 for (VPRecipeBase &R : ExitVPBB->phis()) {
930 auto *ExitIRI = cast<VPIRPhi>(&R);
931
932 for (auto [Idx, PredVPBB] : enumerate(ExitVPBB->getPredecessors())) {
933 VPValue *Escape = nullptr;
934 if (PredVPBB == MiddleVPBB)
935 Escape = optimizeLatchExitInductionUser(Plan, TypeInfo, PredVPBB,
936 ExitIRI->getOperand(Idx),
937 EndValues, SE);
938 else
939 Escape = optimizeEarlyExitInductionUser(Plan, TypeInfo, PredVPBB,
940 ExitIRI->getOperand(Idx), SE);
941 if (Escape)
942 ExitIRI->setOperand(Idx, Escape);
943 }
944 }
945 }
946}
947
948/// Remove redundant EpxandSCEVRecipes in \p Plan's entry block by replacing
949/// them with already existing recipes expanding the same SCEV expression.
952
953 for (VPRecipeBase &R :
955 auto *ExpR = dyn_cast<VPExpandSCEVRecipe>(&R);
956 if (!ExpR)
957 continue;
958
959 const auto &[V, Inserted] = SCEV2VPV.try_emplace(ExpR->getSCEV(), ExpR);
960 if (Inserted)
961 continue;
962 ExpR->replaceAllUsesWith(V->second);
963 ExpR->eraseFromParent();
964 }
965}
966
968 SmallVector<VPValue *> WorkList;
970 WorkList.push_back(V);
971
972 while (!WorkList.empty()) {
973 VPValue *Cur = WorkList.pop_back_val();
974 if (!Seen.insert(Cur).second)
975 continue;
977 if (!R)
978 continue;
979 if (!isDeadRecipe(*R))
980 continue;
981 append_range(WorkList, R->operands());
982 R->eraseFromParent();
983 }
984}
985
986/// Get any instruction opcode or intrinsic ID data embedded in recipe \p R.
987/// Returns an optional pair, where the first element indicates whether it is
988/// an intrinsic ID.
989static std::optional<std::pair<bool, unsigned>>
991 return TypeSwitch<const VPSingleDefRecipe *,
992 std::optional<std::pair<bool, unsigned>>>(R)
995 [](auto *I) { return std::make_pair(false, I->getOpcode()); })
996 .Case<VPWidenIntrinsicRecipe>([](auto *I) {
997 return std::make_pair(true, I->getVectorIntrinsicID());
998 })
999 .Case<VPVectorPointerRecipe, VPPredInstPHIRecipe>([](auto *I) {
1000 // For recipes that do not directly map to LLVM IR instructions,
1001 // assign opcodes after the last VPInstruction opcode (which is also
1002 // after the last IR Instruction opcode), based on the VPDefID.
1003 return std::make_pair(false,
1004 VPInstruction::OpsEnd + 1 + I->getVPDefID());
1005 })
1006 .Default([](auto *) { return std::nullopt; });
1007}
1008
1009/// Try to fold \p R using InstSimplifyFolder. Will succeed and return a
1010/// non-nullptr VPValue for a handled opcode or intrinsic ID if corresponding \p
1011/// Operands are foldable live-ins.
1013 ArrayRef<VPValue *> Operands,
1014 const DataLayout &DL,
1015 VPTypeAnalysis &TypeInfo) {
1016 auto OpcodeOrIID = getOpcodeOrIntrinsicID(&R);
1017 if (!OpcodeOrIID)
1018 return nullptr;
1019
1021 for (VPValue *Op : Operands) {
1022 if (!Op->isLiveIn() || !Op->getLiveInIRValue())
1023 return nullptr;
1024 Ops.push_back(Op->getLiveInIRValue());
1025 }
1026
1027 auto FoldToIRValue = [&]() -> Value * {
1028 InstSimplifyFolder Folder(DL);
1029 if (OpcodeOrIID->first) {
1030 if (R.getNumOperands() != 2)
1031 return nullptr;
1032 unsigned ID = OpcodeOrIID->second;
1033 return Folder.FoldBinaryIntrinsic(ID, Ops[0], Ops[1],
1034 TypeInfo.inferScalarType(&R));
1035 }
1036 unsigned Opcode = OpcodeOrIID->second;
1037 if (Instruction::isBinaryOp(Opcode))
1038 return Folder.FoldBinOp(static_cast<Instruction::BinaryOps>(Opcode),
1039 Ops[0], Ops[1]);
1040 if (Instruction::isCast(Opcode))
1041 return Folder.FoldCast(static_cast<Instruction::CastOps>(Opcode), Ops[0],
1042 TypeInfo.inferScalarType(R.getVPSingleValue()));
1043 switch (Opcode) {
1045 return Folder.FoldSelect(Ops[0], Ops[1],
1047 case VPInstruction::Not:
1048 return Folder.FoldBinOp(Instruction::BinaryOps::Xor, Ops[0],
1050 case Instruction::Select:
1051 return Folder.FoldSelect(Ops[0], Ops[1], Ops[2]);
1052 case Instruction::ICmp:
1053 case Instruction::FCmp:
1054 return Folder.FoldCmp(cast<VPRecipeWithIRFlags>(R).getPredicate(), Ops[0],
1055 Ops[1]);
1056 case Instruction::GetElementPtr: {
1057 auto &RFlags = cast<VPRecipeWithIRFlags>(R);
1058 auto *GEP = cast<GetElementPtrInst>(RFlags.getUnderlyingInstr());
1059 return Folder.FoldGEP(GEP->getSourceElementType(), Ops[0],
1060 drop_begin(Ops), RFlags.getGEPNoWrapFlags());
1061 }
1064 return Folder.FoldGEP(IntegerType::getInt8Ty(TypeInfo.getContext()),
1065 Ops[0], Ops[1],
1066 cast<VPRecipeWithIRFlags>(R).getGEPNoWrapFlags());
1067 // An extract of a live-in is an extract of a broadcast, so return the
1068 // broadcasted element.
1069 case Instruction::ExtractElement:
1070 assert(!Ops[0]->getType()->isVectorTy() && "Live-ins should be scalar");
1071 return Ops[0];
1072 }
1073 return nullptr;
1074 };
1075
1076 if (Value *V = FoldToIRValue())
1077 return R.getParent()->getPlan()->getOrAddLiveIn(V);
1078 return nullptr;
1079}
1080
1081/// Try to simplify VPSingleDefRecipe \p Def.
1083 VPlan *Plan = Def->getParent()->getPlan();
1084
1085 // Simplification of live-in IR values for SingleDef recipes using
1086 // InstSimplifyFolder.
1087 const DataLayout &DL =
1089 if (VPValue *V = tryToFoldLiveIns(*Def, Def->operands(), DL, TypeInfo))
1090 return Def->replaceAllUsesWith(V);
1091
1092 // Fold PredPHI LiveIn -> LiveIn.
1093 if (auto *PredPHI = dyn_cast<VPPredInstPHIRecipe>(Def)) {
1094 VPValue *Op = PredPHI->getOperand(0);
1095 if (Op->isLiveIn())
1096 PredPHI->replaceAllUsesWith(Op);
1097 }
1098
1099 VPBuilder Builder(Def);
1100 VPValue *A;
1101 if (match(Def, m_Trunc(m_ZExtOrSExt(m_VPValue(A))))) {
1102 Type *TruncTy = TypeInfo.inferScalarType(Def);
1103 Type *ATy = TypeInfo.inferScalarType(A);
1104 if (TruncTy == ATy) {
1105 Def->replaceAllUsesWith(A);
1106 } else {
1107 // Don't replace a scalarizing recipe with a widened cast.
1108 if (isa<VPReplicateRecipe>(Def))
1109 return;
1110 if (ATy->getScalarSizeInBits() < TruncTy->getScalarSizeInBits()) {
1111
1112 unsigned ExtOpcode = match(Def->getOperand(0), m_SExt(m_VPValue()))
1113 ? Instruction::SExt
1114 : Instruction::ZExt;
1115 auto *Ext = Builder.createWidenCast(Instruction::CastOps(ExtOpcode), A,
1116 TruncTy);
1117 if (auto *UnderlyingExt = Def->getOperand(0)->getUnderlyingValue()) {
1118 // UnderlyingExt has distinct return type, used to retain legacy cost.
1119 Ext->setUnderlyingValue(UnderlyingExt);
1120 }
1121 Def->replaceAllUsesWith(Ext);
1122 } else if (ATy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits()) {
1123 auto *Trunc = Builder.createWidenCast(Instruction::Trunc, A, TruncTy);
1124 Def->replaceAllUsesWith(Trunc);
1125 }
1126 }
1127#ifndef NDEBUG
1128 // Verify that the cached type info is for both A and its users is still
1129 // accurate by comparing it to freshly computed types.
1130 VPTypeAnalysis TypeInfo2(*Plan);
1131 assert(TypeInfo.inferScalarType(A) == TypeInfo2.inferScalarType(A));
1132 for (VPUser *U : A->users()) {
1133 auto *R = cast<VPRecipeBase>(U);
1134 for (VPValue *VPV : R->definedValues())
1135 assert(TypeInfo.inferScalarType(VPV) == TypeInfo2.inferScalarType(VPV));
1136 }
1137#endif
1138 }
1139
1140 // Simplify (X && Y) || (X && !Y) -> X.
1141 // TODO: Split up into simpler, modular combines: (X && Y) || (X && Z) into X
1142 // && (Y || Z) and (X || !X) into true. This requires queuing newly created
1143 // recipes to be visited during simplification.
1144 VPValue *X, *Y, *Z;
1145 if (match(Def,
1148 Def->replaceAllUsesWith(X);
1149 Def->eraseFromParent();
1150 return;
1151 }
1152
1153 // x | 1 -> 1
1154 if (match(Def, m_c_BinaryOr(m_VPValue(X), m_AllOnes())))
1155 return Def->replaceAllUsesWith(Def->getOperand(Def->getOperand(0) == X));
1156
1157 // x | 0 -> x
1158 if (match(Def, m_c_BinaryOr(m_VPValue(X), m_ZeroInt())))
1159 return Def->replaceAllUsesWith(X);
1160
1161 // x & 0 -> 0
1162 if (match(Def, m_c_BinaryAnd(m_VPValue(X), m_ZeroInt())))
1163 return Def->replaceAllUsesWith(Def->getOperand(Def->getOperand(0) == X));
1164
1165 // x && false -> false
1166 if (match(Def, m_LogicalAnd(m_VPValue(X), m_False())))
1167 return Def->replaceAllUsesWith(Def->getOperand(1));
1168
1169 // (x && y) || (x && z) -> x && (y || z)
1172 // Simplify only if one of the operands has one use to avoid creating an
1173 // extra recipe.
1174 (!Def->getOperand(0)->hasMoreThanOneUniqueUser() ||
1175 !Def->getOperand(1)->hasMoreThanOneUniqueUser()))
1176 return Def->replaceAllUsesWith(
1177 Builder.createLogicalAnd(X, Builder.createOr(Y, Z)));
1178
1179 // x && !x -> 0
1181 return Def->replaceAllUsesWith(Plan->getFalse());
1182
1183 if (match(Def, m_Select(m_VPValue(), m_VPValue(X), m_Deferred(X))))
1184 return Def->replaceAllUsesWith(X);
1185
1186 // select !c, x, y -> select c, y, x
1187 VPValue *C;
1188 if (match(Def, m_Select(m_Not(m_VPValue(C)), m_VPValue(X), m_VPValue(Y)))) {
1189 Def->setOperand(0, C);
1190 Def->setOperand(1, Y);
1191 Def->setOperand(2, X);
1192 return;
1193 }
1194
1195 // Reassociate (x && y) && z -> x && (y && z) if x has multiple users. With
1196 // tail folding it is likely that x is a header mask and can be simplified
1197 // further.
1199 m_VPValue(Z))) &&
1200 X->hasMoreThanOneUniqueUser())
1201 return Def->replaceAllUsesWith(
1202 Builder.createLogicalAnd(X, Builder.createLogicalAnd(Y, Z)));
1203
1204 if (match(Def, m_c_Mul(m_VPValue(A), m_One())))
1205 return Def->replaceAllUsesWith(A);
1206
1207 if (match(Def, m_c_Mul(m_VPValue(A), m_ZeroInt())))
1208 return Def->replaceAllUsesWith(
1209 Def->getOperand(0) == A ? Def->getOperand(1) : Def->getOperand(0));
1210
1211 if (match(Def, m_Not(m_VPValue(A)))) {
1212 if (match(A, m_Not(m_VPValue(A))))
1213 return Def->replaceAllUsesWith(A);
1214
1215 // Try to fold Not into compares by adjusting the predicate in-place.
1216 CmpPredicate Pred;
1217 if (match(A, m_Cmp(Pred, m_VPValue(), m_VPValue()))) {
1218 auto *Cmp = cast<VPRecipeWithIRFlags>(A);
1219 if (all_of(Cmp->users(),
1221 m_Not(m_Specific(Cmp)),
1222 m_Select(m_Specific(Cmp), m_VPValue(), m_VPValue()))))) {
1223 Cmp->setPredicate(CmpInst::getInversePredicate(Pred));
1224 for (VPUser *U : to_vector(Cmp->users())) {
1225 auto *R = cast<VPSingleDefRecipe>(U);
1226 if (match(R, m_Select(m_Specific(Cmp), m_VPValue(X), m_VPValue(Y)))) {
1227 // select (cmp pred), x, y -> select (cmp inv_pred), y, x
1228 R->setOperand(1, Y);
1229 R->setOperand(2, X);
1230 } else {
1231 // not (cmp pred) -> cmp inv_pred
1232 assert(match(R, m_Not(m_Specific(Cmp))) && "Unexpected user");
1233 R->replaceAllUsesWith(Cmp);
1234 }
1235 }
1236 // If Cmp doesn't have a debug location, use the one from the negation,
1237 // to preserve the location.
1238 if (!Cmp->getDebugLoc() && Def->getDebugLoc())
1239 Cmp->setDebugLoc(Def->getDebugLoc());
1240 }
1241 }
1242 }
1243
1244 // Fold any-of (fcmp uno %A, %A), (fcmp uno %B, %B), ... ->
1245 // any-of (fcmp uno %A, %B), ...
1246 if (match(Def, m_AnyOf())) {
1248 VPRecipeBase *UnpairedCmp = nullptr;
1249 for (VPValue *Op : Def->operands()) {
1250 VPValue *X;
1251 if (Op->getNumUsers() > 1 ||
1253 m_Deferred(X)))) {
1254 NewOps.push_back(Op);
1255 } else if (!UnpairedCmp) {
1256 UnpairedCmp = Op->getDefiningRecipe();
1257 } else {
1258 NewOps.push_back(Builder.createFCmp(CmpInst::FCMP_UNO,
1259 UnpairedCmp->getOperand(0), X));
1260 UnpairedCmp = nullptr;
1261 }
1262 }
1263
1264 if (UnpairedCmp)
1265 NewOps.push_back(UnpairedCmp->getVPSingleValue());
1266
1267 if (NewOps.size() < Def->getNumOperands()) {
1268 VPValue *NewAnyOf = Builder.createNaryOp(VPInstruction::AnyOf, NewOps);
1269 return Def->replaceAllUsesWith(NewAnyOf);
1270 }
1271 }
1272
1273 // Fold (fcmp uno %X, %X) or (fcmp uno %Y, %Y) -> fcmp uno %X, %Y
1274 // This is useful for fmax/fmin without fast-math flags, where we need to
1275 // check if any operand is NaN.
1277 m_Deferred(X)),
1279 m_Deferred(Y))))) {
1280 VPValue *NewCmp = Builder.createFCmp(CmpInst::FCMP_UNO, X, Y);
1281 return Def->replaceAllUsesWith(NewCmp);
1282 }
1283
1284 // Remove redundant DerviedIVs, that is 0 + A * 1 -> A and 0 + 0 * x -> 0.
1285 if ((match(Def, m_DerivedIV(m_ZeroInt(), m_VPValue(A), m_One())) ||
1286 match(Def, m_DerivedIV(m_ZeroInt(), m_ZeroInt(), m_VPValue()))) &&
1287 TypeInfo.inferScalarType(Def->getOperand(1)) ==
1288 TypeInfo.inferScalarType(Def))
1289 return Def->replaceAllUsesWith(Def->getOperand(1));
1290
1292 m_One()))) {
1293 Type *WideStepTy = TypeInfo.inferScalarType(Def);
1294 if (TypeInfo.inferScalarType(X) != WideStepTy)
1295 X = Builder.createWidenCast(Instruction::Trunc, X, WideStepTy);
1296 Def->replaceAllUsesWith(X);
1297 return;
1298 }
1299
1300 // For i1 vp.merges produced by AnyOf reductions:
1301 // vp.merge true, (or x, y), x, evl -> vp.merge y, true, x, evl
1303 m_VPValue(X), m_VPValue())) &&
1305 TypeInfo.inferScalarType(Def)->isIntegerTy(1)) {
1306 Def->setOperand(1, Def->getOperand(0));
1307 Def->setOperand(0, Y);
1308 return;
1309 }
1310
1311 if (auto *Phi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(Def)) {
1312 if (Phi->getOperand(0) == Phi->getOperand(1))
1313 Phi->replaceAllUsesWith(Phi->getOperand(0));
1314 return;
1315 }
1316
1317 // Look through ExtractLastElement (BuildVector ....).
1320 auto *BuildVector = cast<VPInstruction>(Def->getOperand(0));
1321 Def->replaceAllUsesWith(
1322 BuildVector->getOperand(BuildVector->getNumOperands() - 1));
1323 return;
1324 }
1325
1326 // Look through ExtractPenultimateElement (BuildVector ....).
1328 m_BuildVector()))) {
1329 auto *BuildVector = cast<VPInstruction>(Def->getOperand(0));
1330 Def->replaceAllUsesWith(
1331 BuildVector->getOperand(BuildVector->getNumOperands() - 2));
1332 return;
1333 }
1334
1335 uint64_t Idx;
1337 auto *BuildVector = cast<VPInstruction>(Def->getOperand(0));
1338 Def->replaceAllUsesWith(BuildVector->getOperand(Idx));
1339 return;
1340 }
1341
1342 if (match(Def, m_BuildVector()) && all_equal(Def->operands())) {
1343 Def->replaceAllUsesWith(
1344 Builder.createNaryOp(VPInstruction::Broadcast, Def->getOperand(0)));
1345 return;
1346 }
1347
1348 // Look through broadcast of single-scalar when used as select conditions; in
1349 // that case the scalar condition can be used directly.
1350 if (match(Def,
1353 "broadcast operand must be single-scalar");
1354 Def->setOperand(0, C);
1355 return;
1356 }
1357
1358 if (auto *Phi = dyn_cast<VPPhi>(Def)) {
1359 if (Phi->getNumOperands() == 1)
1360 Phi->replaceAllUsesWith(Phi->getOperand(0));
1361 return;
1362 }
1363
1364 // Some simplifications can only be applied after unrolling. Perform them
1365 // below.
1366 if (!Plan->isUnrolled())
1367 return;
1368
1369 // Hoist an invariant increment Y of a phi X, by having X start at Y.
1370 if (match(Def, m_c_Add(m_VPValue(X), m_VPValue(Y))) && Y->isLiveIn() &&
1371 isa<VPPhi>(X)) {
1372 auto *Phi = cast<VPPhi>(X);
1373 if (Phi->getOperand(1) != Def && match(Phi->getOperand(0), m_ZeroInt()) &&
1374 Phi->getSingleUser() == Def) {
1375 Phi->setOperand(0, Y);
1376 Def->replaceAllUsesWith(Phi);
1377 return;
1378 }
1379 }
1380
1381 // VPVectorPointer for part 0 can be replaced by their start pointer.
1382 if (auto *VecPtr = dyn_cast<VPVectorPointerRecipe>(Def)) {
1383 if (VecPtr->isFirstPart()) {
1384 VecPtr->replaceAllUsesWith(VecPtr->getOperand(0));
1385 return;
1386 }
1387 }
1388
1389 // VPScalarIVSteps for part 0 can be replaced by their start value, if only
1390 // the first lane is demanded.
1391 if (auto *Steps = dyn_cast<VPScalarIVStepsRecipe>(Def)) {
1392 if (Steps->isPart0() && vputils::onlyFirstLaneUsed(Steps)) {
1393 Steps->replaceAllUsesWith(Steps->getOperand(0));
1394 return;
1395 }
1396 }
1397 // Simplify redundant ReductionStartVector recipes after unrolling.
1398 VPValue *StartV;
1400 m_VPValue(StartV), m_VPValue(), m_VPValue()))) {
1401 Def->replaceUsesWithIf(StartV, [](const VPUser &U, unsigned Idx) {
1402 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&U);
1403 return PhiR && PhiR->isInLoop();
1404 });
1405 return;
1406 }
1407
1408 if (match(Def,
1411 Def->replaceAllUsesWith(A);
1412 return;
1413 }
1414
1419 cast<VPReplicateRecipe>(A)->isSingleScalar())) &&
1420 all_of(A->users(),
1421 [Def, A](VPUser *U) { return U->usesScalars(A) || Def == U; })) {
1422 return Def->replaceAllUsesWith(A);
1423 }
1424
1425 if (Plan->getUF() == 1 &&
1427 return Def->replaceAllUsesWith(
1428 Builder.createNaryOp(VPInstruction::ExtractLastElement, {A}));
1429 }
1430}
1431
1434 Plan.getEntry());
1435 VPTypeAnalysis TypeInfo(Plan);
1437 for (VPRecipeBase &R : make_early_inc_range(*VPBB))
1438 if (auto *Def = dyn_cast<VPSingleDefRecipe>(&R))
1439 simplifyRecipe(Def, TypeInfo);
1440 }
1441}
1442
1444 if (Plan.hasScalarVFOnly())
1445 return;
1446
1447 // Try to narrow wide and replicating recipes to single scalar recipes,
1448 // based on VPlan analysis. Only process blocks in the loop region for now,
1449 // without traversing into nested regions, as recipes in replicate regions
1450 // cannot be converted yet.
1453 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
1455 continue;
1456 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
1457 if (RepR && (RepR->isSingleScalar() || RepR->isPredicated()))
1458 continue;
1459
1460 auto *RepOrWidenR = cast<VPRecipeWithIRFlags>(&R);
1461 if (RepR && isa<StoreInst>(RepR->getUnderlyingInstr()) &&
1462 vputils::isSingleScalar(RepR->getOperand(1))) {
1463 auto *Clone = new VPReplicateRecipe(
1464 RepOrWidenR->getUnderlyingInstr(), RepOrWidenR->operands(),
1465 true /*IsSingleScalar*/, nullptr /*Mask*/, *RepR /*Flags*/,
1466 *RepR /*Metadata*/, RepR->getDebugLoc());
1467 Clone->insertBefore(RepOrWidenR);
1468 unsigned ExtractOpc =
1469 vputils::isUniformAcrossVFsAndUFs(RepR->getOperand(1))
1472 auto *Ext = new VPInstruction(ExtractOpc, {Clone->getOperand(0)});
1473 Ext->insertBefore(Clone);
1474 Clone->setOperand(0, Ext);
1475 RepR->eraseFromParent();
1476 continue;
1477 }
1478
1479 // Skip recipes that aren't single scalars or don't have only their
1480 // scalar results used. In the latter case, we would introduce extra
1481 // broadcasts.
1482 if (!vputils::isSingleScalar(RepOrWidenR) ||
1483 !all_of(RepOrWidenR->users(), [RepOrWidenR](const VPUser *U) {
1484 if (auto *Store = dyn_cast<VPWidenStoreRecipe>(U)) {
1485 // VPWidenStore doesn't have users, and stores are always
1486 // profitable to widen: hence, permitting address and mask
1487 // operands, and single-scalar stored values is an important leaf
1488 // condition. The assert must hold as we checked the RepOrWidenR
1489 // operand against vputils::isSingleScalar.
1490 assert(RepOrWidenR != Store->getStoredValue() ||
1491 vputils::isSingleScalar(Store->getStoredValue()));
1492 (void)Store;
1493 return true;
1494 }
1495
1496 if (auto *VPI = dyn_cast<VPInstruction>(U)) {
1497 unsigned Opcode = VPI->getOpcode();
1498 if (Opcode == VPInstruction::ExtractLastElement ||
1501 return true;
1502 }
1503
1504 return U->usesScalars(RepOrWidenR);
1505 }))
1506 continue;
1507
1508 auto *Clone = new VPReplicateRecipe(
1509 RepOrWidenR->getUnderlyingInstr(), RepOrWidenR->operands(),
1510 true /*IsSingleScalar*/, nullptr, *RepOrWidenR);
1511 Clone->insertBefore(RepOrWidenR);
1512 RepOrWidenR->replaceAllUsesWith(Clone);
1513 if (isDeadRecipe(*RepOrWidenR))
1514 RepOrWidenR->eraseFromParent();
1515 }
1516 }
1517}
1518
1519/// Try to see if all of \p Blend's masks share a common value logically and'ed
1520/// and remove it from the masks.
1522 if (Blend->isNormalized())
1523 return;
1524 VPValue *CommonEdgeMask;
1525 if (!match(Blend->getMask(0),
1526 m_LogicalAnd(m_VPValue(CommonEdgeMask), m_VPValue())))
1527 return;
1528 for (unsigned I = 0; I < Blend->getNumIncomingValues(); I++)
1529 if (!match(Blend->getMask(I),
1530 m_LogicalAnd(m_Specific(CommonEdgeMask), m_VPValue())))
1531 return;
1532 for (unsigned I = 0; I < Blend->getNumIncomingValues(); I++)
1533 Blend->setMask(I, Blend->getMask(I)->getDefiningRecipe()->getOperand(1));
1534}
1535
1536/// Normalize and simplify VPBlendRecipes. Should be run after simplifyRecipes
1537/// to make sure the masks are simplified.
1538static void simplifyBlends(VPlan &Plan) {
1541 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
1542 auto *Blend = dyn_cast<VPBlendRecipe>(&R);
1543 if (!Blend)
1544 continue;
1545
1546 removeCommonBlendMask(Blend);
1547
1548 // Try to remove redundant blend recipes.
1549 SmallPtrSet<VPValue *, 4> UniqueValues;
1550 if (Blend->isNormalized() || !match(Blend->getMask(0), m_False()))
1551 UniqueValues.insert(Blend->getIncomingValue(0));
1552 for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
1553 if (!match(Blend->getMask(I), m_False()))
1554 UniqueValues.insert(Blend->getIncomingValue(I));
1555
1556 if (UniqueValues.size() == 1) {
1557 Blend->replaceAllUsesWith(*UniqueValues.begin());
1558 Blend->eraseFromParent();
1559 continue;
1560 }
1561
1562 if (Blend->isNormalized())
1563 continue;
1564
1565 // Normalize the blend so its first incoming value is used as the initial
1566 // value with the others blended into it.
1567
1568 unsigned StartIndex = 0;
1569 for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
1570 // If a value's mask is used only by the blend then is can be deadcoded.
1571 // TODO: Find the most expensive mask that can be deadcoded, or a mask
1572 // that's used by multiple blends where it can be removed from them all.
1573 VPValue *Mask = Blend->getMask(I);
1574 if (Mask->getNumUsers() == 1 && !match(Mask, m_False())) {
1575 StartIndex = I;
1576 break;
1577 }
1578 }
1579
1580 SmallVector<VPValue *, 4> OperandsWithMask;
1581 OperandsWithMask.push_back(Blend->getIncomingValue(StartIndex));
1582
1583 for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
1584 if (I == StartIndex)
1585 continue;
1586 OperandsWithMask.push_back(Blend->getIncomingValue(I));
1587 OperandsWithMask.push_back(Blend->getMask(I));
1588 }
1589
1590 auto *NewBlend =
1591 new VPBlendRecipe(cast_or_null<PHINode>(Blend->getUnderlyingValue()),
1592 OperandsWithMask, Blend->getDebugLoc());
1593 NewBlend->insertBefore(&R);
1594
1595 VPValue *DeadMask = Blend->getMask(StartIndex);
1596 Blend->replaceAllUsesWith(NewBlend);
1597 Blend->eraseFromParent();
1599
1600 /// Simplify BLEND %a, %b, Not(%mask) -> BLEND %b, %a, %mask.
1601 VPValue *NewMask;
1602 if (NewBlend->getNumOperands() == 3 &&
1603 match(NewBlend->getMask(1), m_Not(m_VPValue(NewMask)))) {
1604 VPValue *Inc0 = NewBlend->getOperand(0);
1605 VPValue *Inc1 = NewBlend->getOperand(1);
1606 VPValue *OldMask = NewBlend->getOperand(2);
1607 NewBlend->setOperand(0, Inc1);
1608 NewBlend->setOperand(1, Inc0);
1609 NewBlend->setOperand(2, NewMask);
1610 if (OldMask->getNumUsers() == 0)
1611 cast<VPInstruction>(OldMask)->eraseFromParent();
1612 }
1613 }
1614 }
1615}
1616
1617/// Optimize the width of vector induction variables in \p Plan based on a known
1618/// constant Trip Count, \p BestVF and \p BestUF.
1620 ElementCount BestVF,
1621 unsigned BestUF) {
1622 // Only proceed if we have not completely removed the vector region.
1623 if (!Plan.getVectorLoopRegion())
1624 return false;
1625
1626 const APInt *TC;
1627 if (!BestVF.isFixed() || !match(Plan.getTripCount(), m_APInt(TC)))
1628 return false;
1629
1630 // Calculate the minimum power-of-2 bit width that can fit the known TC, VF
1631 // and UF. Returns at least 8.
1632 auto ComputeBitWidth = [](APInt TC, uint64_t Align) {
1633 APInt AlignedTC =
1636 APInt MaxVal = AlignedTC - 1;
1637 return std::max<unsigned>(PowerOf2Ceil(MaxVal.getActiveBits()), 8);
1638 };
1639 unsigned NewBitWidth =
1640 ComputeBitWidth(*TC, BestVF.getKnownMinValue() * BestUF);
1641
1642 LLVMContext &Ctx = Plan.getContext();
1643 auto *NewIVTy = IntegerType::get(Ctx, NewBitWidth);
1644
1645 bool MadeChange = false;
1646
1647 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
1648 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
1649 auto *WideIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
1650
1651 // Currently only handle canonical IVs as it is trivial to replace the start
1652 // and stop values, and we currently only perform the optimization when the
1653 // IV has a single use.
1654 if (!WideIV || !WideIV->isCanonical() ||
1655 WideIV->hasMoreThanOneUniqueUser() ||
1656 NewIVTy == WideIV->getScalarType())
1657 continue;
1658
1659 // Currently only handle cases where the single user is a header-mask
1660 // comparison with the backedge-taken-count.
1661 VPUser *SingleUser = WideIV->getSingleUser();
1662 if (!SingleUser ||
1663 !match(SingleUser, m_ICmp(m_Specific(WideIV),
1666 continue;
1667
1668 // Update IV operands and comparison bound to use new narrower type.
1669 auto *NewStart = Plan.getConstantInt(NewIVTy, 0);
1670 WideIV->setStartValue(NewStart);
1671 auto *NewStep = Plan.getConstantInt(NewIVTy, 1);
1672 WideIV->setStepValue(NewStep);
1673
1674 auto *NewBTC = new VPWidenCastRecipe(
1675 Instruction::Trunc, Plan.getOrCreateBackedgeTakenCount(), NewIVTy);
1676 Plan.getVectorPreheader()->appendRecipe(NewBTC);
1677 auto *Cmp = cast<VPInstruction>(WideIV->getSingleUser());
1678 Cmp->setOperand(1, NewBTC);
1679
1680 MadeChange = true;
1681 }
1682
1683 return MadeChange;
1684}
1685
1686/// Return true if \p Cond is known to be true for given \p BestVF and \p
1687/// BestUF.
1689 ElementCount BestVF, unsigned BestUF,
1690 ScalarEvolution &SE) {
1692 return any_of(Cond->getDefiningRecipe()->operands(), [&Plan, BestVF, BestUF,
1693 &SE](VPValue *C) {
1694 return isConditionTrueViaVFAndUF(C, Plan, BestVF, BestUF, SE);
1695 });
1696
1697 auto *CanIV = Plan.getVectorLoopRegion()->getCanonicalIV();
1699 m_Specific(CanIV->getBackedgeValue()),
1700 m_Specific(&Plan.getVectorTripCount()))))
1701 return false;
1702
1703 // The compare checks CanIV + VFxUF == vector trip count. The vector trip
1704 // count is not conveniently available as SCEV so far, so we compare directly
1705 // against the original trip count. This is stricter than necessary, as we
1706 // will only return true if the trip count == vector trip count.
1707 const SCEV *VectorTripCount =
1709 if (isa<SCEVCouldNotCompute>(VectorTripCount))
1710 VectorTripCount = vputils::getSCEVExprForVPValue(Plan.getTripCount(), SE);
1711 assert(!isa<SCEVCouldNotCompute>(VectorTripCount) &&
1712 "Trip count SCEV must be computable");
1713 ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF);
1714 const SCEV *C = SE.getElementCount(VectorTripCount->getType(), NumElements);
1715 return SE.isKnownPredicate(CmpInst::ICMP_EQ, VectorTripCount, C);
1716}
1717
1718/// Try to replace multiple active lane masks used for control flow with
1719/// a single, wide active lane mask instruction followed by multiple
1720/// extract subvector intrinsics. This applies to the active lane mask
1721/// instructions both in the loop and in the preheader.
1722/// Incoming values of all ActiveLaneMaskPHIs are updated to use the
1723/// new extracts from the first active lane mask, which has it's last
1724/// operand (multiplier) set to UF.
1726 unsigned UF) {
1727 if (!EnableWideActiveLaneMask || !VF.isVector() || UF == 1)
1728 return false;
1729
1730 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
1731 VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
1732 auto *Term = &ExitingVPBB->back();
1733
1734 using namespace llvm::VPlanPatternMatch;
1736 m_VPValue(), m_VPValue(), m_VPValue())))))
1737 return false;
1738
1739 auto *Header = cast<VPBasicBlock>(VectorRegion->getEntry());
1740 LLVMContext &Ctx = Plan.getContext();
1741
1742 auto ExtractFromALM = [&](VPInstruction *ALM,
1743 SmallVectorImpl<VPValue *> &Extracts) {
1744 DebugLoc DL = ALM->getDebugLoc();
1745 for (unsigned Part = 0; Part < UF; ++Part) {
1747 Ops.append({ALM, Plan.getOrAddLiveIn(
1748 ConstantInt::get(IntegerType::getInt64Ty(Ctx),
1749 VF.getKnownMinValue() * Part))});
1750 auto *Ext =
1751 new VPWidenIntrinsicRecipe(Intrinsic::vector_extract, Ops,
1752 IntegerType::getInt1Ty(Ctx), {}, {}, DL);
1753 Extracts[Part] = Ext;
1754 Ext->insertAfter(ALM);
1755 }
1756 };
1757
1758 // Create a list of each active lane mask phi, ordered by unroll part.
1760 for (VPRecipeBase &R : Header->phis()) {
1762 if (!Phi)
1763 continue;
1764 VPValue *Index = nullptr;
1765 match(Phi->getBackedgeValue(),
1767 assert(Index && "Expected index from ActiveLaneMask instruction");
1768
1769 uint64_t Part;
1770 if (match(Index,
1772 m_VPValue(), m_ConstantInt(Part))))
1773 Phis[Part] = Phi;
1774 else
1775 // Anything other than a CanonicalIVIncrementForPart is part 0
1776 Phis[0] = Phi;
1777 }
1778
1779 assert(all_of(Phis, [](VPActiveLaneMaskPHIRecipe *Phi) { return Phi; }) &&
1780 "Expected one VPActiveLaneMaskPHIRecipe for each unroll part");
1781
1782 auto *EntryALM = cast<VPInstruction>(Phis[0]->getStartValue());
1783 auto *LoopALM = cast<VPInstruction>(Phis[0]->getBackedgeValue());
1784
1785 assert((EntryALM->getOpcode() == VPInstruction::ActiveLaneMask &&
1786 LoopALM->getOpcode() == VPInstruction::ActiveLaneMask) &&
1787 "Expected incoming values of Phi to be ActiveLaneMasks");
1788
1789 // When using wide lane masks, the return type of the get.active.lane.mask
1790 // intrinsic is VF x UF (last operand).
1791 VPValue *ALMMultiplier = Plan.getConstantInt(64, UF);
1792 EntryALM->setOperand(2, ALMMultiplier);
1793 LoopALM->setOperand(2, ALMMultiplier);
1794
1795 // Create UF x extract vectors and insert into preheader.
1796 SmallVector<VPValue *> EntryExtracts(UF);
1797 ExtractFromALM(EntryALM, EntryExtracts);
1798
1799 // Create UF x extract vectors and insert before the loop compare & branch,
1800 // updating the compare to use the first extract.
1801 SmallVector<VPValue *> LoopExtracts(UF);
1802 ExtractFromALM(LoopALM, LoopExtracts);
1803 VPInstruction *Not = cast<VPInstruction>(Term->getOperand(0));
1804 Not->setOperand(0, LoopExtracts[0]);
1805
1806 // Update the incoming values of active lane mask phis.
1807 for (unsigned Part = 0; Part < UF; ++Part) {
1808 Phis[Part]->setStartValue(EntryExtracts[Part]);
1809 Phis[Part]->setBackedgeValue(LoopExtracts[Part]);
1810 }
1811
1812 return true;
1813}
1814
1815/// Try to simplify the branch condition of \p Plan. This may restrict the
1816/// resulting plan to \p BestVF and \p BestUF.
1818 unsigned BestUF,
1820 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
1821 VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
1822 auto *Term = &ExitingVPBB->back();
1823 VPValue *Cond;
1824 ScalarEvolution &SE = *PSE.getSE();
1825 if (match(Term, m_BranchOnCount()) ||
1827 m_VPValue(), m_VPValue(), m_VPValue()))))) {
1828 // Try to simplify the branch condition if VectorTC <= VF * UF when the
1829 // latch terminator is BranchOnCount or BranchOnCond(Not(ActiveLaneMask)).
1830 const SCEV *VectorTripCount =
1832 if (isa<SCEVCouldNotCompute>(VectorTripCount))
1833 VectorTripCount = vputils::getSCEVExprForVPValue(Plan.getTripCount(), SE);
1834 assert(!isa<SCEVCouldNotCompute>(VectorTripCount) &&
1835 "Trip count SCEV must be computable");
1836 ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF);
1837 const SCEV *C = SE.getElementCount(VectorTripCount->getType(), NumElements);
1838 if (!SE.isKnownPredicate(CmpInst::ICMP_ULE, VectorTripCount, C))
1839 return false;
1840 } else if (match(Term, m_BranchOnCond(m_VPValue(Cond)))) {
1841 // For BranchOnCond, check if we can prove the condition to be true using VF
1842 // and UF.
1843 if (!isConditionTrueViaVFAndUF(Cond, Plan, BestVF, BestUF, SE))
1844 return false;
1845 } else {
1846 return false;
1847 }
1848
1849 // The vector loop region only executes once. If possible, completely remove
1850 // the region, otherwise replace the terminator controlling the latch with
1851 // (BranchOnCond true).
1852 // TODO: VPWidenIntOrFpInductionRecipe is only partially supported; add
1853 // support for other non-canonical widen induction recipes (e.g.,
1854 // VPWidenPointerInductionRecipe).
1855 auto *Header = cast<VPBasicBlock>(VectorRegion->getEntry());
1856 if (all_of(Header->phis(), [](VPRecipeBase &Phi) {
1857 if (auto *R = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi))
1858 return R->isCanonical();
1859 return isa<VPCanonicalIVPHIRecipe, VPEVLBasedIVPHIRecipe,
1860 VPFirstOrderRecurrencePHIRecipe, VPPhi>(&Phi);
1861 })) {
1862 for (VPRecipeBase &HeaderR : make_early_inc_range(Header->phis())) {
1863 if (auto *R = dyn_cast<VPWidenIntOrFpInductionRecipe>(&HeaderR)) {
1864 VPBuilder Builder(Plan.getVectorPreheader());
1865 VPValue *StepV = Builder.createNaryOp(VPInstruction::StepVector, {},
1866 R->getScalarType());
1867 HeaderR.getVPSingleValue()->replaceAllUsesWith(StepV);
1868 HeaderR.eraseFromParent();
1869 continue;
1870 }
1871 auto *Phi = cast<VPPhiAccessors>(&HeaderR);
1872 HeaderR.getVPSingleValue()->replaceAllUsesWith(Phi->getIncomingValue(0));
1873 HeaderR.eraseFromParent();
1874 }
1875
1876 VPBlockBase *Preheader = VectorRegion->getSinglePredecessor();
1877 VPBlockBase *Exit = VectorRegion->getSingleSuccessor();
1878 VPBlockUtils::disconnectBlocks(Preheader, VectorRegion);
1879 VPBlockUtils::disconnectBlocks(VectorRegion, Exit);
1880
1881 for (VPBlockBase *B : vp_depth_first_shallow(VectorRegion->getEntry()))
1882 B->setParent(nullptr);
1883
1884 VPBlockUtils::connectBlocks(Preheader, Header);
1885 VPBlockUtils::connectBlocks(ExitingVPBB, Exit);
1887 } else {
1888 // The vector region contains header phis for which we cannot remove the
1889 // loop region yet.
1890 auto *BOC = new VPInstruction(VPInstruction::BranchOnCond, {Plan.getTrue()},
1891 {}, {}, Term->getDebugLoc());
1892 ExitingVPBB->appendRecipe(BOC);
1893 }
1894
1895 Term->eraseFromParent();
1896
1897 return true;
1898}
1899
1900/// From the definition of llvm.experimental.get.vector.length,
1901/// VPInstruction::ExplicitVectorLength(%AVL) = %AVL when %AVL <= VF.
1905 vp_depth_first_deep(Plan.getEntry()))) {
1906 for (VPRecipeBase &R : *VPBB) {
1907 VPValue *AVL;
1908 if (!match(&R, m_EVL(m_VPValue(AVL))))
1909 continue;
1910
1911 ScalarEvolution &SE = *PSE.getSE();
1912 const SCEV *AVLSCEV = vputils::getSCEVExprForVPValue(AVL, SE);
1913 if (isa<SCEVCouldNotCompute>(AVLSCEV))
1914 continue;
1915 const SCEV *VFSCEV = SE.getElementCount(AVLSCEV->getType(), VF);
1916 if (!SE.isKnownPredicate(CmpInst::ICMP_ULE, AVLSCEV, VFSCEV))
1917 continue;
1918
1920 AVL, Type::getInt32Ty(Plan.getContext()), AVLSCEV->getType(),
1921 R.getDebugLoc());
1922 R.getVPSingleValue()->replaceAllUsesWith(Trunc);
1923 return true;
1924 }
1925 }
1926 return false;
1927}
1928
1930 unsigned BestUF,
1932 assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
1933 assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
1934
1935 bool MadeChange = tryToReplaceALMWithWideALM(Plan, BestVF, BestUF);
1936 MadeChange |= simplifyBranchConditionForVFAndUF(Plan, BestVF, BestUF, PSE);
1937 MadeChange |= optimizeVectorInductionWidthForTCAndVFUF(Plan, BestVF, BestUF);
1938 MadeChange |= simplifyKnownEVL(Plan, BestVF, PSE);
1939
1940 if (MadeChange) {
1941 Plan.setVF(BestVF);
1942 assert(Plan.getUF() == BestUF && "BestUF must match the Plan's UF");
1943 }
1944}
1945
1946/// Sink users of \p FOR after the recipe defining the previous value \p
1947/// Previous of the recurrence. \returns true if all users of \p FOR could be
1948/// re-arranged as needed or false if it is not possible.
1949static bool
1951 VPRecipeBase *Previous,
1952 VPDominatorTree &VPDT) {
1953 // Collect recipes that need sinking.
1956 Seen.insert(Previous);
1957 auto TryToPushSinkCandidate = [&](VPRecipeBase *SinkCandidate) {
1958 // The previous value must not depend on the users of the recurrence phi. In
1959 // that case, FOR is not a fixed order recurrence.
1960 if (SinkCandidate == Previous)
1961 return false;
1962
1963 if (isa<VPHeaderPHIRecipe>(SinkCandidate) ||
1964 !Seen.insert(SinkCandidate).second ||
1965 VPDT.properlyDominates(Previous, SinkCandidate))
1966 return true;
1967
1968 if (cannotHoistOrSinkRecipe(*SinkCandidate))
1969 return false;
1970
1971 WorkList.push_back(SinkCandidate);
1972 return true;
1973 };
1974
1975 // Recursively sink users of FOR after Previous.
1976 WorkList.push_back(FOR);
1977 for (unsigned I = 0; I != WorkList.size(); ++I) {
1978 VPRecipeBase *Current = WorkList[I];
1979 assert(Current->getNumDefinedValues() == 1 &&
1980 "only recipes with a single defined value expected");
1981
1982 for (VPUser *User : Current->getVPSingleValue()->users()) {
1983 if (!TryToPushSinkCandidate(cast<VPRecipeBase>(User)))
1984 return false;
1985 }
1986 }
1987
1988 // Keep recipes to sink ordered by dominance so earlier instructions are
1989 // processed first.
1990 sort(WorkList, [&VPDT](const VPRecipeBase *A, const VPRecipeBase *B) {
1991 return VPDT.properlyDominates(A, B);
1992 });
1993
1994 for (VPRecipeBase *SinkCandidate : WorkList) {
1995 if (SinkCandidate == FOR)
1996 continue;
1997
1998 SinkCandidate->moveAfter(Previous);
1999 Previous = SinkCandidate;
2000 }
2001 return true;
2002}
2003
2004/// Try to hoist \p Previous and its operands before all users of \p FOR.
2006 VPRecipeBase *Previous,
2007 VPDominatorTree &VPDT) {
2008 if (cannotHoistOrSinkRecipe(*Previous))
2009 return false;
2010
2011 // Collect recipes that need hoisting.
2012 SmallVector<VPRecipeBase *> HoistCandidates;
2014 VPRecipeBase *HoistPoint = nullptr;
2015 // Find the closest hoist point by looking at all users of FOR and selecting
2016 // the recipe dominating all other users.
2017 for (VPUser *U : FOR->users()) {
2018 auto *R = cast<VPRecipeBase>(U);
2019 if (!HoistPoint || VPDT.properlyDominates(R, HoistPoint))
2020 HoistPoint = R;
2021 }
2022 assert(all_of(FOR->users(),
2023 [&VPDT, HoistPoint](VPUser *U) {
2024 auto *R = cast<VPRecipeBase>(U);
2025 return HoistPoint == R ||
2026 VPDT.properlyDominates(HoistPoint, R);
2027 }) &&
2028 "HoistPoint must dominate all users of FOR");
2029
2030 auto NeedsHoisting = [HoistPoint, &VPDT,
2031 &Visited](VPValue *HoistCandidateV) -> VPRecipeBase * {
2032 VPRecipeBase *HoistCandidate = HoistCandidateV->getDefiningRecipe();
2033 if (!HoistCandidate)
2034 return nullptr;
2035 VPRegionBlock *EnclosingLoopRegion =
2036 HoistCandidate->getParent()->getEnclosingLoopRegion();
2037 assert((!HoistCandidate->getRegion() ||
2038 HoistCandidate->getRegion() == EnclosingLoopRegion) &&
2039 "CFG in VPlan should still be flat, without replicate regions");
2040 // Hoist candidate was already visited, no need to hoist.
2041 if (!Visited.insert(HoistCandidate).second)
2042 return nullptr;
2043
2044 // Candidate is outside loop region or a header phi, dominates FOR users w/o
2045 // hoisting.
2046 if (!EnclosingLoopRegion || isa<VPHeaderPHIRecipe>(HoistCandidate))
2047 return nullptr;
2048
2049 // If we reached a recipe that dominates HoistPoint, we don't need to
2050 // hoist the recipe.
2051 if (VPDT.properlyDominates(HoistCandidate, HoistPoint))
2052 return nullptr;
2053 return HoistCandidate;
2054 };
2055
2056 if (!NeedsHoisting(Previous->getVPSingleValue()))
2057 return true;
2058
2059 // Recursively try to hoist Previous and its operands before all users of FOR.
2060 HoistCandidates.push_back(Previous);
2061
2062 for (unsigned I = 0; I != HoistCandidates.size(); ++I) {
2063 VPRecipeBase *Current = HoistCandidates[I];
2064 assert(Current->getNumDefinedValues() == 1 &&
2065 "only recipes with a single defined value expected");
2066 if (cannotHoistOrSinkRecipe(*Current))
2067 return false;
2068
2069 for (VPValue *Op : Current->operands()) {
2070 // If we reach FOR, it means the original Previous depends on some other
2071 // recurrence that in turn depends on FOR. If that is the case, we would
2072 // also need to hoist recipes involving the other FOR, which may break
2073 // dependencies.
2074 if (Op == FOR)
2075 return false;
2076
2077 if (auto *R = NeedsHoisting(Op))
2078 HoistCandidates.push_back(R);
2079 }
2080 }
2081
2082 // Order recipes to hoist by dominance so earlier instructions are processed
2083 // first.
2084 sort(HoistCandidates, [&VPDT](const VPRecipeBase *A, const VPRecipeBase *B) {
2085 return VPDT.properlyDominates(A, B);
2086 });
2087
2088 for (VPRecipeBase *HoistCandidate : HoistCandidates) {
2089 HoistCandidate->moveBefore(*HoistPoint->getParent(),
2090 HoistPoint->getIterator());
2091 }
2092
2093 return true;
2094}
2095
2097 VPBuilder &LoopBuilder) {
2098 VPDominatorTree VPDT(Plan);
2099
2101 for (VPRecipeBase &R :
2104 RecurrencePhis.push_back(FOR);
2105
2106 for (VPFirstOrderRecurrencePHIRecipe *FOR : RecurrencePhis) {
2108 VPRecipeBase *Previous = FOR->getBackedgeValue()->getDefiningRecipe();
2109 // Fixed-order recurrences do not contain cycles, so this loop is guaranteed
2110 // to terminate.
2111 while (auto *PrevPhi =
2113 assert(PrevPhi->getParent() == FOR->getParent());
2114 assert(SeenPhis.insert(PrevPhi).second);
2115 Previous = PrevPhi->getBackedgeValue()->getDefiningRecipe();
2116 }
2117
2118 if (!sinkRecurrenceUsersAfterPrevious(FOR, Previous, VPDT) &&
2119 !hoistPreviousBeforeFORUsers(FOR, Previous, VPDT))
2120 return false;
2121
2122 // Introduce a recipe to combine the incoming and previous values of a
2123 // fixed-order recurrence.
2124 VPBasicBlock *InsertBlock = Previous->getParent();
2125 if (isa<VPHeaderPHIRecipe>(Previous))
2126 LoopBuilder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi());
2127 else
2128 LoopBuilder.setInsertPoint(InsertBlock,
2129 std::next(Previous->getIterator()));
2130
2131 auto *RecurSplice =
2133 {FOR, FOR->getBackedgeValue()});
2134
2135 FOR->replaceAllUsesWith(RecurSplice);
2136 // Set the first operand of RecurSplice to FOR again, after replacing
2137 // all users.
2138 RecurSplice->setOperand(0, FOR);
2139 }
2140 return true;
2141}
2142
2144 for (VPRecipeBase &R :
2146 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
2147 if (!PhiR)
2148 continue;
2149 RecurKind RK = PhiR->getRecurrenceKind();
2150 if (RK != RecurKind::Add && RK != RecurKind::Mul && RK != RecurKind::Sub &&
2152 continue;
2153
2154 for (VPUser *U : collectUsersRecursively(PhiR))
2155 if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(U)) {
2156 RecWithFlags->dropPoisonGeneratingFlags();
2157 }
2158 }
2159}
2160
2161namespace {
2162struct VPCSEDenseMapInfo : public DenseMapInfo<VPSingleDefRecipe *> {
2163 static bool isSentinel(const VPSingleDefRecipe *Def) {
2164 return Def == getEmptyKey() || Def == getTombstoneKey();
2165 }
2166
2167 /// If recipe \p R will lower to a GEP with a non-i8 source element type,
2168 /// return that source element type.
2169 static Type *getGEPSourceElementType(const VPSingleDefRecipe *R) {
2170 // All VPInstructions that lower to GEPs must have the i8 source element
2171 // type (as they are PtrAdds), so we omit it.
2173 .Case<VPReplicateRecipe>([](auto *I) -> Type * {
2174 if (auto *GEP = dyn_cast<GetElementPtrInst>(I->getUnderlyingValue()))
2175 return GEP->getSourceElementType();
2176 return nullptr;
2177 })
2178 .Case<VPVectorPointerRecipe, VPWidenGEPRecipe>(
2179 [](auto *I) { return I->getSourceElementType(); })
2180 .Default([](auto *) { return nullptr; });
2181 }
2182
2183 /// Returns true if recipe \p Def can be safely handed for CSE.
2184 static bool canHandle(const VPSingleDefRecipe *Def) {
2185 // We can extend the list of handled recipes in the future,
2186 // provided we account for the data embedded in them while checking for
2187 // equality or hashing.
2188 auto C = getOpcodeOrIntrinsicID(Def);
2189
2190 // The issue with (Insert|Extract)Value is that the index of the
2191 // insert/extract is not a proper operand in LLVM IR, and hence also not in
2192 // VPlan.
2193 if (!C || (!C->first && (C->second == Instruction::InsertValue ||
2194 C->second == Instruction::ExtractValue)))
2195 return false;
2196
2197 // During CSE, we can only handle recipes that don't read from memory: if
2198 // they read from memory, there could be an intervening write to memory
2199 // before the next instance is CSE'd, leading to an incorrect result.
2200 return !Def->mayReadFromMemory();
2201 }
2202
2203 /// Hash the underlying data of \p Def.
2204 static unsigned getHashValue(const VPSingleDefRecipe *Def) {
2205 const VPlan *Plan = Def->getParent()->getPlan();
2206 VPTypeAnalysis TypeInfo(*Plan);
2207 hash_code Result = hash_combine(
2208 Def->getVPDefID(), getOpcodeOrIntrinsicID(Def),
2209 getGEPSourceElementType(Def), TypeInfo.inferScalarType(Def),
2211 if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(Def))
2212 if (RFlags->hasPredicate())
2213 return hash_combine(Result, RFlags->getPredicate());
2214 return Result;
2215 }
2216
2217 /// Check equality of underlying data of \p L and \p R.
2218 static bool isEqual(const VPSingleDefRecipe *L, const VPSingleDefRecipe *R) {
2219 if (isSentinel(L) || isSentinel(R))
2220 return L == R;
2221 if (L->getVPDefID() != R->getVPDefID() ||
2223 getGEPSourceElementType(L) != getGEPSourceElementType(R) ||
2225 !equal(L->operands(), R->operands()))
2226 return false;
2228 "must have valid opcode info for both recipes");
2229 if (auto *LFlags = dyn_cast<VPRecipeWithIRFlags>(L))
2230 if (LFlags->hasPredicate() &&
2231 LFlags->getPredicate() !=
2232 cast<VPRecipeWithIRFlags>(R)->getPredicate())
2233 return false;
2234 // Recipes in replicate regions implicitly depend on predicate. If either
2235 // recipe is in a replicate region, only consider them equal if both have
2236 // the same parent.
2237 const VPRegionBlock *RegionL = L->getRegion();
2238 const VPRegionBlock *RegionR = R->getRegion();
2239 if (((RegionL && RegionL->isReplicator()) ||
2240 (RegionR && RegionR->isReplicator())) &&
2241 L->getParent() != R->getParent())
2242 return false;
2243 const VPlan *Plan = L->getParent()->getPlan();
2244 VPTypeAnalysis TypeInfo(*Plan);
2245 return TypeInfo.inferScalarType(L) == TypeInfo.inferScalarType(R);
2246 }
2247};
2248} // end anonymous namespace
2249
2250/// Perform a common-subexpression-elimination of VPSingleDefRecipes on the \p
2251/// Plan.
2253 VPDominatorTree VPDT(Plan);
2255
2257 vp_depth_first_deep(Plan.getEntry()))) {
2258 for (VPRecipeBase &R : *VPBB) {
2259 auto *Def = dyn_cast<VPSingleDefRecipe>(&R);
2260 if (!Def || !VPCSEDenseMapInfo::canHandle(Def))
2261 continue;
2262 if (VPSingleDefRecipe *V = CSEMap.lookup(Def)) {
2263 // V must dominate Def for a valid replacement.
2264 if (!VPDT.dominates(V->getParent(), VPBB))
2265 continue;
2266 // Only keep flags present on both V and Def.
2267 if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(V))
2268 RFlags->intersectFlags(*cast<VPRecipeWithIRFlags>(Def));
2269 Def->replaceAllUsesWith(V);
2270 continue;
2271 }
2272 CSEMap[Def] = Def;
2273 }
2274 }
2275}
2276
2277/// Move loop-invariant recipes out of the vector loop region in \p Plan.
2278static void licm(VPlan &Plan) {
2279 VPBasicBlock *Preheader = Plan.getVectorPreheader();
2280
2281 // Hoist any loop invariant recipes from the vector loop region to the
2282 // preheader. Preform a shallow traversal of the vector loop region, to
2283 // exclude recipes in replicate regions. Since the top-level blocks in the
2284 // vector loop region are guaranteed to execute if the vector pre-header is,
2285 // we don't need to check speculation safety.
2286 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2287 assert(Preheader->getSingleSuccessor() == LoopRegion &&
2288 "Expected vector prehader's successor to be the vector loop region");
2290 vp_depth_first_shallow(LoopRegion->getEntry()))) {
2291 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
2293 continue;
2294 if (any_of(R.operands(), [](VPValue *Op) {
2295 return !Op->isDefinedOutsideLoopRegions();
2296 }))
2297 continue;
2298 R.moveBefore(*Preheader, Preheader->end());
2299 }
2300 }
2301}
2302
2304 VPlan &Plan, const MapVector<Instruction *, uint64_t> &MinBWs) {
2305 if (Plan.hasScalarVFOnly())
2306 return;
2307 // Keep track of created truncates, so they can be re-used. Note that we
2308 // cannot use RAUW after creating a new truncate, as this would could make
2309 // other uses have different types for their operands, making them invalidly
2310 // typed.
2312 VPTypeAnalysis TypeInfo(Plan);
2313 VPBasicBlock *PH = Plan.getVectorPreheader();
2316 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
2319 &R))
2320 continue;
2321
2322 VPValue *ResultVPV = R.getVPSingleValue();
2323 auto *UI = cast_or_null<Instruction>(ResultVPV->getUnderlyingValue());
2324 unsigned NewResSizeInBits = MinBWs.lookup(UI);
2325 if (!NewResSizeInBits)
2326 continue;
2327
2328 // If the value wasn't vectorized, we must maintain the original scalar
2329 // type. Skip those here, after incrementing NumProcessedRecipes. Also
2330 // skip casts which do not need to be handled explicitly here, as
2331 // redundant casts will be removed during recipe simplification.
2333 continue;
2334
2335 Type *OldResTy = TypeInfo.inferScalarType(ResultVPV);
2336 unsigned OldResSizeInBits = OldResTy->getScalarSizeInBits();
2337 assert(OldResTy->isIntegerTy() && "only integer types supported");
2338 (void)OldResSizeInBits;
2339
2340 auto *NewResTy = IntegerType::get(Plan.getContext(), NewResSizeInBits);
2341
2342 // Any wrapping introduced by shrinking this operation shouldn't be
2343 // considered undefined behavior. So, we can't unconditionally copy
2344 // arithmetic wrapping flags to VPW.
2345 if (auto *VPW = dyn_cast<VPRecipeWithIRFlags>(&R))
2346 VPW->dropPoisonGeneratingFlags();
2347
2348 if (OldResSizeInBits != NewResSizeInBits &&
2349 !match(&R, m_ICmp(m_VPValue(), m_VPValue()))) {
2350 // Extend result to original width.
2351 auto *Ext =
2352 new VPWidenCastRecipe(Instruction::ZExt, ResultVPV, OldResTy);
2353 Ext->insertAfter(&R);
2354 ResultVPV->replaceAllUsesWith(Ext);
2355 Ext->setOperand(0, ResultVPV);
2356 assert(OldResSizeInBits > NewResSizeInBits && "Nothing to shrink?");
2357 } else {
2358 assert(match(&R, m_ICmp(m_VPValue(), m_VPValue())) &&
2359 "Only ICmps should not need extending the result.");
2360 }
2361
2362 assert(!isa<VPWidenStoreRecipe>(&R) && "stores cannot be narrowed");
2364 continue;
2365
2366 // Shrink operands by introducing truncates as needed.
2367 unsigned StartIdx = isa<VPWidenSelectRecipe>(&R) ? 1 : 0;
2368 for (unsigned Idx = StartIdx; Idx != R.getNumOperands(); ++Idx) {
2369 auto *Op = R.getOperand(Idx);
2370 unsigned OpSizeInBits =
2372 if (OpSizeInBits == NewResSizeInBits)
2373 continue;
2374 assert(OpSizeInBits > NewResSizeInBits && "nothing to truncate");
2375 auto [ProcessedIter, IterIsEmpty] = ProcessedTruncs.try_emplace(Op);
2376 if (!IterIsEmpty) {
2377 R.setOperand(Idx, ProcessedIter->second);
2378 continue;
2379 }
2380
2381 VPBuilder Builder;
2382 if (Op->isLiveIn())
2383 Builder.setInsertPoint(PH);
2384 else
2385 Builder.setInsertPoint(&R);
2386 VPWidenCastRecipe *NewOp =
2387 Builder.createWidenCast(Instruction::Trunc, Op, NewResTy);
2388 ProcessedIter->second = NewOp;
2389 R.setOperand(Idx, NewOp);
2390 }
2391
2392 }
2393 }
2394}
2395
2399 VPValue *Cond;
2400 // Skip blocks that are not terminated by BranchOnCond.
2401 if (VPBB->empty() || !match(&VPBB->back(), m_BranchOnCond(m_VPValue(Cond))))
2402 continue;
2403
2404 assert(VPBB->getNumSuccessors() == 2 &&
2405 "Two successors expected for BranchOnCond");
2406 unsigned RemovedIdx;
2407 if (match(Cond, m_True()))
2408 RemovedIdx = 1;
2409 else if (match(Cond, m_False()))
2410 RemovedIdx = 0;
2411 else
2412 continue;
2413
2414 VPBasicBlock *RemovedSucc =
2415 cast<VPBasicBlock>(VPBB->getSuccessors()[RemovedIdx]);
2416 assert(count(RemovedSucc->getPredecessors(), VPBB) == 1 &&
2417 "There must be a single edge between VPBB and its successor");
2418 // Values coming from VPBB into phi recipes of RemoveSucc are removed from
2419 // these recipes.
2420 for (VPRecipeBase &R : RemovedSucc->phis())
2421 cast<VPPhiAccessors>(&R)->removeIncomingValueFor(VPBB);
2422
2423 // Disconnect blocks and remove the terminator. RemovedSucc will be deleted
2424 // automatically on VPlan destruction if it becomes unreachable.
2425 VPBlockUtils::disconnectBlocks(VPBB, RemovedSucc);
2426 VPBB->back().eraseFromParent();
2427 }
2428}
2429
2449
2450// Add a VPActiveLaneMaskPHIRecipe and related recipes to \p Plan and replace
2451// the loop terminator with a branch-on-cond recipe with the negated
2452// active-lane-mask as operand. Note that this turns the loop into an
2453// uncountable one. Only the existing terminator is replaced, all other existing
2454// recipes/users remain unchanged, except for poison-generating flags being
2455// dropped from the canonical IV increment. Return the created
2456// VPActiveLaneMaskPHIRecipe.
2457//
2458// The function uses the following definitions:
2459//
2460// %TripCount = DataWithControlFlowWithoutRuntimeCheck ?
2461// calculate-trip-count-minus-VF (original TC) : original TC
2462// %IncrementValue = DataWithControlFlowWithoutRuntimeCheck ?
2463// CanonicalIVPhi : CanonicalIVIncrement
2464// %StartV is the canonical induction start value.
2465//
2466// The function adds the following recipes:
2467//
2468// vector.ph:
2469// %TripCount = calculate-trip-count-minus-VF (original TC)
2470// [if DataWithControlFlowWithoutRuntimeCheck]
2471// %EntryInc = canonical-iv-increment-for-part %StartV
2472// %EntryALM = active-lane-mask %EntryInc, %TripCount
2473//
2474// vector.body:
2475// ...
2476// %P = active-lane-mask-phi [ %EntryALM, %vector.ph ], [ %ALM, %vector.body ]
2477// ...
2478// %InLoopInc = canonical-iv-increment-for-part %IncrementValue
2479// %ALM = active-lane-mask %InLoopInc, TripCount
2480// %Negated = Not %ALM
2481// branch-on-cond %Negated
2482//
2485 VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
2486 VPBasicBlock *EB = TopRegion->getExitingBasicBlock();
2487 auto *CanonicalIVPHI = TopRegion->getCanonicalIV();
2488 VPValue *StartV = CanonicalIVPHI->getStartValue();
2489
2490 auto *CanonicalIVIncrement =
2491 cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue());
2492 // TODO: Check if dropping the flags is needed if
2493 // !DataAndControlFlowWithoutRuntimeCheck.
2494 CanonicalIVIncrement->dropPoisonGeneratingFlags();
2495 DebugLoc DL = CanonicalIVIncrement->getDebugLoc();
2496 // We can't use StartV directly in the ActiveLaneMask VPInstruction, since
2497 // we have to take unrolling into account. Each part needs to start at
2498 // Part * VF
2499 auto *VecPreheader = Plan.getVectorPreheader();
2500 VPBuilder Builder(VecPreheader);
2501
2502 // Create the ActiveLaneMask instruction using the correct start values.
2503 VPValue *TC = Plan.getTripCount();
2504
2505 VPValue *TripCount, *IncrementValue;
2507 // When the loop is guarded by a runtime overflow check for the loop
2508 // induction variable increment by VF, we can increment the value before
2509 // the get.active.lane mask and use the unmodified tripcount.
2510 IncrementValue = CanonicalIVIncrement;
2511 TripCount = TC;
2512 } else {
2513 // When avoiding a runtime check, the active.lane.mask inside the loop
2514 // uses a modified trip count and the induction variable increment is
2515 // done after the active.lane.mask intrinsic is called.
2516 IncrementValue = CanonicalIVPHI;
2517 TripCount = Builder.createNaryOp(VPInstruction::CalculateTripCountMinusVF,
2518 {TC}, DL);
2519 }
2520 auto *EntryIncrement = Builder.createOverflowingOp(
2521 VPInstruction::CanonicalIVIncrementForPart, {StartV}, {false, false}, DL,
2522 "index.part.next");
2523
2524 // Create the active lane mask instruction in the VPlan preheader.
2525 VPValue *ALMMultiplier =
2526 Plan.getConstantInt(TopRegion->getCanonicalIVType(), 1);
2527 auto *EntryALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
2528 {EntryIncrement, TC, ALMMultiplier}, DL,
2529 "active.lane.mask.entry");
2530
2531 // Now create the ActiveLaneMaskPhi recipe in the main loop using the
2532 // preheader ActiveLaneMask instruction.
2533 auto *LaneMaskPhi =
2535 LaneMaskPhi->insertAfter(CanonicalIVPHI);
2536
2537 // Create the active lane mask for the next iteration of the loop before the
2538 // original terminator.
2539 VPRecipeBase *OriginalTerminator = EB->getTerminator();
2540 Builder.setInsertPoint(OriginalTerminator);
2541 auto *InLoopIncrement =
2542 Builder.createOverflowingOp(VPInstruction::CanonicalIVIncrementForPart,
2543 {IncrementValue}, {false, false}, DL);
2544 auto *ALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
2545 {InLoopIncrement, TripCount, ALMMultiplier},
2546 DL, "active.lane.mask.next");
2547 LaneMaskPhi->addOperand(ALM);
2548
2549 // Replace the original terminator with BranchOnCond. We have to invert the
2550 // mask here because a true condition means jumping to the exit block.
2551 auto *NotMask = Builder.createNot(ALM, DL);
2552 Builder.createNaryOp(VPInstruction::BranchOnCond, {NotMask}, DL);
2553 OriginalTerminator->eraseFromParent();
2554 return LaneMaskPhi;
2555}
2556
2557/// Collect the header mask with the pattern:
2558/// (ICMP_ULE, WideCanonicalIV, backedge-taken-count)
2559/// TODO: Introduce explicit recipe for header-mask instead of searching
2560/// for the header-mask pattern manually.
2562 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2563 SmallVector<VPValue *> WideCanonicalIVs;
2564 auto *FoundWidenCanonicalIVUser = find_if(
2566 assert(count_if(LoopRegion->getCanonicalIV()->users(),
2568 "Must have at most one VPWideCanonicalIVRecipe");
2569 if (FoundWidenCanonicalIVUser !=
2570 LoopRegion->getCanonicalIV()->users().end()) {
2571 auto *WideCanonicalIV =
2572 cast<VPWidenCanonicalIVRecipe>(*FoundWidenCanonicalIVUser);
2573 WideCanonicalIVs.push_back(WideCanonicalIV);
2574 }
2575
2576 // Also include VPWidenIntOrFpInductionRecipes that represent a widened
2577 // version of the canonical induction.
2578 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
2579 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
2580 auto *WidenOriginalIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
2581 if (WidenOriginalIV && WidenOriginalIV->isCanonical())
2582 WideCanonicalIVs.push_back(WidenOriginalIV);
2583 }
2584
2585 // Walk users of wide canonical IVs and find the single compare of the form
2586 // (ICMP_ULE, WideCanonicalIV, backedge-taken-count).
2587 VPSingleDefRecipe *HeaderMask = nullptr;
2588 for (auto *Wide : WideCanonicalIVs) {
2589 for (VPUser *U : SmallVector<VPUser *>(Wide->users())) {
2590 auto *VPI = dyn_cast<VPInstruction>(U);
2591 if (!VPI || !vputils::isHeaderMask(VPI, Plan))
2592 continue;
2593
2594 assert(VPI->getOperand(0) == Wide &&
2595 "WidenCanonicalIV must be the first operand of the compare");
2596 assert(!HeaderMask && "Multiple header masks found?");
2597 HeaderMask = VPI;
2598 }
2599 }
2600 return HeaderMask;
2601}
2602
2604 VPlan &Plan, bool UseActiveLaneMaskForControlFlow,
2607 UseActiveLaneMaskForControlFlow) &&
2608 "DataAndControlFlowWithoutRuntimeCheck implies "
2609 "UseActiveLaneMaskForControlFlow");
2610
2611 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2612 auto *FoundWidenCanonicalIVUser = find_if(
2614 assert(FoundWidenCanonicalIVUser &&
2615 "Must have widened canonical IV when tail folding!");
2616 VPSingleDefRecipe *HeaderMask = findHeaderMask(Plan);
2617 auto *WideCanonicalIV =
2618 cast<VPWidenCanonicalIVRecipe>(*FoundWidenCanonicalIVUser);
2619 VPSingleDefRecipe *LaneMask;
2620 if (UseActiveLaneMaskForControlFlow) {
2623 } else {
2624 VPBuilder B = VPBuilder::getToInsertAfter(WideCanonicalIV);
2625 VPValue *ALMMultiplier = Plan.getOrAddLiveIn(
2626 ConstantInt::get(LoopRegion->getCanonicalIVType(), 1));
2627 LaneMask =
2628 B.createNaryOp(VPInstruction::ActiveLaneMask,
2629 {WideCanonicalIV, Plan.getTripCount(), ALMMultiplier},
2630 nullptr, "active.lane.mask");
2631 }
2632
2633 // Walk users of WideCanonicalIV and replace the header mask of the form
2634 // (ICMP_ULE, WideCanonicalIV, backedge-taken-count) with an active-lane-mask,
2635 // removing the old one to ensure there is always only a single header mask.
2636 HeaderMask->replaceAllUsesWith(LaneMask);
2637 HeaderMask->eraseFromParent();
2638}
2639
2640template <typename Op0_t, typename Op1_t> struct RemoveMask_match {
2641 Op0_t In;
2643
2644 RemoveMask_match(const Op0_t &In, Op1_t &Out) : In(In), Out(Out) {}
2645
2646 template <typename OpTy> bool match(OpTy *V) const {
2647 if (m_Specific(In).match(V)) {
2648 Out = nullptr;
2649 return true;
2650 }
2651 return m_LogicalAnd(m_Specific(In), m_VPValue(Out)).match(V);
2652 }
2653};
2654
2655/// Match a specific mask \p In, or a combination of it (logical-and In, Out).
2656/// Returns the remaining part \p Out if so, or nullptr otherwise.
2657template <typename Op0_t, typename Op1_t>
2658static inline RemoveMask_match<Op0_t, Op1_t> m_RemoveMask(const Op0_t &In,
2659 Op1_t &Out) {
2660 return RemoveMask_match<Op0_t, Op1_t>(In, Out);
2661}
2662
2663/// Try to optimize a \p CurRecipe masked by \p HeaderMask to a corresponding
2664/// EVL-based recipe without the header mask. Returns nullptr if no EVL-based
2665/// recipe could be created.
2666/// \p HeaderMask Header Mask.
2667/// \p CurRecipe Recipe to be transform.
2668/// \p TypeInfo VPlan-based type analysis.
2669/// \p EVL The explicit vector length parameter of vector-predication
2670/// intrinsics.
2672 VPRecipeBase &CurRecipe,
2673 VPTypeAnalysis &TypeInfo, VPValue &EVL) {
2674 VPlan *Plan = CurRecipe.getParent()->getPlan();
2675 VPValue *Addr, *Mask, *EndPtr;
2676
2677 /// Adjust any end pointers so that they point to the end of EVL lanes not VF.
2678 auto AdjustEndPtr = [&CurRecipe, &EVL](VPValue *EndPtr) {
2679 auto *EVLEndPtr = cast<VPVectorEndPointerRecipe>(EndPtr)->clone();
2680 EVLEndPtr->insertBefore(&CurRecipe);
2681 EVLEndPtr->setOperand(1, &EVL);
2682 return EVLEndPtr;
2683 };
2684
2685 if (match(&CurRecipe,
2686 m_MaskedLoad(m_VPValue(Addr), m_RemoveMask(HeaderMask, Mask))) &&
2687 !cast<VPWidenLoadRecipe>(CurRecipe).isReverse())
2688 return new VPWidenLoadEVLRecipe(cast<VPWidenLoadRecipe>(CurRecipe), Addr,
2689 EVL, Mask);
2690
2691 if (match(&CurRecipe,
2692 m_MaskedLoad(m_VPValue(EndPtr), m_RemoveMask(HeaderMask, Mask))) &&
2693 match(EndPtr, m_VecEndPtr(m_VPValue(Addr), m_Specific(&Plan->getVF()))) &&
2694 cast<VPWidenLoadRecipe>(CurRecipe).isReverse())
2695 return new VPWidenLoadEVLRecipe(cast<VPWidenLoadRecipe>(CurRecipe),
2696 AdjustEndPtr(EndPtr), EVL, Mask);
2697
2698 if (match(&CurRecipe, m_MaskedStore(m_VPValue(Addr), m_VPValue(),
2699 m_RemoveMask(HeaderMask, Mask))) &&
2700 !cast<VPWidenStoreRecipe>(CurRecipe).isReverse())
2701 return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(CurRecipe), Addr,
2702 EVL, Mask);
2703
2704 if (match(&CurRecipe, m_MaskedStore(m_VPValue(EndPtr), m_VPValue(),
2705 m_RemoveMask(HeaderMask, Mask))) &&
2706 match(EndPtr, m_VecEndPtr(m_VPValue(Addr), m_Specific(&Plan->getVF()))) &&
2707 cast<VPWidenStoreRecipe>(CurRecipe).isReverse())
2708 return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(CurRecipe),
2709 AdjustEndPtr(EndPtr), EVL, Mask);
2710
2711 if (auto *Rdx = dyn_cast<VPReductionRecipe>(&CurRecipe))
2712 if (Rdx->isConditional() &&
2713 match(Rdx->getCondOp(), m_RemoveMask(HeaderMask, Mask)))
2714 return new VPReductionEVLRecipe(*Rdx, EVL, Mask);
2715
2716 if (auto *Interleave = dyn_cast<VPInterleaveRecipe>(&CurRecipe))
2717 if (Interleave->getMask() &&
2718 match(Interleave->getMask(), m_RemoveMask(HeaderMask, Mask)))
2719 return new VPInterleaveEVLRecipe(*Interleave, EVL, Mask);
2720
2721 VPValue *LHS, *RHS;
2722 if (match(&CurRecipe,
2723 m_Select(m_Specific(HeaderMask), m_VPValue(LHS), m_VPValue(RHS))))
2724 return new VPWidenIntrinsicRecipe(
2725 Intrinsic::vp_merge, {Plan->getTrue(), LHS, RHS, &EVL},
2726 TypeInfo.inferScalarType(LHS), {}, {}, CurRecipe.getDebugLoc());
2727
2728 if (match(&CurRecipe, m_Select(m_RemoveMask(HeaderMask, Mask), m_VPValue(LHS),
2729 m_VPValue(RHS))))
2730 return new VPWidenIntrinsicRecipe(
2731 Intrinsic::vp_merge, {Mask, LHS, RHS, &EVL},
2732 TypeInfo.inferScalarType(LHS), {}, {}, CurRecipe.getDebugLoc());
2733
2734 return nullptr;
2735}
2736
2737/// Replace recipes with their EVL variants.
2739 VPTypeAnalysis TypeInfo(Plan);
2740 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2741 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
2742
2743 assert(all_of(Plan.getVF().users(),
2746 "User of VF that we can't transform to EVL.");
2747 Plan.getVF().replaceUsesWithIf(&EVL, [](VPUser &U, unsigned Idx) {
2749 });
2750
2751 assert(all_of(Plan.getVFxUF().users(),
2752 [&LoopRegion, &Plan](VPUser *U) {
2753 return match(U,
2754 m_c_Add(m_Specific(LoopRegion->getCanonicalIV()),
2755 m_Specific(&Plan.getVFxUF()))) ||
2756 isa<VPWidenPointerInductionRecipe>(U);
2757 }) &&
2758 "Only users of VFxUF should be VPWidenPointerInductionRecipe and the "
2759 "increment of the canonical induction.");
2760 Plan.getVFxUF().replaceUsesWithIf(&EVL, [](VPUser &U, unsigned Idx) {
2761 // Only replace uses in VPWidenPointerInductionRecipe; The increment of the
2762 // canonical induction must not be updated.
2764 });
2765
2766 // Defer erasing recipes till the end so that we don't invalidate the
2767 // VPTypeAnalysis cache.
2769
2770 // Create a scalar phi to track the previous EVL if fixed-order recurrence is
2771 // contained.
2772 bool ContainsFORs =
2774 if (ContainsFORs) {
2775 // TODO: Use VPInstruction::ExplicitVectorLength to get maximum EVL.
2776 VPValue *MaxEVL = &Plan.getVF();
2777 // Emit VPScalarCastRecipe in preheader if VF is not a 32 bits integer.
2778 VPBuilder Builder(LoopRegion->getPreheaderVPBB());
2779 MaxEVL = Builder.createScalarZExtOrTrunc(
2780 MaxEVL, Type::getInt32Ty(Plan.getContext()),
2781 TypeInfo.inferScalarType(MaxEVL), DebugLoc::getUnknown());
2782
2783 Builder.setInsertPoint(Header, Header->getFirstNonPhi());
2784 VPValue *PrevEVL = Builder.createScalarPhi(
2785 {MaxEVL, &EVL}, DebugLoc::getUnknown(), "prev.evl");
2786
2789 for (VPRecipeBase &R : *VPBB) {
2790 VPValue *V1, *V2;
2791 if (!match(&R,
2793 m_VPValue(V1), m_VPValue(V2))))
2794 continue;
2795 VPValue *Imm = Plan.getOrAddLiveIn(
2798 Intrinsic::experimental_vp_splice,
2799 {V1, V2, Imm, Plan.getTrue(), PrevEVL, &EVL},
2800 TypeInfo.inferScalarType(R.getVPSingleValue()), {}, {},
2801 R.getDebugLoc());
2802 VPSplice->insertBefore(&R);
2803 R.getVPSingleValue()->replaceAllUsesWith(VPSplice);
2804 ToErase.push_back(&R);
2805 }
2806 }
2807 }
2808
2809 VPValue *HeaderMask = findHeaderMask(Plan);
2810 if (!HeaderMask)
2811 return;
2812
2813 // Replace header masks with a mask equivalent to predicating by EVL:
2814 //
2815 // icmp ule widen-canonical-iv backedge-taken-count
2816 // ->
2817 // icmp ult step-vector, EVL
2818 VPRecipeBase *EVLR = EVL.getDefiningRecipe();
2819 VPBuilder Builder(EVLR->getParent(), std::next(EVLR->getIterator()));
2820 Type *EVLType = TypeInfo.inferScalarType(&EVL);
2821 VPValue *EVLMask = Builder.createICmp(
2823 Builder.createNaryOp(VPInstruction::StepVector, {}, EVLType), &EVL);
2824 HeaderMask->replaceAllUsesWith(EVLMask);
2825 ToErase.push_back(HeaderMask->getDefiningRecipe());
2826
2827 // Try to optimize header mask recipes away to their EVL variants.
2828 // TODO: Split optimizeMaskToEVL out and move into
2829 // VPlanTransforms::optimize. transformRecipestoEVLRecipes should be run in
2830 // tryToBuildVPlanWithVPRecipes beforehand.
2831 for (VPUser *U : collectUsersRecursively(EVLMask)) {
2832 auto *CurRecipe = cast<VPRecipeBase>(U);
2833 VPRecipeBase *EVLRecipe =
2834 optimizeMaskToEVL(EVLMask, *CurRecipe, TypeInfo, EVL);
2835 if (!EVLRecipe)
2836 continue;
2837
2838 unsigned NumDefVal = EVLRecipe->getNumDefinedValues();
2839 assert(NumDefVal == CurRecipe->getNumDefinedValues() &&
2840 "New recipe must define the same number of values as the "
2841 "original.");
2842 EVLRecipe->insertBefore(CurRecipe);
2844 EVLRecipe)) {
2845 for (unsigned I = 0; I < NumDefVal; ++I) {
2846 VPValue *CurVPV = CurRecipe->getVPValue(I);
2847 CurVPV->replaceAllUsesWith(EVLRecipe->getVPValue(I));
2848 }
2849 }
2850 ToErase.push_back(CurRecipe);
2851 }
2852 // Remove dead EVL mask.
2853 if (EVLMask->getNumUsers() == 0)
2854 ToErase.push_back(EVLMask->getDefiningRecipe());
2855
2856 for (VPRecipeBase *R : reverse(ToErase)) {
2857 SmallVector<VPValue *> PossiblyDead(R->operands());
2858 R->eraseFromParent();
2859 for (VPValue *Op : PossiblyDead)
2861 }
2862}
2863
2864/// Add a VPEVLBasedIVPHIRecipe and related recipes to \p Plan and
2865/// replaces all uses except the canonical IV increment of
2866/// VPCanonicalIVPHIRecipe with a VPEVLBasedIVPHIRecipe. VPCanonicalIVPHIRecipe
2867/// is used only for loop iterations counting after this transformation.
2868///
2869/// The function uses the following definitions:
2870/// %StartV is the canonical induction start value.
2871///
2872/// The function adds the following recipes:
2873///
2874/// vector.ph:
2875/// ...
2876///
2877/// vector.body:
2878/// ...
2879/// %EVLPhi = EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI [ %StartV, %vector.ph ],
2880/// [ %NextEVLIV, %vector.body ]
2881/// %AVL = phi [ trip-count, %vector.ph ], [ %NextAVL, %vector.body ]
2882/// %VPEVL = EXPLICIT-VECTOR-LENGTH %AVL
2883/// ...
2884/// %OpEVL = cast i32 %VPEVL to IVSize
2885/// %NextEVLIV = add IVSize %OpEVL, %EVLPhi
2886/// %NextAVL = sub IVSize nuw %AVL, %OpEVL
2887/// ...
2888///
2889/// If MaxSafeElements is provided, the function adds the following recipes:
2890/// vector.ph:
2891/// ...
2892///
2893/// vector.body:
2894/// ...
2895/// %EVLPhi = EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI [ %StartV, %vector.ph ],
2896/// [ %NextEVLIV, %vector.body ]
2897/// %AVL = phi [ trip-count, %vector.ph ], [ %NextAVL, %vector.body ]
2898/// %cmp = cmp ult %AVL, MaxSafeElements
2899/// %SAFE_AVL = select %cmp, %AVL, MaxSafeElements
2900/// %VPEVL = EXPLICIT-VECTOR-LENGTH %SAFE_AVL
2901/// ...
2902/// %OpEVL = cast i32 %VPEVL to IVSize
2903/// %NextEVLIV = add IVSize %OpEVL, %EVLPhi
2904/// %NextAVL = sub IVSize nuw %AVL, %OpEVL
2905/// ...
2906///
2908 VPlan &Plan, const std::optional<unsigned> &MaxSafeElements) {
2909 if (Plan.hasScalarVFOnly())
2910 return;
2911 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2912 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
2913
2914 auto *CanonicalIVPHI = LoopRegion->getCanonicalIV();
2915 auto *CanIVTy = LoopRegion->getCanonicalIVType();
2916 VPValue *StartV = CanonicalIVPHI->getStartValue();
2917
2918 // Create the ExplicitVectorLengthPhi recipe in the main loop.
2919 auto *EVLPhi = new VPEVLBasedIVPHIRecipe(StartV, DebugLoc::getUnknown());
2920 EVLPhi->insertAfter(CanonicalIVPHI);
2921 VPBuilder Builder(Header, Header->getFirstNonPhi());
2922 // Create the AVL (application vector length), starting from TC -> 0 in steps
2923 // of EVL.
2924 VPPhi *AVLPhi = Builder.createScalarPhi(
2925 {Plan.getTripCount()}, DebugLoc::getCompilerGenerated(), "avl");
2926 VPValue *AVL = AVLPhi;
2927
2928 if (MaxSafeElements) {
2929 // Support for MaxSafeDist for correct loop emission.
2930 VPValue *AVLSafe = Plan.getConstantInt(CanIVTy, *MaxSafeElements);
2931 VPValue *Cmp = Builder.createICmp(ICmpInst::ICMP_ULT, AVL, AVLSafe);
2932 AVL = Builder.createSelect(Cmp, AVL, AVLSafe, DebugLoc::getUnknown(),
2933 "safe_avl");
2934 }
2935 auto *VPEVL = Builder.createNaryOp(VPInstruction::ExplicitVectorLength, AVL,
2937
2938 auto *CanonicalIVIncrement =
2939 cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue());
2940 Builder.setInsertPoint(CanonicalIVIncrement);
2941 VPValue *OpVPEVL = VPEVL;
2942
2943 auto *I32Ty = Type::getInt32Ty(Plan.getContext());
2944 OpVPEVL = Builder.createScalarZExtOrTrunc(
2945 OpVPEVL, CanIVTy, I32Ty, CanonicalIVIncrement->getDebugLoc());
2946
2947 auto *NextEVLIV = Builder.createOverflowingOp(
2948 Instruction::Add, {OpVPEVL, EVLPhi},
2949 {CanonicalIVIncrement->hasNoUnsignedWrap(),
2950 CanonicalIVIncrement->hasNoSignedWrap()},
2951 CanonicalIVIncrement->getDebugLoc(), "index.evl.next");
2952 EVLPhi->addOperand(NextEVLIV);
2953
2954 VPValue *NextAVL = Builder.createOverflowingOp(
2955 Instruction::Sub, {AVLPhi, OpVPEVL}, {/*hasNUW=*/true, /*hasNSW=*/false},
2956 DebugLoc::getCompilerGenerated(), "avl.next");
2957 AVLPhi->addOperand(NextAVL);
2958
2959 transformRecipestoEVLRecipes(Plan, *VPEVL);
2960
2961 // Replace all uses of VPCanonicalIVPHIRecipe by
2962 // VPEVLBasedIVPHIRecipe except for the canonical IV increment.
2963 CanonicalIVPHI->replaceAllUsesWith(EVLPhi);
2964 CanonicalIVIncrement->setOperand(0, CanonicalIVPHI);
2965 // TODO: support unroll factor > 1.
2966 Plan.setUF(1);
2967}
2968
2970 // Find EVL loop entries by locating VPEVLBasedIVPHIRecipe.
2971 // There should be only one EVL PHI in the entire plan.
2972 VPEVLBasedIVPHIRecipe *EVLPhi = nullptr;
2973
2976 for (VPRecipeBase &R : VPBB->phis())
2977 if (auto *PhiR = dyn_cast<VPEVLBasedIVPHIRecipe>(&R)) {
2978 assert(!EVLPhi && "Found multiple EVL PHIs. Only one expected");
2979 EVLPhi = PhiR;
2980 }
2981
2982 // Early return if no EVL PHI is found.
2983 if (!EVLPhi)
2984 return;
2985
2986 VPBasicBlock *HeaderVPBB = EVLPhi->getParent();
2987 VPValue *EVLIncrement = EVLPhi->getBackedgeValue();
2988 VPValue *AVL;
2989 [[maybe_unused]] bool FoundAVL =
2990 match(EVLIncrement,
2991 m_c_Add(m_ZExtOrSelf(m_EVL(m_VPValue(AVL))), m_Specific(EVLPhi)));
2992 assert(FoundAVL && "Didn't find AVL?");
2993
2994 // The AVL may be capped to a safe distance.
2995 VPValue *SafeAVL;
2996 if (match(AVL, m_Select(m_VPValue(), m_VPValue(SafeAVL), m_VPValue())))
2997 AVL = SafeAVL;
2998
2999 VPValue *AVLNext;
3000 [[maybe_unused]] bool FoundAVLNext =
3002 m_Specific(Plan.getTripCount()), m_VPValue(AVLNext)));
3003 assert(FoundAVLNext && "Didn't find AVL backedge?");
3004
3005 // Convert EVLPhi to concrete recipe.
3006 auto *ScalarR =
3007 VPBuilder(EVLPhi).createScalarPhi({EVLPhi->getStartValue(), EVLIncrement},
3008 EVLPhi->getDebugLoc(), "evl.based.iv");
3009 EVLPhi->replaceAllUsesWith(ScalarR);
3010 EVLPhi->eraseFromParent();
3011
3012 // Replace CanonicalIVInc with EVL-PHI increment.
3013 auto *CanonicalIV = cast<VPPhi>(&*HeaderVPBB->begin());
3014 VPValue *Backedge = CanonicalIV->getIncomingValue(1);
3015 assert(match(Backedge, m_c_Add(m_Specific(CanonicalIV),
3016 m_Specific(&Plan.getVFxUF()))) &&
3017 "Unexpected canonical iv");
3018 Backedge->replaceAllUsesWith(EVLIncrement);
3019
3020 // Remove unused phi and increment.
3021 VPRecipeBase *CanonicalIVIncrement = Backedge->getDefiningRecipe();
3022 CanonicalIVIncrement->eraseFromParent();
3023 CanonicalIV->eraseFromParent();
3024
3025 // Replace the use of VectorTripCount in the latch-exiting block.
3026 // Before: (branch-on-count EVLIVInc, VectorTripCount)
3027 // After: (branch-on-cond eq AVLNext, 0)
3028
3029 VPBasicBlock *LatchExiting =
3030 HeaderVPBB->getPredecessors()[1]->getEntryBasicBlock();
3031 auto *LatchExitingBr = cast<VPInstruction>(LatchExiting->getTerminator());
3032 // Skip single-iteration loop region
3033 if (match(LatchExitingBr, m_BranchOnCond(m_True())))
3034 return;
3035 assert(LatchExitingBr &&
3036 match(LatchExitingBr,
3037 m_BranchOnCount(m_VPValue(EVLIncrement),
3038 m_Specific(&Plan.getVectorTripCount()))) &&
3039 "Unexpected terminator in EVL loop");
3040
3041 Type *AVLTy = VPTypeAnalysis(Plan).inferScalarType(AVLNext);
3042 VPBuilder Builder(LatchExitingBr);
3043 VPValue *Cmp = Builder.createICmp(CmpInst::ICMP_EQ, AVLNext,
3044 Plan.getConstantInt(AVLTy, 0));
3045 Builder.createNaryOp(VPInstruction::BranchOnCond, Cmp);
3046 LatchExitingBr->eraseFromParent();
3047}
3048
3050 VPlan &Plan, PredicatedScalarEvolution &PSE,
3051 const DenseMap<Value *, const SCEV *> &StridesMap) {
3052 // Replace VPValues for known constant strides guaranteed by predicate scalar
3053 // evolution.
3054 auto CanUseVersionedStride = [&Plan](VPUser &U, unsigned) {
3055 auto *R = cast<VPRecipeBase>(&U);
3056 return R->getRegion() ||
3057 R->getParent() == Plan.getVectorLoopRegion()->getSinglePredecessor();
3058 };
3059 ValueToSCEVMapTy RewriteMap;
3060 for (const SCEV *Stride : StridesMap.values()) {
3061 using namespace SCEVPatternMatch;
3062 auto *StrideV = cast<SCEVUnknown>(Stride)->getValue();
3063 const APInt *StrideConst;
3064 if (!match(PSE.getSCEV(StrideV), m_scev_APInt(StrideConst)))
3065 // Only handle constant strides for now.
3066 continue;
3067
3068 auto *CI = Plan.getConstantInt(*StrideConst);
3069 if (VPValue *StrideVPV = Plan.getLiveIn(StrideV))
3070 StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
3071
3072 // The versioned value may not be used in the loop directly but through a
3073 // sext/zext. Add new live-ins in those cases.
3074 for (Value *U : StrideV->users()) {
3076 continue;
3077 VPValue *StrideVPV = Plan.getLiveIn(U);
3078 if (!StrideVPV)
3079 continue;
3080 unsigned BW = U->getType()->getScalarSizeInBits();
3081 APInt C =
3082 isa<SExtInst>(U) ? StrideConst->sext(BW) : StrideConst->zext(BW);
3083 VPValue *CI = Plan.getConstantInt(C);
3084 StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
3085 }
3086 RewriteMap[StrideV] = PSE.getSCEV(StrideV);
3087 }
3088
3089 for (VPRecipeBase &R : *Plan.getEntry()) {
3090 auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(&R);
3091 if (!ExpSCEV)
3092 continue;
3093 const SCEV *ScevExpr = ExpSCEV->getSCEV();
3094 auto *NewSCEV =
3095 SCEVParameterRewriter::rewrite(ScevExpr, *PSE.getSE(), RewriteMap);
3096 if (NewSCEV != ScevExpr) {
3097 VPValue *NewExp = vputils::getOrCreateVPValueForSCEVExpr(Plan, NewSCEV);
3098 ExpSCEV->replaceAllUsesWith(NewExp);
3099 if (Plan.getTripCount() == ExpSCEV)
3100 Plan.resetTripCount(NewExp);
3101 }
3102 }
3103}
3104
3106 VPlan &Plan,
3107 const std::function<bool(BasicBlock *)> &BlockNeedsPredication) {
3108 // Collect recipes in the backward slice of `Root` that may generate a poison
3109 // value that is used after vectorization.
3111 auto CollectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) {
3113 Worklist.push_back(Root);
3114
3115 // Traverse the backward slice of Root through its use-def chain.
3116 while (!Worklist.empty()) {
3117 VPRecipeBase *CurRec = Worklist.pop_back_val();
3118
3119 if (!Visited.insert(CurRec).second)
3120 continue;
3121
3122 // Prune search if we find another recipe generating a widen memory
3123 // instruction. Widen memory instructions involved in address computation
3124 // will lead to gather/scatter instructions, which don't need to be
3125 // handled.
3127 VPHeaderPHIRecipe>(CurRec))
3128 continue;
3129
3130 // This recipe contributes to the address computation of a widen
3131 // load/store. If the underlying instruction has poison-generating flags,
3132 // drop them directly.
3133 if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(CurRec)) {
3134 VPValue *A, *B;
3135 // Dropping disjoint from an OR may yield incorrect results, as some
3136 // analysis may have converted it to an Add implicitly (e.g. SCEV used
3137 // for dependence analysis). Instead, replace it with an equivalent Add.
3138 // This is possible as all users of the disjoint OR only access lanes
3139 // where the operands are disjoint or poison otherwise.
3140 if (match(RecWithFlags, m_BinaryOr(m_VPValue(A), m_VPValue(B))) &&
3141 RecWithFlags->isDisjoint()) {
3142 VPBuilder Builder(RecWithFlags);
3143 VPInstruction *New = Builder.createOverflowingOp(
3144 Instruction::Add, {A, B}, {false, false},
3145 RecWithFlags->getDebugLoc());
3146 New->setUnderlyingValue(RecWithFlags->getUnderlyingValue());
3147 RecWithFlags->replaceAllUsesWith(New);
3148 RecWithFlags->eraseFromParent();
3149 CurRec = New;
3150 } else
3151 RecWithFlags->dropPoisonGeneratingFlags();
3152 } else {
3155 (void)Instr;
3156 assert((!Instr || !Instr->hasPoisonGeneratingFlags()) &&
3157 "found instruction with poison generating flags not covered by "
3158 "VPRecipeWithIRFlags");
3159 }
3160
3161 // Add new definitions to the worklist.
3162 for (VPValue *Operand : CurRec->operands())
3163 if (VPRecipeBase *OpDef = Operand->getDefiningRecipe())
3164 Worklist.push_back(OpDef);
3165 }
3166 });
3167
3168 // Traverse all the recipes in the VPlan and collect the poison-generating
3169 // recipes in the backward slice starting at the address of a VPWidenRecipe or
3170 // VPInterleaveRecipe.
3171 auto Iter = vp_depth_first_deep(Plan.getEntry());
3173 for (VPRecipeBase &Recipe : *VPBB) {
3174 if (auto *WidenRec = dyn_cast<VPWidenMemoryRecipe>(&Recipe)) {
3175 Instruction &UnderlyingInstr = WidenRec->getIngredient();
3176 VPRecipeBase *AddrDef = WidenRec->getAddr()->getDefiningRecipe();
3177 if (AddrDef && WidenRec->isConsecutive() &&
3178 BlockNeedsPredication(UnderlyingInstr.getParent()))
3179 CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
3180 } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) {
3181 VPRecipeBase *AddrDef = InterleaveRec->getAddr()->getDefiningRecipe();
3182 if (AddrDef) {
3183 // Check if any member of the interleave group needs predication.
3184 const InterleaveGroup<Instruction> *InterGroup =
3185 InterleaveRec->getInterleaveGroup();
3186 bool NeedPredication = false;
3187 for (int I = 0, NumMembers = InterGroup->getNumMembers();
3188 I < NumMembers; ++I) {
3189 Instruction *Member = InterGroup->getMember(I);
3190 if (Member)
3191 NeedPredication |= BlockNeedsPredication(Member->getParent());
3192 }
3193
3194 if (NeedPredication)
3195 CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
3196 }
3197 }
3198 }
3199 }
3200}
3201
3203 VPlan &Plan,
3205 &InterleaveGroups,
3206 VPRecipeBuilder &RecipeBuilder, const bool &ScalarEpilogueAllowed) {
3207 if (InterleaveGroups.empty())
3208 return;
3209
3210 // Interleave memory: for each Interleave Group we marked earlier as relevant
3211 // for this VPlan, replace the Recipes widening its memory instructions with a
3212 // single VPInterleaveRecipe at its insertion point.
3213 VPDominatorTree VPDT(Plan);
3214 for (const auto *IG : InterleaveGroups) {
3215 auto *Start =
3216 cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IG->getMember(0)));
3217 VPIRMetadata InterleaveMD(*Start);
3218 SmallVector<VPValue *, 4> StoredValues;
3219 if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(Start))
3220 StoredValues.push_back(StoreR->getStoredValue());
3221 for (unsigned I = 1; I < IG->getFactor(); ++I) {
3222 Instruction *MemberI = IG->getMember(I);
3223 if (!MemberI)
3224 continue;
3225 VPWidenMemoryRecipe *MemoryR =
3226 cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(MemberI));
3227 if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(MemoryR))
3228 StoredValues.push_back(StoreR->getStoredValue());
3229 InterleaveMD.intersect(*MemoryR);
3230 }
3231
3232 bool NeedsMaskForGaps =
3233 (IG->requiresScalarEpilogue() && !ScalarEpilogueAllowed) ||
3234 (!StoredValues.empty() && !IG->isFull());
3235
3236 Instruction *IRInsertPos = IG->getInsertPos();
3237 auto *InsertPos =
3238 cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IRInsertPos));
3239
3241 if (auto *Gep = dyn_cast<GetElementPtrInst>(
3242 getLoadStorePointerOperand(IRInsertPos)->stripPointerCasts()))
3243 NW = Gep->getNoWrapFlags().withoutNoUnsignedWrap();
3244
3245 // Get or create the start address for the interleave group.
3246 VPValue *Addr = Start->getAddr();
3247 VPRecipeBase *AddrDef = Addr->getDefiningRecipe();
3248 if (AddrDef && !VPDT.properlyDominates(AddrDef, InsertPos)) {
3249 // We cannot re-use the address of member zero because it does not
3250 // dominate the insert position. Instead, use the address of the insert
3251 // position and create a PtrAdd adjusting it to the address of member
3252 // zero.
3253 // TODO: Hoist Addr's defining recipe (and any operands as needed) to
3254 // InsertPos or sink loads above zero members to join it.
3255 assert(IG->getIndex(IRInsertPos) != 0 &&
3256 "index of insert position shouldn't be zero");
3257 auto &DL = IRInsertPos->getDataLayout();
3258 APInt Offset(32,
3259 DL.getTypeAllocSize(getLoadStoreType(IRInsertPos)) *
3260 IG->getIndex(IRInsertPos),
3261 /*IsSigned=*/true);
3262 VPValue *OffsetVPV = Plan.getConstantInt(-Offset);
3263 VPBuilder B(InsertPos);
3264 Addr = B.createNoWrapPtrAdd(InsertPos->getAddr(), OffsetVPV, NW);
3265 }
3266 // If the group is reverse, adjust the index to refer to the last vector
3267 // lane instead of the first. We adjust the index from the first vector
3268 // lane, rather than directly getting the pointer for lane VF - 1, because
3269 // the pointer operand of the interleaved access is supposed to be uniform.
3270 if (IG->isReverse()) {
3271 auto *ReversePtr = new VPVectorEndPointerRecipe(
3272 Addr, &Plan.getVF(), getLoadStoreType(IRInsertPos),
3273 -(int64_t)IG->getFactor(), NW, InsertPos->getDebugLoc());
3274 ReversePtr->insertBefore(InsertPos);
3275 Addr = ReversePtr;
3276 }
3277 auto *VPIG = new VPInterleaveRecipe(IG, Addr, StoredValues,
3278 InsertPos->getMask(), NeedsMaskForGaps,
3279 InterleaveMD, InsertPos->getDebugLoc());
3280 VPIG->insertBefore(InsertPos);
3281
3282 unsigned J = 0;
3283 for (unsigned i = 0; i < IG->getFactor(); ++i)
3284 if (Instruction *Member = IG->getMember(i)) {
3285 VPRecipeBase *MemberR = RecipeBuilder.getRecipe(Member);
3286 if (!Member->getType()->isVoidTy()) {
3287 VPValue *OriginalV = MemberR->getVPSingleValue();
3288 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));
3289 J++;
3290 }
3291 MemberR->eraseFromParent();
3292 }
3293 }
3294}
3295
3296/// Expand a VPWidenIntOrFpInduction into executable recipes, for the initial
3297/// value, phi and backedge value. In the following example:
3298///
3299/// vector.ph:
3300/// Successor(s): vector loop
3301///
3302/// <x1> vector loop: {
3303/// vector.body:
3304/// WIDEN-INDUCTION %i = phi %start, %step, %vf
3305/// ...
3306/// EMIT branch-on-count ...
3307/// No successors
3308/// }
3309///
3310/// WIDEN-INDUCTION will get expanded to:
3311///
3312/// vector.ph:
3313/// ...
3314/// vp<%induction.start> = ...
3315/// vp<%induction.increment> = ...
3316///
3317/// Successor(s): vector loop
3318///
3319/// <x1> vector loop: {
3320/// vector.body:
3321/// ir<%i> = WIDEN-PHI vp<%induction.start>, vp<%vec.ind.next>
3322/// ...
3323/// vp<%vec.ind.next> = add ir<%i>, vp<%induction.increment>
3324/// EMIT branch-on-count ...
3325/// No successors
3326/// }
3327static void
3329 VPTypeAnalysis &TypeInfo) {
3330 VPlan *Plan = WidenIVR->getParent()->getPlan();
3331 VPValue *Start = WidenIVR->getStartValue();
3332 VPValue *Step = WidenIVR->getStepValue();
3333 VPValue *VF = WidenIVR->getVFValue();
3334 DebugLoc DL = WidenIVR->getDebugLoc();
3335
3336 // The value from the original loop to which we are mapping the new induction
3337 // variable.
3338 Type *Ty = TypeInfo.inferScalarType(WidenIVR);
3339
3340 const InductionDescriptor &ID = WidenIVR->getInductionDescriptor();
3343 VPIRFlags Flags = *WidenIVR;
3344 if (ID.getKind() == InductionDescriptor::IK_IntInduction) {
3345 AddOp = Instruction::Add;
3346 MulOp = Instruction::Mul;
3347 } else {
3348 AddOp = ID.getInductionOpcode();
3349 MulOp = Instruction::FMul;
3350 }
3351
3352 // If the phi is truncated, truncate the start and step values.
3353 VPBuilder Builder(Plan->getVectorPreheader());
3354 Type *StepTy = TypeInfo.inferScalarType(Step);
3355 if (Ty->getScalarSizeInBits() < StepTy->getScalarSizeInBits()) {
3356 assert(StepTy->isIntegerTy() && "Truncation requires an integer type");
3357 Step = Builder.createScalarCast(Instruction::Trunc, Step, Ty, DL);
3358 Start = Builder.createScalarCast(Instruction::Trunc, Start, Ty, DL);
3359 // Truncation doesn't preserve WrapFlags.
3360 Flags.dropPoisonGeneratingFlags();
3361 StepTy = Ty;
3362 }
3363
3364 // Construct the initial value of the vector IV in the vector loop preheader.
3365 Type *IVIntTy =
3367 VPValue *Init = Builder.createNaryOp(VPInstruction::StepVector, {}, IVIntTy);
3368 if (StepTy->isFloatingPointTy())
3369 Init = Builder.createWidenCast(Instruction::UIToFP, Init, StepTy);
3370
3371 VPValue *SplatStart = Builder.createNaryOp(VPInstruction::Broadcast, Start);
3372 VPValue *SplatStep = Builder.createNaryOp(VPInstruction::Broadcast, Step);
3373
3374 Init = Builder.createNaryOp(MulOp, {Init, SplatStep}, Flags);
3375 Init = Builder.createNaryOp(AddOp, {SplatStart, Init}, Flags,
3376 DebugLoc::getUnknown(), "induction");
3377
3378 // Create the widened phi of the vector IV.
3379 auto *WidePHI = new VPWidenPHIRecipe(WidenIVR->getPHINode(), Init,
3380 WidenIVR->getDebugLoc(), "vec.ind");
3381 WidePHI->insertBefore(WidenIVR);
3382
3383 // Create the backedge value for the vector IV.
3384 VPValue *Inc;
3385 VPValue *Prev;
3386 // If unrolled, use the increment and prev value from the operands.
3387 if (auto *SplatVF = WidenIVR->getSplatVFValue()) {
3388 Inc = SplatVF;
3389 Prev = WidenIVR->getLastUnrolledPartOperand();
3390 } else {
3391 if (VPRecipeBase *R = VF->getDefiningRecipe())
3392 Builder.setInsertPoint(R->getParent(), std::next(R->getIterator()));
3393 // Multiply the vectorization factor by the step using integer or
3394 // floating-point arithmetic as appropriate.
3395 if (StepTy->isFloatingPointTy())
3396 VF = Builder.createScalarCast(Instruction::CastOps::UIToFP, VF, StepTy,
3397 DL);
3398 else
3399 VF = Builder.createScalarZExtOrTrunc(VF, StepTy,
3400 TypeInfo.inferScalarType(VF), DL);
3401
3402 Inc = Builder.createNaryOp(MulOp, {Step, VF}, Flags);
3403 Inc = Builder.createNaryOp(VPInstruction::Broadcast, Inc);
3404 Prev = WidePHI;
3405 }
3406
3408 Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator());
3409 auto *Next = Builder.createNaryOp(AddOp, {Prev, Inc}, Flags,
3410 WidenIVR->getDebugLoc(), "vec.ind.next");
3411
3412 WidePHI->addOperand(Next);
3413
3414 WidenIVR->replaceAllUsesWith(WidePHI);
3415}
3416
3417/// Expand a VPWidenPointerInductionRecipe into executable recipes, for the
3418/// initial value, phi and backedge value. In the following example:
3419///
3420/// <x1> vector loop: {
3421/// vector.body:
3422/// EMIT ir<%ptr.iv> = WIDEN-POINTER-INDUCTION %start, %step, %vf
3423/// ...
3424/// EMIT branch-on-count ...
3425/// }
3426///
3427/// WIDEN-POINTER-INDUCTION will get expanded to:
3428///
3429/// <x1> vector loop: {
3430/// vector.body:
3431/// EMIT-SCALAR %pointer.phi = phi %start, %ptr.ind
3432/// EMIT %mul = mul %stepvector, %step
3433/// EMIT %vector.gep = wide-ptradd %pointer.phi, %mul
3434/// ...
3435/// EMIT %ptr.ind = ptradd %pointer.phi, %vf
3436/// EMIT branch-on-count ...
3437/// }
3439 VPTypeAnalysis &TypeInfo) {
3440 VPlan *Plan = R->getParent()->getPlan();
3441 VPValue *Start = R->getStartValue();
3442 VPValue *Step = R->getStepValue();
3443 VPValue *VF = R->getVFValue();
3444
3445 assert(R->getInductionDescriptor().getKind() ==
3447 "Not a pointer induction according to InductionDescriptor!");
3448 assert(TypeInfo.inferScalarType(R)->isPointerTy() && "Unexpected type.");
3449 assert(!R->onlyScalarsGenerated(Plan->hasScalableVF()) &&
3450 "Recipe should have been replaced");
3451
3452 VPBuilder Builder(R);
3453 DebugLoc DL = R->getDebugLoc();
3454
3455 // Build a scalar pointer phi.
3456 VPPhi *ScalarPtrPhi = Builder.createScalarPhi(Start, DL, "pointer.phi");
3457
3458 // Create actual address geps that use the pointer phi as base and a
3459 // vectorized version of the step value (<step*0, ..., step*N>) as offset.
3460 Builder.setInsertPoint(R->getParent(), R->getParent()->getFirstNonPhi());
3461 Type *StepTy = TypeInfo.inferScalarType(Step);
3462 VPValue *Offset = Builder.createNaryOp(VPInstruction::StepVector, {}, StepTy);
3463 Offset = Builder.createOverflowingOp(Instruction::Mul, {Offset, Step});
3464 VPValue *PtrAdd = Builder.createNaryOp(
3465 VPInstruction::WidePtrAdd, {ScalarPtrPhi, Offset}, DL, "vector.gep");
3466 R->replaceAllUsesWith(PtrAdd);
3467
3468 // Create the backedge value for the scalar pointer phi.
3470 Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator());
3471 VF = Builder.createScalarZExtOrTrunc(VF, StepTy, TypeInfo.inferScalarType(VF),
3472 DL);
3473 VPValue *Inc = Builder.createOverflowingOp(Instruction::Mul, {Step, VF});
3474
3475 VPValue *InductionGEP =
3476 Builder.createPtrAdd(ScalarPtrPhi, Inc, DL, "ptr.ind");
3477 ScalarPtrPhi->addOperand(InductionGEP);
3478}
3479
3481 // Replace loop regions with explicity CFG.
3482 SmallVector<VPRegionBlock *> LoopRegions;
3484 vp_depth_first_deep(Plan.getEntry()))) {
3485 if (!R->isReplicator())
3486 LoopRegions.push_back(R);
3487 }
3488 for (VPRegionBlock *R : LoopRegions)
3489 R->dissolveToCFGLoop();
3490}
3491
3493 VPTypeAnalysis TypeInfo(Plan);
3496 vp_depth_first_deep(Plan.getEntry()))) {
3497 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
3498 if (auto *WidenIVR = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R)) {
3499 expandVPWidenIntOrFpInduction(WidenIVR, TypeInfo);
3500 ToRemove.push_back(WidenIVR);
3501 continue;
3502 }
3503
3504 if (auto *WidenIVR = dyn_cast<VPWidenPointerInductionRecipe>(&R)) {
3505 expandVPWidenPointerInduction(WidenIVR, TypeInfo);
3506 ToRemove.push_back(WidenIVR);
3507 continue;
3508 }
3509
3510 // Expand VPBlendRecipe into VPInstruction::Select.
3511 VPBuilder Builder(&R);
3512 if (auto *Blend = dyn_cast<VPBlendRecipe>(&R)) {
3513 VPValue *Select = Blend->getIncomingValue(0);
3514 for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
3515 Select = Builder.createSelect(Blend->getMask(I),
3516 Blend->getIncomingValue(I), Select,
3517 R.getDebugLoc(), "predphi");
3518 Blend->replaceAllUsesWith(Select);
3519 ToRemove.push_back(Blend);
3520 }
3521
3522 if (auto *Expr = dyn_cast<VPExpressionRecipe>(&R)) {
3523 Expr->decompose();
3524 ToRemove.push_back(Expr);
3525 }
3526
3527 VPValue *VectorStep;
3528 VPValue *ScalarStep;
3530 m_VPValue(VectorStep), m_VPValue(ScalarStep))))
3531 continue;
3532
3533 // Expand WideIVStep.
3534 auto *VPI = cast<VPInstruction>(&R);
3535 Type *IVTy = TypeInfo.inferScalarType(VPI);
3536 if (TypeInfo.inferScalarType(VectorStep) != IVTy) {
3538 ? Instruction::UIToFP
3539 : Instruction::Trunc;
3540 VectorStep = Builder.createWidenCast(CastOp, VectorStep, IVTy);
3541 }
3542
3543 assert(!match(ScalarStep, m_One()) && "Expected non-unit scalar-step");
3544 if (TypeInfo.inferScalarType(ScalarStep) != IVTy) {
3545 ScalarStep =
3546 Builder.createWidenCast(Instruction::Trunc, ScalarStep, IVTy);
3547 }
3548
3549 VPIRFlags Flags;
3550 if (IVTy->isFloatingPointTy())
3551 Flags = {VPI->getFastMathFlags()};
3552
3553 unsigned MulOpc =
3554 IVTy->isFloatingPointTy() ? Instruction::FMul : Instruction::Mul;
3555 VPInstruction *Mul = Builder.createNaryOp(
3556 MulOpc, {VectorStep, ScalarStep}, Flags, R.getDebugLoc());
3557 VectorStep = Mul;
3558 VPI->replaceAllUsesWith(VectorStep);
3559 ToRemove.push_back(VPI);
3560 }
3561 }
3562
3563 for (VPRecipeBase *R : ToRemove)
3564 R->eraseFromParent();
3565}
3566
3568 VPBasicBlock *EarlyExitVPBB,
3569 VPlan &Plan,
3570 VPBasicBlock *HeaderVPBB,
3571 VPBasicBlock *LatchVPBB) {
3572 VPBlockBase *MiddleVPBB = LatchVPBB->getSuccessors()[0];
3573 if (!EarlyExitVPBB->getSinglePredecessor() &&
3574 EarlyExitVPBB->getPredecessors()[1] == MiddleVPBB) {
3575 assert(EarlyExitVPBB->getNumPredecessors() == 2 &&
3576 EarlyExitVPBB->getPredecessors()[0] == EarlyExitingVPBB &&
3577 "unsupported early exit VPBB");
3578 // Early exit operand should always be last phi operand. If EarlyExitVPBB
3579 // has two predecessors and EarlyExitingVPBB is the first, swap the operands
3580 // of the phis.
3581 for (VPRecipeBase &R : EarlyExitVPBB->phis())
3582 cast<VPIRPhi>(&R)->swapOperands();
3583 }
3584
3585 VPBuilder Builder(LatchVPBB->getTerminator());
3586 VPBlockBase *TrueSucc = EarlyExitingVPBB->getSuccessors()[0];
3587 assert(match(EarlyExitingVPBB->getTerminator(), m_BranchOnCond()) &&
3588 "Terminator must be be BranchOnCond");
3589 VPValue *CondOfEarlyExitingVPBB =
3590 EarlyExitingVPBB->getTerminator()->getOperand(0);
3591 auto *CondToEarlyExit = TrueSucc == EarlyExitVPBB
3592 ? CondOfEarlyExitingVPBB
3593 : Builder.createNot(CondOfEarlyExitingVPBB);
3594
3595 // Split the middle block and have it conditionally branch to the early exit
3596 // block if CondToEarlyExit.
3597 VPValue *IsEarlyExitTaken =
3598 Builder.createNaryOp(VPInstruction::AnyOf, {CondToEarlyExit});
3599 VPBasicBlock *NewMiddle = Plan.createVPBasicBlock("middle.split");
3600 VPBasicBlock *VectorEarlyExitVPBB =
3601 Plan.createVPBasicBlock("vector.early.exit");
3602 VPBlockUtils::insertOnEdge(LatchVPBB, MiddleVPBB, NewMiddle);
3603 VPBlockUtils::connectBlocks(NewMiddle, VectorEarlyExitVPBB);
3604 NewMiddle->swapSuccessors();
3605
3606 VPBlockUtils::connectBlocks(VectorEarlyExitVPBB, EarlyExitVPBB);
3607
3608 // Update the exit phis in the early exit block.
3609 VPBuilder MiddleBuilder(NewMiddle);
3610 VPBuilder EarlyExitB(VectorEarlyExitVPBB);
3611 for (VPRecipeBase &R : EarlyExitVPBB->phis()) {
3612 auto *ExitIRI = cast<VPIRPhi>(&R);
3613 // Early exit operand should always be last, i.e., 0 if EarlyExitVPBB has
3614 // a single predecessor and 1 if it has two.
3615 unsigned EarlyExitIdx = ExitIRI->getNumOperands() - 1;
3616 if (ExitIRI->getNumOperands() != 1) {
3617 // The first of two operands corresponds to the latch exit, via MiddleVPBB
3618 // predecessor. Extract its last lane.
3619 ExitIRI->extractLastLaneOfFirstOperand(MiddleBuilder);
3620 }
3621
3622 VPValue *IncomingFromEarlyExit = ExitIRI->getOperand(EarlyExitIdx);
3623 if (!IncomingFromEarlyExit->isLiveIn()) {
3624 // Update the incoming value from the early exit.
3625 VPValue *FirstActiveLane = EarlyExitB.createNaryOp(
3626 VPInstruction::FirstActiveLane, {CondToEarlyExit}, nullptr,
3627 "first.active.lane");
3628 IncomingFromEarlyExit = EarlyExitB.createNaryOp(
3629 VPInstruction::ExtractLane, {FirstActiveLane, IncomingFromEarlyExit},
3630 nullptr, "early.exit.value");
3631 ExitIRI->setOperand(EarlyExitIdx, IncomingFromEarlyExit);
3632 }
3633 }
3634 MiddleBuilder.createNaryOp(VPInstruction::BranchOnCond, {IsEarlyExitTaken});
3635
3636 // Replace the condition controlling the non-early exit from the vector loop
3637 // with one exiting if either the original condition of the vector latch is
3638 // true or the early exit has been taken.
3639 auto *LatchExitingBranch = cast<VPInstruction>(LatchVPBB->getTerminator());
3640 assert(LatchExitingBranch->getOpcode() == VPInstruction::BranchOnCount &&
3641 "Unexpected terminator");
3642 auto *IsLatchExitTaken =
3643 Builder.createICmp(CmpInst::ICMP_EQ, LatchExitingBranch->getOperand(0),
3644 LatchExitingBranch->getOperand(1));
3645 auto *AnyExitTaken = Builder.createNaryOp(
3646 Instruction::Or, {IsEarlyExitTaken, IsLatchExitTaken});
3647 Builder.createNaryOp(VPInstruction::BranchOnCond, AnyExitTaken);
3648 LatchExitingBranch->eraseFromParent();
3649}
3650
3651/// This function tries convert extended in-loop reductions to
3652/// VPExpressionRecipe and clamp the \p Range if it is beneficial and
3653/// valid. The created recipe must be decomposed to its constituent
3654/// recipes before execution.
3655static VPExpressionRecipe *
3657 VFRange &Range) {
3658 Type *RedTy = Ctx.Types.inferScalarType(Red);
3659 VPValue *VecOp = Red->getVecOp();
3660
3661 // Clamp the range if using extended-reduction is profitable.
3662 auto IsExtendedRedValidAndClampRange =
3663 [&](unsigned Opcode, Instruction::CastOps ExtOpc, Type *SrcTy) -> bool {
3665 [&](ElementCount VF) {
3666 auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF));
3668
3669 InstructionCost ExtRedCost;
3670 InstructionCost ExtCost =
3671 cast<VPWidenCastRecipe>(VecOp)->computeCost(VF, Ctx);
3672 InstructionCost RedCost = Red->computeCost(VF, Ctx);
3673
3677 // FIXME: Move partial reduction creation, costing and clamping
3678 // here from LoopVectorize.cpp.
3679 ExtRedCost = Ctx.TTI.getPartialReductionCost(
3680 Opcode, SrcTy, nullptr, RedTy, VF, ExtKind,
3681 llvm::TargetTransformInfo::PR_None, std::nullopt, Ctx.CostKind);
3682 } else {
3683 ExtRedCost = Ctx.TTI.getExtendedReductionCost(
3684 Opcode, ExtOpc == Instruction::CastOps::ZExt, RedTy, SrcVecTy,
3685 Red->getFastMathFlags(), CostKind);
3686 }
3687 return ExtRedCost.isValid() && ExtRedCost < ExtCost + RedCost;
3688 },
3689 Range);
3690 };
3691
3692 VPValue *A;
3693 // Match reduce(ext)).
3694 if (match(VecOp, m_ZExtOrSExt(m_VPValue(A))) &&
3695 IsExtendedRedValidAndClampRange(
3696 RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()),
3697 cast<VPWidenCastRecipe>(VecOp)->getOpcode(),
3698 Ctx.Types.inferScalarType(A)))
3699 return new VPExpressionRecipe(cast<VPWidenCastRecipe>(VecOp), Red);
3700
3701 return nullptr;
3702}
3703
3704/// This function tries convert extended in-loop reductions to
3705/// VPExpressionRecipe and clamp the \p Range if it is beneficial
3706/// and valid. The created VPExpressionRecipe must be decomposed to its
3707/// constituent recipes before execution. Patterns of the
3708/// VPExpressionRecipe:
3709/// reduce.add(mul(...)),
3710/// reduce.add(mul(ext(A), ext(B))),
3711/// reduce.add(ext(mul(ext(A), ext(B)))).
3712static VPExpressionRecipe *
3714 VPCostContext &Ctx, VFRange &Range) {
3715 bool IsPartialReduction = isa<VPPartialReductionRecipe>(Red);
3716
3717 unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind());
3718 if (Opcode != Instruction::Add && Opcode != Instruction::Sub)
3719 return nullptr;
3720
3721 Type *RedTy = Ctx.Types.inferScalarType(Red);
3722
3723 // Clamp the range if using multiply-accumulate-reduction is profitable.
3724 auto IsMulAccValidAndClampRange =
3726 VPWidenCastRecipe *OuterExt) -> bool {
3728 [&](ElementCount VF) {
3730 Type *SrcTy =
3731 Ext0 ? Ctx.Types.inferScalarType(Ext0->getOperand(0)) : RedTy;
3732 InstructionCost MulAccCost;
3733
3734 if (IsPartialReduction) {
3735 Type *SrcTy2 =
3736 Ext1 ? Ctx.Types.inferScalarType(Ext1->getOperand(0)) : nullptr;
3737 // FIXME: Move partial reduction creation, costing and clamping
3738 // here from LoopVectorize.cpp.
3739 MulAccCost = Ctx.TTI.getPartialReductionCost(
3740 Opcode, SrcTy, SrcTy2, RedTy, VF,
3742 Ext0->getOpcode())
3745 Ext1->getOpcode())
3747 Mul->getOpcode(), CostKind);
3748 } else {
3749 // Only partial reductions support mixed extends at the moment.
3750 if (Ext0 && Ext1 && Ext0->getOpcode() != Ext1->getOpcode())
3751 return false;
3752
3753 bool IsZExt =
3754 !Ext0 || Ext0->getOpcode() == Instruction::CastOps::ZExt;
3755 auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF));
3756 MulAccCost = Ctx.TTI.getMulAccReductionCost(IsZExt, Opcode, RedTy,
3757 SrcVecTy, CostKind);
3758 }
3759
3760 InstructionCost MulCost = Mul->computeCost(VF, Ctx);
3761 InstructionCost RedCost = Red->computeCost(VF, Ctx);
3762 InstructionCost ExtCost = 0;
3763 if (Ext0)
3764 ExtCost += Ext0->computeCost(VF, Ctx);
3765 if (Ext1)
3766 ExtCost += Ext1->computeCost(VF, Ctx);
3767 if (OuterExt)
3768 ExtCost += OuterExt->computeCost(VF, Ctx);
3769
3770 return MulAccCost.isValid() &&
3771 MulAccCost < ExtCost + MulCost + RedCost;
3772 },
3773 Range);
3774 };
3775
3776 VPValue *VecOp = Red->getVecOp();
3777 VPRecipeBase *Sub = nullptr;
3778 VPValue *A, *B;
3779 VPValue *Tmp = nullptr;
3780 // Sub reductions could have a sub between the add reduction and vec op.
3781 if (match(VecOp, m_Sub(m_ZeroInt(), m_VPValue(Tmp)))) {
3782 Sub = VecOp->getDefiningRecipe();
3783 VecOp = Tmp;
3784 }
3785
3786 // If ValB is a constant and can be safely extended, truncate it to the same
3787 // type as ExtA's operand, then extend it to the same type as ExtA. This
3788 // creates two uniform extends that can more easily be matched by the rest of
3789 // the bundling code. The ExtB reference, ValB and operand 1 of Mul are all
3790 // replaced with the new extend of the constant.
3791 auto ExtendAndReplaceConstantOp = [&Ctx](VPWidenCastRecipe *ExtA,
3792 VPWidenCastRecipe *&ExtB,
3793 VPValue *&ValB, VPWidenRecipe *Mul) {
3794 if (!ExtA || ExtB || !ValB->isLiveIn())
3795 return;
3796 Type *NarrowTy = Ctx.Types.inferScalarType(ExtA->getOperand(0));
3797 Instruction::CastOps ExtOpc = ExtA->getOpcode();
3798 const APInt *Const;
3799 if (!match(ValB, m_APInt(Const)) ||
3801 Const, NarrowTy, TTI::getPartialReductionExtendKind(ExtOpc)))
3802 return;
3803 // The truncate ensures that the type of each extended operand is the
3804 // same, and it's been proven that the constant can be extended from
3805 // NarrowTy safely. Necessary since ExtA's extended operand would be
3806 // e.g. an i8, while the const will likely be an i32. This will be
3807 // elided by later optimisations.
3808 VPBuilder Builder(Mul);
3809 auto *Trunc =
3810 Builder.createWidenCast(Instruction::CastOps::Trunc, ValB, NarrowTy);
3811 Type *WideTy = Ctx.Types.inferScalarType(ExtA);
3812 ValB = ExtB = Builder.createWidenCast(ExtOpc, Trunc, WideTy);
3813 Mul->setOperand(1, ExtB);
3814 };
3815
3816 // Try to match reduce.add(mul(...)).
3817 if (match(VecOp, m_Mul(m_VPValue(A), m_VPValue(B)))) {
3820 auto *Mul = cast<VPWidenRecipe>(VecOp);
3821
3822 // Convert reduce.add(mul(ext, const)) to reduce.add(mul(ext, ext(const)))
3823 ExtendAndReplaceConstantOp(RecipeA, RecipeB, B, Mul);
3824
3825 // Match reduce.add/sub(mul(ext, ext)).
3826 if (RecipeA && RecipeB && match(RecipeA, m_ZExtOrSExt(m_VPValue())) &&
3827 match(RecipeB, m_ZExtOrSExt(m_VPValue())) &&
3828 IsMulAccValidAndClampRange(Mul, RecipeA, RecipeB, nullptr)) {
3829 if (Sub)
3830 return new VPExpressionRecipe(RecipeA, RecipeB, Mul,
3831 cast<VPWidenRecipe>(Sub), Red);
3832 return new VPExpressionRecipe(RecipeA, RecipeB, Mul, Red);
3833 }
3834 // TODO: Add an expression type for this variant with a negated mul
3835 if (!Sub && IsMulAccValidAndClampRange(Mul, nullptr, nullptr, nullptr))
3836 return new VPExpressionRecipe(Mul, Red);
3837 }
3838 // TODO: Add an expression type for negated versions of other expression
3839 // variants.
3840 if (Sub)
3841 return nullptr;
3842
3843 // Match reduce.add(ext(mul(A, B))).
3844 if (match(VecOp, m_ZExtOrSExt(m_Mul(m_VPValue(A), m_VPValue(B))))) {
3845 auto *Ext = cast<VPWidenCastRecipe>(VecOp);
3846 auto *Mul = cast<VPWidenRecipe>(Ext->getOperand(0));
3849
3850 // reduce.add(ext(mul(ext, const)))
3851 // -> reduce.add(ext(mul(ext, ext(const))))
3852 ExtendAndReplaceConstantOp(Ext0, Ext1, B, Mul);
3853
3854 // reduce.add(ext(mul(ext(A), ext(B))))
3855 // -> reduce.add(mul(wider_ext(A), wider_ext(B)))
3856 // The inner extends must either have the same opcode as the outer extend or
3857 // be the same, in which case the multiply can never result in a negative
3858 // value and the outer extend can be folded away by doing wider
3859 // extends for the operands of the mul.
3860 if (Ext0 && Ext1 &&
3861 (Ext->getOpcode() == Ext0->getOpcode() || Ext0 == Ext1) &&
3862 Ext0->getOpcode() == Ext1->getOpcode() &&
3863 IsMulAccValidAndClampRange(Mul, Ext0, Ext1, Ext) && Mul->hasOneUse()) {
3864 auto *NewExt0 = new VPWidenCastRecipe(
3865 Ext0->getOpcode(), Ext0->getOperand(0), Ext->getResultType(), nullptr,
3866 *Ext0, *Ext0, Ext0->getDebugLoc());
3867 NewExt0->insertBefore(Ext0);
3868
3869 VPWidenCastRecipe *NewExt1 = NewExt0;
3870 if (Ext0 != Ext1) {
3871 NewExt1 = new VPWidenCastRecipe(Ext1->getOpcode(), Ext1->getOperand(0),
3872 Ext->getResultType(), nullptr, *Ext1,
3873 *Ext1, Ext1->getDebugLoc());
3874 NewExt1->insertBefore(Ext1);
3875 }
3876 Mul->setOperand(0, NewExt0);
3877 Mul->setOperand(1, NewExt1);
3878 Red->setOperand(1, Mul);
3879 return new VPExpressionRecipe(NewExt0, NewExt1, Mul, Red);
3880 }
3881 }
3882 return nullptr;
3883}
3884
3885/// This function tries to create abstract recipes from the reduction recipe for
3886/// following optimizations and cost estimation.
3888 VPCostContext &Ctx,
3889 VFRange &Range) {
3890 VPExpressionRecipe *AbstractR = nullptr;
3891 auto IP = std::next(Red->getIterator());
3892 auto *VPBB = Red->getParent();
3893 if (auto *MulAcc = tryToMatchAndCreateMulAccumulateReduction(Red, Ctx, Range))
3894 AbstractR = MulAcc;
3895 else if (auto *ExtRed = tryToMatchAndCreateExtendedReduction(Red, Ctx, Range))
3896 AbstractR = ExtRed;
3897 // Cannot create abstract inloop reduction recipes.
3898 if (!AbstractR)
3899 return;
3900
3901 AbstractR->insertBefore(*VPBB, IP);
3902 Red->replaceAllUsesWith(AbstractR);
3903}
3904
3915
3917 if (Plan.hasScalarVFOnly())
3918 return;
3919
3920#ifndef NDEBUG
3921 VPDominatorTree VPDT(Plan);
3922#endif
3923
3924 SmallVector<VPValue *> VPValues;
3927 append_range(VPValues, Plan.getLiveIns());
3928 for (VPRecipeBase &R : *Plan.getEntry())
3929 append_range(VPValues, R.definedValues());
3930
3931 auto *VectorPreheader = Plan.getVectorPreheader();
3932 for (VPValue *VPV : VPValues) {
3934 (VPV->isLiveIn() && VPV->getLiveInIRValue() &&
3935 isa<Constant>(VPV->getLiveInIRValue())))
3936 continue;
3937
3938 // Add explicit broadcast at the insert point that dominates all users.
3939 VPBasicBlock *HoistBlock = VectorPreheader;
3940 VPBasicBlock::iterator HoistPoint = VectorPreheader->end();
3941 for (VPUser *User : VPV->users()) {
3942 if (User->usesScalars(VPV))
3943 continue;
3944 if (cast<VPRecipeBase>(User)->getParent() == VectorPreheader)
3945 HoistPoint = HoistBlock->begin();
3946 else
3947 assert(VPDT.dominates(VectorPreheader,
3948 cast<VPRecipeBase>(User)->getParent()) &&
3949 "All users must be in the vector preheader or dominated by it");
3950 }
3951
3952 VPBuilder Builder(cast<VPBasicBlock>(HoistBlock), HoistPoint);
3953 auto *Broadcast = Builder.createNaryOp(VPInstruction::Broadcast, {VPV});
3954 VPV->replaceUsesWithIf(Broadcast,
3955 [VPV, Broadcast](VPUser &U, unsigned Idx) {
3956 return Broadcast != &U && !U.usesScalars(VPV);
3957 });
3958 }
3959}
3960
3962 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3963
3964 // Collect candidate loads with invariant addresses and noalias scopes
3965 // metadata and memory-writing recipes with noalias metadata.
3969 vp_depth_first_shallow(LoopRegion->getEntry()))) {
3970 for (VPRecipeBase &R : *VPBB) {
3971 // Only handle single-scalar replicated loads with invariant addresses.
3972 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
3973 if (RepR->isPredicated() || !RepR->isSingleScalar() ||
3974 RepR->getOpcode() != Instruction::Load)
3975 continue;
3976
3977 VPValue *Addr = RepR->getOperand(0);
3978 if (Addr->isDefinedOutsideLoopRegions()) {
3980 if (!Loc.AATags.Scope)
3981 continue;
3982 CandidateLoads.push_back({RepR, Loc});
3983 }
3984 }
3985 if (R.mayWriteToMemory()) {
3987 if (!Loc || !Loc->AATags.Scope || !Loc->AATags.NoAlias)
3988 return;
3989 Stores.push_back(*Loc);
3990 }
3991 }
3992 }
3993
3994 VPBasicBlock *Preheader = Plan.getVectorPreheader();
3995 for (auto &[LoadRecipe, LoadLoc] : CandidateLoads) {
3996 // Hoist the load to the preheader if it doesn't alias with any stores
3997 // according to the noalias metadata. Other loads should have been hoisted
3998 // by other passes
3999 const AAMDNodes &LoadAA = LoadLoc.AATags;
4000 if (all_of(Stores, [&](const MemoryLocation &StoreLoc) {
4002 LoadAA.Scope, StoreLoc.AATags.NoAlias);
4003 })) {
4004 LoadRecipe->moveBefore(*Preheader, Preheader->getFirstNonPhi());
4005 }
4006 }
4007}
4008
4010 VPlan &Plan, ElementCount BestVF, unsigned BestUF,
4012 assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
4013 assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
4014
4015 VPValue *TC = Plan.getTripCount();
4016 // Skip cases for which the trip count may be non-trivial to materialize.
4017 // I.e., when a scalar tail is absent - due to tail folding, or when a scalar
4018 // tail is required.
4019 if (!Plan.hasScalarTail() ||
4021 Plan.getScalarPreheader() ||
4022 !TC->isLiveIn())
4023 return;
4024
4025 // Materialize vector trip counts for constants early if it can simply
4026 // be computed as (Original TC / VF * UF) * VF * UF.
4027 // TODO: Compute vector trip counts for loops requiring a scalar epilogue and
4028 // tail-folded loops.
4029 ScalarEvolution &SE = *PSE.getSE();
4030 auto *TCScev = SE.getSCEV(TC->getLiveInIRValue());
4031 if (!isa<SCEVConstant>(TCScev))
4032 return;
4033 const SCEV *VFxUF = SE.getElementCount(TCScev->getType(), BestVF * BestUF);
4034 auto VecTCScev = SE.getMulExpr(SE.getUDivExpr(TCScev, VFxUF), VFxUF);
4035 if (auto *ConstVecTC = dyn_cast<SCEVConstant>(VecTCScev))
4036 Plan.getVectorTripCount().setUnderlyingValue(ConstVecTC->getValue());
4037}
4038
4040 VPBasicBlock *VectorPH) {
4042 if (BTC->getNumUsers() == 0)
4043 return;
4044
4045 VPBuilder Builder(VectorPH, VectorPH->begin());
4046 auto *TCTy = VPTypeAnalysis(Plan).inferScalarType(Plan.getTripCount());
4047 auto *TCMO = Builder.createNaryOp(
4048 Instruction::Sub, {Plan.getTripCount(), Plan.getConstantInt(TCTy, 1)},
4049 DebugLoc::getCompilerGenerated(), "trip.count.minus.1");
4050 BTC->replaceAllUsesWith(TCMO);
4051}
4052
4054 if (Plan.hasScalarVFOnly())
4055 return;
4056
4057 VPTypeAnalysis TypeInfo(Plan);
4058 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
4059 auto VPBBsOutsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
4061 auto VPBBsInsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
4062 vp_depth_first_shallow(LoopRegion->getEntry()));
4063 // Materialize Build(Struct)Vector for all replicating VPReplicateRecipes and
4064 // VPInstructions, excluding ones in replicate regions. Those are not
4065 // materialized explicitly yet. Those vector users are still handled in
4066 // VPReplicateRegion::execute(), via shouldPack().
4067 // TODO: materialize build vectors for replicating recipes in replicating
4068 // regions.
4069 for (VPBasicBlock *VPBB :
4070 concat<VPBasicBlock *>(VPBBsOutsideLoopRegion, VPBBsInsideLoopRegion)) {
4071 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
4073 continue;
4074 auto *DefR = cast<VPRecipeWithIRFlags>(&R);
4075 auto UsesVectorOrInsideReplicateRegion = [DefR, LoopRegion](VPUser *U) {
4076 VPRegionBlock *ParentRegion = cast<VPRecipeBase>(U)->getRegion();
4077 return !U->usesScalars(DefR) || ParentRegion != LoopRegion;
4078 };
4079 if ((isa<VPReplicateRecipe>(DefR) &&
4080 cast<VPReplicateRecipe>(DefR)->isSingleScalar()) ||
4081 (isa<VPInstruction>(DefR) &&
4083 !cast<VPInstruction>(DefR)->doesGeneratePerAllLanes())) ||
4084 none_of(DefR->users(), UsesVectorOrInsideReplicateRegion))
4085 continue;
4086
4087 Type *ScalarTy = TypeInfo.inferScalarType(DefR);
4088 unsigned Opcode = ScalarTy->isStructTy()
4091 auto *BuildVector = new VPInstruction(Opcode, {DefR});
4092 BuildVector->insertAfter(DefR);
4093
4094 DefR->replaceUsesWithIf(
4095 BuildVector, [BuildVector, &UsesVectorOrInsideReplicateRegion](
4096 VPUser &U, unsigned) {
4097 return &U != BuildVector && UsesVectorOrInsideReplicateRegion(&U);
4098 });
4099 }
4100 }
4101
4102 // Create explicit VPInstructions to convert vectors to scalars. The current
4103 // implementation is conservative - it may miss some cases that may or may not
4104 // be vector values. TODO: introduce Unpacks speculatively - remove them later
4105 // if they are known to operate on scalar values.
4106 for (VPBasicBlock *VPBB : VPBBsInsideLoopRegion) {
4107 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
4110 continue;
4111 for (VPValue *Def : R.definedValues()) {
4112 // Skip recipes that are single-scalar or only have their first lane
4113 // used.
4114 // TODO: The Defs skipped here may or may not be vector values.
4115 // Introduce Unpacks, and remove them later, if they are guaranteed to
4116 // produce scalar values.
4118 continue;
4119
4120 // At the moment, we create unpacks only for scalar users outside
4121 // replicate regions. Recipes inside replicate regions still extract the
4122 // required lanes implicitly.
4123 // TODO: Remove once replicate regions are unrolled completely.
4124 auto IsCandidateUnpackUser = [Def](VPUser *U) {
4125 VPRegionBlock *ParentRegion = cast<VPRecipeBase>(U)->getRegion();
4126 return U->usesScalars(Def) &&
4127 (!ParentRegion || !ParentRegion->isReplicator());
4128 };
4129 if (none_of(Def->users(), IsCandidateUnpackUser))
4130 continue;
4131
4132 auto *Unpack = new VPInstruction(VPInstruction::Unpack, {Def});
4133 if (R.isPhi())
4134 Unpack->insertBefore(*VPBB, VPBB->getFirstNonPhi());
4135 else
4136 Unpack->insertAfter(&R);
4137 Def->replaceUsesWithIf(Unpack,
4138 [&IsCandidateUnpackUser](VPUser &U, unsigned) {
4139 return IsCandidateUnpackUser(&U);
4140 });
4141 }
4142 }
4143 }
4144}
4145
4147 VPBasicBlock *VectorPHVPBB,
4148 bool TailByMasking,
4149 bool RequiresScalarEpilogue) {
4150 VPValue &VectorTC = Plan.getVectorTripCount();
4151 assert(VectorTC.isLiveIn() && "vector-trip-count must be a live-in");
4152 // There's nothing to do if there are no users of the vector trip count or its
4153 // IR value has already been set.
4154 if (VectorTC.getNumUsers() == 0 || VectorTC.getLiveInIRValue())
4155 return;
4156
4157 VPValue *TC = Plan.getTripCount();
4158 Type *TCTy = VPTypeAnalysis(Plan).inferScalarType(TC);
4159 VPBuilder Builder(VectorPHVPBB, VectorPHVPBB->begin());
4160 VPValue *Step = &Plan.getVFxUF();
4161
4162 // If the tail is to be folded by masking, round the number of iterations N
4163 // up to a multiple of Step instead of rounding down. This is done by first
4164 // adding Step-1 and then rounding down. Note that it's ok if this addition
4165 // overflows: the vector induction variable will eventually wrap to zero given
4166 // that it starts at zero and its Step is a power of two; the loop will then
4167 // exit, with the last early-exit vector comparison also producing all-true.
4168 // For scalable vectors the VF is not guaranteed to be a power of 2, but this
4169 // is accounted for in emitIterationCountCheck that adds an overflow check.
4170 if (TailByMasking) {
4171 TC = Builder.createNaryOp(
4172 Instruction::Add,
4173 {TC, Builder.createNaryOp(Instruction::Sub,
4174 {Step, Plan.getConstantInt(TCTy, 1)})},
4175 DebugLoc::getCompilerGenerated(), "n.rnd.up");
4176 }
4177
4178 // Now we need to generate the expression for the part of the loop that the
4179 // vectorized body will execute. This is equal to N - (N % Step) if scalar
4180 // iterations are not required for correctness, or N - Step, otherwise. Step
4181 // is equal to the vectorization factor (number of SIMD elements) times the
4182 // unroll factor (number of SIMD instructions).
4183 VPValue *R =
4184 Builder.createNaryOp(Instruction::URem, {TC, Step},
4185 DebugLoc::getCompilerGenerated(), "n.mod.vf");
4186
4187 // There are cases where we *must* run at least one iteration in the remainder
4188 // loop. See the cost model for when this can happen. If the step evenly
4189 // divides the trip count, we set the remainder to be equal to the step. If
4190 // the step does not evenly divide the trip count, no adjustment is necessary
4191 // since there will already be scalar iterations. Note that the minimum
4192 // iterations check ensures that N >= Step.
4193 if (RequiresScalarEpilogue) {
4194 assert(!TailByMasking &&
4195 "requiring scalar epilogue is not supported with fail folding");
4196 VPValue *IsZero =
4197 Builder.createICmp(CmpInst::ICMP_EQ, R, Plan.getConstantInt(TCTy, 0));
4198 R = Builder.createSelect(IsZero, Step, R);
4199 }
4200
4201 VPValue *Res = Builder.createNaryOp(
4202 Instruction::Sub, {TC, R}, DebugLoc::getCompilerGenerated(), "n.vec");
4203 VectorTC.replaceAllUsesWith(Res);
4204}
4205
4207 ElementCount VFEC) {
4208 VPBuilder Builder(VectorPH, VectorPH->begin());
4209 Type *TCTy = VPTypeAnalysis(Plan).inferScalarType(Plan.getTripCount());
4210 VPValue &VF = Plan.getVF();
4211 VPValue &VFxUF = Plan.getVFxUF();
4212 // Note that after the transform, Plan.getVF and Plan.getVFxUF should not be
4213 // used.
4214 // TODO: Assert that they aren't used.
4215
4216 // If there are no users of the runtime VF, compute VFxUF by constant folding
4217 // the multiplication of VF and UF.
4218 if (VF.getNumUsers() == 0) {
4219 VPValue *RuntimeVFxUF =
4220 Builder.createElementCount(TCTy, VFEC * Plan.getUF());
4221 VFxUF.replaceAllUsesWith(RuntimeVFxUF);
4222 return;
4223 }
4224
4225 // For users of the runtime VF, compute it as VF * vscale, and VFxUF as (VF *
4226 // vscale) * UF.
4227 VPValue *RuntimeVF = Builder.createElementCount(TCTy, VFEC);
4229 VPValue *BC = Builder.createNaryOp(VPInstruction::Broadcast, RuntimeVF);
4231 BC, [&VF](VPUser &U, unsigned) { return !U.usesScalars(&VF); });
4232 }
4233 VF.replaceAllUsesWith(RuntimeVF);
4234
4235 VPValue *UF = Plan.getConstantInt(TCTy, Plan.getUF());
4236 VPValue *MulByUF = Builder.createNaryOp(Instruction::Mul, {RuntimeVF, UF});
4237 VFxUF.replaceAllUsesWith(MulByUF);
4238}
4239
4242 const DataLayout &DL = SE.getDataLayout();
4243 SCEVExpander Expander(SE, DL, "induction", /*PreserveLCSSA=*/false);
4244
4245 auto *Entry = cast<VPIRBasicBlock>(Plan.getEntry());
4246 BasicBlock *EntryBB = Entry->getIRBasicBlock();
4247 DenseMap<const SCEV *, Value *> ExpandedSCEVs;
4248 for (VPRecipeBase &R : make_early_inc_range(*Entry)) {
4250 continue;
4251 auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(&R);
4252 if (!ExpSCEV)
4253 break;
4254 const SCEV *Expr = ExpSCEV->getSCEV();
4255 Value *Res =
4256 Expander.expandCodeFor(Expr, Expr->getType(), EntryBB->getTerminator());
4257 ExpandedSCEVs[ExpSCEV->getSCEV()] = Res;
4258 VPValue *Exp = Plan.getOrAddLiveIn(Res);
4259 ExpSCEV->replaceAllUsesWith(Exp);
4260 if (Plan.getTripCount() == ExpSCEV)
4261 Plan.resetTripCount(Exp);
4262 ExpSCEV->eraseFromParent();
4263 }
4265 "VPExpandSCEVRecipes must be at the beginning of the entry block, "
4266 "after any VPIRInstructions");
4267 // Add IR instructions in the entry basic block but not in the VPIRBasicBlock
4268 // to the VPIRBasicBlock.
4269 auto EI = Entry->begin();
4270 for (Instruction &I : drop_end(*EntryBB)) {
4271 if (EI != Entry->end() && isa<VPIRInstruction>(*EI) &&
4272 &cast<VPIRInstruction>(&*EI)->getInstruction() == &I) {
4273 EI++;
4274 continue;
4275 }
4277 }
4278
4279 return ExpandedSCEVs;
4280}
4281
4282/// Returns true if \p V is VPWidenLoadRecipe or VPInterleaveRecipe that can be
4283/// converted to a narrower recipe. \p V is used by a wide recipe that feeds a
4284/// store interleave group at index \p Idx, \p WideMember0 is the recipe feeding
4285/// the same interleave group at index 0. A VPWidenLoadRecipe can be narrowed to
4286/// an index-independent load if it feeds all wide ops at all indices (\p OpV
4287/// must be the operand at index \p OpIdx for both the recipe at lane 0, \p
4288/// WideMember0). A VPInterleaveRecipe can be narrowed to a wide load, if \p V
4289/// is defined at \p Idx of a load interleave group.
4290static bool canNarrowLoad(VPWidenRecipe *WideMember0, unsigned OpIdx,
4291 VPValue *OpV, unsigned Idx) {
4292 VPValue *Member0Op = WideMember0->getOperand(OpIdx);
4293 VPRecipeBase *Member0OpR = Member0Op->getDefiningRecipe();
4294 if (!Member0OpR)
4295 return Member0Op == OpV;
4296 if (auto *W = dyn_cast<VPWidenLoadRecipe>(Member0OpR))
4297 return !W->getMask() && Member0Op == OpV;
4298 if (auto *IR = dyn_cast<VPInterleaveRecipe>(Member0OpR))
4299 return IR->getInterleaveGroup()->isFull() && IR->getVPValue(Idx) == OpV;
4300 return false;
4301}
4302
4303/// Returns true if \p IR is a full interleave group with factor and number of
4304/// members both equal to \p VF. The interleave group must also access the full
4305/// vector width \p VectorRegWidth.
4307 ElementCount VF,
4308 VPTypeAnalysis &TypeInfo,
4309 TypeSize VectorRegWidth) {
4310 if (!InterleaveR || InterleaveR->getMask())
4311 return false;
4312
4313 Type *GroupElementTy = nullptr;
4314 if (InterleaveR->getStoredValues().empty()) {
4315 GroupElementTy = TypeInfo.inferScalarType(InterleaveR->getVPValue(0));
4316 if (!all_of(InterleaveR->definedValues(),
4317 [&TypeInfo, GroupElementTy](VPValue *Op) {
4318 return TypeInfo.inferScalarType(Op) == GroupElementTy;
4319 }))
4320 return false;
4321 } else {
4322 GroupElementTy =
4323 TypeInfo.inferScalarType(InterleaveR->getStoredValues()[0]);
4324 if (!all_of(InterleaveR->getStoredValues(),
4325 [&TypeInfo, GroupElementTy](VPValue *Op) {
4326 return TypeInfo.inferScalarType(Op) == GroupElementTy;
4327 }))
4328 return false;
4329 }
4330
4331 unsigned VFMin = VF.getKnownMinValue();
4332 TypeSize GroupSize = TypeSize::get(
4333 GroupElementTy->getScalarSizeInBits() * VFMin, VF.isScalable());
4334 const auto *IG = InterleaveR->getInterleaveGroup();
4335 return IG->getFactor() == VFMin && IG->getNumMembers() == VFMin &&
4336 GroupSize == VectorRegWidth;
4337}
4338
4339/// Returns true if \p VPValue is a narrow VPValue.
4340static bool isAlreadyNarrow(VPValue *VPV) {
4341 if (VPV->isLiveIn())
4342 return true;
4343 auto *RepR = dyn_cast<VPReplicateRecipe>(VPV);
4344 return RepR && RepR->isSingleScalar();
4345}
4346
4347// Convert a wide recipe defining a VPValue \p V feeding an interleave group to
4348// a narrow variant.
4349static VPValue *
4351 auto *R = V->getDefiningRecipe();
4352 if (!R || NarrowedOps.contains(V))
4353 return V;
4354
4355 if (isAlreadyNarrow(V))
4356 return V;
4357
4358 if (auto *WideMember0 = dyn_cast<VPWidenRecipe>(R)) {
4359 for (unsigned Idx = 0, E = WideMember0->getNumOperands(); Idx != E; ++Idx)
4360 WideMember0->setOperand(
4361 Idx,
4362 narrowInterleaveGroupOp(WideMember0->getOperand(Idx), NarrowedOps));
4363 return V;
4364 }
4365
4366 if (auto *LoadGroup = dyn_cast<VPInterleaveRecipe>(R)) {
4367 // Narrow interleave group to wide load, as transformed VPlan will only
4368 // process one original iteration.
4369 auto *LI = cast<LoadInst>(LoadGroup->getInterleaveGroup()->getInsertPos());
4370 auto *L = new VPWidenLoadRecipe(
4371 *LI, LoadGroup->getAddr(), LoadGroup->getMask(), /*Consecutive=*/true,
4372 /*Reverse=*/false, {}, LoadGroup->getDebugLoc());
4373 L->insertBefore(LoadGroup);
4374 NarrowedOps.insert(L);
4375 return L;
4376 }
4377
4378 if (auto *RepR = dyn_cast<VPReplicateRecipe>(R)) {
4379 assert(RepR->isSingleScalar() &&
4380 isa<LoadInst>(RepR->getUnderlyingInstr()) &&
4381 "must be a single scalar load");
4382 NarrowedOps.insert(RepR);
4383 return RepR;
4384 }
4385
4386 auto *WideLoad = cast<VPWidenLoadRecipe>(R);
4387 VPValue *PtrOp = WideLoad->getAddr();
4388 if (auto *VecPtr = dyn_cast<VPVectorPointerRecipe>(PtrOp))
4389 PtrOp = VecPtr->getOperand(0);
4390 // Narrow wide load to uniform scalar load, as transformed VPlan will only
4391 // process one original iteration.
4392 auto *N = new VPReplicateRecipe(&WideLoad->getIngredient(), {PtrOp},
4393 /*IsUniform*/ true,
4394 /*Mask*/ nullptr, {}, *WideLoad);
4395 N->insertBefore(WideLoad);
4396 NarrowedOps.insert(N);
4397 return N;
4398}
4399
4401 TypeSize VectorRegWidth) {
4402 VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
4403 if (!VectorLoop || VectorLoop->getEntry()->getNumSuccessors() != 0)
4404 return;
4405
4406 VPTypeAnalysis TypeInfo(Plan);
4407
4409 for (auto &R : *VectorLoop->getEntryBasicBlock()) {
4411 continue;
4412
4415 continue;
4416
4417 // Bail out on recipes not supported at the moment:
4418 // * phi recipes other than the canonical induction
4419 // * recipes writing to memory except interleave groups
4420 // Only support plans with a canonical induction phi.
4421 if (R.isPhi())
4422 return;
4423
4424 auto *InterleaveR = dyn_cast<VPInterleaveRecipe>(&R);
4425 if (R.mayWriteToMemory() && !InterleaveR)
4426 return;
4427
4428 // Do not narrow interleave groups if there are VectorPointer recipes and
4429 // the plan was unrolled. The recipe implicitly uses VF from
4430 // VPTransformState.
4431 // TODO: Remove restriction once the VF for the VectorPointer offset is
4432 // modeled explicitly as operand.
4433 if (isa<VPVectorPointerRecipe>(&R) && Plan.getUF() > 1)
4434 return;
4435
4436 // All other ops are allowed, but we reject uses that cannot be converted
4437 // when checking all allowed consumers (store interleave groups) below.
4438 if (!InterleaveR)
4439 continue;
4440
4441 // Bail out on non-consecutive interleave groups.
4442 if (!isConsecutiveInterleaveGroup(InterleaveR, VF, TypeInfo,
4443 VectorRegWidth))
4444 return;
4445
4446 // Skip read interleave groups.
4447 if (InterleaveR->getStoredValues().empty())
4448 continue;
4449
4450 // Narrow interleave groups, if all operands are already matching narrow
4451 // ops.
4452 auto *Member0 = InterleaveR->getStoredValues()[0];
4453 if (isAlreadyNarrow(Member0) &&
4454 all_of(InterleaveR->getStoredValues(),
4455 [Member0](VPValue *VPV) { return Member0 == VPV; })) {
4456 StoreGroups.push_back(InterleaveR);
4457 continue;
4458 }
4459
4460 // For now, we only support full interleave groups storing load interleave
4461 // groups.
4462 if (all_of(enumerate(InterleaveR->getStoredValues()), [](auto Op) {
4463 VPRecipeBase *DefR = Op.value()->getDefiningRecipe();
4464 if (!DefR)
4465 return false;
4466 auto *IR = dyn_cast<VPInterleaveRecipe>(DefR);
4467 return IR && IR->getInterleaveGroup()->isFull() &&
4468 IR->getVPValue(Op.index()) == Op.value();
4469 })) {
4470 StoreGroups.push_back(InterleaveR);
4471 continue;
4472 }
4473
4474 // Check if all values feeding InterleaveR are matching wide recipes, which
4475 // operands that can be narrowed.
4476 auto *WideMember0 =
4477 dyn_cast_or_null<VPWidenRecipe>(InterleaveR->getStoredValues()[0]);
4478 if (!WideMember0)
4479 return;
4480 for (const auto &[I, V] : enumerate(InterleaveR->getStoredValues())) {
4482 if (!R || R->getOpcode() != WideMember0->getOpcode() ||
4483 R->getNumOperands() > 2)
4484 return;
4485 if (any_of(enumerate(R->operands()),
4486 [WideMember0, Idx = I](const auto &P) {
4487 const auto &[OpIdx, OpV] = P;
4488 return !canNarrowLoad(WideMember0, OpIdx, OpV, Idx);
4489 }))
4490 return;
4491 }
4492 StoreGroups.push_back(InterleaveR);
4493 }
4494
4495 if (StoreGroups.empty())
4496 return;
4497
4498 // Convert InterleaveGroup \p R to a single VPWidenLoadRecipe.
4499 SmallPtrSet<VPValue *, 4> NarrowedOps;
4500 // Narrow operation tree rooted at store groups.
4501 for (auto *StoreGroup : StoreGroups) {
4502 VPValue *Res =
4503 narrowInterleaveGroupOp(StoreGroup->getStoredValues()[0], NarrowedOps);
4504 auto *SI =
4505 cast<StoreInst>(StoreGroup->getInterleaveGroup()->getInsertPos());
4506 auto *S = new VPWidenStoreRecipe(
4507 *SI, StoreGroup->getAddr(), Res, nullptr, /*Consecutive=*/true,
4508 /*Reverse=*/false, {}, StoreGroup->getDebugLoc());
4509 S->insertBefore(StoreGroup);
4510 StoreGroup->eraseFromParent();
4511 }
4512
4513 // Adjust induction to reflect that the transformed plan only processes one
4514 // original iteration.
4515 auto *CanIV = VectorLoop->getCanonicalIV();
4516 auto *Inc = cast<VPInstruction>(CanIV->getBackedgeValue());
4517 VPBuilder PHBuilder(Plan.getVectorPreheader());
4518
4519 VPValue *UF = Plan.getOrAddLiveIn(
4520 ConstantInt::get(VectorLoop->getCanonicalIVType(), 1 * Plan.getUF()));
4521 if (VF.isScalable()) {
4522 VPValue *VScale = PHBuilder.createElementCount(
4524 VPValue *VScaleUF = PHBuilder.createNaryOp(Instruction::Mul, {VScale, UF});
4525 Inc->setOperand(1, VScaleUF);
4526 Plan.getVF().replaceAllUsesWith(VScale);
4527 } else {
4528 Inc->setOperand(1, UF);
4530 Plan.getConstantInt(CanIV->getScalarType(), 1));
4531 }
4532 removeDeadRecipes(Plan);
4533}
4534
4535/// Add branch weight metadata, if the \p Plan's middle block is terminated by a
4536/// BranchOnCond recipe.
4538 VPlan &Plan, ElementCount VF, std::optional<unsigned> VScaleForTuning) {
4539 VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
4540 auto *MiddleTerm =
4542 // Only add branch metadata if there is a (conditional) terminator.
4543 if (!MiddleTerm)
4544 return;
4545
4546 assert(MiddleTerm->getOpcode() == VPInstruction::BranchOnCond &&
4547 "must have a BranchOnCond");
4548 // Assume that `TripCount % VectorStep ` is equally distributed.
4549 unsigned VectorStep = Plan.getUF() * VF.getKnownMinValue();
4550 if (VF.isScalable() && VScaleForTuning.has_value())
4551 VectorStep *= *VScaleForTuning;
4552 assert(VectorStep > 0 && "trip count should not be zero");
4553 MDBuilder MDB(Plan.getContext());
4554 MDNode *BranchWeights =
4555 MDB.createBranchWeights({1, VectorStep - 1}, /*IsExpected=*/false);
4556 MiddleTerm->setMetadata(LLVMContext::MD_prof, BranchWeights);
4557}
4558
4559/// Compute and return the end value for \p WideIV, unless it is truncated. If
4560/// the induction recipe is not canonical, creates a VPDerivedIVRecipe to
4561/// compute the end value of the induction.
4563 VPBuilder &VectorPHBuilder,
4564 VPTypeAnalysis &TypeInfo,
4565 VPValue *VectorTC) {
4566 auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
4567 // Truncated wide inductions resume from the last lane of their vector value
4568 // in the last vector iteration which is handled elsewhere.
4569 if (WideIntOrFp && WideIntOrFp->getTruncInst())
4570 return nullptr;
4571
4572 VPValue *Start = WideIV->getStartValue();
4573 VPValue *Step = WideIV->getStepValue();
4575 VPValue *EndValue = VectorTC;
4576 if (!WideIntOrFp || !WideIntOrFp->isCanonical()) {
4577 EndValue = VectorPHBuilder.createDerivedIV(
4578 ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
4579 Start, VectorTC, Step);
4580 }
4581
4582 // EndValue is derived from the vector trip count (which has the same type as
4583 // the widest induction) and thus may be wider than the induction here.
4584 Type *ScalarTypeOfWideIV = TypeInfo.inferScalarType(WideIV);
4585 if (ScalarTypeOfWideIV != TypeInfo.inferScalarType(EndValue)) {
4586 EndValue = VectorPHBuilder.createScalarCast(Instruction::Trunc, EndValue,
4587 ScalarTypeOfWideIV,
4588 WideIV->getDebugLoc());
4589 }
4590
4591 return EndValue;
4592}
4593
4595 VPlan &Plan, DenseMap<VPValue *, VPValue *> &IVEndValues) {
4596 VPTypeAnalysis TypeInfo(Plan);
4597 auto *ScalarPH = Plan.getScalarPreheader();
4598 auto *MiddleVPBB = cast<VPBasicBlock>(ScalarPH->getPredecessors()[0]);
4599 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
4600 VPBuilder VectorPHBuilder(
4601 cast<VPBasicBlock>(VectorRegion->getSinglePredecessor()));
4602 VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
4603 for (VPRecipeBase &PhiR : Plan.getScalarPreheader()->phis()) {
4604 auto *ResumePhiR = cast<VPPhi>(&PhiR);
4605
4606 // TODO: Extract final value from induction recipe initially, optimize to
4607 // pre-computed end value together in optimizeInductionExitUsers.
4608 auto *VectorPhiR = cast<VPHeaderPHIRecipe>(ResumePhiR->getOperand(0));
4609 if (auto *WideIVR = dyn_cast<VPWidenInductionRecipe>(VectorPhiR)) {
4611 WideIVR, VectorPHBuilder, TypeInfo, &Plan.getVectorTripCount())) {
4612 IVEndValues[WideIVR] = EndValue;
4613 ResumePhiR->setOperand(0, EndValue);
4614 ResumePhiR->setName("bc.resume.val");
4615 continue;
4616 }
4617 // TODO: Also handle truncated inductions here. Computing end-values
4618 // separately should be done as VPlan-to-VPlan optimization, after
4619 // legalizing all resume values to use the last lane from the loop.
4620 assert(cast<VPWidenIntOrFpInductionRecipe>(VectorPhiR)->getTruncInst() &&
4621 "should only skip truncated wide inductions");
4622 continue;
4623 }
4624
4625 // The backedge value provides the value to resume coming out of a loop,
4626 // which for FORs is a vector whose last element needs to be extracted. The
4627 // start value provides the value if the loop is bypassed.
4628 bool IsFOR = isa<VPFirstOrderRecurrencePHIRecipe>(VectorPhiR);
4629 auto *ResumeFromVectorLoop = VectorPhiR->getBackedgeValue();
4630 assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
4631 "Cannot handle loops with uncountable early exits");
4632 if (IsFOR)
4633 ResumeFromVectorLoop = MiddleBuilder.createNaryOp(
4634 VPInstruction::ExtractLastElement, {ResumeFromVectorLoop}, {},
4635 "vector.recur.extract");
4636 ResumePhiR->setName(IsFOR ? "scalar.recur.init" : "bc.merge.rdx");
4637 ResumePhiR->setOperand(0, ResumeFromVectorLoop);
4638 }
4639}
4640
4642 VFRange &Range) {
4643 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
4644 auto *ScalarPHVPBB = Plan.getScalarPreheader();
4645 auto *MiddleVPBB = Plan.getMiddleBlock();
4646 VPBuilder ScalarPHBuilder(ScalarPHVPBB);
4647 VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
4648
4649 auto IsScalableOne = [](ElementCount VF) -> bool {
4650 return VF == ElementCount::getScalable(1);
4651 };
4652
4653 for (auto &HeaderPhi : VectorRegion->getEntryBasicBlock()->phis()) {
4654 auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&HeaderPhi);
4655 if (!FOR)
4656 continue;
4657
4658 assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
4659 "Cannot handle loops with uncountable early exits");
4660
4661 // This is the second phase of vectorizing first-order recurrences, creating
4662 // extract for users outside the loop. An overview of the transformation is
4663 // described below. Suppose we have the following loop with some use after
4664 // the loop of the last a[i-1],
4665 //
4666 // for (int i = 0; i < n; ++i) {
4667 // t = a[i - 1];
4668 // b[i] = a[i] - t;
4669 // }
4670 // use t;
4671 //
4672 // There is a first-order recurrence on "a". For this loop, the shorthand
4673 // scalar IR looks like:
4674 //
4675 // scalar.ph:
4676 // s.init = a[-1]
4677 // br scalar.body
4678 //
4679 // scalar.body:
4680 // i = phi [0, scalar.ph], [i+1, scalar.body]
4681 // s1 = phi [s.init, scalar.ph], [s2, scalar.body]
4682 // s2 = a[i]
4683 // b[i] = s2 - s1
4684 // br cond, scalar.body, exit.block
4685 //
4686 // exit.block:
4687 // use = lcssa.phi [s1, scalar.body]
4688 //
4689 // In this example, s1 is a recurrence because it's value depends on the
4690 // previous iteration. In the first phase of vectorization, we created a
4691 // VPFirstOrderRecurrencePHIRecipe v1 for s1. Now we create the extracts
4692 // for users in the scalar preheader and exit block.
4693 //
4694 // vector.ph:
4695 // v_init = vector(..., ..., ..., a[-1])
4696 // br vector.body
4697 //
4698 // vector.body
4699 // i = phi [0, vector.ph], [i+4, vector.body]
4700 // v1 = phi [v_init, vector.ph], [v2, vector.body]
4701 // v2 = a[i, i+1, i+2, i+3]
4702 // b[i] = v2 - v1
4703 // // Next, third phase will introduce v1' = splice(v1(3), v2(0, 1, 2))
4704 // b[i, i+1, i+2, i+3] = v2 - v1
4705 // br cond, vector.body, middle.block
4706 //
4707 // middle.block:
4708 // vector.recur.extract.for.phi = v2(2)
4709 // vector.recur.extract = v2(3)
4710 // br cond, scalar.ph, exit.block
4711 //
4712 // scalar.ph:
4713 // scalar.recur.init = phi [vector.recur.extract, middle.block],
4714 // [s.init, otherwise]
4715 // br scalar.body
4716 //
4717 // scalar.body:
4718 // i = phi [0, scalar.ph], [i+1, scalar.body]
4719 // s1 = phi [scalar.recur.init, scalar.ph], [s2, scalar.body]
4720 // s2 = a[i]
4721 // b[i] = s2 - s1
4722 // br cond, scalar.body, exit.block
4723 //
4724 // exit.block:
4725 // lo = lcssa.phi [s1, scalar.body],
4726 // [vector.recur.extract.for.phi, middle.block]
4727 //
4728 // Now update VPIRInstructions modeling LCSSA phis in the exit block.
4729 // Extract the penultimate value of the recurrence and use it as operand for
4730 // the VPIRInstruction modeling the phi.
4731 for (VPUser *U : FOR->users()) {
4732 using namespace llvm::VPlanPatternMatch;
4733 if (!match(U, m_ExtractLastElement(m_Specific(FOR))))
4734 continue;
4735 // For VF vscale x 1, if vscale = 1, we are unable to extract the
4736 // penultimate value of the recurrence. Instead we rely on the existing
4737 // extract of the last element from the result of
4738 // VPInstruction::FirstOrderRecurrenceSplice.
4739 // TODO: Consider vscale_range info and UF.
4741 Range))
4742 return;
4743 VPValue *PenultimateElement = MiddleBuilder.createNaryOp(
4744 VPInstruction::ExtractPenultimateElement, {FOR->getBackedgeValue()},
4745 {}, "vector.recur.extract.for.phi");
4746 cast<VPInstruction>(U)->replaceAllUsesWith(PenultimateElement);
4747 }
4748 }
4749}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
AMDGPU Register Bank Select
This file implements a class to represent arbitrary precision integral constant values and operations...
ReachingDefInfo InstSet & ToRemove
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool isEqual(const Function &Caller, const Function &Callee)
static const Function * getParent(const Value *V)
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
static bool isSentinel(const DWARFDebugNames::AttributeEncoding &AE)
@ Default
Hexagon Common GEP
iv Induction Variable Users
Definition IVUsers.cpp:48
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
licm
Definition LICM.cpp:383
Legalize the Machine IR a function s Machine IR
Definition Legalizer.cpp:80
static bool mergeBlocksIntoPredecessors(Loop &L, DominatorTree &DT, LoopInfo &LI, MemorySSAUpdater *MSSAU, ScalarEvolution &SE)
#define I(x, y, z)
Definition MD5.cpp:57
This file provides utility analysis objects describing memory locations.
This file contains the declarations for metadata subclasses.
MachineInstr unsigned OpIdx
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
uint64_t IntrinsicInst * II
#define P(N)
if(PassOpts->AAPipeline)
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
const SmallVectorImpl< MachineOperand > & Cond
This file contains some templates that are useful if you are working with the STL at all.
This is the interface for a metadata-based scoped no-alias analysis.
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file defines the SmallPtrSet class.
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This file implements the TypeSwitch template, which mimics a switch() statement whose cases are type ...
This file implements dominator tree analysis for a single level of a VPlan's H-CFG.
This file contains the declarations of different VPlan-related auxiliary helpers.
static VPValue * optimizeLatchExitInductionUser(VPlan &Plan, VPTypeAnalysis &TypeInfo, VPBlockBase *PredVPBB, VPValue *Op, DenseMap< VPValue *, VPValue * > &EndValues, ScalarEvolution &SE)
Attempts to optimize the induction variable exit values for users in the exit block coming from the l...
static void removeCommonBlendMask(VPBlendRecipe *Blend)
Try to see if all of Blend's masks share a common value logically and'ed and remove it from the masks...
static void tryToCreateAbstractReductionRecipe(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries to create abstract recipes from the reduction recipe for following optimizations ...
static bool sinkScalarOperands(VPlan &Plan)
static bool cannotHoistOrSinkRecipe(const VPRecipeBase &R)
Return true if we do not know how to (mechanically) hoist or sink R out of a loop region.
static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Try to simplify the branch condition of Plan.
static void simplifyRecipe(VPSingleDefRecipe *Def, VPTypeAnalysis &TypeInfo)
Try to simplify VPSingleDefRecipe Def.
static void removeRedundantInductionCasts(VPlan &Plan)
Remove redundant casts of inductions.
static bool tryToReplaceALMWithWideALM(VPlan &Plan, ElementCount VF, unsigned UF)
Try to replace multiple active lane masks used for control flow with a single, wide active lane mask ...
static std::optional< std::pair< bool, unsigned > > getOpcodeOrIntrinsicID(const VPSingleDefRecipe *R)
Get any instruction opcode or intrinsic ID data embedded in recipe R.
static VPExpressionRecipe * tryToMatchAndCreateExtendedReduction(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries convert extended in-loop reductions to VPExpressionRecipe and clamp the Range if ...
static VPScalarIVStepsRecipe * createScalarIVSteps(VPlan &Plan, InductionDescriptor::InductionKind Kind, Instruction::BinaryOps InductionOpcode, FPMathOperator *FPBinOp, Instruction *TruncI, VPValue *StartV, VPValue *Step, DebugLoc DL, VPBuilder &Builder)
static RemoveMask_match< Op0_t, Op1_t > m_RemoveMask(const Op0_t &In, Op1_t &Out)
Match a specific mask In, or a combination of it (logical-and In, Out).
static VPValue * getPredicatedMask(VPRegionBlock *R)
If R is a region with a VPBranchOnMaskRecipe in the entry block, return the mask.
static bool sinkRecurrenceUsersAfterPrevious(VPFirstOrderRecurrencePHIRecipe *FOR, VPRecipeBase *Previous, VPDominatorTree &VPDT)
Sink users of FOR after the recipe defining the previous value Previous of the recurrence.
static bool mergeReplicateRegionsIntoSuccessors(VPlan &Plan)
static VPActiveLaneMaskPHIRecipe * addVPLaneMaskPhiAndUpdateExitBranch(VPlan &Plan, bool DataAndControlFlowWithoutRuntimeCheck)
static void expandVPWidenPointerInduction(VPWidenPointerInductionRecipe *R, VPTypeAnalysis &TypeInfo)
Expand a VPWidenPointerInductionRecipe into executable recipes, for the initial value,...
static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL)
Replace recipes with their EVL variants.
static bool isDeadRecipe(VPRecipeBase &R)
Returns true if R is dead and can be removed.
static void legalizeAndOptimizeInductions(VPlan &Plan)
Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd (IndStart, ScalarIVSteps (0,...
static void addReplicateRegions(VPlan &Plan)
static VPValue * tryToFoldLiveIns(VPSingleDefRecipe &R, ArrayRef< VPValue * > Operands, const DataLayout &DL, VPTypeAnalysis &TypeInfo)
Try to fold R using InstSimplifyFolder.
static VPValue * tryToComputeEndValueForInduction(VPWidenInductionRecipe *WideIV, VPBuilder &VectorPHBuilder, VPTypeAnalysis &TypeInfo, VPValue *VectorTC)
Compute and return the end value for WideIV, unless it is truncated.
static void removeRedundantExpandSCEVRecipes(VPlan &Plan)
Remove redundant EpxandSCEVRecipes in Plan's entry block by replacing them with already existing reci...
static bool simplifyKnownEVL(VPlan &Plan, ElementCount VF, PredicatedScalarEvolution &PSE)
From the definition of llvm.experimental.get.vector.length, VPInstruction::ExplicitVectorLength(AVL) ...
static bool isConditionTrueViaVFAndUF(VPValue *Cond, VPlan &Plan, ElementCount BestVF, unsigned BestUF, ScalarEvolution &SE)
Return true if Cond is known to be true for given BestVF and BestUF.
static bool hoistPreviousBeforeFORUsers(VPFirstOrderRecurrencePHIRecipe *FOR, VPRecipeBase *Previous, VPDominatorTree &VPDT)
Try to hoist Previous and its operands before all users of FOR.
static SmallVector< VPUser * > collectUsersRecursively(VPValue *V)
static void recursivelyDeleteDeadRecipes(VPValue *V)
static VPValue * optimizeEarlyExitInductionUser(VPlan &Plan, VPTypeAnalysis &TypeInfo, VPBlockBase *PredVPBB, VPValue *Op, ScalarEvolution &SE)
Attempts to optimize the induction variable exit values for users in the early exit block.
static VPWidenInductionRecipe * getOptimizableIVOf(VPValue *VPV, ScalarEvolution &SE)
Check if VPV is an untruncated wide induction, either before or after the increment.
static VPRegionBlock * createReplicateRegion(VPReplicateRecipe *PredRecipe, VPlan &Plan)
static VPBasicBlock * getPredicatedThenBlock(VPRegionBlock *R)
If R is a triangle region, return the 'then' block of the triangle.
static VPValue * narrowInterleaveGroupOp(VPValue *V, SmallPtrSetImpl< VPValue * > &NarrowedOps)
static void simplifyBlends(VPlan &Plan)
Normalize and simplify VPBlendRecipes.
static bool isConsecutiveInterleaveGroup(VPInterleaveRecipe *InterleaveR, ElementCount VF, VPTypeAnalysis &TypeInfo, TypeSize VectorRegWidth)
Returns true if IR is a full interleave group with factor and number of members both equal to VF.
static VPRecipeBase * optimizeMaskToEVL(VPValue *HeaderMask, VPRecipeBase &CurRecipe, VPTypeAnalysis &TypeInfo, VPValue &EVL)
Try to optimize a CurRecipe masked by HeaderMask to a corresponding EVL-based recipe without the head...
static bool isAlreadyNarrow(VPValue *VPV)
Returns true if VPValue is a narrow VPValue.
static bool optimizeVectorInductionWidthForTCAndVFUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF)
Optimize the width of vector induction variables in Plan based on a known constant Trip Count,...
static VPExpressionRecipe * tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries convert extended in-loop reductions to VPExpressionRecipe and clamp the Range if ...
static void expandVPWidenIntOrFpInduction(VPWidenIntOrFpInductionRecipe *WidenIVR, VPTypeAnalysis &TypeInfo)
Expand a VPWidenIntOrFpInduction into executable recipes, for the initial value, phi and backedge val...
static VPSingleDefRecipe * findHeaderMask(VPlan &Plan)
Collect the header mask with the pattern: (ICMP_ULE, WideCanonicalIV, backedge-taken-count) TODO: Int...
static void removeRedundantCanonicalIVs(VPlan &Plan)
Try to replace VPWidenCanonicalIVRecipes with a widened canonical IV recipe, if it exists.
static bool canNarrowLoad(VPWidenRecipe *WideMember0, unsigned OpIdx, VPValue *OpV, unsigned Idx)
Returns true if V is VPWidenLoadRecipe or VPInterleaveRecipe that can be converted to a narrower reci...
static void narrowToSingleScalarRecipes(VPlan &Plan)
This file provides utility VPlan to VPlan transformations.
This file declares the class VPlanVerifier, which contains utility functions to check the consistency...
This file contains the declarations of the Vectorization Plan base classes:
Value * RHS
Value * LHS
BinaryOperator * Mul
static const uint32_t IV[8]
Definition blake3_impl.h:83
Class for arbitrary precision integers.
Definition APInt.h:78
LLVM_ABI APInt zext(unsigned width) const
Zero extend to a new width.
Definition APInt.cpp:1012
unsigned getActiveBits() const
Compute the number of active bits in the value.
Definition APInt.h:1513
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1489
LLVM_ABI APInt sext(unsigned width) const
Sign extend to a new width.
Definition APInt.cpp:985
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
LLVM Basic Block Representation.
Definition BasicBlock.h:62
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this basic block belongs to.
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233
This class represents a function call, abstracting a target machine's calling convention.
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:701
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:702
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition InstrTypes.h:686
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition InstrTypes.h:789
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
static ConstantInt * getSigned(IntegerType *Ty, int64_t V)
Return a ConstantInt with the specified value for the specified type.
Definition Constants.h:131
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
A debug info location.
Definition DebugLoc.h:124
static DebugLoc getCompilerGenerated()
Definition DebugLoc.h:163
static DebugLoc getUnknown()
Definition DebugLoc.h:162
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition DenseMap.h:205
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:256
bool dominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
dominates - Returns true iff A dominates B.
constexpr bool isVector() const
One or more elements.
Definition TypeSize.h:324
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition TypeSize.h:312
Utility class for floating point operations which can have information about relaxed accuracy require...
Definition Operator.h:200
Represents flags for the getelementptr instruction/expression.
GEPNoWrapFlags withoutNoUnsignedWrap() const
static GEPNoWrapFlags none()
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
A struct for saving information about induction variables.
InductionKind
This enum represents the kinds of inductions that we support.
@ IK_PtrInduction
Pointer induction var. Step = C.
@ IK_IntInduction
Integer induction variable. Step = C.
InstSimplifyFolder - Use InstructionSimplify to fold operations to existing values.
bool isCast() const
bool isBinaryOp() const
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:318
The group of interleaved loads/stores sharing the same stride and close to each other.
InstTy * getMember(uint32_t Index) const
Get the member with the given index Index.
uint32_t getNumMembers() const
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
static bool getDecisionAndClampRange(const std::function< bool(ElementCount)> &Predicate, VFRange &Range)
Test a Predicate on a Range of VF's.
Definition VPlan.cpp:1541
LLVM_ABI MDNode * createBranchWeights(uint32_t TrueWeight, uint32_t FalseWeight, bool IsExpected=false)
Return metadata containing two branch weights.
Definition MDBuilder.cpp:38
Metadata node.
Definition Metadata.h:1078
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:36
ValueT lookup(const KeyT &Key) const
Definition MapVector.h:103
Representation for a specific memory location.
AAMDNodes AATags
The metadata nodes which describes the aliasing of the location (each member is null if that kind of ...
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
ScalarEvolution * getSE() const
Returns the ScalarEvolution analysis used.
LLVM_ABI const SCEV * getSCEV(Value *V)
Returns the SCEV expression of V, in the context of the current SCEV predicate.
static LLVM_ABI unsigned getOpcode(RecurKind Kind)
Returns the opcode corresponding to the RecurrenceKind.
RegionT * getParent() const
Get the parent of the Region.
Definition RegionInfo.h:362
This class uses information about analyze scalars to rewrite expressions in canonical form.
LLVM_ABI Value * expandCodeFor(const SCEV *SH, Type *Ty, BasicBlock::iterator I)
Insert code to directly compute the specified SCEV expression into the program.
static const SCEV * rewrite(const SCEV *Scev, ScalarEvolution &SE, ValueToSCEVMapTy &Map)
This class represents an analyzed expression in the program.
LLVM_ABI Type * getType() const
Return the LLVM type of this SCEV expression.
The main scalar evolution driver.
const DataLayout & getDataLayout() const
Return the DataLayout associated with the module this SCEV instance is operating on.
LLVM_ABI const SCEV * getNegativeSCEV(const SCEV *V, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
Return the SCEV object corresponding to -V.
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI const SCEV * getUDivExpr(const SCEV *LHS, const SCEV *RHS)
Get a canonical unsigned division expression, or something simpler if possible.
LLVM_ABI const SCEV * getElementCount(Type *Ty, ElementCount EC, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
LLVM_ABI const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
LLVM_ABI bool isKnownPredicate(CmpPredicate Pred, const SCEV *LHS, const SCEV *RHS)
Test if the given expression is known to satisfy the condition described by Pred, LHS,...
static LLVM_ABI bool mayAliasInScopes(const MDNode *Scopes, const MDNode *NoAlias)
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
Definition SetVector.h:58
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:101
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:149
size_type size() const
Definition SmallPtrSet.h:99
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
iterator begin() const
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
Provides information about what library functions are available for the current target.
static LLVM_ABI PartialReductionExtendKind getPartialReductionExtendKind(Instruction *I)
Get the kind of extension that an instruction represents.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
static constexpr TypeSize get(ScalarTy Quantity, bool Scalable)
Definition TypeSize.h:340
This class implements a switch-like dispatch statement for a value of 'T' using dyn_cast functionalit...
Definition TypeSwitch.h:88
TypeSwitch< T, ResultT > & Case(CallableT &&caseFn)
Add a case on the given type.
Definition TypeSwitch.h:97
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:297
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:296
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
Definition Type.cpp:294
bool isStructTy() const
True if this is an instance of StructType.
Definition Type.h:261
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:230
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:293
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:184
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
op_range operands()
Definition User.h:292
A recipe for generating the active lane mask for the vector loop that is used to predicate the vector...
Definition VPlan.h:3625
VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph.
Definition VPlan.h:3986
void appendRecipe(VPRecipeBase *Recipe)
Augment the existing recipes of a VPBasicBlock with an additional Recipe as the last recipe.
Definition VPlan.h:4061
RecipeListTy::iterator iterator
Instruction iterators...
Definition VPlan.h:4013
iterator end()
Definition VPlan.h:4023
iterator begin()
Recipe iterator methods.
Definition VPlan.h:4021
iterator_range< iterator > phis()
Returns an iterator range over the PHI-like recipes in the block.
Definition VPlan.h:4074
iterator getFirstNonPhi()
Return the position of the first non-phi node recipe in the block.
Definition VPlan.cpp:216
VPRegionBlock * getEnclosingLoopRegion()
Definition VPlan.cpp:578
VPBasicBlock * splitAt(iterator SplitAt)
Split current block at SplitAt by inserting a new block between the current block and its successors ...
Definition VPlan.cpp:550
VPRecipeBase * getTerminator()
If the block has multiple successors, return the branch recipe terminating the block.
Definition VPlan.cpp:623
const VPRecipeBase & back() const
Definition VPlan.h:4035
A recipe for vectorizing a phi-node as a sequence of mask-based select instructions.
Definition VPlan.h:2483
VPValue * getMask(unsigned Idx) const
Return mask number Idx.
Definition VPlan.h:2517
unsigned getNumIncomingValues() const
Return the number of incoming values, taking into account when normalized the first incoming value wi...
Definition VPlan.h:2507
void setMask(unsigned Idx, VPValue *V)
Set mask number Idx to V.
Definition VPlan.h:2523
bool isNormalized() const
A normalized blend is one that has an odd number of operands, whereby the first operand does not have...
Definition VPlan.h:2503
VPBlockBase is the building block of the Hierarchical Control-Flow Graph.
Definition VPlan.h:80
VPRegionBlock * getParent()
Definition VPlan.h:172
const VPBasicBlock * getExitingBasicBlock() const
Definition VPlan.cpp:186
size_t getNumSuccessors() const
Definition VPlan.h:218
void swapSuccessors()
Swap successors of the block. The block must have exactly 2 successors.
Definition VPlan.h:321
size_t getNumPredecessors() const
Definition VPlan.h:219
const VPBlocksTy & getPredecessors() const
Definition VPlan.h:203
VPlan * getPlan()
Definition VPlan.cpp:161
VPBlockBase * getSinglePredecessor() const
Definition VPlan.h:214
const VPBasicBlock * getEntryBasicBlock() const
Definition VPlan.cpp:166
VPBlockBase * getSingleHierarchicalPredecessor()
Definition VPlan.h:263
VPBlockBase * getSingleSuccessor() const
Definition VPlan.h:208
const VPBlocksTy & getSuccessors() const
Definition VPlan.h:197
static auto blocksOnly(const T &Range)
Return an iterator range over Range which only includes BlockTy blocks.
Definition VPlanUtils.h:211
static void insertOnEdge(VPBlockBase *From, VPBlockBase *To, VPBlockBase *BlockPtr)
Inserts BlockPtr on the edge between From and To.
Definition VPlanUtils.h:232
static void insertTwoBlocksAfter(VPBlockBase *IfTrue, VPBlockBase *IfFalse, VPBlockBase *BlockPtr)
Insert disconnected VPBlockBases IfTrue and IfFalse after BlockPtr.
Definition VPlanUtils.h:151
static void connectBlocks(VPBlockBase *From, VPBlockBase *To, unsigned PredIdx=-1u, unsigned SuccIdx=-1u)
Connect VPBlockBases From and To bi-directionally.
Definition VPlanUtils.h:170
static void disconnectBlocks(VPBlockBase *From, VPBlockBase *To)
Disconnect VPBlockBases From and To bi-directionally.
Definition VPlanUtils.h:189
A recipe for generating conditional branches on the bits of a mask.
Definition VPlan.h:3036
RAII object that stores the current insertion point and restores it when the object is destroyed.
VPlan-based builder utility analogous to IRBuilder.
VPValue * createScalarZExtOrTrunc(VPValue *Op, Type *ResultTy, Type *SrcTy, DebugLoc DL)
VPValue * createElementCount(Type *Ty, ElementCount EC)
VPInstruction * createScalarCast(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy, DebugLoc DL, const VPIRFlags &Flags={}, const VPIRMetadata &Metadata={})
VPDerivedIVRecipe * createDerivedIV(InductionDescriptor::InductionKind Kind, FPMathOperator *FPBinOp, VPValue *Start, VPValue *Current, VPValue *Step, const Twine &Name="")
Convert the input value Current to the corresponding value of an induction with Start and Step values...
static VPBuilder getToInsertAfter(VPRecipeBase *R)
Create a VPBuilder to insert after R.
VPPhi * createScalarPhi(ArrayRef< VPValue * > IncomingValues, DebugLoc DL, const Twine &Name="")
void setInsertPoint(VPBasicBlock *TheBB)
This specifies that created VPInstructions should be appended to the end of the specified block.
VPInstruction * createNaryOp(unsigned Opcode, ArrayRef< VPValue * > Operands, Instruction *Inst=nullptr, const VPIRFlags &Flags={}, const VPIRMetadata &MD={}, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
Create an N-ary operation with Opcode, Operands and set Inst as its underlying Instruction.
Canonical scalar induction phi of the vector loop.
Definition VPlan.h:3567
unsigned getNumDefinedValues() const
Returns the number of values defined by the VPDef.
Definition VPlanValue.h:432
ArrayRef< VPValue * > definedValues()
Returns an ArrayRef of the values defined by the VPDef.
Definition VPlanValue.h:427
VPValue * getVPSingleValue()
Returns the only VPValue defined by the VPDef.
Definition VPlanValue.h:405
VPValue * getVPValue(unsigned I)
Returns the VPValue with index I defined by the VPDef.
Definition VPlanValue.h:417
A recipe for converting the input value IV value to the corresponding value of an IV with different s...
Definition VPlan.h:3736
Template specialization of the standard LLVM dominator tree utility for VPBlockBases.
bool properlyDominates(const VPRecipeBase *A, const VPRecipeBase *B)
A recipe for generating the phi node for the current index of elements, adjusted in accordance with E...
Definition VPlan.h:3657
A recipe to combine multiple recipes into a single 'expression' recipe, which should be considered a ...
Definition VPlan.h:3081
A pure virtual base class for all recipes modeling header phis, including phis for first order recurr...
Definition VPlan.h:2058
virtual VPValue * getBackedgeValue()
Returns the incoming value from the loop backedge.
Definition VPlan.h:2098
VPValue * getStartValue()
Returns the start value of the phi, if one is set.
Definition VPlan.h:2087
A special type of VPBasicBlock that wraps an existing IR basic block.
Definition VPlan.h:4139
BasicBlock * getIRBasicBlock() const
Definition VPlan.h:4163
Class to record and manage LLVM IR flags.
Definition VPlan.h:609
static LLVM_ABI_FOR_TEST VPIRInstruction * create(Instruction &I)
Create a new VPIRPhi for \I , if it is a PHINode, otherwise create a VPIRInstruction.
Helper to manage IR metadata for recipes.
Definition VPlan.h:982
void intersect(const VPIRMetadata &MD)
Intersect this VPIRMetada object with MD, keeping only metadata nodes that are common to both.
This is a concrete Recipe that models a single VPlan-level instruction.
Definition VPlan.h:1031
@ ExtractLane
Extracts a single lane (first operand) from a set of vector operands.
Definition VPlan.h:1115
@ Unpack
Extracts all lanes from its (non-scalable) vector operand.
Definition VPlan.h:1066
@ BuildVector
Creates a fixed-width vector containing all operands.
Definition VPlan.h:1061
@ BuildStructVector
Given operands of (the same) struct type, creates a struct of fixed- width vectors each containing a ...
Definition VPlan.h:1058
@ CanonicalIVIncrementForPart
Definition VPlan.h:1051
void setName(StringRef NewName)
Set the symbolic name for the VPInstruction.
Definition VPlan.h:1229
const InterleaveGroup< Instruction > * getInterleaveGroup() const
Definition VPlan.h:2625
VPValue * getMask() const
Return the mask used by this recipe.
Definition VPlan.h:2617
ArrayRef< VPValue * > getStoredValues() const
Return the VPValues stored by this interleave group.
Definition VPlan.h:2646
A recipe for interleaved memory operations with vector-predication intrinsics.
Definition VPlan.h:2699
VPInterleaveRecipe is a recipe for transforming an interleave group of load or stores into one wide l...
Definition VPlan.h:2657
VPPredInstPHIRecipe is a recipe for generating the phi nodes needed when control converges back from ...
Definition VPlan.h:3223
VPRecipeBase is a base class modeling a sequence of one or more output IR instructions.
Definition VPlan.h:386
VPRegionBlock * getRegion()
Definition VPlan.h:4291
VPBasicBlock * getParent()
Definition VPlan.h:407
DebugLoc getDebugLoc() const
Returns the debug location of the recipe.
Definition VPlan.h:478
void moveBefore(VPBasicBlock &BB, iplist< VPRecipeBase >::iterator I)
Unlink this recipe and insert into BB before I.
void insertBefore(VPRecipeBase *InsertPos)
Insert an unlinked recipe into a basic block immediately before the specified recipe.
void insertAfter(VPRecipeBase *InsertPos)
Insert an unlinked Recipe into a basic block immediately after the specified Recipe.
iplist< VPRecipeBase >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Helper class to create VPRecipies from IR instructions.
VPRecipeBase * getRecipe(Instruction *I)
Return the recipe created for given ingredient.
A recipe to represent inloop reduction operations with vector-predication intrinsics,...
Definition VPlan.h:2910
A recipe to represent inloop reduction operations, performing a reduction on a vector operand into a ...
Definition VPlan.h:2749
VPRegionBlock represents a collection of VPBasicBlocks and VPRegionBlocks which form a Single-Entry-S...
Definition VPlan.h:4174
const VPBlockBase * getEntry() const
Definition VPlan.h:4210
Type * getCanonicalIVType()
Return the type of the canonical IV for loop regions.
Definition VPlan.h:4285
bool isReplicator() const
An indicator whether this region is to generate multiple replicated instances of output IR correspond...
Definition VPlan.h:4242
void setExiting(VPBlockBase *ExitingBlock)
Set ExitingBlock as the exiting VPBlockBase of this VPRegionBlock.
Definition VPlan.h:4227
VPCanonicalIVPHIRecipe * getCanonicalIV()
Returns the canonical induction recipe of the region.
Definition VPlan.h:4272
const VPBlockBase * getExiting() const
Definition VPlan.h:4222
VPBasicBlock * getPreheaderVPBB()
Returns the pre-header VPBasicBlock of the loop region.
Definition VPlan.h:4235
VPReplicateRecipe replicates a given instruction producing multiple scalar copies of the original sca...
Definition VPlan.h:2955
bool isSingleScalar() const
Definition VPlan.h:2996
VPValue * getMask()
Return the mask of a predicated VPReplicateRecipe.
Definition VPlan.h:3020
A recipe for handling phi nodes of integer and floating-point inductions, producing their scalar valu...
Definition VPlan.h:3806
VPSingleDef is a base class for recipes for modeling a sequence of one or more output IR that define ...
Definition VPlan.h:530
Instruction * getUnderlyingInstr()
Returns the underlying instruction.
Definition VPlan.h:595
VPSingleDefRecipe * clone() override=0
Clone the current recipe.
An analysis for type-inference for VPValues.
LLVMContext & getContext()
Return the LLVMContext used by the analysis.
Type * inferScalarType(const VPValue *V)
Infer the type of V. Returns the scalar type of V.
This class augments VPValue with operands which provide the inverse def-use edges from VPValue's user...
Definition VPlanValue.h:207
operand_range operands()
Definition VPlanValue.h:275
void setOperand(unsigned I, VPValue *New)
Definition VPlanValue.h:251
VPValue * getOperand(unsigned N) const
Definition VPlanValue.h:246
void addOperand(VPValue *Operand)
Definition VPlanValue.h:240
This is the base class of the VPlan Def/Use graph, used for modeling the data flow into,...
Definition VPlanValue.h:48
bool isDefinedOutsideLoopRegions() const
Returns true if the VPValue is defined outside any loop.
Definition VPlan.cpp:1374
VPRecipeBase * getDefiningRecipe()
Returns the recipe defining this VPValue or nullptr if it is not defined by a recipe,...
Definition VPlan.cpp:131
Value * getLiveInIRValue() const
Returns the underlying IR value, if this VPValue is defined outside the scope of VPlan.
Definition VPlanValue.h:183
Value * getUnderlyingValue() const
Return the underlying Value attached to this VPValue.
Definition VPlanValue.h:85
void setUnderlyingValue(Value *Val)
Definition VPlanValue.h:193
void replaceAllUsesWith(VPValue *New)
Definition VPlan.cpp:1377
unsigned getNumUsers() const
Definition VPlanValue.h:113
bool isLiveIn() const
Returns true if this VPValue is a live-in, i.e. defined outside the VPlan.
Definition VPlanValue.h:178
void replaceUsesWithIf(VPValue *New, llvm::function_ref< bool(VPUser &U, unsigned Idx)> ShouldReplace)
Go through the uses list for this VPValue and make each use point to New if the callback ShouldReplac...
Definition VPlan.cpp:1381
user_range users()
Definition VPlanValue.h:134
A recipe to compute a pointer to the last element of each part of a widened memory access for widened...
Definition VPlan.h:1920
A Recipe for widening the canonical induction variable of the vector loop.
Definition VPlan.h:3699
VPWidenCastRecipe is a recipe to create vector cast instructions.
Definition VPlan.h:1548
Instruction::CastOps getOpcode() const
Definition VPlan.h:1584
A recipe for handling GEP instructions.
Definition VPlan.h:1844
Base class for widened induction (VPWidenIntOrFpInductionRecipe and VPWidenPointerInductionRecipe),...
Definition VPlan.h:2122
PHINode * getPHINode() const
Definition VPlan.h:2164
VPValue * getStepValue()
Returns the step value of the induction.
Definition VPlan.h:2150
const InductionDescriptor & getInductionDescriptor() const
Returns the induction descriptor for the recipe.
Definition VPlan.h:2167
A recipe for handling phi nodes of integer and floating-point inductions, producing their vector valu...
Definition VPlan.h:2198
VPValue * getLastUnrolledPartOperand()
Returns the VPValue representing the value of this induction at the last unrolled part,...
Definition VPlan.h:2274
A recipe for widening vector intrinsics.
Definition VPlan.h:1598
A common base class for widening memory operations.
Definition VPlan.h:3266
A recipe for widened phis.
Definition VPlan.h:2332
VPWidenRecipe is a recipe for producing a widened instruction using the opcode and operands of the re...
Definition VPlan.h:1500
VPlan models a candidate for vectorization, encoding various decisions take to produce efficient outp...
Definition VPlan.h:4304
bool hasVF(ElementCount VF) const
Definition VPlan.h:4509
LLVMContext & getContext() const
Definition VPlan.h:4497
VPBasicBlock * getEntry()
Definition VPlan.h:4397
VPValue & getVectorTripCount()
The vector trip count.
Definition VPlan.h:4488
bool hasScalableVF() const
Definition VPlan.h:4510
VPValue & getVFxUF()
Returns VF * UF of the vector loop region.
Definition VPlan.h:4495
VPValue & getVF()
Returns the VF of the vector loop region.
Definition VPlan.h:4491
VPValue * getTripCount() const
The trip count of the original loop.
Definition VPlan.h:4459
VPValue * getTrue()
Return a VPValue wrapping i1 true.
Definition VPlan.h:4566
VPValue * getOrCreateBackedgeTakenCount()
The backedge taken count of the original loop.
Definition VPlan.h:4480
unsigned getUF() const
Definition VPlan.h:4529
VPRegionBlock * createReplicateRegion(VPBlockBase *Entry, VPBlockBase *Exiting, const std::string &Name="")
Create a new replicate region with Entry, Exiting and Name.
Definition VPlan.h:4643
bool hasUF(unsigned UF) const
Definition VPlan.h:4527
ArrayRef< VPIRBasicBlock * > getExitBlocks() const
Return an ArrayRef containing VPIRBasicBlocks wrapping the exit blocks of the original scalar loop.
Definition VPlan.h:4449
VPValue * getConstantInt(Type *Ty, uint64_t Val, bool IsSigned=false)
Return a VPValue wrapping a ConstantInt with the given type and value.
Definition VPlan.h:4572
void setVF(ElementCount VF)
Definition VPlan.h:4503
bool isUnrolled() const
Returns true if the VPlan already has been unrolled, i.e.
Definition VPlan.h:4542
LLVM_ABI_FOR_TEST VPRegionBlock * getVectorLoopRegion()
Returns the VPRegionBlock of the vector loop.
Definition VPlan.cpp:1011
void resetTripCount(VPValue *NewTripCount)
Resets the trip count for the VPlan.
Definition VPlan.h:4473
VPBasicBlock * getMiddleBlock()
Returns the 'middle' block of the plan, that is the block that selects whether to execute the scalar ...
Definition VPlan.h:4422
VPBasicBlock * createVPBasicBlock(const Twine &Name, VPRecipeBase *Recipe=nullptr)
Create a new VPBasicBlock with Name and containing Recipe if present.
Definition VPlan.h:4621
VPValue * getFalse()
Return a VPValue wrapping i1 false.
Definition VPlan.h:4569
VPValue * getOrAddLiveIn(Value *V)
Gets the live-in VPValue for V or adds a new live-in (if none exists yet) for V.
Definition VPlan.h:4551
bool hasScalarVFOnly() const
Definition VPlan.h:4520
VPBasicBlock * getScalarPreheader() const
Return the VPBasicBlock for the preheader of the scalar loop.
Definition VPlan.h:4440
ArrayRef< VPValue * > getLiveIns() const
Return the list of live-in VPValues available in the VPlan.
Definition VPlan.h:4591
VPIRBasicBlock * getScalarHeader() const
Return the VPIRBasicBlock wrapping the header of the scalar loop.
Definition VPlan.h:4445
VPValue * getLiveIn(Value *V) const
Return the live-in VPValue for V, if there is one or nullptr otherwise.
Definition VPlan.h:4588
VPBasicBlock * getVectorPreheader()
Returns the preheader of the vector loop region, if one exists, or null otherwise.
Definition VPlan.h:4402
void setUF(unsigned UF)
Definition VPlan.h:4534
bool hasScalarTail() const
Returns true if the scalar tail may execute after the vector loop.
Definition VPlan.h:4675
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
iterator_range< user_iterator > users()
Definition Value.h:426
bool hasName() const
Definition Value.h:262
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:168
constexpr LeafTy multiplyCoefficientBy(ScalarTy RHS) const
Definition TypeSize.h:256
constexpr bool isFixed() const
Returns true if the quantity is not scaled by vscale.
Definition TypeSize.h:171
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
IteratorT end() const
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
LLVM_ABI APInt RoundingUDiv(const APInt &A, const APInt &B, APInt::Rounding RM)
Return A unsign-divided by B, rounded by the given rounding mode.
Definition APInt.cpp:2763
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
m_Intrinsic_Ty< Opnd0, Opnd1, Opnd2 >::Ty m_MaskedStore(const Opnd0 &Op0, const Opnd1 &Op1, const Opnd2 &Op2)
Matches MaskedStore Intrinsic.
ap_match< APInt > m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
CastInst_match< OpTy, TruncInst > m_Trunc(const OpTy &Op)
Matches Trunc.
LogicalOp_match< LHS, RHS, Instruction::And > m_LogicalAnd(const LHS &L, const RHS &R)
Matches L && R either in the form of L & R or L ?
match_combine_or< CastInst_match< OpTy, ZExtInst >, OpTy > m_ZExtOrSelf(const OpTy &Op)
bool match(Val *V, const Pattern &P)
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
m_Intrinsic_Ty< Opnd0, Opnd1, Opnd2 >::Ty m_MaskedLoad(const Opnd0 &Op0, const Opnd1 &Op1, const Opnd2 &Op2)
Matches MaskedLoad Intrinsic.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
SpecificCmpClass_match< LHS, RHS, CmpInst > m_SpecificCmp(CmpPredicate MatchPred, const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
deferredval_ty< Value > m_Deferred(Value *const &V)
Like m_Specific(), but works if the specific value to match is determined as part of the same match()...
SpecificCmpClass_match< LHS, RHS, ICmpInst > m_SpecificICmp(CmpPredicate MatchPred, const LHS &L, const RHS &R)
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
CmpClass_match< LHS, RHS, ICmpInst > m_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
BinaryOp_match< LHS, RHS, Instruction::Mul, true > m_c_Mul(const LHS &L, const RHS &R)
Matches a Mul with LHS and RHS in either order.
MatchFunctor< Val, Pattern > match_fn(const Pattern &P)
A match functor that can be used as a UnaryPredicate in functional algorithms like all_of.
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
AllRecipe_commutative_match< Instruction::And, Op0_t, Op1_t > m_c_BinaryAnd(const Op0_t &Op0, const Op1_t &Op1)
Match a binary AND operation.
AllRecipe_match< Instruction::Or, Op0_t, Op1_t > m_BinaryOr(const Op0_t &Op0, const Op1_t &Op1)
Match a binary OR operation.
VPInstruction_match< VPInstruction::AnyOf > m_AnyOf()
AllRecipe_commutative_match< Opcode, Op0_t, Op1_t > m_c_Binary(const Op0_t &Op0, const Op1_t &Op1)
AllRecipe_commutative_match< Instruction::Or, Op0_t, Op1_t > m_c_BinaryOr(const Op0_t &Op0, const Op1_t &Op1)
GEPLikeRecipe_match< Op0_t, Op1_t > m_GetElementPtr(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::ExtractLastLanePerPart, Op0_t > m_ExtractLastLanePerPart(const Op0_t &Op0)
VPInstruction_match< VPInstruction::ExtractLastElement, Op0_t > m_ExtractLastElement(const Op0_t &Op0)
AllRecipe_match< Opcode, Op0_t, Op1_t > m_Binary(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< Instruction::ExtractElement, Op0_t, Op1_t > m_ExtractElement(const Op0_t &Op0, const Op1_t &Op1)
specific_intval< 1 > m_False()
VPDerivedIV_match< Op0_t, Op1_t, Op2_t > m_DerivedIV(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2)
VPInstruction_match< VPInstruction::ActiveLaneMask, Op0_t, Op1_t, Op2_t > m_ActiveLaneMask(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2)
VPInstruction_match< VPInstruction::BranchOnCount > m_BranchOnCount()
specific_intval< 1 > m_True()
VectorEndPointerRecipe_match< Op0_t, Op1_t > m_VecEndPtr(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::Broadcast, Op0_t > m_Broadcast(const Op0_t &Op0)
class_match< VPValue > m_VPValue()
Match an arbitrary VPValue and ignore it.
VPInstruction_match< VPInstruction::ExplicitVectorLength, Op0_t > m_EVL(const Op0_t &Op0)
VPInstruction_match< VPInstruction::BuildVector > m_BuildVector()
BuildVector is matches only its opcode, w/o matching its operands as the number of operands is not fi...
VPInstruction_match< VPInstruction::FirstActiveLane, Op0_t > m_FirstActiveLane(const Op0_t &Op0)
bind_ty< VPInstruction > m_VPInstruction(VPInstruction *&V)
Match a VPInstruction, capturing if we match.
VPInstruction_match< VPInstruction::BranchOnCond > m_BranchOnCond()
NodeAddr< DefNode * > Def
Definition RDFGraph.h:384
bool isSingleScalar(const VPValue *VPV)
Returns true if VPV is a single scalar, either because it produces the same value for all lanes or on...
bool isUniformAcrossVFsAndUFs(VPValue *V)
Checks if V is uniform across all VF lanes and UF parts.
VPValue * getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr)
Get or create a VPValue that corresponds to the expansion of Expr.
std::optional< MemoryLocation > getMemoryLocation(const VPRecipeBase &R)
Return a MemoryLocation for R with noalias metadata populated from R, if the recipe is supported and ...
bool onlyFirstLaneUsed(const VPValue *Def)
Returns true if only the first lane of Def is used.
VPIRFlags getFlagsFromIndDesc(const InductionDescriptor &ID)
Extracts and returns NoWrap and FastMath flags from the induction binop in ID.
Definition VPlanUtils.h:85
bool onlyScalarValuesUsed(const VPValue *Def)
Returns true if only scalar values of Def are used by all users.
bool isHeaderMask(const VPValue *V, const VPlan &Plan)
Return true if V is a header mask in Plan.
const SCEV * getSCEVExprForVPValue(const VPValue *V, ScalarEvolution &SE, const Loop *L=nullptr)
Return the SCEV expression for V.
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
@ Offset
Definition DWP.cpp:532
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1725
LLVM_ABI Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
DenseMap< const Value *, const SCEV * > ValueToSCEVMapTy
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2472
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
constexpr from_range_t from_range
auto dyn_cast_if_present(const Y &Val)
dyn_cast_if_present<X> - Functionally identical to dyn_cast, except that a null (or none in the case ...
Definition Casting.h:732
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2136
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:632
auto cast_or_null(const Y &Val)
Definition Casting.h:714
iterator_range< df_iterator< VPBlockShallowTraversalWrapper< VPBlockBase * > > > vp_depth_first_shallow(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order.
Definition VPlanCFG.h:216
iterator_range< df_iterator< VPBlockDeepTraversalWrapper< VPBlockBase * > > > vp_depth_first_deep(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order while traversing t...
Definition VPlanCFG.h:243
detail::concat_range< ValueT, RangeTs... > concat(RangeTs &&...Ranges)
Returns a concatenated range across two or more ranges.
Definition STLExtras.h:1150
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:385
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1732
auto reverse(ContainerTy &&C)
Definition STLExtras.h:406
iterator_range< po_iterator< VPBlockDeepTraversalWrapper< VPBlockBase * > > > vp_post_order_deep(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in post order while traversing through ...
Definition VPlanCFG.h:236
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1622
LLVM_ABI_FOR_TEST cl::opt< bool > EnableWideActiveLaneMask
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
SmallVector< ValueTypeFromRangeType< R >, Size > to_vector(R &&Range)
Given a range of type R, iterate the entire range and return a SmallVector with elements of the vecto...
iterator_range< filter_iterator< detail::IterOfRange< RangeT >, PredicateT > > make_filter_range(RangeT &&Range, PredicateT Pred)
Convenience function that takes a range of elements and a predicate, and return a new filter_iterator...
Definition STLExtras.h:550
bool canConstantBeExtended(const APInt *C, Type *NarrowType, TTI::PartialReductionExtendKind ExtKind)
Check if a constant CI can be safely treated as having been extended from a narrower type with the gi...
Definition VPlan.cpp:1718
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
auto drop_end(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the last N elements excluded.
Definition STLExtras.h:323
RecurKind
These are the kinds of recurrences that we support.
@ Mul
Product of integers.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
@ AddChainWithSubs
A chain of adds and subs.
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition STLExtras.h:1954
DWARFExpression::Operation Op
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition STLExtras.h:1961
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="", bool Before=false)
Split the specified block at the specified instruction.
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1758
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition STLExtras.h:2108
@ DataAndControlFlowWithoutRuntimeCheck
Use predicate to control both data and control flow, but modify the trip count so that a runtime over...
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition Hashing.h:592
bool equal(L &&LRange, R &&RRange)
Wrapper function around std::equal to detect if pair-wise elements between two ranges are the same.
Definition STLExtras.h:2088
Type * toVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
@ Default
The result values are uniform if and only if all operands are uniform.
Definition Uniformity.h:20
constexpr detail::IsaCheckPredicate< Types... > IsaPred
Function object wrapper for the llvm::isa type check.
Definition Casting.h:866
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition Hashing.h:466
#define N
RemoveMask_match(const Op0_t &In, Op1_t &Out)
bool match(OpTy *V) const
A collection of metadata nodes that might be associated with a memory access used by the alias-analys...
Definition Metadata.h:761
MDNode * Scope
The tag for alias scope specification (used with noalias).
Definition Metadata.h:784
MDNode * NoAlias
The tag specifying the noalias scope.
Definition Metadata.h:787
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
An information struct used to provide DenseMap with the various necessary components for a given valu...
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
A range of powers-of-2 vectorization factors with fixed start and adjustable end.
Struct to hold various analysis needed for cost computations.
A recipe for handling first-order recurrence phis.
Definition VPlan.h:2374
A recipe for widening load operations with vector-predication intrinsics, using the address to load f...
Definition VPlan.h:3398
A recipe for widening load operations, using the address to load from and an optional mask.
Definition VPlan.h:3357
A recipe for widening select instructions.
Definition VPlan.h:1797
A recipe for widening store operations with vector-predication intrinsics, using the value to store,...
Definition VPlan.h:3482
A recipe for widening store operations, using the stored value, the address to store to and an option...
Definition VPlan.h:3439
static void materializeBroadcasts(VPlan &Plan)
Add explicit broadcasts for live-ins and VPValues defined in Plan's entry block if they are used as v...
static void materializePacksAndUnpacks(VPlan &Plan)
Add explicit Build[Struct]Vector recipes to Pack multiple scalar values into vectors and Unpack recip...
static void materializeBackedgeTakenCount(VPlan &Plan, VPBasicBlock *VectorPH)
Materialize the backedge-taken count to be computed explicitly using VPInstructions.
static void optimizeInductionExitUsers(VPlan &Plan, DenseMap< VPValue *, VPValue * > &EndValues, ScalarEvolution &SE)
If there's a single exit block, optimize its phi recipes that use exiting IV values by feeding them p...
static void hoistInvariantLoads(VPlan &Plan)
Hoist single-scalar loads with invariant addresses out of the vector loop to the preheader,...
static void canonicalizeEVLLoops(VPlan &Plan)
Transform EVL loops to use variable-length stepping after region dissolution.
static void dropPoisonGeneratingRecipes(VPlan &Plan, const std::function< bool(BasicBlock *)> &BlockNeedsPredication)
Drop poison flags from recipes that may generate a poison value that is used after vectorization,...
static void createAndOptimizeReplicateRegions(VPlan &Plan)
Wrap predicated VPReplicateRecipes with a mask operand in an if-then region block and remove the mask...
static void createInterleaveGroups(VPlan &Plan, const SmallPtrSetImpl< const InterleaveGroup< Instruction > * > &InterleaveGroups, VPRecipeBuilder &RecipeBuilder, const bool &ScalarEpilogueAllowed)
static bool runPass(bool(*Transform)(VPlan &, ArgsTy...), VPlan &Plan, typename std::remove_reference< ArgsTy >::type &...Args)
Helper to run a VPlan transform Transform on VPlan, forwarding extra arguments to the transform.
static void addBranchWeightToMiddleTerminator(VPlan &Plan, ElementCount VF, std::optional< unsigned > VScaleForTuning)
Add branch weight metadata, if the Plan's middle block is terminated by a BranchOnCond recipe.
static void narrowInterleaveGroups(VPlan &Plan, ElementCount VF, TypeSize VectorRegWidth)
Try to convert a plan with interleave groups with VF elements to a plan with the interleave groups re...
static DenseMap< const SCEV *, Value * > expandSCEVs(VPlan &Plan, ScalarEvolution &SE)
Expand VPExpandSCEVRecipes in Plan's entry block.
static void convertToConcreteRecipes(VPlan &Plan)
Lower abstract recipes to concrete ones, that can be codegen'd.
static void convertToAbstractRecipes(VPlan &Plan, VPCostContext &Ctx, VFRange &Range)
This function converts initial recipes to the abstract recipes and clamps Range based on cost model f...
static void materializeConstantVectorTripCount(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
static LLVM_ABI_FOR_TEST bool tryToConvertVPInstructionsToVPRecipes(VPlan &Plan, function_ref< const InductionDescriptor *(PHINode *)> GetIntOrFpInductionDescriptor, const TargetLibraryInfo &TLI)
Replaces the VPInstructions in Plan with corresponding widen recipes.
static void addExitUsersForFirstOrderRecurrences(VPlan &Plan, VFRange &Range)
Handle users in the exit block for first order reductions in the original exit block.
static void addExplicitVectorLength(VPlan &Plan, const std::optional< unsigned > &MaxEVLSafeElements)
Add a VPEVLBasedIVPHIRecipe and related recipes to Plan and replaces all uses except the canonical IV...
static void replaceSymbolicStrides(VPlan &Plan, PredicatedScalarEvolution &PSE, const DenseMap< Value *, const SCEV * > &StridesMap)
Replace symbolic strides from StridesMap in Plan with constants when possible.
static void removeBranchOnConst(VPlan &Plan)
Remove BranchOnCond recipes with true or false conditions together with removing dead edges to their ...
static void removeDeadRecipes(VPlan &Plan)
Remove dead recipes from Plan.
static void materializeVectorTripCount(VPlan &Plan, VPBasicBlock *VectorPHVPBB, bool TailByMasking, bool RequiresScalarEpilogue)
Materialize vector trip count computations to a set of VPInstructions.
static void simplifyRecipes(VPlan &Plan)
Perform instcombine-like simplifications on recipes in Plan.
static void handleUncountableEarlyExit(VPBasicBlock *EarlyExitingVPBB, VPBasicBlock *EarlyExitVPBB, VPlan &Plan, VPBasicBlock *HeaderVPBB, VPBasicBlock *LatchVPBB)
Update Plan to account for the uncountable early exit from EarlyExitingVPBB to EarlyExitVPBB by.
static void clearReductionWrapFlags(VPlan &Plan)
Clear NSW/NUW flags from reduction instructions if necessary.
static void cse(VPlan &Plan)
Perform common-subexpression-elimination on Plan.
static void addActiveLaneMask(VPlan &Plan, bool UseActiveLaneMaskForControlFlow, bool DataAndControlFlowWithoutRuntimeCheck)
Replace (ICMP_ULE, wide canonical IV, backedge-taken-count) checks with an (active-lane-mask recipe,...
static LLVM_ABI_FOR_TEST void optimize(VPlan &Plan)
Apply VPlan-to-VPlan optimizations to Plan, including induction recipe optimizations,...
static void dissolveLoopRegions(VPlan &Plan)
Replace loop regions with explicit CFG.
static void truncateToMinimalBitwidths(VPlan &Plan, const MapVector< Instruction *, uint64_t > &MinBWs)
Insert truncates and extends for any truncated recipe.
static bool adjustFixedOrderRecurrences(VPlan &Plan, VPBuilder &Builder)
Try to have all users of fixed-order recurrences appear after the recipe defining their previous valu...
static void optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Optimize Plan based on BestVF and BestUF.
static void materializeVFAndVFxUF(VPlan &Plan, VPBasicBlock *VectorPH, ElementCount VF)
Materialize VF and VFxUF to be computed explicitly using VPInstructions.
static void updateScalarResumePhis(VPlan &Plan, DenseMap< VPValue *, VPValue * > &IVEndValues)
Update the resume phis in the scalar preheader after creating wide recipes for first-order recurrence...