LLVM 23.0.0git
VPlanTransforms.cpp
Go to the documentation of this file.
1//===-- VPlanTransforms.cpp - Utility VPlan to VPlan transforms -----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file implements a set of utility VPlan to VPlan transformations.
11///
12//===----------------------------------------------------------------------===//
13
14#include "VPlanTransforms.h"
15#include "VPRecipeBuilder.h"
16#include "VPlan.h"
17#include "VPlanAnalysis.h"
18#include "VPlanCFG.h"
19#include "VPlanDominatorTree.h"
20#include "VPlanHelpers.h"
21#include "VPlanPatternMatch.h"
22#include "VPlanUtils.h"
23#include "VPlanVerifier.h"
24#include "llvm/ADT/APInt.h"
26#include "llvm/ADT/STLExtras.h"
28#include "llvm/ADT/SetVector.h"
30#include "llvm/ADT/TypeSwitch.h"
38#include "llvm/IR/Intrinsics.h"
39#include "llvm/IR/MDBuilder.h"
40#include "llvm/IR/Metadata.h"
44
45using namespace llvm;
46using namespace VPlanPatternMatch;
47using namespace SCEVPatternMatch;
48
50 VPlan &Plan, const TargetLibraryInfo &TLI) {
51
53 Plan.getVectorLoopRegion());
55 // Skip blocks outside region
56 if (!VPBB->getParent())
57 break;
58 VPRecipeBase *Term = VPBB->getTerminator();
59 auto EndIter = Term ? Term->getIterator() : VPBB->end();
60 // Introduce each ingredient into VPlan.
61 for (VPRecipeBase &Ingredient :
62 make_early_inc_range(make_range(VPBB->begin(), EndIter))) {
63
64 VPValue *VPV = Ingredient.getVPSingleValue();
65 if (!VPV->getUnderlyingValue())
66 continue;
67
69
70 VPRecipeBase *NewRecipe = nullptr;
71 if (auto *PhiR = dyn_cast<VPPhi>(&Ingredient)) {
72 auto *Phi = cast<PHINode>(PhiR->getUnderlyingValue());
73 NewRecipe = new VPWidenPHIRecipe(Phi, nullptr, PhiR->getDebugLoc());
74 for (VPValue *Op : PhiR->operands())
75 NewRecipe->addOperand(Op);
76 } else if (auto *VPI = dyn_cast<VPInstruction>(&Ingredient)) {
77 assert(!isa<PHINode>(Inst) && "phis should be handled above");
78 // Create VPWidenMemoryRecipe for loads and stores.
79 if (LoadInst *Load = dyn_cast<LoadInst>(Inst)) {
80 NewRecipe = new VPWidenLoadRecipe(
81 *Load, Ingredient.getOperand(0), nullptr /*Mask*/,
82 false /*Consecutive*/, false /*Reverse*/, *VPI,
83 Ingredient.getDebugLoc());
84 } else if (StoreInst *Store = dyn_cast<StoreInst>(Inst)) {
85 NewRecipe = new VPWidenStoreRecipe(
86 *Store, Ingredient.getOperand(1), Ingredient.getOperand(0),
87 nullptr /*Mask*/, false /*Consecutive*/, false /*Reverse*/, *VPI,
88 Ingredient.getDebugLoc());
90 NewRecipe = new VPWidenGEPRecipe(GEP, Ingredient.operands(), *VPI,
91 Ingredient.getDebugLoc());
92 } else if (CallInst *CI = dyn_cast<CallInst>(Inst)) {
93 Intrinsic::ID VectorID = getVectorIntrinsicIDForCall(CI, &TLI);
94 if (VectorID == Intrinsic::not_intrinsic)
95 return false;
96 NewRecipe = new VPWidenIntrinsicRecipe(
97 *CI, getVectorIntrinsicIDForCall(CI, &TLI),
98 drop_end(Ingredient.operands()), CI->getType(), VPIRFlags(*CI),
99 *VPI, CI->getDebugLoc());
100 } else if (auto *CI = dyn_cast<CastInst>(Inst)) {
101 NewRecipe = new VPWidenCastRecipe(
102 CI->getOpcode(), Ingredient.getOperand(0), CI->getType(), CI,
103 VPIRFlags(*CI), VPIRMetadata(*CI));
104 } else {
105 NewRecipe = new VPWidenRecipe(*Inst, Ingredient.operands(), *VPI,
106 *VPI, Ingredient.getDebugLoc());
107 }
108 } else {
110 "inductions must be created earlier");
111 continue;
112 }
113
114 NewRecipe->insertBefore(&Ingredient);
115 if (NewRecipe->getNumDefinedValues() == 1)
116 VPV->replaceAllUsesWith(NewRecipe->getVPSingleValue());
117 else
118 assert(NewRecipe->getNumDefinedValues() == 0 &&
119 "Only recpies with zero or one defined values expected");
120 Ingredient.eraseFromParent();
121 }
122 }
123 return true;
124}
125
126/// Helper for extra no-alias checks via known-safe recipe and SCEV.
128 const SmallPtrSetImpl<VPRecipeBase *> &ExcludeRecipes;
129 VPReplicateRecipe &GroupLeader;
131 const Loop &L;
132 VPTypeAnalysis &TypeInfo;
133
134 // Return true if \p A and \p B are known to not alias for all VFs in the
135 // plan, checked via the distance between the accesses
136 bool isNoAliasViaDistance(VPReplicateRecipe *A, VPReplicateRecipe *B) const {
137 if (A->getOpcode() != Instruction::Store ||
138 B->getOpcode() != Instruction::Store)
139 return false;
140
141 VPValue *AddrA = A->getOperand(1);
142 const SCEV *SCEVA = vputils::getSCEVExprForVPValue(AddrA, PSE, &L);
143 VPValue *AddrB = B->getOperand(1);
144 const SCEV *SCEVB = vputils::getSCEVExprForVPValue(AddrB, PSE, &L);
146 return false;
147
148 const APInt *Distance;
149 ScalarEvolution &SE = *PSE.getSE();
150 if (!match(SE.getMinusSCEV(SCEVA, SCEVB), m_scev_APInt(Distance)))
151 return false;
152
153 const DataLayout &DL = SE.getDataLayout();
154 Type *TyA = TypeInfo.inferScalarType(A->getOperand(0));
155 uint64_t SizeA = DL.getTypeStoreSize(TyA);
156 Type *TyB = TypeInfo.inferScalarType(B->getOperand(0));
157 uint64_t SizeB = DL.getTypeStoreSize(TyB);
158
159 // Use the maximum store size to ensure no overlap from either direction.
160 // Currently only handles fixed sizes, as it is only used for
161 // replicating VPReplicateRecipes.
162 uint64_t MaxStoreSize = std::max(SizeA, SizeB);
163
164 auto VFs = B->getParent()->getPlan()->vectorFactors();
166 if (MaxVF.isScalable())
167 return false;
168 return Distance->abs().uge(
169 MaxVF.multiplyCoefficientBy(MaxStoreSize).getFixedValue());
170 }
171
172public:
175 const Loop &L, VPTypeAnalysis &TypeInfo)
176 : ExcludeRecipes(ExcludeRecipes), GroupLeader(GroupLeader), PSE(PSE),
177 L(L), TypeInfo(TypeInfo) {}
178
179 /// Return true if \p R should be skipped during alias checking, either
180 /// because it's in the exclude set or because no-alias can be proven via
181 /// SCEV.
182 bool shouldSkip(VPRecipeBase &R) const {
183 auto *Store = dyn_cast<VPReplicateRecipe>(&R);
184 return ExcludeRecipes.contains(&R) ||
185 (Store && isNoAliasViaDistance(Store, &GroupLeader));
186 }
187};
188
189/// Check if a memory operation doesn't alias with memory operations in blocks
190/// between \p FirstBB and \p LastBB using scoped noalias metadata. If
191/// \p SinkInfo is std::nullopt, only recipes that may write to memory are
192/// checked (for load hoisting). Otherwise recipes that both read and write
193/// memory are checked, and SCEV is used to prove no-alias between the group
194/// leader and other replicate recipes (for store sinking).
195static bool
197 VPBasicBlock *FirstBB, VPBasicBlock *LastBB,
198 std::optional<SinkStoreInfo> SinkInfo = {}) {
199 bool CheckReads = SinkInfo.has_value();
200 if (!MemLoc.AATags.Scope)
201 return false;
202
203 const AAMDNodes &MemAA = MemLoc.AATags;
204
205 for (VPBlockBase *Block = FirstBB; Block;
206 Block = Block->getSingleSuccessor()) {
207 assert(Block->getNumSuccessors() <= 1 &&
208 "Expected at most one successor in block chain");
209 auto *VPBB = cast<VPBasicBlock>(Block);
210 for (VPRecipeBase &R : *VPBB) {
211 if (SinkInfo && SinkInfo->shouldSkip(R))
212 continue;
213
214 // Skip recipes that don't need checking.
215 if (!R.mayWriteToMemory() && !(CheckReads && R.mayReadFromMemory()))
216 continue;
217
219 if (!Loc)
220 // Conservatively assume aliasing for memory operations without
221 // location.
222 return false;
223
224 // For reads, check if they don't alias in the reverse direction and
225 // skip if so.
226 if (CheckReads && R.mayReadFromMemory() &&
228 MemAA.NoAlias))
229 continue;
230
231 // Check if the memory operations may alias in the forward direction.
233 Loc->AATags.NoAlias))
234 return false;
235 }
236
237 if (Block == LastBB)
238 break;
239 }
240 return true;
241}
242
243/// Return true if we do not know how to (mechanically) hoist or sink \p R out
244/// of a loop region.
246 // Assumes don't alias anything or throw; as long as they're guaranteed to
247 // execute, they're safe to hoist.
249 return false;
250
251 // TODO: Relax checks in the future, e.g. we could also hoist reads, if their
252 // memory location is not modified in the vector loop.
253 if (R.mayHaveSideEffects() || R.mayReadFromMemory() || R.isPhi())
254 return true;
255
256 // Allocas cannot be hoisted.
257 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
258 return RepR && RepR->getOpcode() == Instruction::Alloca;
259}
260
261static bool sinkScalarOperands(VPlan &Plan) {
262 auto Iter = vp_depth_first_deep(Plan.getEntry());
263 bool ScalarVFOnly = Plan.hasScalarVFOnly();
264 bool Changed = false;
265
267 auto InsertIfValidSinkCandidate = [ScalarVFOnly, &WorkList](
268 VPBasicBlock *SinkTo, VPValue *Op) {
269 auto *Candidate =
270 dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe());
271 if (!Candidate)
272 return;
273
274 // We only know how to sink VPReplicateRecipes and VPScalarIVStepsRecipes
275 // for now.
277 return;
278
279 if (Candidate->getParent() == SinkTo || cannotHoistOrSinkRecipe(*Candidate))
280 return;
281
282 if (auto *RepR = dyn_cast<VPReplicateRecipe>(Candidate))
283 if (!ScalarVFOnly && RepR->isSingleScalar())
284 return;
285
286 WorkList.insert({SinkTo, Candidate});
287 };
288
289 // First, collect the operands of all recipes in replicate blocks as seeds for
290 // sinking.
292 VPBasicBlock *EntryVPBB = VPR->getEntryBasicBlock();
293 if (!VPR->isReplicator() || EntryVPBB->getSuccessors().size() != 2)
294 continue;
295 VPBasicBlock *VPBB = cast<VPBasicBlock>(EntryVPBB->getSuccessors().front());
296 if (VPBB->getSingleSuccessor() != VPR->getExitingBasicBlock())
297 continue;
298 for (auto &Recipe : *VPBB)
299 for (VPValue *Op : Recipe.operands())
300 InsertIfValidSinkCandidate(VPBB, Op);
301 }
302
303 // Try to sink each replicate or scalar IV steps recipe in the worklist.
304 for (unsigned I = 0; I != WorkList.size(); ++I) {
305 VPBasicBlock *SinkTo;
306 VPSingleDefRecipe *SinkCandidate;
307 std::tie(SinkTo, SinkCandidate) = WorkList[I];
308
309 // All recipe users of SinkCandidate must be in the same block SinkTo or all
310 // users outside of SinkTo must only use the first lane of SinkCandidate. In
311 // the latter case, we need to duplicate SinkCandidate.
312 auto UsersOutsideSinkTo =
313 make_filter_range(SinkCandidate->users(), [SinkTo](VPUser *U) {
314 return cast<VPRecipeBase>(U)->getParent() != SinkTo;
315 });
316 if (any_of(UsersOutsideSinkTo, [SinkCandidate](VPUser *U) {
317 return !U->usesFirstLaneOnly(SinkCandidate);
318 }))
319 continue;
320 bool NeedsDuplicating = !UsersOutsideSinkTo.empty();
321
322 if (NeedsDuplicating) {
323 if (ScalarVFOnly)
324 continue;
325 VPSingleDefRecipe *Clone;
326 if (auto *SinkCandidateRepR =
327 dyn_cast<VPReplicateRecipe>(SinkCandidate)) {
328 // TODO: Handle converting to uniform recipes as separate transform,
329 // then cloning should be sufficient here.
330 Instruction *I = SinkCandidate->getUnderlyingInstr();
331 Clone = new VPReplicateRecipe(I, SinkCandidate->operands(), true,
332 nullptr /*Mask*/, *SinkCandidateRepR,
333 *SinkCandidateRepR);
334 // TODO: add ".cloned" suffix to name of Clone's VPValue.
335 } else {
336 Clone = SinkCandidate->clone();
337 }
338
339 Clone->insertBefore(SinkCandidate);
340 SinkCandidate->replaceUsesWithIf(Clone, [SinkTo](VPUser &U, unsigned) {
341 return cast<VPRecipeBase>(&U)->getParent() != SinkTo;
342 });
343 }
344 SinkCandidate->moveBefore(*SinkTo, SinkTo->getFirstNonPhi());
345 for (VPValue *Op : SinkCandidate->operands())
346 InsertIfValidSinkCandidate(SinkTo, Op);
347 Changed = true;
348 }
349 return Changed;
350}
351
352/// If \p R is a region with a VPBranchOnMaskRecipe in the entry block, return
353/// the mask.
355 auto *EntryBB = dyn_cast<VPBasicBlock>(R->getEntry());
356 if (!EntryBB || EntryBB->size() != 1 ||
357 !isa<VPBranchOnMaskRecipe>(EntryBB->begin()))
358 return nullptr;
359
360 return cast<VPBranchOnMaskRecipe>(&*EntryBB->begin())->getOperand(0);
361}
362
363/// If \p R is a triangle region, return the 'then' block of the triangle.
365 auto *EntryBB = cast<VPBasicBlock>(R->getEntry());
366 if (EntryBB->getNumSuccessors() != 2)
367 return nullptr;
368
369 auto *Succ0 = dyn_cast<VPBasicBlock>(EntryBB->getSuccessors()[0]);
370 auto *Succ1 = dyn_cast<VPBasicBlock>(EntryBB->getSuccessors()[1]);
371 if (!Succ0 || !Succ1)
372 return nullptr;
373
374 if (Succ0->getNumSuccessors() + Succ1->getNumSuccessors() != 1)
375 return nullptr;
376 if (Succ0->getSingleSuccessor() == Succ1)
377 return Succ0;
378 if (Succ1->getSingleSuccessor() == Succ0)
379 return Succ1;
380 return nullptr;
381}
382
383// Merge replicate regions in their successor region, if a replicate region
384// is connected to a successor replicate region with the same predicate by a
385// single, empty VPBasicBlock.
387 SmallPtrSet<VPRegionBlock *, 4> TransformedRegions;
388
389 // Collect replicate regions followed by an empty block, followed by another
390 // replicate region with matching masks to process front. This is to avoid
391 // iterator invalidation issues while merging regions.
394 vp_depth_first_deep(Plan.getEntry()))) {
395 if (!Region1->isReplicator())
396 continue;
397 auto *MiddleBasicBlock =
398 dyn_cast_or_null<VPBasicBlock>(Region1->getSingleSuccessor());
399 if (!MiddleBasicBlock || !MiddleBasicBlock->empty())
400 continue;
401
402 auto *Region2 =
403 dyn_cast_or_null<VPRegionBlock>(MiddleBasicBlock->getSingleSuccessor());
404 if (!Region2 || !Region2->isReplicator())
405 continue;
406
407 VPValue *Mask1 = getPredicatedMask(Region1);
408 VPValue *Mask2 = getPredicatedMask(Region2);
409 if (!Mask1 || Mask1 != Mask2)
410 continue;
411
412 assert(Mask1 && Mask2 && "both region must have conditions");
413 WorkList.push_back(Region1);
414 }
415
416 // Move recipes from Region1 to its successor region, if both are triangles.
417 for (VPRegionBlock *Region1 : WorkList) {
418 if (TransformedRegions.contains(Region1))
419 continue;
420 auto *MiddleBasicBlock = cast<VPBasicBlock>(Region1->getSingleSuccessor());
421 auto *Region2 = cast<VPRegionBlock>(MiddleBasicBlock->getSingleSuccessor());
422
423 VPBasicBlock *Then1 = getPredicatedThenBlock(Region1);
424 VPBasicBlock *Then2 = getPredicatedThenBlock(Region2);
425 if (!Then1 || !Then2)
426 continue;
427
428 // Note: No fusion-preventing memory dependencies are expected in either
429 // region. Such dependencies should be rejected during earlier dependence
430 // checks, which guarantee accesses can be re-ordered for vectorization.
431 //
432 // Move recipes to the successor region.
433 for (VPRecipeBase &ToMove : make_early_inc_range(reverse(*Then1)))
434 ToMove.moveBefore(*Then2, Then2->getFirstNonPhi());
435
436 auto *Merge1 = cast<VPBasicBlock>(Then1->getSingleSuccessor());
437 auto *Merge2 = cast<VPBasicBlock>(Then2->getSingleSuccessor());
438
439 // Move VPPredInstPHIRecipes from the merge block to the successor region's
440 // merge block. Update all users inside the successor region to use the
441 // original values.
442 for (VPRecipeBase &Phi1ToMove : make_early_inc_range(reverse(*Merge1))) {
443 VPValue *PredInst1 =
444 cast<VPPredInstPHIRecipe>(&Phi1ToMove)->getOperand(0);
445 VPValue *Phi1ToMoveV = Phi1ToMove.getVPSingleValue();
446 Phi1ToMoveV->replaceUsesWithIf(PredInst1, [Then2](VPUser &U, unsigned) {
447 return cast<VPRecipeBase>(&U)->getParent() == Then2;
448 });
449
450 // Remove phi recipes that are unused after merging the regions.
451 if (Phi1ToMove.getVPSingleValue()->getNumUsers() == 0) {
452 Phi1ToMove.eraseFromParent();
453 continue;
454 }
455 Phi1ToMove.moveBefore(*Merge2, Merge2->begin());
456 }
457
458 // Remove the dead recipes in Region1's entry block.
459 for (VPRecipeBase &R :
460 make_early_inc_range(reverse(*Region1->getEntryBasicBlock())))
461 R.eraseFromParent();
462
463 // Finally, remove the first region.
464 for (VPBlockBase *Pred : make_early_inc_range(Region1->getPredecessors())) {
465 VPBlockUtils::disconnectBlocks(Pred, Region1);
466 VPBlockUtils::connectBlocks(Pred, MiddleBasicBlock);
467 }
468 VPBlockUtils::disconnectBlocks(Region1, MiddleBasicBlock);
469 TransformedRegions.insert(Region1);
470 }
471
472 return !TransformedRegions.empty();
473}
474
476 VPlan &Plan) {
477 Instruction *Instr = PredRecipe->getUnderlyingInstr();
478 // Build the triangular if-then region.
479 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
480 assert(Instr->getParent() && "Predicated instruction not in any basic block");
481 auto *BlockInMask = PredRecipe->getMask();
482 auto *MaskDef = BlockInMask->getDefiningRecipe();
483 auto *BOMRecipe = new VPBranchOnMaskRecipe(
484 BlockInMask, MaskDef ? MaskDef->getDebugLoc() : DebugLoc::getUnknown());
485 auto *Entry =
486 Plan.createVPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
487
488 // Replace predicated replicate recipe with a replicate recipe without a
489 // mask but in the replicate region.
490 auto *RecipeWithoutMask = new VPReplicateRecipe(
491 PredRecipe->getUnderlyingInstr(), drop_end(PredRecipe->operands()),
492 PredRecipe->isSingleScalar(), nullptr /*Mask*/, *PredRecipe, *PredRecipe,
493 PredRecipe->getDebugLoc());
494 auto *Pred =
495 Plan.createVPBasicBlock(Twine(RegionName) + ".if", RecipeWithoutMask);
496
497 VPPredInstPHIRecipe *PHIRecipe = nullptr;
498 if (PredRecipe->getNumUsers() != 0) {
499 PHIRecipe = new VPPredInstPHIRecipe(RecipeWithoutMask,
500 RecipeWithoutMask->getDebugLoc());
501 PredRecipe->replaceAllUsesWith(PHIRecipe);
502 PHIRecipe->setOperand(0, RecipeWithoutMask);
503 }
504 PredRecipe->eraseFromParent();
505 auto *Exiting =
506 Plan.createVPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
508 Plan.createReplicateRegion(Entry, Exiting, RegionName);
509
510 // Note: first set Entry as region entry and then connect successors starting
511 // from it in order, to propagate the "parent" of each VPBasicBlock.
512 VPBlockUtils::insertTwoBlocksAfter(Pred, Exiting, Entry);
513 VPBlockUtils::connectBlocks(Pred, Exiting);
514
515 return Region;
516}
517
518static void addReplicateRegions(VPlan &Plan) {
521 vp_depth_first_deep(Plan.getEntry()))) {
522 for (VPRecipeBase &R : *VPBB)
523 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
524 if (RepR->isPredicated())
525 WorkList.push_back(RepR);
526 }
527 }
528
529 unsigned BBNum = 0;
530 for (VPReplicateRecipe *RepR : WorkList) {
531 VPBasicBlock *CurrentBlock = RepR->getParent();
532 VPBasicBlock *SplitBlock = CurrentBlock->splitAt(RepR->getIterator());
533
534 BasicBlock *OrigBB = RepR->getUnderlyingInstr()->getParent();
535 SplitBlock->setName(
536 OrigBB->hasName() ? OrigBB->getName() + "." + Twine(BBNum++) : "");
537 // Record predicated instructions for above packing optimizations.
539 Region->setParent(CurrentBlock->getParent());
541
542 VPRegionBlock *ParentRegion = Region->getParent();
543 if (ParentRegion && ParentRegion->getExiting() == CurrentBlock)
544 ParentRegion->setExiting(SplitBlock);
545 }
546}
547
548/// Remove redundant VPBasicBlocks by merging them into their predecessor if
549/// the predecessor has a single successor.
553 vp_depth_first_deep(Plan.getEntry()))) {
554 // Don't fold the blocks in the skeleton of the Plan into their single
555 // predecessors for now.
556 // TODO: Remove restriction once more of the skeleton is modeled in VPlan.
557 if (!VPBB->getParent())
558 continue;
559 auto *PredVPBB =
560 dyn_cast_or_null<VPBasicBlock>(VPBB->getSinglePredecessor());
561 if (!PredVPBB || PredVPBB->getNumSuccessors() != 1 ||
562 isa<VPIRBasicBlock>(PredVPBB))
563 continue;
564 WorkList.push_back(VPBB);
565 }
566
567 for (VPBasicBlock *VPBB : WorkList) {
568 VPBasicBlock *PredVPBB = cast<VPBasicBlock>(VPBB->getSinglePredecessor());
569 for (VPRecipeBase &R : make_early_inc_range(*VPBB))
570 R.moveBefore(*PredVPBB, PredVPBB->end());
571 VPBlockUtils::disconnectBlocks(PredVPBB, VPBB);
572 auto *ParentRegion = VPBB->getParent();
573 if (ParentRegion && ParentRegion->getExiting() == VPBB)
574 ParentRegion->setExiting(PredVPBB);
575 for (auto *Succ : to_vector(VPBB->successors())) {
577 VPBlockUtils::connectBlocks(PredVPBB, Succ);
578 }
579 // VPBB is now dead and will be cleaned up when the plan gets destroyed.
580 }
581 return !WorkList.empty();
582}
583
585 // Convert masked VPReplicateRecipes to if-then region blocks.
587
588 bool ShouldSimplify = true;
589 while (ShouldSimplify) {
590 ShouldSimplify = sinkScalarOperands(Plan);
591 ShouldSimplify |= mergeReplicateRegionsIntoSuccessors(Plan);
592 ShouldSimplify |= mergeBlocksIntoPredecessors(Plan);
593 }
594}
595
596/// Remove redundant casts of inductions.
597///
598/// Such redundant casts are casts of induction variables that can be ignored,
599/// because we already proved that the casted phi is equal to the uncasted phi
600/// in the vectorized loop. There is no need to vectorize the cast - the same
601/// value can be used for both the phi and casts in the vector loop.
603 for (auto &Phi : Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
605 if (!IV || IV->getTruncInst())
606 continue;
607
608 // A sequence of IR Casts has potentially been recorded for IV, which
609 // *must be bypassed* when the IV is vectorized, because the vectorized IV
610 // will produce the desired casted value. This sequence forms a def-use
611 // chain and is provided in reverse order, ending with the cast that uses
612 // the IV phi. Search for the recipe of the last cast in the chain and
613 // replace it with the original IV. Note that only the final cast is
614 // expected to have users outside the cast-chain and the dead casts left
615 // over will be cleaned up later.
616 ArrayRef<Instruction *> Casts = IV->getInductionDescriptor().getCastInsts();
617 VPValue *FindMyCast = IV;
618 for (Instruction *IRCast : reverse(Casts)) {
619 VPSingleDefRecipe *FoundUserCast = nullptr;
620 for (auto *U : FindMyCast->users()) {
621 auto *UserCast = dyn_cast<VPSingleDefRecipe>(U);
622 if (UserCast && UserCast->getUnderlyingValue() == IRCast) {
623 FoundUserCast = UserCast;
624 break;
625 }
626 }
627 FindMyCast = FoundUserCast;
628 }
629 FindMyCast->replaceAllUsesWith(IV);
630 }
631}
632
633/// Try to replace VPWidenCanonicalIVRecipes with a widened canonical IV
634/// recipe, if it exists.
636 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
637 VPCanonicalIVPHIRecipe *CanonicalIV = LoopRegion->getCanonicalIV();
638 VPWidenCanonicalIVRecipe *WidenNewIV = nullptr;
639 for (VPUser *U : CanonicalIV->users()) {
641 if (WidenNewIV)
642 break;
643 }
644
645 if (!WidenNewIV)
646 return;
647
648 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
649 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
650 auto *WidenOriginalIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
651
652 if (!WidenOriginalIV || !WidenOriginalIV->isCanonical())
653 continue;
654
655 // Replace WidenNewIV with WidenOriginalIV if WidenOriginalIV provides
656 // everything WidenNewIV's users need. That is, WidenOriginalIV will
657 // generate a vector phi or all users of WidenNewIV demand the first lane
658 // only.
659 if (!vputils::onlyScalarValuesUsed(WidenOriginalIV) ||
660 vputils::onlyFirstLaneUsed(WidenNewIV)) {
661 // We are replacing a wide canonical iv with a suitable wide induction.
662 // This is used to compute header mask, hence all lanes will be used and
663 // we need to drop wrap flags only applying to lanes guranteed to execute
664 // in the original scalar loop.
665 WidenOriginalIV->dropPoisonGeneratingFlags();
666 WidenNewIV->replaceAllUsesWith(WidenOriginalIV);
667 WidenNewIV->eraseFromParent();
668 return;
669 }
670 }
671}
672
673/// Returns true if \p R is dead and can be removed.
674static bool isDeadRecipe(VPRecipeBase &R) {
675 // Do remove conditional assume instructions as their conditions may be
676 // flattened.
677 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
678 bool IsConditionalAssume = RepR && RepR->isPredicated() &&
680 if (IsConditionalAssume)
681 return true;
682
683 if (R.mayHaveSideEffects())
684 return false;
685
686 // Recipe is dead if no user keeps the recipe alive.
687 return all_of(R.definedValues(),
688 [](VPValue *V) { return V->getNumUsers() == 0; });
689}
690
693 vp_post_order_deep(Plan.getEntry()))) {
694 // The recipes in the block are processed in reverse order, to catch chains
695 // of dead recipes.
696 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
697 if (isDeadRecipe(R)) {
698 R.eraseFromParent();
699 continue;
700 }
701
702 // Check if R is a dead VPPhi <-> update cycle and remove it.
703 auto *PhiR = dyn_cast<VPPhi>(&R);
704 if (!PhiR || PhiR->getNumOperands() != 2)
705 continue;
706 VPUser *PhiUser = PhiR->getSingleUser();
707 if (!PhiUser)
708 continue;
709 VPValue *Incoming = PhiR->getOperand(1);
710 if (PhiUser != Incoming->getDefiningRecipe() ||
711 Incoming->getNumUsers() != 1)
712 continue;
713 PhiR->replaceAllUsesWith(PhiR->getOperand(0));
714 PhiR->eraseFromParent();
715 Incoming->getDefiningRecipe()->eraseFromParent();
716 }
717 }
718}
719
722 Instruction::BinaryOps InductionOpcode,
723 FPMathOperator *FPBinOp, Instruction *TruncI,
724 VPIRValue *StartV, VPValue *Step, DebugLoc DL,
725 VPBuilder &Builder) {
726 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
727 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
728 VPCanonicalIVPHIRecipe *CanonicalIV = LoopRegion->getCanonicalIV();
729 VPSingleDefRecipe *BaseIV = Builder.createDerivedIV(
730 Kind, FPBinOp, StartV, CanonicalIV, Step, "offset.idx");
731
732 // Truncate base induction if needed.
733 VPTypeAnalysis TypeInfo(Plan);
734 Type *ResultTy = TypeInfo.inferScalarType(BaseIV);
735 if (TruncI) {
736 Type *TruncTy = TruncI->getType();
737 assert(ResultTy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits() &&
738 "Not truncating.");
739 assert(ResultTy->isIntegerTy() && "Truncation requires an integer type");
740 BaseIV = Builder.createScalarCast(Instruction::Trunc, BaseIV, TruncTy, DL);
741 ResultTy = TruncTy;
742 }
743
744 // Truncate step if needed.
745 Type *StepTy = TypeInfo.inferScalarType(Step);
746 if (ResultTy != StepTy) {
747 assert(StepTy->getScalarSizeInBits() > ResultTy->getScalarSizeInBits() &&
748 "Not truncating.");
749 assert(StepTy->isIntegerTy() && "Truncation requires an integer type");
750 auto *VecPreheader =
752 VPBuilder::InsertPointGuard Guard(Builder);
753 Builder.setInsertPoint(VecPreheader);
754 Step = Builder.createScalarCast(Instruction::Trunc, Step, ResultTy, DL);
755 }
756 return Builder.createScalarIVSteps(InductionOpcode, FPBinOp, BaseIV, Step,
757 &Plan.getVF(), DL);
758}
759
762 for (unsigned I = 0; I != Users.size(); ++I) {
764 if (isa<VPHeaderPHIRecipe>(Cur))
765 continue;
766 for (VPValue *V : Cur->definedValues())
767 Users.insert_range(V->users());
768 }
769 return Users.takeVector();
770}
771
772/// Scalarize a VPWidenPointerInductionRecipe by replacing it with a PtrAdd
773/// (IndStart, ScalarIVSteps (0, Step)). This is used when the recipe only
774/// generates scalar values.
775static VPValue *
777 VPlan &Plan, VPBuilder &Builder) {
779 VPIRValue *StartV = Plan.getConstantInt(ID.getStep()->getType(), 0);
780 VPValue *StepV = PtrIV->getOperand(1);
782 Plan, InductionDescriptor::IK_IntInduction, Instruction::Add, nullptr,
783 nullptr, StartV, StepV, PtrIV->getDebugLoc(), Builder);
784
785 return Builder.createPtrAdd(PtrIV->getStartValue(), Steps,
786 PtrIV->getDebugLoc(), "next.gep");
787}
788
789/// Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd
790/// (IndStart, ScalarIVSteps (0, Step)) if only its scalar values are used, as
791/// VPWidenPointerInductionRecipe will generate vectors only. If some users
792/// require vectors while other require scalars, the scalar uses need to extract
793/// the scalars from the generated vectors (Note that this is different to how
794/// int/fp inductions are handled). Legalize extract-from-ends using uniform
795/// VPReplicateRecipe of wide inductions to use regular VPReplicateRecipe, so
796/// the correct end value is available. Also optimize
797/// VPWidenIntOrFpInductionRecipe, if any of its users needs scalar values, by
798/// providing them scalar steps built on the canonical scalar IV and update the
799/// original IV's users. This is an optional optimization to reduce the needs of
800/// vector extracts.
803 bool HasOnlyVectorVFs = !Plan.hasScalarVFOnly();
804 VPBuilder Builder(HeaderVPBB, HeaderVPBB->getFirstNonPhi());
805 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
806 auto *PhiR = dyn_cast<VPWidenInductionRecipe>(&Phi);
807 if (!PhiR)
808 continue;
809
810 // Try to narrow wide and replicating recipes to uniform recipes, based on
811 // VPlan analysis.
812 // TODO: Apply to all recipes in the future, to replace legacy uniformity
813 // analysis.
814 auto Users = collectUsersRecursively(PhiR);
815 for (VPUser *U : reverse(Users)) {
816 auto *Def = dyn_cast<VPRecipeWithIRFlags>(U);
817 auto *RepR = dyn_cast<VPReplicateRecipe>(U);
818 // Skip recipes that shouldn't be narrowed.
819 if (!Def || !isa<VPReplicateRecipe, VPWidenRecipe>(Def) ||
820 Def->getNumUsers() == 0 || !Def->getUnderlyingValue() ||
821 (RepR && (RepR->isSingleScalar() || RepR->isPredicated())))
822 continue;
823
824 // Skip recipes that may have other lanes than their first used.
826 continue;
827
828 auto *Clone = new VPReplicateRecipe(Def->getUnderlyingInstr(),
829 Def->operands(), /*IsUniform*/ true,
830 /*Mask*/ nullptr, /*Flags*/ *Def);
831 Clone->insertAfter(Def);
832 Def->replaceAllUsesWith(Clone);
833 }
834
835 // Replace wide pointer inductions which have only their scalars used by
836 // PtrAdd(IndStart, ScalarIVSteps (0, Step)).
837 if (auto *PtrIV = dyn_cast<VPWidenPointerInductionRecipe>(&Phi)) {
838 if (!Plan.hasScalarVFOnly() &&
839 !PtrIV->onlyScalarsGenerated(Plan.hasScalableVF()))
840 continue;
841
842 VPValue *PtrAdd = scalarizeVPWidenPointerInduction(PtrIV, Plan, Builder);
843 PtrIV->replaceAllUsesWith(PtrAdd);
844 continue;
845 }
846
847 // Replace widened induction with scalar steps for users that only use
848 // scalars.
849 auto *WideIV = cast<VPWidenIntOrFpInductionRecipe>(&Phi);
850 if (HasOnlyVectorVFs && none_of(WideIV->users(), [WideIV](VPUser *U) {
851 return U->usesScalars(WideIV);
852 }))
853 continue;
854
855 const InductionDescriptor &ID = WideIV->getInductionDescriptor();
857 Plan, ID.getKind(), ID.getInductionOpcode(),
858 dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
859 WideIV->getTruncInst(), WideIV->getStartValue(), WideIV->getStepValue(),
860 WideIV->getDebugLoc(), Builder);
861
862 // Update scalar users of IV to use Step instead.
863 if (!HasOnlyVectorVFs) {
864 assert(!Plan.hasScalableVF() &&
865 "plans containing a scalar VF cannot also include scalable VFs");
866 WideIV->replaceAllUsesWith(Steps);
867 } else {
868 bool HasScalableVF = Plan.hasScalableVF();
869 WideIV->replaceUsesWithIf(Steps,
870 [WideIV, HasScalableVF](VPUser &U, unsigned) {
871 if (HasScalableVF)
872 return U.usesFirstLaneOnly(WideIV);
873 return U.usesScalars(WideIV);
874 });
875 }
876 }
877}
878
879/// Check if \p VPV is an untruncated wide induction, either before or after the
880/// increment. If so return the header IV (before the increment), otherwise
881/// return null.
884 auto *WideIV = dyn_cast<VPWidenInductionRecipe>(VPV);
885 if (WideIV) {
886 // VPV itself is a wide induction, separately compute the end value for exit
887 // users if it is not a truncated IV.
888 auto *IntOrFpIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
889 return (IntOrFpIV && IntOrFpIV->getTruncInst()) ? nullptr : WideIV;
890 }
891
892 // Check if VPV is an optimizable induction increment.
893 VPRecipeBase *Def = VPV->getDefiningRecipe();
894 if (!Def || Def->getNumOperands() != 2)
895 return nullptr;
896 WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(0));
897 if (!WideIV)
898 WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(1));
899 if (!WideIV)
900 return nullptr;
901
902 auto IsWideIVInc = [&]() {
903 auto &ID = WideIV->getInductionDescriptor();
904
905 // Check if VPV increments the induction by the induction step.
906 VPValue *IVStep = WideIV->getStepValue();
907 switch (ID.getInductionOpcode()) {
908 case Instruction::Add:
909 return match(VPV, m_c_Add(m_Specific(WideIV), m_Specific(IVStep)));
910 case Instruction::FAdd:
912 m_Specific(IVStep)));
913 case Instruction::FSub:
914 return match(VPV, m_Binary<Instruction::FSub>(m_Specific(WideIV),
915 m_Specific(IVStep)));
916 case Instruction::Sub: {
917 // IVStep will be the negated step of the subtraction. Check if Step == -1
918 // * IVStep.
919 VPValue *Step;
920 if (!match(VPV, m_Sub(m_VPValue(), m_VPValue(Step))))
921 return false;
922 const SCEV *IVStepSCEV = vputils::getSCEVExprForVPValue(IVStep, PSE);
923 const SCEV *StepSCEV = vputils::getSCEVExprForVPValue(Step, PSE);
924 ScalarEvolution &SE = *PSE.getSE();
925 return !isa<SCEVCouldNotCompute>(IVStepSCEV) &&
926 !isa<SCEVCouldNotCompute>(StepSCEV) &&
927 IVStepSCEV == SE.getNegativeSCEV(StepSCEV);
928 }
929 default:
930 return ID.getKind() == InductionDescriptor::IK_PtrInduction &&
931 match(VPV, m_GetElementPtr(m_Specific(WideIV),
932 m_Specific(WideIV->getStepValue())));
933 }
934 llvm_unreachable("should have been covered by switch above");
935 };
936 return IsWideIVInc() ? WideIV : nullptr;
937}
938
939/// Attempts to optimize the induction variable exit values for users in the
940/// early exit block.
942 VPTypeAnalysis &TypeInfo,
943 VPBlockBase *PredVPBB,
944 VPValue *Op,
946 VPValue *Incoming, *Mask;
949 return nullptr;
950
951 auto *WideIV = getOptimizableIVOf(Incoming, PSE);
952 if (!WideIV)
953 return nullptr;
954
955 auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
956 if (WideIntOrFp && WideIntOrFp->getTruncInst())
957 return nullptr;
958
959 // Calculate the final index.
960 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
961 auto *CanonicalIV = LoopRegion->getCanonicalIV();
962 Type *CanonicalIVType = LoopRegion->getCanonicalIVType();
963 VPBuilder B(cast<VPBasicBlock>(PredVPBB));
964
965 DebugLoc DL = cast<VPInstruction>(Op)->getDebugLoc();
966 VPValue *FirstActiveLane =
967 B.createNaryOp(VPInstruction::FirstActiveLane, Mask, DL);
968 Type *FirstActiveLaneType = TypeInfo.inferScalarType(FirstActiveLane);
969 FirstActiveLane = B.createScalarZExtOrTrunc(FirstActiveLane, CanonicalIVType,
970 FirstActiveLaneType, DL);
971 VPValue *EndValue =
972 B.createNaryOp(Instruction::Add, {CanonicalIV, FirstActiveLane}, DL);
973
974 // `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
975 // changed it means the exit is using the incremented value, so we need to
976 // add the step.
977 if (Incoming != WideIV) {
978 VPValue *One = Plan.getConstantInt(CanonicalIVType, 1);
979 EndValue = B.createNaryOp(Instruction::Add, {EndValue, One}, DL);
980 }
981
982 if (!WideIntOrFp || !WideIntOrFp->isCanonical()) {
983 const InductionDescriptor &ID = WideIV->getInductionDescriptor();
984 VPIRValue *Start = WideIV->getStartValue();
985 VPValue *Step = WideIV->getStepValue();
986 EndValue = B.createDerivedIV(
987 ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
988 Start, EndValue, Step);
989 }
990
991 return EndValue;
992}
993
994/// Attempts to optimize the induction variable exit values for users in the
995/// exit block coming from the latch in the original scalar loop.
997 VPlan &Plan, VPTypeAnalysis &TypeInfo, VPBlockBase *PredVPBB, VPValue *Op,
1001 return nullptr;
1002
1003 auto *WideIV = getOptimizableIVOf(Incoming, PSE);
1004 if (!WideIV)
1005 return nullptr;
1006
1007 VPValue *EndValue = EndValues.lookup(WideIV);
1008 assert(EndValue && "end value must have been pre-computed");
1009
1010 // `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
1011 // changed it means the exit is using the incremented value, so we don't
1012 // need to subtract the step.
1013 if (Incoming != WideIV)
1014 return EndValue;
1015
1016 // Otherwise, subtract the step from the EndValue.
1017 VPBuilder B(cast<VPBasicBlock>(PredVPBB)->getTerminator());
1018 VPValue *Step = WideIV->getStepValue();
1019 Type *ScalarTy = TypeInfo.inferScalarType(WideIV);
1020 if (ScalarTy->isIntegerTy())
1021 return B.createNaryOp(Instruction::Sub, {EndValue, Step},
1022 DebugLoc::getUnknown(), "ind.escape");
1023 if (ScalarTy->isPointerTy()) {
1024 Type *StepTy = TypeInfo.inferScalarType(Step);
1025 auto *Zero = Plan.getConstantInt(StepTy, 0);
1026 return B.createPtrAdd(EndValue,
1027 B.createNaryOp(Instruction::Sub, {Zero, Step}),
1028 DebugLoc::getUnknown(), "ind.escape");
1029 }
1030 if (ScalarTy->isFloatingPointTy()) {
1031 const auto &ID = WideIV->getInductionDescriptor();
1032 return B.createNaryOp(
1033 ID.getInductionBinOp()->getOpcode() == Instruction::FAdd
1034 ? Instruction::FSub
1035 : Instruction::FAdd,
1036 {EndValue, Step}, {ID.getInductionBinOp()->getFastMathFlags()});
1037 }
1038 llvm_unreachable("all possible induction types must be handled");
1039 return nullptr;
1040}
1041
1043 VPlan &Plan, DenseMap<VPValue *, VPValue *> &EndValues,
1045 VPBlockBase *MiddleVPBB = Plan.getMiddleBlock();
1046 VPTypeAnalysis TypeInfo(Plan);
1047 for (VPIRBasicBlock *ExitVPBB : Plan.getExitBlocks()) {
1048 for (VPRecipeBase &R : ExitVPBB->phis()) {
1049 auto *ExitIRI = cast<VPIRPhi>(&R);
1050
1051 for (auto [Idx, PredVPBB] : enumerate(ExitVPBB->getPredecessors())) {
1052 VPValue *Escape = nullptr;
1053 if (PredVPBB == MiddleVPBB)
1054 Escape = optimizeLatchExitInductionUser(Plan, TypeInfo, PredVPBB,
1055 ExitIRI->getOperand(Idx),
1056 EndValues, PSE);
1057 else
1059 Plan, TypeInfo, PredVPBB, ExitIRI->getOperand(Idx), PSE);
1060 if (Escape)
1061 ExitIRI->setOperand(Idx, Escape);
1062 }
1063 }
1064 }
1065}
1066
1067/// Remove redundant EpxandSCEVRecipes in \p Plan's entry block by replacing
1068/// them with already existing recipes expanding the same SCEV expression.
1071
1072 for (VPRecipeBase &R :
1074 auto *ExpR = dyn_cast<VPExpandSCEVRecipe>(&R);
1075 if (!ExpR)
1076 continue;
1077
1078 const auto &[V, Inserted] = SCEV2VPV.try_emplace(ExpR->getSCEV(), ExpR);
1079 if (Inserted)
1080 continue;
1081 ExpR->replaceAllUsesWith(V->second);
1082 ExpR->eraseFromParent();
1083 }
1084}
1085
1087 SmallVector<VPValue *> WorkList;
1089 WorkList.push_back(V);
1090
1091 while (!WorkList.empty()) {
1092 VPValue *Cur = WorkList.pop_back_val();
1093 if (!Seen.insert(Cur).second)
1094 continue;
1095 VPRecipeBase *R = Cur->getDefiningRecipe();
1096 if (!R)
1097 continue;
1098 if (!isDeadRecipe(*R))
1099 continue;
1100 append_range(WorkList, R->operands());
1101 R->eraseFromParent();
1102 }
1103}
1104
1105/// Get any instruction opcode or intrinsic ID data embedded in recipe \p R.
1106/// Returns an optional pair, where the first element indicates whether it is
1107/// an intrinsic ID.
1108static std::optional<std::pair<bool, unsigned>>
1110 return TypeSwitch<const VPSingleDefRecipe *,
1111 std::optional<std::pair<bool, unsigned>>>(R)
1114 [](auto *I) { return std::make_pair(false, I->getOpcode()); })
1115 .Case([](const VPWidenIntrinsicRecipe *I) {
1116 return std::make_pair(true, I->getVectorIntrinsicID());
1117 })
1118 .Case<VPVectorPointerRecipe, VPPredInstPHIRecipe>([](auto *I) {
1119 // For recipes that do not directly map to LLVM IR instructions,
1120 // assign opcodes after the last VPInstruction opcode (which is also
1121 // after the last IR Instruction opcode), based on the VPRecipeID.
1122 return std::make_pair(false,
1123 VPInstruction::OpsEnd + 1 + I->getVPRecipeID());
1124 })
1125 .Default([](auto *) { return std::nullopt; });
1126}
1127
1128/// Try to fold \p R using InstSimplifyFolder. Will succeed and return a
1129/// non-nullptr VPValue for a handled opcode or intrinsic ID if corresponding \p
1130/// Operands are foldable live-ins.
1132 ArrayRef<VPValue *> Operands,
1133 const DataLayout &DL,
1134 VPTypeAnalysis &TypeInfo) {
1135 auto OpcodeOrIID = getOpcodeOrIntrinsicID(&R);
1136 if (!OpcodeOrIID)
1137 return nullptr;
1138
1140 for (VPValue *Op : Operands) {
1141 if (!match(Op, m_LiveIn()))
1142 return nullptr;
1143 Value *V = Op->getUnderlyingValue();
1144 if (!V)
1145 return nullptr;
1146 Ops.push_back(V);
1147 }
1148
1149 auto FoldToIRValue = [&]() -> Value * {
1150 InstSimplifyFolder Folder(DL);
1151 if (OpcodeOrIID->first) {
1152 if (R.getNumOperands() != 2)
1153 return nullptr;
1154 unsigned ID = OpcodeOrIID->second;
1155 return Folder.FoldBinaryIntrinsic(ID, Ops[0], Ops[1],
1156 TypeInfo.inferScalarType(&R));
1157 }
1158 unsigned Opcode = OpcodeOrIID->second;
1159 if (Instruction::isBinaryOp(Opcode))
1160 return Folder.FoldBinOp(static_cast<Instruction::BinaryOps>(Opcode),
1161 Ops[0], Ops[1]);
1162 if (Instruction::isCast(Opcode))
1163 return Folder.FoldCast(static_cast<Instruction::CastOps>(Opcode), Ops[0],
1164 TypeInfo.inferScalarType(R.getVPSingleValue()));
1165 switch (Opcode) {
1167 return Folder.FoldSelect(Ops[0], Ops[1],
1169 case VPInstruction::Not:
1170 return Folder.FoldBinOp(Instruction::BinaryOps::Xor, Ops[0],
1172 case Instruction::Select:
1173 return Folder.FoldSelect(Ops[0], Ops[1], Ops[2]);
1174 case Instruction::ICmp:
1175 case Instruction::FCmp:
1176 return Folder.FoldCmp(cast<VPRecipeWithIRFlags>(R).getPredicate(), Ops[0],
1177 Ops[1]);
1178 case Instruction::GetElementPtr: {
1179 auto &RFlags = cast<VPRecipeWithIRFlags>(R);
1180 auto *GEP = cast<GetElementPtrInst>(RFlags.getUnderlyingInstr());
1181 return Folder.FoldGEP(GEP->getSourceElementType(), Ops[0],
1182 drop_begin(Ops), RFlags.getGEPNoWrapFlags());
1183 }
1186 return Folder.FoldGEP(IntegerType::getInt8Ty(TypeInfo.getContext()),
1187 Ops[0], Ops[1],
1188 cast<VPRecipeWithIRFlags>(R).getGEPNoWrapFlags());
1189 // An extract of a live-in is an extract of a broadcast, so return the
1190 // broadcasted element.
1191 case Instruction::ExtractElement:
1192 assert(!Ops[0]->getType()->isVectorTy() && "Live-ins should be scalar");
1193 return Ops[0];
1194 }
1195 return nullptr;
1196 };
1197
1198 if (Value *V = FoldToIRValue())
1199 return R.getParent()->getPlan()->getOrAddLiveIn(V);
1200 return nullptr;
1201}
1202
1203/// Try to simplify VPSingleDefRecipe \p Def.
1205 VPlan *Plan = Def->getParent()->getPlan();
1206
1207 // Simplification of live-in IR values for SingleDef recipes using
1208 // InstSimplifyFolder.
1209 const DataLayout &DL =
1211 if (VPValue *V = tryToFoldLiveIns(*Def, Def->operands(), DL, TypeInfo))
1212 return Def->replaceAllUsesWith(V);
1213
1214 // Fold PredPHI LiveIn -> LiveIn.
1215 if (auto *PredPHI = dyn_cast<VPPredInstPHIRecipe>(Def)) {
1216 VPValue *Op = PredPHI->getOperand(0);
1217 if (isa<VPIRValue>(Op))
1218 PredPHI->replaceAllUsesWith(Op);
1219 }
1220
1221 VPBuilder Builder(Def);
1222 VPValue *A;
1223 if (match(Def, m_Trunc(m_ZExtOrSExt(m_VPValue(A))))) {
1224 Type *TruncTy = TypeInfo.inferScalarType(Def);
1225 Type *ATy = TypeInfo.inferScalarType(A);
1226 if (TruncTy == ATy) {
1227 Def->replaceAllUsesWith(A);
1228 } else {
1229 // Don't replace a scalarizing recipe with a widened cast.
1230 if (isa<VPReplicateRecipe>(Def))
1231 return;
1232 if (ATy->getScalarSizeInBits() < TruncTy->getScalarSizeInBits()) {
1233
1234 unsigned ExtOpcode = match(Def->getOperand(0), m_SExt(m_VPValue()))
1235 ? Instruction::SExt
1236 : Instruction::ZExt;
1237 auto *Ext = Builder.createWidenCast(Instruction::CastOps(ExtOpcode), A,
1238 TruncTy);
1239 if (auto *UnderlyingExt = Def->getOperand(0)->getUnderlyingValue()) {
1240 // UnderlyingExt has distinct return type, used to retain legacy cost.
1241 Ext->setUnderlyingValue(UnderlyingExt);
1242 }
1243 Def->replaceAllUsesWith(Ext);
1244 } else if (ATy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits()) {
1245 auto *Trunc = Builder.createWidenCast(Instruction::Trunc, A, TruncTy);
1246 Def->replaceAllUsesWith(Trunc);
1247 }
1248 }
1249#ifndef NDEBUG
1250 // Verify that the cached type info is for both A and its users is still
1251 // accurate by comparing it to freshly computed types.
1252 VPTypeAnalysis TypeInfo2(*Plan);
1253 assert(TypeInfo.inferScalarType(A) == TypeInfo2.inferScalarType(A));
1254 for (VPUser *U : A->users()) {
1255 auto *R = cast<VPRecipeBase>(U);
1256 for (VPValue *VPV : R->definedValues())
1257 assert(TypeInfo.inferScalarType(VPV) == TypeInfo2.inferScalarType(VPV));
1258 }
1259#endif
1260 }
1261
1262 // Simplify (X && Y) || (X && !Y) -> X.
1263 // TODO: Split up into simpler, modular combines: (X && Y) || (X && Z) into X
1264 // && (Y || Z) and (X || !X) into true. This requires queuing newly created
1265 // recipes to be visited during simplification.
1266 VPValue *X, *Y, *Z;
1267 if (match(Def,
1270 Def->replaceAllUsesWith(X);
1271 Def->eraseFromParent();
1272 return;
1273 }
1274
1275 // x | 1 -> 1
1276 if (match(Def, m_c_BinaryOr(m_VPValue(X), m_AllOnes())))
1277 return Def->replaceAllUsesWith(Def->getOperand(Def->getOperand(0) == X));
1278
1279 // x | 0 -> x
1280 if (match(Def, m_c_BinaryOr(m_VPValue(X), m_ZeroInt())))
1281 return Def->replaceAllUsesWith(X);
1282
1283 // x & 0 -> 0
1284 if (match(Def, m_c_BinaryAnd(m_VPValue(X), m_ZeroInt())))
1285 return Def->replaceAllUsesWith(Def->getOperand(Def->getOperand(0) == X));
1286
1287 // x && false -> false
1288 if (match(Def, m_LogicalAnd(m_VPValue(X), m_False())))
1289 return Def->replaceAllUsesWith(Def->getOperand(1));
1290
1291 // (x && y) || (x && z) -> x && (y || z)
1294 // Simplify only if one of the operands has one use to avoid creating an
1295 // extra recipe.
1296 (!Def->getOperand(0)->hasMoreThanOneUniqueUser() ||
1297 !Def->getOperand(1)->hasMoreThanOneUniqueUser()))
1298 return Def->replaceAllUsesWith(
1299 Builder.createLogicalAnd(X, Builder.createOr(Y, Z)));
1300
1301 // x && !x -> 0
1303 return Def->replaceAllUsesWith(Plan->getFalse());
1304
1305 if (match(Def, m_Select(m_VPValue(), m_VPValue(X), m_Deferred(X))))
1306 return Def->replaceAllUsesWith(X);
1307
1308 // select c, false, true -> not c
1309 VPValue *C;
1310 if (match(Def, m_Select(m_VPValue(C), m_False(), m_True())))
1311 return Def->replaceAllUsesWith(Builder.createNot(C));
1312
1313 // select !c, x, y -> select c, y, x
1314 if (match(Def, m_Select(m_Not(m_VPValue(C)), m_VPValue(X), m_VPValue(Y)))) {
1315 Def->setOperand(0, C);
1316 Def->setOperand(1, Y);
1317 Def->setOperand(2, X);
1318 return;
1319 }
1320
1321 // Reassociate (x && y) && z -> x && (y && z) if x has multiple users. With
1322 // tail folding it is likely that x is a header mask and can be simplified
1323 // further.
1325 m_VPValue(Z))) &&
1326 X->hasMoreThanOneUniqueUser())
1327 return Def->replaceAllUsesWith(
1328 Builder.createLogicalAnd(X, Builder.createLogicalAnd(Y, Z)));
1329
1330 if (match(Def, m_c_Add(m_VPValue(A), m_ZeroInt())))
1331 return Def->replaceAllUsesWith(A);
1332
1333 if (match(Def, m_c_Mul(m_VPValue(A), m_One())))
1334 return Def->replaceAllUsesWith(A);
1335
1336 if (match(Def, m_c_Mul(m_VPValue(A), m_ZeroInt())))
1337 return Def->replaceAllUsesWith(
1338 Def->getOperand(0) == A ? Def->getOperand(1) : Def->getOperand(0));
1339
1340 const APInt *APC;
1341 if (match(Def, m_c_Mul(m_VPValue(A), m_APInt(APC))) && APC->isPowerOf2())
1342 return Def->replaceAllUsesWith(Builder.createNaryOp(
1343 Instruction::Shl,
1344 {A, Plan->getConstantInt(APC->getBitWidth(), APC->exactLogBase2())},
1345 *cast<VPRecipeWithIRFlags>(Def), Def->getDebugLoc()));
1346
1347 // Don't convert udiv to lshr inside a replicate region, as VPInstructions are
1348 // not allowed in them.
1349 const VPRegionBlock *ParentRegion = Def->getParent()->getParent();
1350 bool IsInReplicateRegion = ParentRegion && ParentRegion->isReplicator();
1351 if (!IsInReplicateRegion && match(Def, m_UDiv(m_VPValue(A), m_APInt(APC))) &&
1352 APC->isPowerOf2())
1353 return Def->replaceAllUsesWith(Builder.createNaryOp(
1354 Instruction::LShr,
1355 {A, Plan->getConstantInt(APC->getBitWidth(), APC->exactLogBase2())}, {},
1356 Def->getDebugLoc()));
1357
1358 if (match(Def, m_Not(m_VPValue(A)))) {
1359 if (match(A, m_Not(m_VPValue(A))))
1360 return Def->replaceAllUsesWith(A);
1361
1362 // Try to fold Not into compares by adjusting the predicate in-place.
1363 CmpPredicate Pred;
1364 if (match(A, m_Cmp(Pred, m_VPValue(), m_VPValue()))) {
1365 auto *Cmp = cast<VPRecipeWithIRFlags>(A);
1366 if (all_of(Cmp->users(),
1368 m_Not(m_Specific(Cmp)),
1369 m_Select(m_Specific(Cmp), m_VPValue(), m_VPValue()))))) {
1370 Cmp->setPredicate(CmpInst::getInversePredicate(Pred));
1371 for (VPUser *U : to_vector(Cmp->users())) {
1372 auto *R = cast<VPSingleDefRecipe>(U);
1373 if (match(R, m_Select(m_Specific(Cmp), m_VPValue(X), m_VPValue(Y)))) {
1374 // select (cmp pred), x, y -> select (cmp inv_pred), y, x
1375 R->setOperand(1, Y);
1376 R->setOperand(2, X);
1377 } else {
1378 // not (cmp pred) -> cmp inv_pred
1379 assert(match(R, m_Not(m_Specific(Cmp))) && "Unexpected user");
1380 R->replaceAllUsesWith(Cmp);
1381 }
1382 }
1383 // If Cmp doesn't have a debug location, use the one from the negation,
1384 // to preserve the location.
1385 if (!Cmp->getDebugLoc() && Def->getDebugLoc())
1386 Cmp->setDebugLoc(Def->getDebugLoc());
1387 }
1388 }
1389 }
1390
1391 // Fold any-of (fcmp uno %A, %A), (fcmp uno %B, %B), ... ->
1392 // any-of (fcmp uno %A, %B), ...
1393 if (match(Def, m_AnyOf())) {
1395 VPRecipeBase *UnpairedCmp = nullptr;
1396 for (VPValue *Op : Def->operands()) {
1397 VPValue *X;
1398 if (Op->getNumUsers() > 1 ||
1400 m_Deferred(X)))) {
1401 NewOps.push_back(Op);
1402 } else if (!UnpairedCmp) {
1403 UnpairedCmp = Op->getDefiningRecipe();
1404 } else {
1405 NewOps.push_back(Builder.createFCmp(CmpInst::FCMP_UNO,
1406 UnpairedCmp->getOperand(0), X));
1407 UnpairedCmp = nullptr;
1408 }
1409 }
1410
1411 if (UnpairedCmp)
1412 NewOps.push_back(UnpairedCmp->getVPSingleValue());
1413
1414 if (NewOps.size() < Def->getNumOperands()) {
1415 VPValue *NewAnyOf = Builder.createNaryOp(VPInstruction::AnyOf, NewOps);
1416 return Def->replaceAllUsesWith(NewAnyOf);
1417 }
1418 }
1419
1420 // Fold (fcmp uno %X, %X) or (fcmp uno %Y, %Y) -> fcmp uno %X, %Y
1421 // This is useful for fmax/fmin without fast-math flags, where we need to
1422 // check if any operand is NaN.
1424 m_Deferred(X)),
1426 m_Deferred(Y))))) {
1427 VPValue *NewCmp = Builder.createFCmp(CmpInst::FCMP_UNO, X, Y);
1428 return Def->replaceAllUsesWith(NewCmp);
1429 }
1430
1431 // Remove redundant DerviedIVs, that is 0 + A * 1 -> A and 0 + 0 * x -> 0.
1432 if ((match(Def, m_DerivedIV(m_ZeroInt(), m_VPValue(A), m_One())) ||
1433 match(Def, m_DerivedIV(m_ZeroInt(), m_ZeroInt(), m_VPValue()))) &&
1434 TypeInfo.inferScalarType(Def->getOperand(1)) ==
1435 TypeInfo.inferScalarType(Def))
1436 return Def->replaceAllUsesWith(Def->getOperand(1));
1437
1439 m_One()))) {
1440 Type *WideStepTy = TypeInfo.inferScalarType(Def);
1441 if (TypeInfo.inferScalarType(X) != WideStepTy)
1442 X = Builder.createWidenCast(Instruction::Trunc, X, WideStepTy);
1443 Def->replaceAllUsesWith(X);
1444 return;
1445 }
1446
1447 // For i1 vp.merges produced by AnyOf reductions:
1448 // vp.merge true, (or x, y), x, evl -> vp.merge y, true, x, evl
1450 m_VPValue(X), m_VPValue())) &&
1452 TypeInfo.inferScalarType(Def)->isIntegerTy(1)) {
1453 Def->setOperand(1, Def->getOperand(0));
1454 Def->setOperand(0, Y);
1455 return;
1456 }
1457
1458 if (auto *Phi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(Def)) {
1459 if (Phi->getOperand(0) == Phi->getOperand(1))
1460 Phi->replaceAllUsesWith(Phi->getOperand(0));
1461 return;
1462 }
1463
1464 // Look through ExtractLastLane.
1465 if (match(Def, m_ExtractLastLane(m_VPValue(A)))) {
1466 if (match(A, m_BuildVector())) {
1467 auto *BuildVector = cast<VPInstruction>(A);
1468 Def->replaceAllUsesWith(
1469 BuildVector->getOperand(BuildVector->getNumOperands() - 1));
1470 return;
1471 }
1472 if (Plan->hasScalarVFOnly())
1473 return Def->replaceAllUsesWith(A);
1474 }
1475
1476 // Look through ExtractPenultimateElement (BuildVector ....).
1478 auto *BuildVector = cast<VPInstruction>(Def->getOperand(0));
1479 Def->replaceAllUsesWith(
1480 BuildVector->getOperand(BuildVector->getNumOperands() - 2));
1481 return;
1482 }
1483
1484 uint64_t Idx;
1486 auto *BuildVector = cast<VPInstruction>(Def->getOperand(0));
1487 Def->replaceAllUsesWith(BuildVector->getOperand(Idx));
1488 return;
1489 }
1490
1491 if (match(Def, m_BuildVector()) && all_equal(Def->operands())) {
1492 Def->replaceAllUsesWith(
1493 Builder.createNaryOp(VPInstruction::Broadcast, Def->getOperand(0)));
1494 return;
1495 }
1496
1497 // Look through broadcast of single-scalar when used as select conditions; in
1498 // that case the scalar condition can be used directly.
1499 if (match(Def,
1502 "broadcast operand must be single-scalar");
1503 Def->setOperand(0, C);
1504 return;
1505 }
1506
1507 if (auto *Phi = dyn_cast<VPPhi>(Def)) {
1508 if (Phi->getNumOperands() == 1)
1509 Phi->replaceAllUsesWith(Phi->getOperand(0));
1510 return;
1511 }
1512
1513 VPIRValue *IRV;
1514 if (Def->getNumOperands() == 1 &&
1516 return Def->replaceAllUsesWith(IRV);
1517
1518 // Some simplifications can only be applied after unrolling. Perform them
1519 // below.
1520 if (!Plan->isUnrolled())
1521 return;
1522
1523 // After unrolling, extract-lane may be used to extract values from multiple
1524 // scalar sources. Only simplify when extracting from a single scalar source.
1525 VPValue *LaneToExtract;
1526 if (match(Def, m_ExtractLane(m_VPValue(LaneToExtract), m_VPValue(A)))) {
1527 // Simplify extract-lane(%lane_num, %scalar_val) -> %scalar_val.
1529 return Def->replaceAllUsesWith(A);
1530
1531 // Simplify extract-lane with single source to extract-element.
1532 Def->replaceAllUsesWith(Builder.createNaryOp(
1533 Instruction::ExtractElement, {A, LaneToExtract}, Def->getDebugLoc()));
1534 return;
1535 }
1536
1537 // Hoist an invariant increment Y of a phi X, by having X start at Y.
1538 if (match(Def, m_c_Add(m_VPValue(X), m_VPValue(Y))) && isa<VPIRValue>(Y) &&
1539 isa<VPPhi>(X)) {
1540 auto *Phi = cast<VPPhi>(X);
1541 if (Phi->getOperand(1) != Def && match(Phi->getOperand(0), m_ZeroInt()) &&
1542 Phi->getSingleUser() == Def) {
1543 Phi->setOperand(0, Y);
1544 Def->replaceAllUsesWith(Phi);
1545 return;
1546 }
1547 }
1548
1549 // Simplify unrolled VectorPointer without offset, or with zero offset, to
1550 // just the pointer operand.
1551 if (auto *VPR = dyn_cast<VPVectorPointerRecipe>(Def))
1552 if (!VPR->getOffset() || match(VPR->getOffset(), m_ZeroInt()))
1553 return VPR->replaceAllUsesWith(VPR->getOperand(0));
1554
1555 // VPScalarIVSteps after unrolling can be replaced by their start value, if
1556 // the start index is zero and only the first lane 0 is demanded.
1557 if (auto *Steps = dyn_cast<VPScalarIVStepsRecipe>(Def)) {
1558 if (!Steps->getStartIndex() && vputils::onlyFirstLaneUsed(Steps)) {
1559 Steps->replaceAllUsesWith(Steps->getOperand(0));
1560 return;
1561 }
1562 }
1563 // Simplify redundant ReductionStartVector recipes after unrolling.
1564 VPValue *StartV;
1566 m_VPValue(StartV), m_VPValue(), m_VPValue()))) {
1567 Def->replaceUsesWithIf(StartV, [](const VPUser &U, unsigned Idx) {
1568 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&U);
1569 return PhiR && PhiR->isInLoop();
1570 });
1571 return;
1572 }
1573
1575 Def->replaceAllUsesWith(A);
1576 return;
1577 }
1578
1579 if (match(Def, m_ExtractLastLane(m_VPValue(A))) &&
1582 cast<VPReplicateRecipe>(A)->isSingleScalar())) &&
1583 all_of(A->users(),
1584 [Def, A](VPUser *U) { return U->usesScalars(A) || Def == U; })) {
1585 return Def->replaceAllUsesWith(A);
1586 }
1587
1588 if (Plan->getUF() == 1 && match(Def, m_ExtractLastPart(m_VPValue(A))))
1589 return Def->replaceAllUsesWith(A);
1590}
1591
1594 Plan.getEntry());
1595 VPTypeAnalysis TypeInfo(Plan);
1597 for (VPRecipeBase &R : make_early_inc_range(*VPBB))
1598 if (auto *Def = dyn_cast<VPSingleDefRecipe>(&R))
1599 simplifyRecipe(Def, TypeInfo);
1600 }
1601}
1602
1604 if (Plan.hasScalarVFOnly())
1605 return;
1606
1607 // Try to narrow wide and replicating recipes to single scalar recipes,
1608 // based on VPlan analysis. Only process blocks in the loop region for now,
1609 // without traversing into nested regions, as recipes in replicate regions
1610 // cannot be converted yet.
1613 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
1615 VPWidenStoreRecipe>(&R))
1616 continue;
1617 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
1618 if (RepR && (RepR->isSingleScalar() || RepR->isPredicated()))
1619 continue;
1620
1621 // Convert an unmasked scatter with an uniform address into
1622 // extract-last-lane + scalar store.
1623 // TODO: Add a profitability check comparing the cost of a scatter vs.
1624 // extract + scalar store.
1625 auto *WidenStoreR = dyn_cast<VPWidenStoreRecipe>(&R);
1626 if (WidenStoreR && vputils::isSingleScalar(WidenStoreR->getAddr()) &&
1627 !WidenStoreR->isConsecutive()) {
1628 assert(!WidenStoreR->isReverse() &&
1629 "Not consecutive memory recipes shouldn't be reversed");
1630 VPValue *Mask = WidenStoreR->getMask();
1631
1632 // Only convert the scatter to a scalar store if it is unmasked.
1633 // TODO: Support converting scatter masked by the header mask to scalar
1634 // store.
1635 if (Mask)
1636 continue;
1637
1639 {WidenStoreR->getOperand(1)});
1640 Extract->insertBefore(WidenStoreR);
1641
1642 // TODO: Sink the scalar store recipe to middle block if possible.
1643 auto *ScalarStore = new VPReplicateRecipe(
1644 &WidenStoreR->getIngredient(), {Extract, WidenStoreR->getAddr()},
1645 true /*IsSingleScalar*/, nullptr /*Mask*/, {},
1646 *WidenStoreR /*Metadata*/);
1647 ScalarStore->insertBefore(WidenStoreR);
1648 WidenStoreR->eraseFromParent();
1649 continue;
1650 }
1651
1652 auto *RepOrWidenR = dyn_cast<VPRecipeWithIRFlags>(&R);
1653 if (RepR && isa<StoreInst>(RepR->getUnderlyingInstr()) &&
1654 vputils::isSingleScalar(RepR->getOperand(1))) {
1655 auto *Clone = new VPReplicateRecipe(
1656 RepOrWidenR->getUnderlyingInstr(), RepOrWidenR->operands(),
1657 true /*IsSingleScalar*/, nullptr /*Mask*/, *RepR /*Flags*/,
1658 *RepR /*Metadata*/, RepR->getDebugLoc());
1659 Clone->insertBefore(RepOrWidenR);
1660 VPBuilder Builder(Clone);
1661 VPValue *ExtractOp = Clone->getOperand(0);
1662 if (vputils::isUniformAcrossVFsAndUFs(RepR->getOperand(1)))
1663 ExtractOp =
1664 Builder.createNaryOp(VPInstruction::ExtractLastPart, ExtractOp);
1665 ExtractOp =
1666 Builder.createNaryOp(VPInstruction::ExtractLastLane, ExtractOp);
1667 Clone->setOperand(0, ExtractOp);
1668 RepR->eraseFromParent();
1669 continue;
1670 }
1671
1672 // Skip recipes that aren't single scalars.
1673 if (!RepOrWidenR || !vputils::isSingleScalar(RepOrWidenR))
1674 continue;
1675
1676 // Skip recipes for which conversion to single-scalar does introduce
1677 // additional broadcasts. No extra broadcasts are needed, if either only
1678 // the scalars of the recipe are used, or at least one of the operands
1679 // would require a broadcast. In the latter case, the single-scalar may
1680 // need to be broadcasted, but another broadcast is removed.
1681 if (!all_of(RepOrWidenR->users(),
1682 [RepOrWidenR](const VPUser *U) {
1683 if (auto *VPI = dyn_cast<VPInstruction>(U)) {
1684 unsigned Opcode = VPI->getOpcode();
1685 if (Opcode == VPInstruction::ExtractLastLane ||
1686 Opcode == VPInstruction::ExtractLastPart ||
1687 Opcode == VPInstruction::ExtractPenultimateElement)
1688 return true;
1689 }
1690
1691 return U->usesScalars(RepOrWidenR);
1692 }) &&
1693 none_of(RepOrWidenR->operands(), [RepOrWidenR](VPValue *Op) {
1694 if (Op->getSingleUser() != RepOrWidenR)
1695 return false;
1696 // Non-constant live-ins require broadcasts, while constants do not
1697 // need explicit broadcasts.
1698 auto *IRV = dyn_cast<VPIRValue>(Op);
1699 bool LiveInNeedsBroadcast = IRV && !isa<Constant>(IRV->getValue());
1700 auto *OpR = dyn_cast<VPReplicateRecipe>(Op);
1701 return LiveInNeedsBroadcast || (OpR && OpR->isSingleScalar());
1702 }))
1703 continue;
1704
1705 auto *Clone = new VPReplicateRecipe(
1706 RepOrWidenR->getUnderlyingInstr(), RepOrWidenR->operands(),
1707 true /*IsSingleScalar*/, nullptr, *RepOrWidenR);
1708 Clone->insertBefore(RepOrWidenR);
1709 RepOrWidenR->replaceAllUsesWith(Clone);
1710 if (isDeadRecipe(*RepOrWidenR))
1711 RepOrWidenR->eraseFromParent();
1712 }
1713 }
1714}
1715
1716/// Try to see if all of \p Blend's masks share a common value logically and'ed
1717/// and remove it from the masks.
1719 if (Blend->isNormalized())
1720 return;
1721 VPValue *CommonEdgeMask;
1722 if (!match(Blend->getMask(0),
1723 m_LogicalAnd(m_VPValue(CommonEdgeMask), m_VPValue())))
1724 return;
1725 for (unsigned I = 0; I < Blend->getNumIncomingValues(); I++)
1726 if (!match(Blend->getMask(I),
1727 m_LogicalAnd(m_Specific(CommonEdgeMask), m_VPValue())))
1728 return;
1729 for (unsigned I = 0; I < Blend->getNumIncomingValues(); I++)
1730 Blend->setMask(I, Blend->getMask(I)->getDefiningRecipe()->getOperand(1));
1731}
1732
1733/// Normalize and simplify VPBlendRecipes. Should be run after simplifyRecipes
1734/// to make sure the masks are simplified.
1735static void simplifyBlends(VPlan &Plan) {
1738 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
1739 auto *Blend = dyn_cast<VPBlendRecipe>(&R);
1740 if (!Blend)
1741 continue;
1742
1743 removeCommonBlendMask(Blend);
1744
1745 // Try to remove redundant blend recipes.
1746 SmallPtrSet<VPValue *, 4> UniqueValues;
1747 if (Blend->isNormalized() || !match(Blend->getMask(0), m_False()))
1748 UniqueValues.insert(Blend->getIncomingValue(0));
1749 for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
1750 if (!match(Blend->getMask(I), m_False()))
1751 UniqueValues.insert(Blend->getIncomingValue(I));
1752
1753 if (UniqueValues.size() == 1) {
1754 Blend->replaceAllUsesWith(*UniqueValues.begin());
1755 Blend->eraseFromParent();
1756 continue;
1757 }
1758
1759 if (Blend->isNormalized())
1760 continue;
1761
1762 // Normalize the blend so its first incoming value is used as the initial
1763 // value with the others blended into it.
1764
1765 unsigned StartIndex = 0;
1766 for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
1767 // If a value's mask is used only by the blend then is can be deadcoded.
1768 // TODO: Find the most expensive mask that can be deadcoded, or a mask
1769 // that's used by multiple blends where it can be removed from them all.
1770 VPValue *Mask = Blend->getMask(I);
1771 if (Mask->getNumUsers() == 1 && !match(Mask, m_False())) {
1772 StartIndex = I;
1773 break;
1774 }
1775 }
1776
1777 SmallVector<VPValue *, 4> OperandsWithMask;
1778 OperandsWithMask.push_back(Blend->getIncomingValue(StartIndex));
1779
1780 for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
1781 if (I == StartIndex)
1782 continue;
1783 OperandsWithMask.push_back(Blend->getIncomingValue(I));
1784 OperandsWithMask.push_back(Blend->getMask(I));
1785 }
1786
1787 auto *NewBlend =
1788 new VPBlendRecipe(cast_or_null<PHINode>(Blend->getUnderlyingValue()),
1789 OperandsWithMask, Blend->getDebugLoc());
1790 NewBlend->insertBefore(&R);
1791
1792 VPValue *DeadMask = Blend->getMask(StartIndex);
1793 Blend->replaceAllUsesWith(NewBlend);
1794 Blend->eraseFromParent();
1796
1797 /// Simplify BLEND %a, %b, Not(%mask) -> BLEND %b, %a, %mask.
1798 VPValue *NewMask;
1799 if (NewBlend->getNumOperands() == 3 &&
1800 match(NewBlend->getMask(1), m_Not(m_VPValue(NewMask)))) {
1801 VPValue *Inc0 = NewBlend->getOperand(0);
1802 VPValue *Inc1 = NewBlend->getOperand(1);
1803 VPValue *OldMask = NewBlend->getOperand(2);
1804 NewBlend->setOperand(0, Inc1);
1805 NewBlend->setOperand(1, Inc0);
1806 NewBlend->setOperand(2, NewMask);
1807 if (OldMask->getNumUsers() == 0)
1808 cast<VPInstruction>(OldMask)->eraseFromParent();
1809 }
1810 }
1811 }
1812}
1813
1814/// Optimize the width of vector induction variables in \p Plan based on a known
1815/// constant Trip Count, \p BestVF and \p BestUF.
1817 ElementCount BestVF,
1818 unsigned BestUF) {
1819 // Only proceed if we have not completely removed the vector region.
1820 if (!Plan.getVectorLoopRegion())
1821 return false;
1822
1823 const APInt *TC;
1824 if (!BestVF.isFixed() || !match(Plan.getTripCount(), m_APInt(TC)))
1825 return false;
1826
1827 // Calculate the minimum power-of-2 bit width that can fit the known TC, VF
1828 // and UF. Returns at least 8.
1829 auto ComputeBitWidth = [](APInt TC, uint64_t Align) {
1830 APInt AlignedTC =
1833 APInt MaxVal = AlignedTC - 1;
1834 return std::max<unsigned>(PowerOf2Ceil(MaxVal.getActiveBits()), 8);
1835 };
1836 unsigned NewBitWidth =
1837 ComputeBitWidth(*TC, BestVF.getKnownMinValue() * BestUF);
1838
1839 LLVMContext &Ctx = Plan.getContext();
1840 auto *NewIVTy = IntegerType::get(Ctx, NewBitWidth);
1841
1842 bool MadeChange = false;
1843
1844 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
1845 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
1846 auto *WideIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
1847
1848 // Currently only handle canonical IVs as it is trivial to replace the start
1849 // and stop values, and we currently only perform the optimization when the
1850 // IV has a single use.
1851 if (!WideIV || !WideIV->isCanonical() ||
1852 WideIV->hasMoreThanOneUniqueUser() ||
1853 NewIVTy == WideIV->getScalarType())
1854 continue;
1855
1856 // Currently only handle cases where the single user is a header-mask
1857 // comparison with the backedge-taken-count.
1858 VPUser *SingleUser = WideIV->getSingleUser();
1859 if (!SingleUser ||
1860 !match(SingleUser, m_ICmp(m_Specific(WideIV),
1863 continue;
1864
1865 // Update IV operands and comparison bound to use new narrower type.
1866 auto *NewStart = Plan.getConstantInt(NewIVTy, 0);
1867 WideIV->setStartValue(NewStart);
1868 auto *NewStep = Plan.getConstantInt(NewIVTy, 1);
1869 WideIV->setStepValue(NewStep);
1870
1871 auto *NewBTC = new VPWidenCastRecipe(
1872 Instruction::Trunc, Plan.getOrCreateBackedgeTakenCount(), NewIVTy);
1873 Plan.getVectorPreheader()->appendRecipe(NewBTC);
1874 auto *Cmp = cast<VPInstruction>(WideIV->getSingleUser());
1875 Cmp->setOperand(1, NewBTC);
1876
1877 MadeChange = true;
1878 }
1879
1880 return MadeChange;
1881}
1882
1883/// Return true if \p Cond is known to be true for given \p BestVF and \p
1884/// BestUF.
1886 ElementCount BestVF, unsigned BestUF,
1889 return any_of(Cond->getDefiningRecipe()->operands(), [&Plan, BestVF, BestUF,
1890 &PSE](VPValue *C) {
1891 return isConditionTrueViaVFAndUF(C, Plan, BestVF, BestUF, PSE);
1892 });
1893
1894 auto *CanIV = Plan.getVectorLoopRegion()->getCanonicalIV();
1896 m_Specific(CanIV->getBackedgeValue()),
1897 m_Specific(&Plan.getVectorTripCount()))))
1898 return false;
1899
1900 // The compare checks CanIV + VFxUF == vector trip count. The vector trip
1901 // count is not conveniently available as SCEV so far, so we compare directly
1902 // against the original trip count. This is stricter than necessary, as we
1903 // will only return true if the trip count == vector trip count.
1904 const SCEV *VectorTripCount =
1906 if (isa<SCEVCouldNotCompute>(VectorTripCount))
1907 VectorTripCount = vputils::getSCEVExprForVPValue(Plan.getTripCount(), PSE);
1908 assert(!isa<SCEVCouldNotCompute>(VectorTripCount) &&
1909 "Trip count SCEV must be computable");
1910 ScalarEvolution &SE = *PSE.getSE();
1911 ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF);
1912 const SCEV *C = SE.getElementCount(VectorTripCount->getType(), NumElements);
1913 return SE.isKnownPredicate(CmpInst::ICMP_EQ, VectorTripCount, C);
1914}
1915
1916/// Try to replace multiple active lane masks used for control flow with
1917/// a single, wide active lane mask instruction followed by multiple
1918/// extract subvector intrinsics. This applies to the active lane mask
1919/// instructions both in the loop and in the preheader.
1920/// Incoming values of all ActiveLaneMaskPHIs are updated to use the
1921/// new extracts from the first active lane mask, which has it's last
1922/// operand (multiplier) set to UF.
1924 unsigned UF) {
1925 if (!EnableWideActiveLaneMask || !VF.isVector() || UF == 1)
1926 return false;
1927
1928 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
1929 VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
1930 auto *Term = &ExitingVPBB->back();
1931
1932 using namespace llvm::VPlanPatternMatch;
1934 m_VPValue(), m_VPValue(), m_VPValue())))))
1935 return false;
1936
1937 auto *Header = cast<VPBasicBlock>(VectorRegion->getEntry());
1938 LLVMContext &Ctx = Plan.getContext();
1939
1940 auto ExtractFromALM = [&](VPInstruction *ALM,
1941 SmallVectorImpl<VPValue *> &Extracts) {
1942 DebugLoc DL = ALM->getDebugLoc();
1943 for (unsigned Part = 0; Part < UF; ++Part) {
1945 Ops.append({ALM, Plan.getOrAddLiveIn(
1946 ConstantInt::get(IntegerType::getInt64Ty(Ctx),
1947 VF.getKnownMinValue() * Part))});
1948 auto *Ext =
1949 new VPWidenIntrinsicRecipe(Intrinsic::vector_extract, Ops,
1950 IntegerType::getInt1Ty(Ctx), {}, {}, DL);
1951 Extracts[Part] = Ext;
1952 Ext->insertAfter(ALM);
1953 }
1954 };
1955
1956 // Create a list of each active lane mask phi, ordered by unroll part.
1958 for (VPRecipeBase &R : Header->phis()) {
1960 if (!Phi)
1961 continue;
1962 VPValue *Index = nullptr;
1963 match(Phi->getBackedgeValue(),
1965 assert(Index && "Expected index from ActiveLaneMask instruction");
1966
1967 uint64_t Part;
1968 if (match(Index,
1970 m_VPValue(), m_ConstantInt(Part))))
1971 Phis[Part] = Phi;
1972 else
1973 // Anything other than a CanonicalIVIncrementForPart is part 0
1974 Phis[0] = Phi;
1975 }
1976
1977 assert(all_of(Phis, [](VPActiveLaneMaskPHIRecipe *Phi) { return Phi; }) &&
1978 "Expected one VPActiveLaneMaskPHIRecipe for each unroll part");
1979
1980 auto *EntryALM = cast<VPInstruction>(Phis[0]->getStartValue());
1981 auto *LoopALM = cast<VPInstruction>(Phis[0]->getBackedgeValue());
1982
1983 assert((EntryALM->getOpcode() == VPInstruction::ActiveLaneMask &&
1984 LoopALM->getOpcode() == VPInstruction::ActiveLaneMask) &&
1985 "Expected incoming values of Phi to be ActiveLaneMasks");
1986
1987 // When using wide lane masks, the return type of the get.active.lane.mask
1988 // intrinsic is VF x UF (last operand).
1989 VPValue *ALMMultiplier = Plan.getConstantInt(64, UF);
1990 EntryALM->setOperand(2, ALMMultiplier);
1991 LoopALM->setOperand(2, ALMMultiplier);
1992
1993 // Create UF x extract vectors and insert into preheader.
1994 SmallVector<VPValue *> EntryExtracts(UF);
1995 ExtractFromALM(EntryALM, EntryExtracts);
1996
1997 // Create UF x extract vectors and insert before the loop compare & branch,
1998 // updating the compare to use the first extract.
1999 SmallVector<VPValue *> LoopExtracts(UF);
2000 ExtractFromALM(LoopALM, LoopExtracts);
2001 VPInstruction *Not = cast<VPInstruction>(Term->getOperand(0));
2002 Not->setOperand(0, LoopExtracts[0]);
2003
2004 // Update the incoming values of active lane mask phis.
2005 for (unsigned Part = 0; Part < UF; ++Part) {
2006 Phis[Part]->setStartValue(EntryExtracts[Part]);
2007 Phis[Part]->setBackedgeValue(LoopExtracts[Part]);
2008 }
2009
2010 return true;
2011}
2012
2013/// Try to simplify the branch condition of \p Plan. This may restrict the
2014/// resulting plan to \p BestVF and \p BestUF.
2016 unsigned BestUF,
2018 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
2019 VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
2020 auto *Term = &ExitingVPBB->back();
2021 VPValue *Cond;
2022 if (match(Term, m_BranchOnCount()) ||
2024 m_VPValue(), m_VPValue(), m_VPValue()))))) {
2025 // Try to simplify the branch condition if VectorTC <= VF * UF when the
2026 // latch terminator is BranchOnCount or BranchOnCond(Not(ActiveLaneMask)).
2027 const SCEV *VectorTripCount =
2029 if (isa<SCEVCouldNotCompute>(VectorTripCount))
2030 VectorTripCount =
2032 assert(!isa<SCEVCouldNotCompute>(VectorTripCount) &&
2033 "Trip count SCEV must be computable");
2034 ScalarEvolution &SE = *PSE.getSE();
2035 ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF);
2036 const SCEV *C = SE.getElementCount(VectorTripCount->getType(), NumElements);
2037 if (!SE.isKnownPredicate(CmpInst::ICMP_ULE, VectorTripCount, C))
2038 return false;
2039 } else if (match(Term, m_BranchOnCond(m_VPValue(Cond))) ||
2041 // For BranchOnCond, check if we can prove the condition to be true using VF
2042 // and UF.
2043 if (!isConditionTrueViaVFAndUF(Cond, Plan, BestVF, BestUF, PSE))
2044 return false;
2045 } else {
2046 return false;
2047 }
2048
2049 // The vector loop region only executes once. If possible, completely remove
2050 // the region, otherwise replace the terminator controlling the latch with
2051 // (BranchOnCond true).
2052 // TODO: VPWidenIntOrFpInductionRecipe is only partially supported; add
2053 // support for other non-canonical widen induction recipes (e.g.,
2054 // VPWidenPointerInductionRecipe).
2055 // TODO: fold branch-on-constant after dissolving region.
2056 auto *Header = cast<VPBasicBlock>(VectorRegion->getEntry());
2057 if (all_of(Header->phis(), [](VPRecipeBase &Phi) {
2058 if (auto *R = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi))
2059 return R->isCanonical();
2060 return isa<VPCanonicalIVPHIRecipe, VPEVLBasedIVPHIRecipe,
2061 VPFirstOrderRecurrencePHIRecipe, VPPhi>(&Phi);
2062 })) {
2063 for (VPRecipeBase &HeaderR : make_early_inc_range(Header->phis())) {
2064 if (auto *R = dyn_cast<VPWidenIntOrFpInductionRecipe>(&HeaderR)) {
2065 VPBuilder Builder(Plan.getVectorPreheader());
2066 VPValue *StepV = Builder.createNaryOp(VPInstruction::StepVector, {},
2067 R->getScalarType());
2068 HeaderR.getVPSingleValue()->replaceAllUsesWith(StepV);
2069 HeaderR.eraseFromParent();
2070 continue;
2071 }
2072 auto *Phi = cast<VPPhiAccessors>(&HeaderR);
2073 HeaderR.getVPSingleValue()->replaceAllUsesWith(Phi->getIncomingValue(0));
2074 HeaderR.eraseFromParent();
2075 }
2076
2077 VPBlockBase *Preheader = VectorRegion->getSinglePredecessor();
2078 SmallVector<VPBlockBase *> Exits = to_vector(VectorRegion->getSuccessors());
2079 VPBlockUtils::disconnectBlocks(Preheader, VectorRegion);
2080 for (VPBlockBase *Exit : Exits)
2081 VPBlockUtils::disconnectBlocks(VectorRegion, Exit);
2082
2083 for (VPBlockBase *B : vp_depth_first_shallow(VectorRegion->getEntry()))
2084 B->setParent(nullptr);
2085
2086 VPBlockUtils::connectBlocks(Preheader, Header);
2087
2088 for (VPBlockBase *Exit : Exits)
2089 VPBlockUtils::connectBlocks(ExitingVPBB, Exit);
2090
2091 // Replace terminating branch-on-two-conds with branch-on-cond to early
2092 // exit.
2093 if (Exits.size() != 1) {
2094 assert(match(Term, m_BranchOnTwoConds()) && Exits.size() == 2 &&
2095 "BranchOnTwoConds needs 2 remaining exits");
2097 Term->getOperand(0));
2098 }
2100 } else {
2101 // The vector region contains header phis for which we cannot remove the
2102 // loop region yet.
2103
2104 // For BranchOnTwoConds, set the latch exit condition to true directly.
2105 if (match(Term, m_BranchOnTwoConds())) {
2106 Term->setOperand(1, Plan.getTrue());
2107 return true;
2108 }
2109
2110 auto *BOC = new VPInstruction(VPInstruction::BranchOnCond, {Plan.getTrue()},
2111 {}, {}, Term->getDebugLoc());
2112 ExitingVPBB->appendRecipe(BOC);
2113 }
2114
2115 Term->eraseFromParent();
2116
2117 return true;
2118}
2119
2120/// From the definition of llvm.experimental.get.vector.length,
2121/// VPInstruction::ExplicitVectorLength(%AVL) = %AVL when %AVL <= VF.
2125 vp_depth_first_deep(Plan.getEntry()))) {
2126 for (VPRecipeBase &R : *VPBB) {
2127 VPValue *AVL;
2128 if (!match(&R, m_EVL(m_VPValue(AVL))))
2129 continue;
2130
2131 const SCEV *AVLSCEV = vputils::getSCEVExprForVPValue(AVL, PSE);
2132 if (isa<SCEVCouldNotCompute>(AVLSCEV))
2133 continue;
2134 ScalarEvolution &SE = *PSE.getSE();
2135 const SCEV *VFSCEV = SE.getElementCount(AVLSCEV->getType(), VF);
2136 if (!SE.isKnownPredicate(CmpInst::ICMP_ULE, AVLSCEV, VFSCEV))
2137 continue;
2138
2140 AVL, Type::getInt32Ty(Plan.getContext()), AVLSCEV->getType(),
2141 R.getDebugLoc());
2142 R.getVPSingleValue()->replaceAllUsesWith(Trunc);
2143 return true;
2144 }
2145 }
2146 return false;
2147}
2148
2150 unsigned BestUF,
2152 assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
2153 assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
2154
2155 bool MadeChange = tryToReplaceALMWithWideALM(Plan, BestVF, BestUF);
2156 MadeChange |= simplifyBranchConditionForVFAndUF(Plan, BestVF, BestUF, PSE);
2157 MadeChange |= optimizeVectorInductionWidthForTCAndVFUF(Plan, BestVF, BestUF);
2158 MadeChange |= simplifyKnownEVL(Plan, BestVF, PSE);
2159
2160 if (MadeChange) {
2161 Plan.setVF(BestVF);
2162 assert(Plan.getUF() == BestUF && "BestUF must match the Plan's UF");
2163 }
2164}
2165
2166/// Sink users of \p FOR after the recipe defining the previous value \p
2167/// Previous of the recurrence. \returns true if all users of \p FOR could be
2168/// re-arranged as needed or false if it is not possible.
2169static bool
2171 VPRecipeBase *Previous,
2172 VPDominatorTree &VPDT) {
2173 // Collect recipes that need sinking.
2176 Seen.insert(Previous);
2177 auto TryToPushSinkCandidate = [&](VPRecipeBase *SinkCandidate) {
2178 // The previous value must not depend on the users of the recurrence phi. In
2179 // that case, FOR is not a fixed order recurrence.
2180 if (SinkCandidate == Previous)
2181 return false;
2182
2183 if (isa<VPHeaderPHIRecipe>(SinkCandidate) ||
2184 !Seen.insert(SinkCandidate).second ||
2185 VPDT.properlyDominates(Previous, SinkCandidate))
2186 return true;
2187
2188 if (cannotHoistOrSinkRecipe(*SinkCandidate))
2189 return false;
2190
2191 WorkList.push_back(SinkCandidate);
2192 return true;
2193 };
2194
2195 // Recursively sink users of FOR after Previous.
2196 WorkList.push_back(FOR);
2197 for (unsigned I = 0; I != WorkList.size(); ++I) {
2198 VPRecipeBase *Current = WorkList[I];
2199 assert(Current->getNumDefinedValues() == 1 &&
2200 "only recipes with a single defined value expected");
2201
2202 for (VPUser *User : Current->getVPSingleValue()->users()) {
2203 if (!TryToPushSinkCandidate(cast<VPRecipeBase>(User)))
2204 return false;
2205 }
2206 }
2207
2208 // Keep recipes to sink ordered by dominance so earlier instructions are
2209 // processed first.
2210 sort(WorkList, [&VPDT](const VPRecipeBase *A, const VPRecipeBase *B) {
2211 return VPDT.properlyDominates(A, B);
2212 });
2213
2214 for (VPRecipeBase *SinkCandidate : WorkList) {
2215 if (SinkCandidate == FOR)
2216 continue;
2217
2218 SinkCandidate->moveAfter(Previous);
2219 Previous = SinkCandidate;
2220 }
2221 return true;
2222}
2223
2224/// Try to hoist \p Previous and its operands before all users of \p FOR.
2226 VPRecipeBase *Previous,
2227 VPDominatorTree &VPDT) {
2228 if (cannotHoistOrSinkRecipe(*Previous))
2229 return false;
2230
2231 // Collect recipes that need hoisting.
2232 SmallVector<VPRecipeBase *> HoistCandidates;
2234 VPRecipeBase *HoistPoint = nullptr;
2235 // Find the closest hoist point by looking at all users of FOR and selecting
2236 // the recipe dominating all other users.
2237 for (VPUser *U : FOR->users()) {
2238 auto *R = cast<VPRecipeBase>(U);
2239 if (!HoistPoint || VPDT.properlyDominates(R, HoistPoint))
2240 HoistPoint = R;
2241 }
2242 assert(all_of(FOR->users(),
2243 [&VPDT, HoistPoint](VPUser *U) {
2244 auto *R = cast<VPRecipeBase>(U);
2245 return HoistPoint == R ||
2246 VPDT.properlyDominates(HoistPoint, R);
2247 }) &&
2248 "HoistPoint must dominate all users of FOR");
2249
2250 auto NeedsHoisting = [HoistPoint, &VPDT,
2251 &Visited](VPValue *HoistCandidateV) -> VPRecipeBase * {
2252 VPRecipeBase *HoistCandidate = HoistCandidateV->getDefiningRecipe();
2253 if (!HoistCandidate)
2254 return nullptr;
2255 VPRegionBlock *EnclosingLoopRegion =
2256 HoistCandidate->getParent()->getEnclosingLoopRegion();
2257 assert((!HoistCandidate->getRegion() ||
2258 HoistCandidate->getRegion() == EnclosingLoopRegion) &&
2259 "CFG in VPlan should still be flat, without replicate regions");
2260 // Hoist candidate was already visited, no need to hoist.
2261 if (!Visited.insert(HoistCandidate).second)
2262 return nullptr;
2263
2264 // Candidate is outside loop region or a header phi, dominates FOR users w/o
2265 // hoisting.
2266 if (!EnclosingLoopRegion || isa<VPHeaderPHIRecipe>(HoistCandidate))
2267 return nullptr;
2268
2269 // If we reached a recipe that dominates HoistPoint, we don't need to
2270 // hoist the recipe.
2271 if (VPDT.properlyDominates(HoistCandidate, HoistPoint))
2272 return nullptr;
2273 return HoistCandidate;
2274 };
2275
2276 if (!NeedsHoisting(Previous->getVPSingleValue()))
2277 return true;
2278
2279 // Recursively try to hoist Previous and its operands before all users of FOR.
2280 HoistCandidates.push_back(Previous);
2281
2282 for (unsigned I = 0; I != HoistCandidates.size(); ++I) {
2283 VPRecipeBase *Current = HoistCandidates[I];
2284 assert(Current->getNumDefinedValues() == 1 &&
2285 "only recipes with a single defined value expected");
2286 if (cannotHoistOrSinkRecipe(*Current))
2287 return false;
2288
2289 for (VPValue *Op : Current->operands()) {
2290 // If we reach FOR, it means the original Previous depends on some other
2291 // recurrence that in turn depends on FOR. If that is the case, we would
2292 // also need to hoist recipes involving the other FOR, which may break
2293 // dependencies.
2294 if (Op == FOR)
2295 return false;
2296
2297 if (auto *R = NeedsHoisting(Op)) {
2298 // Bail out if the recipe defines multiple values.
2299 // TODO: Hoisting such recipes requires additional handling.
2300 if (R->getNumDefinedValues() != 1)
2301 return false;
2302 HoistCandidates.push_back(R);
2303 }
2304 }
2305 }
2306
2307 // Order recipes to hoist by dominance so earlier instructions are processed
2308 // first.
2309 sort(HoistCandidates, [&VPDT](const VPRecipeBase *A, const VPRecipeBase *B) {
2310 return VPDT.properlyDominates(A, B);
2311 });
2312
2313 for (VPRecipeBase *HoistCandidate : HoistCandidates) {
2314 HoistCandidate->moveBefore(*HoistPoint->getParent(),
2315 HoistPoint->getIterator());
2316 }
2317
2318 return true;
2319}
2320
2322 VPBuilder &LoopBuilder) {
2323 VPDominatorTree VPDT(Plan);
2324
2326 for (VPRecipeBase &R :
2329 RecurrencePhis.push_back(FOR);
2330
2331 for (VPFirstOrderRecurrencePHIRecipe *FOR : RecurrencePhis) {
2333 VPRecipeBase *Previous = FOR->getBackedgeValue()->getDefiningRecipe();
2334 // Fixed-order recurrences do not contain cycles, so this loop is guaranteed
2335 // to terminate.
2336 while (auto *PrevPhi =
2338 assert(PrevPhi->getParent() == FOR->getParent());
2339 assert(SeenPhis.insert(PrevPhi).second);
2340 Previous = PrevPhi->getBackedgeValue()->getDefiningRecipe();
2341 }
2342
2343 if (!sinkRecurrenceUsersAfterPrevious(FOR, Previous, VPDT) &&
2344 !hoistPreviousBeforeFORUsers(FOR, Previous, VPDT))
2345 return false;
2346
2347 // Introduce a recipe to combine the incoming and previous values of a
2348 // fixed-order recurrence.
2349 VPBasicBlock *InsertBlock = Previous->getParent();
2350 if (isa<VPHeaderPHIRecipe>(Previous))
2351 LoopBuilder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi());
2352 else
2353 LoopBuilder.setInsertPoint(InsertBlock,
2354 std::next(Previous->getIterator()));
2355
2356 auto *RecurSplice =
2358 {FOR, FOR->getBackedgeValue()});
2359
2360 FOR->replaceAllUsesWith(RecurSplice);
2361 // Set the first operand of RecurSplice to FOR again, after replacing
2362 // all users.
2363 RecurSplice->setOperand(0, FOR);
2364
2365 // Check for users extracting at the penultimate active lane of the FOR.
2366 // If only a single lane is active in the current iteration, we need to
2367 // select the last element from the previous iteration (from the FOR phi
2368 // directly).
2369 for (VPUser *U : RecurSplice->users()) {
2371 m_Specific(RecurSplice))))
2372 continue;
2373
2375 VPValue *LastActiveLane = cast<VPInstruction>(U)->getOperand(0);
2376 Type *I64Ty = Type::getInt64Ty(Plan.getContext());
2377 VPValue *Zero = Plan.getOrAddLiveIn(ConstantInt::get(I64Ty, 0));
2378 VPValue *One = Plan.getOrAddLiveIn(ConstantInt::get(I64Ty, 1));
2379 VPValue *PenultimateIndex =
2380 B.createNaryOp(Instruction::Sub, {LastActiveLane, One});
2381 VPValue *PenultimateLastIter =
2382 B.createNaryOp(VPInstruction::ExtractLane,
2383 {PenultimateIndex, FOR->getBackedgeValue()});
2384 VPValue *LastPrevIter =
2385 B.createNaryOp(VPInstruction::ExtractLastLane, FOR);
2386
2387 VPValue *Cmp = B.createICmp(CmpInst::ICMP_EQ, LastActiveLane, Zero);
2388 VPValue *Sel = B.createSelect(Cmp, LastPrevIter, PenultimateLastIter);
2389 cast<VPInstruction>(U)->replaceAllUsesWith(Sel);
2390 }
2391 }
2392 return true;
2393}
2394
2396 for (VPRecipeBase &R :
2398 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
2399 if (!PhiR)
2400 continue;
2401 RecurKind RK = PhiR->getRecurrenceKind();
2402 if (RK != RecurKind::Add && RK != RecurKind::Mul && RK != RecurKind::Sub &&
2404 continue;
2405
2406 for (VPUser *U : collectUsersRecursively(PhiR))
2407 if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(U)) {
2408 RecWithFlags->dropPoisonGeneratingFlags();
2409 }
2410 }
2411}
2412
2413namespace {
2414struct VPCSEDenseMapInfo : public DenseMapInfo<VPSingleDefRecipe *> {
2415 static bool isSentinel(const VPSingleDefRecipe *Def) {
2416 return Def == getEmptyKey() || Def == getTombstoneKey();
2417 }
2418
2419 /// If recipe \p R will lower to a GEP with a non-i8 source element type,
2420 /// return that source element type.
2421 static Type *getGEPSourceElementType(const VPSingleDefRecipe *R) {
2422 // All VPInstructions that lower to GEPs must have the i8 source element
2423 // type (as they are PtrAdds), so we omit it.
2425 .Case([](const VPReplicateRecipe *I) -> Type * {
2426 if (auto *GEP = dyn_cast<GetElementPtrInst>(I->getUnderlyingValue()))
2427 return GEP->getSourceElementType();
2428 return nullptr;
2429 })
2430 .Case<VPVectorPointerRecipe, VPWidenGEPRecipe>(
2431 [](auto *I) { return I->getSourceElementType(); })
2432 .Default([](auto *) { return nullptr; });
2433 }
2434
2435 /// Returns true if recipe \p Def can be safely handed for CSE.
2436 static bool canHandle(const VPSingleDefRecipe *Def) {
2437 // We can extend the list of handled recipes in the future,
2438 // provided we account for the data embedded in them while checking for
2439 // equality or hashing.
2440 auto C = getOpcodeOrIntrinsicID(Def);
2441
2442 // The issue with (Insert|Extract)Value is that the index of the
2443 // insert/extract is not a proper operand in LLVM IR, and hence also not in
2444 // VPlan.
2445 if (!C || (!C->first && (C->second == Instruction::InsertValue ||
2446 C->second == Instruction::ExtractValue)))
2447 return false;
2448
2449 // During CSE, we can only handle recipes that don't read from memory: if
2450 // they read from memory, there could be an intervening write to memory
2451 // before the next instance is CSE'd, leading to an incorrect result.
2452 return !Def->mayReadFromMemory();
2453 }
2454
2455 /// Hash the underlying data of \p Def.
2456 static unsigned getHashValue(const VPSingleDefRecipe *Def) {
2457 const VPlan *Plan = Def->getParent()->getPlan();
2458 VPTypeAnalysis TypeInfo(*Plan);
2459 hash_code Result = hash_combine(
2460 Def->getVPRecipeID(), getOpcodeOrIntrinsicID(Def),
2461 getGEPSourceElementType(Def), TypeInfo.inferScalarType(Def),
2463 if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(Def))
2464 if (RFlags->hasPredicate())
2465 return hash_combine(Result, RFlags->getPredicate());
2466 return Result;
2467 }
2468
2469 /// Check equality of underlying data of \p L and \p R.
2470 static bool isEqual(const VPSingleDefRecipe *L, const VPSingleDefRecipe *R) {
2471 if (isSentinel(L) || isSentinel(R))
2472 return L == R;
2473 if (L->getVPRecipeID() != R->getVPRecipeID() ||
2475 getGEPSourceElementType(L) != getGEPSourceElementType(R) ||
2477 !equal(L->operands(), R->operands()))
2478 return false;
2480 "must have valid opcode info for both recipes");
2481 if (auto *LFlags = dyn_cast<VPRecipeWithIRFlags>(L))
2482 if (LFlags->hasPredicate() &&
2483 LFlags->getPredicate() !=
2484 cast<VPRecipeWithIRFlags>(R)->getPredicate())
2485 return false;
2486 // Recipes in replicate regions implicitly depend on predicate. If either
2487 // recipe is in a replicate region, only consider them equal if both have
2488 // the same parent.
2489 const VPRegionBlock *RegionL = L->getRegion();
2490 const VPRegionBlock *RegionR = R->getRegion();
2491 if (((RegionL && RegionL->isReplicator()) ||
2492 (RegionR && RegionR->isReplicator())) &&
2493 L->getParent() != R->getParent())
2494 return false;
2495 const VPlan *Plan = L->getParent()->getPlan();
2496 VPTypeAnalysis TypeInfo(*Plan);
2497 return TypeInfo.inferScalarType(L) == TypeInfo.inferScalarType(R);
2498 }
2499};
2500} // end anonymous namespace
2501
2502/// Perform a common-subexpression-elimination of VPSingleDefRecipes on the \p
2503/// Plan.
2505 VPDominatorTree VPDT(Plan);
2507
2509 vp_depth_first_deep(Plan.getEntry()))) {
2510 for (VPRecipeBase &R : *VPBB) {
2511 auto *Def = dyn_cast<VPSingleDefRecipe>(&R);
2512 if (!Def || !VPCSEDenseMapInfo::canHandle(Def))
2513 continue;
2514 if (VPSingleDefRecipe *V = CSEMap.lookup(Def)) {
2515 // V must dominate Def for a valid replacement.
2516 if (!VPDT.dominates(V->getParent(), VPBB))
2517 continue;
2518 // Only keep flags present on both V and Def.
2519 if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(V))
2520 RFlags->intersectFlags(*cast<VPRecipeWithIRFlags>(Def));
2521 Def->replaceAllUsesWith(V);
2522 continue;
2523 }
2524 CSEMap[Def] = Def;
2525 }
2526 }
2527}
2528
2529/// Move loop-invariant recipes out of the vector loop region in \p Plan.
2530static void licm(VPlan &Plan) {
2531 VPBasicBlock *Preheader = Plan.getVectorPreheader();
2532
2533 // Hoist any loop invariant recipes from the vector loop region to the
2534 // preheader. Preform a shallow traversal of the vector loop region, to
2535 // exclude recipes in replicate regions. Since the top-level blocks in the
2536 // vector loop region are guaranteed to execute if the vector pre-header is,
2537 // we don't need to check speculation safety.
2538 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2539 assert(Preheader->getSingleSuccessor() == LoopRegion &&
2540 "Expected vector prehader's successor to be the vector loop region");
2542 vp_depth_first_shallow(LoopRegion->getEntry()))) {
2543 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
2545 continue;
2546 if (any_of(R.operands(), [](VPValue *Op) {
2547 return !Op->isDefinedOutsideLoopRegions();
2548 }))
2549 continue;
2550 R.moveBefore(*Preheader, Preheader->end());
2551 }
2552 }
2553}
2554
2556 VPlan &Plan, const MapVector<Instruction *, uint64_t> &MinBWs) {
2557 if (Plan.hasScalarVFOnly())
2558 return;
2559 // Keep track of created truncates, so they can be re-used. Note that we
2560 // cannot use RAUW after creating a new truncate, as this would could make
2561 // other uses have different types for their operands, making them invalidly
2562 // typed.
2564 VPTypeAnalysis TypeInfo(Plan);
2565 VPBasicBlock *PH = Plan.getVectorPreheader();
2568 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
2571 continue;
2572
2573 VPValue *ResultVPV = R.getVPSingleValue();
2574 auto *UI = cast_or_null<Instruction>(ResultVPV->getUnderlyingValue());
2575 unsigned NewResSizeInBits = MinBWs.lookup(UI);
2576 if (!NewResSizeInBits)
2577 continue;
2578
2579 // If the value wasn't vectorized, we must maintain the original scalar
2580 // type. Skip those here, after incrementing NumProcessedRecipes. Also
2581 // skip casts which do not need to be handled explicitly here, as
2582 // redundant casts will be removed during recipe simplification.
2584 continue;
2585
2586 Type *OldResTy = TypeInfo.inferScalarType(ResultVPV);
2587 unsigned OldResSizeInBits = OldResTy->getScalarSizeInBits();
2588 assert(OldResTy->isIntegerTy() && "only integer types supported");
2589 (void)OldResSizeInBits;
2590
2591 auto *NewResTy = IntegerType::get(Plan.getContext(), NewResSizeInBits);
2592
2593 // Any wrapping introduced by shrinking this operation shouldn't be
2594 // considered undefined behavior. So, we can't unconditionally copy
2595 // arithmetic wrapping flags to VPW.
2596 if (auto *VPW = dyn_cast<VPRecipeWithIRFlags>(&R))
2597 VPW->dropPoisonGeneratingFlags();
2598
2599 if (OldResSizeInBits != NewResSizeInBits &&
2600 !match(&R, m_ICmp(m_VPValue(), m_VPValue()))) {
2601 // Extend result to original width.
2602 auto *Ext =
2603 new VPWidenCastRecipe(Instruction::ZExt, ResultVPV, OldResTy);
2604 Ext->insertAfter(&R);
2605 ResultVPV->replaceAllUsesWith(Ext);
2606 Ext->setOperand(0, ResultVPV);
2607 assert(OldResSizeInBits > NewResSizeInBits && "Nothing to shrink?");
2608 } else {
2609 assert(match(&R, m_ICmp(m_VPValue(), m_VPValue())) &&
2610 "Only ICmps should not need extending the result.");
2611 }
2612
2613 assert(!isa<VPWidenStoreRecipe>(&R) && "stores cannot be narrowed");
2615 continue;
2616
2617 // Shrink operands by introducing truncates as needed.
2618 unsigned StartIdx =
2619 match(&R, m_Select(m_VPValue(), m_VPValue(), m_VPValue())) ? 1 : 0;
2620 for (unsigned Idx = StartIdx; Idx != R.getNumOperands(); ++Idx) {
2621 auto *Op = R.getOperand(Idx);
2622 unsigned OpSizeInBits =
2624 if (OpSizeInBits == NewResSizeInBits)
2625 continue;
2626 assert(OpSizeInBits > NewResSizeInBits && "nothing to truncate");
2627 auto [ProcessedIter, IterIsEmpty] = ProcessedTruncs.try_emplace(Op);
2628 if (!IterIsEmpty) {
2629 R.setOperand(Idx, ProcessedIter->second);
2630 continue;
2631 }
2632
2633 VPBuilder Builder;
2634 if (isa<VPIRValue>(Op))
2635 Builder.setInsertPoint(PH);
2636 else
2637 Builder.setInsertPoint(&R);
2638 VPWidenCastRecipe *NewOp =
2639 Builder.createWidenCast(Instruction::Trunc, Op, NewResTy);
2640 ProcessedIter->second = NewOp;
2641 R.setOperand(Idx, NewOp);
2642 }
2643
2644 }
2645 }
2646}
2647
2651 VPValue *Cond;
2652 // Skip blocks that are not terminated by BranchOnCond.
2653 if (VPBB->empty() || !match(&VPBB->back(), m_BranchOnCond(m_VPValue(Cond))))
2654 continue;
2655
2656 assert(VPBB->getNumSuccessors() == 2 &&
2657 "Two successors expected for BranchOnCond");
2658 unsigned RemovedIdx;
2659 if (match(Cond, m_True()))
2660 RemovedIdx = 1;
2661 else if (match(Cond, m_False()))
2662 RemovedIdx = 0;
2663 else
2664 continue;
2665
2666 VPBasicBlock *RemovedSucc =
2667 cast<VPBasicBlock>(VPBB->getSuccessors()[RemovedIdx]);
2668 assert(count(RemovedSucc->getPredecessors(), VPBB) == 1 &&
2669 "There must be a single edge between VPBB and its successor");
2670 // Values coming from VPBB into phi recipes of RemoveSucc are removed from
2671 // these recipes.
2672 for (VPRecipeBase &R : RemovedSucc->phis())
2673 cast<VPPhiAccessors>(&R)->removeIncomingValueFor(VPBB);
2674
2675 // Disconnect blocks and remove the terminator. RemovedSucc will be deleted
2676 // automatically on VPlan destruction if it becomes unreachable.
2677 VPBlockUtils::disconnectBlocks(VPBB, RemovedSucc);
2678 VPBB->back().eraseFromParent();
2679 }
2680}
2681
2701
2702// Add a VPActiveLaneMaskPHIRecipe and related recipes to \p Plan and replace
2703// the loop terminator with a branch-on-cond recipe with the negated
2704// active-lane-mask as operand. Note that this turns the loop into an
2705// uncountable one. Only the existing terminator is replaced, all other existing
2706// recipes/users remain unchanged, except for poison-generating flags being
2707// dropped from the canonical IV increment. Return the created
2708// VPActiveLaneMaskPHIRecipe.
2709//
2710// The function uses the following definitions:
2711//
2712// %TripCount = DataWithControlFlowWithoutRuntimeCheck ?
2713// calculate-trip-count-minus-VF (original TC) : original TC
2714// %IncrementValue = DataWithControlFlowWithoutRuntimeCheck ?
2715// CanonicalIVPhi : CanonicalIVIncrement
2716// %StartV is the canonical induction start value.
2717//
2718// The function adds the following recipes:
2719//
2720// vector.ph:
2721// %TripCount = calculate-trip-count-minus-VF (original TC)
2722// [if DataWithControlFlowWithoutRuntimeCheck]
2723// %EntryInc = canonical-iv-increment-for-part %StartV
2724// %EntryALM = active-lane-mask %EntryInc, %TripCount
2725//
2726// vector.body:
2727// ...
2728// %P = active-lane-mask-phi [ %EntryALM, %vector.ph ], [ %ALM, %vector.body ]
2729// ...
2730// %InLoopInc = canonical-iv-increment-for-part %IncrementValue
2731// %ALM = active-lane-mask %InLoopInc, TripCount
2732// %Negated = Not %ALM
2733// branch-on-cond %Negated
2734//
2737 VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
2738 VPBasicBlock *EB = TopRegion->getExitingBasicBlock();
2739 auto *CanonicalIVPHI = TopRegion->getCanonicalIV();
2740 VPValue *StartV = CanonicalIVPHI->getStartValue();
2741
2742 auto *CanonicalIVIncrement =
2743 cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue());
2744 // TODO: Check if dropping the flags is needed if
2745 // !DataAndControlFlowWithoutRuntimeCheck.
2746 CanonicalIVIncrement->dropPoisonGeneratingFlags();
2747 DebugLoc DL = CanonicalIVIncrement->getDebugLoc();
2748 // We can't use StartV directly in the ActiveLaneMask VPInstruction, since
2749 // we have to take unrolling into account. Each part needs to start at
2750 // Part * VF
2751 auto *VecPreheader = Plan.getVectorPreheader();
2752 VPBuilder Builder(VecPreheader);
2753
2754 // Create the ActiveLaneMask instruction using the correct start values.
2755 VPValue *TC = Plan.getTripCount();
2756
2757 VPValue *TripCount, *IncrementValue;
2759 // When the loop is guarded by a runtime overflow check for the loop
2760 // induction variable increment by VF, we can increment the value before
2761 // the get.active.lane mask and use the unmodified tripcount.
2762 IncrementValue = CanonicalIVIncrement;
2763 TripCount = TC;
2764 } else {
2765 // When avoiding a runtime check, the active.lane.mask inside the loop
2766 // uses a modified trip count and the induction variable increment is
2767 // done after the active.lane.mask intrinsic is called.
2768 IncrementValue = CanonicalIVPHI;
2769 TripCount = Builder.createNaryOp(VPInstruction::CalculateTripCountMinusVF,
2770 {TC}, DL);
2771 }
2772 auto *EntryIncrement = Builder.createOverflowingOp(
2773 VPInstruction::CanonicalIVIncrementForPart, {StartV}, {false, false}, DL,
2774 "index.part.next");
2775
2776 // Create the active lane mask instruction in the VPlan preheader.
2777 VPValue *ALMMultiplier =
2778 Plan.getConstantInt(TopRegion->getCanonicalIVType(), 1);
2779 auto *EntryALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
2780 {EntryIncrement, TC, ALMMultiplier}, DL,
2781 "active.lane.mask.entry");
2782
2783 // Now create the ActiveLaneMaskPhi recipe in the main loop using the
2784 // preheader ActiveLaneMask instruction.
2785 auto *LaneMaskPhi =
2787 LaneMaskPhi->insertAfter(CanonicalIVPHI);
2788
2789 // Create the active lane mask for the next iteration of the loop before the
2790 // original terminator.
2791 VPRecipeBase *OriginalTerminator = EB->getTerminator();
2792 Builder.setInsertPoint(OriginalTerminator);
2793 auto *InLoopIncrement =
2794 Builder.createOverflowingOp(VPInstruction::CanonicalIVIncrementForPart,
2795 {IncrementValue}, {false, false}, DL);
2796 auto *ALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
2797 {InLoopIncrement, TripCount, ALMMultiplier},
2798 DL, "active.lane.mask.next");
2799 LaneMaskPhi->addOperand(ALM);
2800
2801 // Replace the original terminator with BranchOnCond. We have to invert the
2802 // mask here because a true condition means jumping to the exit block.
2803 auto *NotMask = Builder.createNot(ALM, DL);
2804 Builder.createNaryOp(VPInstruction::BranchOnCond, {NotMask}, DL);
2805 OriginalTerminator->eraseFromParent();
2806 return LaneMaskPhi;
2807}
2808
2809/// Collect the header mask with the pattern:
2810/// (ICMP_ULE, WideCanonicalIV, backedge-taken-count)
2811/// TODO: Introduce explicit recipe for header-mask instead of searching
2812/// for the header-mask pattern manually.
2814 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2815 SmallVector<VPValue *> WideCanonicalIVs;
2816 auto *FoundWidenCanonicalIVUser = find_if(
2818 assert(count_if(LoopRegion->getCanonicalIV()->users(),
2820 "Must have at most one VPWideCanonicalIVRecipe");
2821 if (FoundWidenCanonicalIVUser !=
2822 LoopRegion->getCanonicalIV()->users().end()) {
2823 auto *WideCanonicalIV =
2824 cast<VPWidenCanonicalIVRecipe>(*FoundWidenCanonicalIVUser);
2825 WideCanonicalIVs.push_back(WideCanonicalIV);
2826 }
2827
2828 // Also include VPWidenIntOrFpInductionRecipes that represent a widened
2829 // version of the canonical induction.
2830 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
2831 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
2832 auto *WidenOriginalIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
2833 if (WidenOriginalIV && WidenOriginalIV->isCanonical())
2834 WideCanonicalIVs.push_back(WidenOriginalIV);
2835 }
2836
2837 // Walk users of wide canonical IVs and find the single compare of the form
2838 // (ICMP_ULE, WideCanonicalIV, backedge-taken-count).
2839 VPSingleDefRecipe *HeaderMask = nullptr;
2840 for (auto *Wide : WideCanonicalIVs) {
2841 for (VPUser *U : Wide->users()) {
2842 auto *VPI = dyn_cast<VPInstruction>(U);
2843 if (!VPI || !vputils::isHeaderMask(VPI, Plan))
2844 continue;
2845
2846 assert(VPI->getOperand(0) == Wide &&
2847 "WidenCanonicalIV must be the first operand of the compare");
2848 assert(!HeaderMask && "Multiple header masks found?");
2849 HeaderMask = VPI;
2850 }
2851 }
2852 return HeaderMask;
2853}
2854
2856 VPlan &Plan, bool UseActiveLaneMaskForControlFlow,
2859 UseActiveLaneMaskForControlFlow) &&
2860 "DataAndControlFlowWithoutRuntimeCheck implies "
2861 "UseActiveLaneMaskForControlFlow");
2862
2863 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2864 auto *FoundWidenCanonicalIVUser = find_if(
2866 assert(FoundWidenCanonicalIVUser &&
2867 "Must have widened canonical IV when tail folding!");
2868 VPSingleDefRecipe *HeaderMask = findHeaderMask(Plan);
2869 auto *WideCanonicalIV =
2870 cast<VPWidenCanonicalIVRecipe>(*FoundWidenCanonicalIVUser);
2871 VPSingleDefRecipe *LaneMask;
2872 if (UseActiveLaneMaskForControlFlow) {
2875 } else {
2876 VPBuilder B = VPBuilder::getToInsertAfter(WideCanonicalIV);
2877 VPValue *ALMMultiplier = Plan.getOrAddLiveIn(
2878 ConstantInt::get(LoopRegion->getCanonicalIVType(), 1));
2879 LaneMask =
2880 B.createNaryOp(VPInstruction::ActiveLaneMask,
2881 {WideCanonicalIV, Plan.getTripCount(), ALMMultiplier},
2882 nullptr, "active.lane.mask");
2883 }
2884
2885 // Walk users of WideCanonicalIV and replace the header mask of the form
2886 // (ICMP_ULE, WideCanonicalIV, backedge-taken-count) with an active-lane-mask,
2887 // removing the old one to ensure there is always only a single header mask.
2888 HeaderMask->replaceAllUsesWith(LaneMask);
2889 HeaderMask->eraseFromParent();
2890}
2891
2892template <typename Op0_t, typename Op1_t> struct RemoveMask_match {
2893 Op0_t In;
2895
2896 RemoveMask_match(const Op0_t &In, Op1_t &Out) : In(In), Out(Out) {}
2897
2898 template <typename OpTy> bool match(OpTy *V) const {
2899 if (m_Specific(In).match(V)) {
2900 Out = nullptr;
2901 return true;
2902 }
2903 return m_LogicalAnd(m_Specific(In), m_VPValue(Out)).match(V);
2904 }
2905};
2906
2907/// Match a specific mask \p In, or a combination of it (logical-and In, Out).
2908/// Returns the remaining part \p Out if so, or nullptr otherwise.
2909template <typename Op0_t, typename Op1_t>
2910static inline RemoveMask_match<Op0_t, Op1_t> m_RemoveMask(const Op0_t &In,
2911 Op1_t &Out) {
2912 return RemoveMask_match<Op0_t, Op1_t>(In, Out);
2913}
2914
2915/// Try to optimize a \p CurRecipe masked by \p HeaderMask to a corresponding
2916/// EVL-based recipe without the header mask. Returns nullptr if no EVL-based
2917/// recipe could be created.
2918/// \p HeaderMask Header Mask.
2919/// \p CurRecipe Recipe to be transform.
2920/// \p TypeInfo VPlan-based type analysis.
2921/// \p EVL The explicit vector length parameter of vector-predication
2922/// intrinsics.
2924 VPRecipeBase &CurRecipe,
2925 VPTypeAnalysis &TypeInfo, VPValue &EVL) {
2926 VPlan *Plan = CurRecipe.getParent()->getPlan();
2927 DebugLoc DL = CurRecipe.getDebugLoc();
2928 VPValue *Addr, *Mask, *EndPtr;
2929
2930 /// Adjust any end pointers so that they point to the end of EVL lanes not VF.
2931 auto AdjustEndPtr = [&CurRecipe, &EVL](VPValue *EndPtr) {
2932 auto *EVLEndPtr = cast<VPVectorEndPointerRecipe>(EndPtr)->clone();
2933 EVLEndPtr->insertBefore(&CurRecipe);
2934 EVLEndPtr->setOperand(1, &EVL);
2935 return EVLEndPtr;
2936 };
2937
2938 if (match(&CurRecipe,
2939 m_MaskedLoad(m_VPValue(Addr), m_RemoveMask(HeaderMask, Mask))) &&
2940 !cast<VPWidenLoadRecipe>(CurRecipe).isReverse())
2941 return new VPWidenLoadEVLRecipe(cast<VPWidenLoadRecipe>(CurRecipe), Addr,
2942 EVL, Mask);
2943
2944 VPValue *ReversedVal;
2945 if (match(&CurRecipe, m_Reverse(m_VPValue(ReversedVal))) &&
2946 match(ReversedVal,
2947 m_MaskedLoad(m_VPValue(EndPtr), m_RemoveMask(HeaderMask, Mask))) &&
2948 match(EndPtr, m_VecEndPtr(m_VPValue(Addr), m_Specific(&Plan->getVF()))) &&
2949 cast<VPWidenLoadRecipe>(ReversedVal)->isReverse()) {
2950 auto *LoadR = new VPWidenLoadEVLRecipe(
2951 *cast<VPWidenLoadRecipe>(ReversedVal), AdjustEndPtr(EndPtr), EVL, Mask);
2952 LoadR->insertBefore(&CurRecipe);
2953 return new VPWidenIntrinsicRecipe(
2954 Intrinsic::experimental_vp_reverse, {LoadR, Plan->getTrue(), &EVL},
2955 TypeInfo.inferScalarType(LoadR), {}, {}, DL);
2956 }
2957
2958 VPValue *StoredVal;
2959 if (match(&CurRecipe, m_MaskedStore(m_VPValue(Addr), m_VPValue(StoredVal),
2960 m_RemoveMask(HeaderMask, Mask))) &&
2961 !cast<VPWidenStoreRecipe>(CurRecipe).isReverse())
2962 return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(CurRecipe), Addr,
2963 StoredVal, EVL, Mask);
2964
2965 if (match(&CurRecipe,
2966 m_MaskedStore(m_VPValue(EndPtr), m_Reverse(m_VPValue(ReversedVal)),
2967 m_RemoveMask(HeaderMask, Mask))) &&
2968 match(EndPtr, m_VecEndPtr(m_VPValue(Addr), m_Specific(&Plan->getVF()))) &&
2969 cast<VPWidenStoreRecipe>(CurRecipe).isReverse()) {
2970 auto *NewReverse = new VPWidenIntrinsicRecipe(
2971 Intrinsic::experimental_vp_reverse,
2972 {ReversedVal, Plan->getTrue(), &EVL},
2973 TypeInfo.inferScalarType(ReversedVal), {}, {}, DL);
2974 NewReverse->insertBefore(&CurRecipe);
2975 return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(CurRecipe),
2976 AdjustEndPtr(EndPtr), NewReverse, EVL,
2977 Mask);
2978 }
2979
2980 if (auto *Rdx = dyn_cast<VPReductionRecipe>(&CurRecipe))
2981 if (Rdx->isConditional() &&
2982 match(Rdx->getCondOp(), m_RemoveMask(HeaderMask, Mask)))
2983 return new VPReductionEVLRecipe(*Rdx, EVL, Mask);
2984
2985 if (auto *Interleave = dyn_cast<VPInterleaveRecipe>(&CurRecipe))
2986 if (Interleave->getMask() &&
2987 match(Interleave->getMask(), m_RemoveMask(HeaderMask, Mask)))
2988 return new VPInterleaveEVLRecipe(*Interleave, EVL, Mask);
2989
2990 VPValue *LHS, *RHS;
2991 if (match(&CurRecipe,
2992 m_Select(m_Specific(HeaderMask), m_VPValue(LHS), m_VPValue(RHS))))
2993 return new VPWidenIntrinsicRecipe(
2994 Intrinsic::vp_merge, {Plan->getTrue(), LHS, RHS, &EVL},
2995 TypeInfo.inferScalarType(LHS), {}, {}, DL);
2996
2997 if (match(&CurRecipe, m_Select(m_RemoveMask(HeaderMask, Mask), m_VPValue(LHS),
2998 m_VPValue(RHS))))
2999 return new VPWidenIntrinsicRecipe(
3000 Intrinsic::vp_merge, {Mask, LHS, RHS, &EVL},
3001 TypeInfo.inferScalarType(LHS), {}, {}, DL);
3002
3003 if (match(&CurRecipe, m_LastActiveLane(m_Specific(HeaderMask)))) {
3004 Type *Ty = TypeInfo.inferScalarType(CurRecipe.getVPSingleValue());
3005 VPValue *ZExt =
3006 VPBuilder(&CurRecipe).createScalarCast(Instruction::ZExt, &EVL, Ty, DL);
3007 return new VPInstruction(Instruction::Sub,
3008 {ZExt, Plan->getConstantInt(Ty, 1)}, {}, {}, DL);
3009 }
3010
3011 return nullptr;
3012}
3013
3014/// Optimize away any EVL-based header masks to VP intrinsic based recipes.
3015/// The transforms here need to preserve the original semantics.
3017 // Find the EVL-based header mask if it exists: icmp ult step-vector, EVL
3018 VPValue *HeaderMask = nullptr, *EVL = nullptr;
3021 m_VPValue(EVL))) &&
3022 match(EVL, m_EVL(m_VPValue()))) {
3023 HeaderMask = R.getVPSingleValue();
3024 break;
3025 }
3026 }
3027 if (!HeaderMask)
3028 return;
3029
3030 VPTypeAnalysis TypeInfo(Plan);
3031 SmallVector<VPRecipeBase *> OldRecipes;
3032 for (VPUser *U : collectUsersRecursively(HeaderMask)) {
3034 if (auto *NewR = optimizeMaskToEVL(HeaderMask, *R, TypeInfo, *EVL)) {
3035 NewR->insertBefore(R);
3036 for (auto [Old, New] :
3037 zip_equal(R->definedValues(), NewR->definedValues()))
3038 Old->replaceAllUsesWith(New);
3039 OldRecipes.push_back(R);
3040 }
3041 }
3042 // Erase old recipes at the end so we don't invalidate TypeInfo.
3043 for (VPRecipeBase *R : reverse(OldRecipes)) {
3044 SmallVector<VPValue *> PossiblyDead(R->operands());
3045 R->eraseFromParent();
3046 for (VPValue *Op : PossiblyDead)
3048 }
3049}
3050
3051/// After replacing the canonical IV with a EVL-based IV, fixup recipes that use
3052/// VF to use the EVL instead to avoid incorrect updates on the penultimate
3053/// iteration.
3054static void fixupVFUsersForEVL(VPlan &Plan, VPValue &EVL) {
3055 VPTypeAnalysis TypeInfo(Plan);
3056 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3057 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
3058
3059 assert(all_of(Plan.getVF().users(),
3062 "User of VF that we can't transform to EVL.");
3063 Plan.getVF().replaceUsesWithIf(&EVL, [](VPUser &U, unsigned Idx) {
3065 });
3066
3067 assert(all_of(Plan.getVFxUF().users(),
3068 [&LoopRegion, &Plan](VPUser *U) {
3069 return match(U,
3070 m_c_Add(m_Specific(LoopRegion->getCanonicalIV()),
3071 m_Specific(&Plan.getVFxUF()))) ||
3072 isa<VPWidenPointerInductionRecipe>(U);
3073 }) &&
3074 "Only users of VFxUF should be VPWidenPointerInductionRecipe and the "
3075 "increment of the canonical induction.");
3076 Plan.getVFxUF().replaceUsesWithIf(&EVL, [](VPUser &U, unsigned Idx) {
3077 // Only replace uses in VPWidenPointerInductionRecipe; The increment of the
3078 // canonical induction must not be updated.
3080 });
3081
3082 // Create a scalar phi to track the previous EVL if fixed-order recurrence is
3083 // contained.
3084 bool ContainsFORs =
3086 if (ContainsFORs) {
3087 // TODO: Use VPInstruction::ExplicitVectorLength to get maximum EVL.
3088 VPValue *MaxEVL = &Plan.getVF();
3089 // Emit VPScalarCastRecipe in preheader if VF is not a 32 bits integer.
3090 VPBuilder Builder(LoopRegion->getPreheaderVPBB());
3091 MaxEVL = Builder.createScalarZExtOrTrunc(
3092 MaxEVL, Type::getInt32Ty(Plan.getContext()),
3093 TypeInfo.inferScalarType(MaxEVL), DebugLoc::getUnknown());
3094
3095 Builder.setInsertPoint(Header, Header->getFirstNonPhi());
3096 VPValue *PrevEVL = Builder.createScalarPhi(
3097 {MaxEVL, &EVL}, DebugLoc::getUnknown(), "prev.evl");
3098
3101 for (VPRecipeBase &R : *VPBB) {
3102 VPValue *V1, *V2;
3103 if (!match(&R,
3105 m_VPValue(V1), m_VPValue(V2))))
3106 continue;
3107 VPValue *Imm = Plan.getOrAddLiveIn(
3110 Intrinsic::experimental_vp_splice,
3111 {V1, V2, Imm, Plan.getTrue(), PrevEVL, &EVL},
3112 TypeInfo.inferScalarType(R.getVPSingleValue()), {}, {},
3113 R.getDebugLoc());
3114 VPSplice->insertBefore(&R);
3115 R.getVPSingleValue()->replaceAllUsesWith(VPSplice);
3116 }
3117 }
3118 }
3119
3120 VPValue *HeaderMask = findHeaderMask(Plan);
3121 if (!HeaderMask)
3122 return;
3123
3124 // Replace header masks with a mask equivalent to predicating by EVL:
3125 //
3126 // icmp ule widen-canonical-iv backedge-taken-count
3127 // ->
3128 // icmp ult step-vector, EVL
3129 VPRecipeBase *EVLR = EVL.getDefiningRecipe();
3130 VPBuilder Builder(EVLR->getParent(), std::next(EVLR->getIterator()));
3131 Type *EVLType = TypeInfo.inferScalarType(&EVL);
3132 VPValue *EVLMask = Builder.createICmp(
3134 Builder.createNaryOp(VPInstruction::StepVector, {}, EVLType), &EVL);
3135 HeaderMask->replaceAllUsesWith(EVLMask);
3136}
3137
3138/// Converts a tail folded vector loop region to step by
3139/// VPInstruction::ExplicitVectorLength elements instead of VF elements each
3140/// iteration.
3141///
3142/// - Add a VPEVLBasedIVPHIRecipe and related recipes to \p Plan and
3143/// replaces all uses except the canonical IV increment of
3144/// VPCanonicalIVPHIRecipe with a VPEVLBasedIVPHIRecipe.
3145/// VPCanonicalIVPHIRecipe is used only for loop iterations counting after
3146/// this transformation.
3147///
3148/// - The header mask is replaced with a header mask based on the EVL.
3149///
3150/// - Plans with FORs have a new phi added to keep track of the EVL of the
3151/// previous iteration, and VPFirstOrderRecurrencePHIRecipes are replaced with
3152/// @llvm.vp.splice.
3153///
3154/// The function uses the following definitions:
3155/// %StartV is the canonical induction start value.
3156///
3157/// The function adds the following recipes:
3158///
3159/// vector.ph:
3160/// ...
3161///
3162/// vector.body:
3163/// ...
3164/// %EVLPhi = EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI [ %StartV, %vector.ph ],
3165/// [ %NextEVLIV, %vector.body ]
3166/// %AVL = phi [ trip-count, %vector.ph ], [ %NextAVL, %vector.body ]
3167/// %VPEVL = EXPLICIT-VECTOR-LENGTH %AVL
3168/// ...
3169/// %OpEVL = cast i32 %VPEVL to IVSize
3170/// %NextEVLIV = add IVSize %OpEVL, %EVLPhi
3171/// %NextAVL = sub IVSize nuw %AVL, %OpEVL
3172/// ...
3173///
3174/// If MaxSafeElements is provided, the function adds the following recipes:
3175/// vector.ph:
3176/// ...
3177///
3178/// vector.body:
3179/// ...
3180/// %EVLPhi = EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI [ %StartV, %vector.ph ],
3181/// [ %NextEVLIV, %vector.body ]
3182/// %AVL = phi [ trip-count, %vector.ph ], [ %NextAVL, %vector.body ]
3183/// %cmp = cmp ult %AVL, MaxSafeElements
3184/// %SAFE_AVL = select %cmp, %AVL, MaxSafeElements
3185/// %VPEVL = EXPLICIT-VECTOR-LENGTH %SAFE_AVL
3186/// ...
3187/// %OpEVL = cast i32 %VPEVL to IVSize
3188/// %NextEVLIV = add IVSize %OpEVL, %EVLPhi
3189/// %NextAVL = sub IVSize nuw %AVL, %OpEVL
3190/// ...
3191///
3193 VPlan &Plan, const std::optional<unsigned> &MaxSafeElements) {
3194 if (Plan.hasScalarVFOnly())
3195 return;
3196 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3197 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
3198
3199 auto *CanonicalIVPHI = LoopRegion->getCanonicalIV();
3200 auto *CanIVTy = LoopRegion->getCanonicalIVType();
3201 VPValue *StartV = CanonicalIVPHI->getStartValue();
3202
3203 // Create the ExplicitVectorLengthPhi recipe in the main loop.
3204 auto *EVLPhi = new VPEVLBasedIVPHIRecipe(StartV, DebugLoc::getUnknown());
3205 EVLPhi->insertAfter(CanonicalIVPHI);
3206 VPBuilder Builder(Header, Header->getFirstNonPhi());
3207 // Create the AVL (application vector length), starting from TC -> 0 in steps
3208 // of EVL.
3209 VPPhi *AVLPhi = Builder.createScalarPhi(
3210 {Plan.getTripCount()}, DebugLoc::getCompilerGenerated(), "avl");
3211 VPValue *AVL = AVLPhi;
3212
3213 if (MaxSafeElements) {
3214 // Support for MaxSafeDist for correct loop emission.
3215 VPValue *AVLSafe = Plan.getConstantInt(CanIVTy, *MaxSafeElements);
3216 VPValue *Cmp = Builder.createICmp(ICmpInst::ICMP_ULT, AVL, AVLSafe);
3217 AVL = Builder.createSelect(Cmp, AVL, AVLSafe, DebugLoc::getUnknown(),
3218 "safe_avl");
3219 }
3220 auto *VPEVL = Builder.createNaryOp(VPInstruction::ExplicitVectorLength, AVL,
3221 DebugLoc::getUnknown(), "evl");
3222
3223 auto *CanonicalIVIncrement =
3224 cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue());
3225 Builder.setInsertPoint(CanonicalIVIncrement);
3226 VPValue *OpVPEVL = VPEVL;
3227
3228 auto *I32Ty = Type::getInt32Ty(Plan.getContext());
3229 OpVPEVL = Builder.createScalarZExtOrTrunc(
3230 OpVPEVL, CanIVTy, I32Ty, CanonicalIVIncrement->getDebugLoc());
3231
3232 auto *NextEVLIV = Builder.createOverflowingOp(
3233 Instruction::Add, {OpVPEVL, EVLPhi},
3234 {CanonicalIVIncrement->hasNoUnsignedWrap(),
3235 CanonicalIVIncrement->hasNoSignedWrap()},
3236 CanonicalIVIncrement->getDebugLoc(), "index.evl.next");
3237 EVLPhi->addOperand(NextEVLIV);
3238
3239 VPValue *NextAVL = Builder.createOverflowingOp(
3240 Instruction::Sub, {AVLPhi, OpVPEVL}, {/*hasNUW=*/true, /*hasNSW=*/false},
3241 DebugLoc::getCompilerGenerated(), "avl.next");
3242 AVLPhi->addOperand(NextAVL);
3243
3244 fixupVFUsersForEVL(Plan, *VPEVL);
3245 removeDeadRecipes(Plan);
3246
3247 // Replace all uses of VPCanonicalIVPHIRecipe by
3248 // VPEVLBasedIVPHIRecipe except for the canonical IV increment.
3249 CanonicalIVPHI->replaceAllUsesWith(EVLPhi);
3250 CanonicalIVIncrement->setOperand(0, CanonicalIVPHI);
3251 // TODO: support unroll factor > 1.
3252 Plan.setUF(1);
3253}
3254
3256 // Find EVL loop entries by locating VPEVLBasedIVPHIRecipe.
3257 // There should be only one EVL PHI in the entire plan.
3258 VPEVLBasedIVPHIRecipe *EVLPhi = nullptr;
3259
3262 for (VPRecipeBase &R : VPBB->phis())
3263 if (auto *PhiR = dyn_cast<VPEVLBasedIVPHIRecipe>(&R)) {
3264 assert(!EVLPhi && "Found multiple EVL PHIs. Only one expected");
3265 EVLPhi = PhiR;
3266 }
3267
3268 // Early return if no EVL PHI is found.
3269 if (!EVLPhi)
3270 return;
3271
3272 VPBasicBlock *HeaderVPBB = EVLPhi->getParent();
3273 VPValue *EVLIncrement = EVLPhi->getBackedgeValue();
3274 VPValue *AVL;
3275 [[maybe_unused]] bool FoundAVL =
3276 match(EVLIncrement,
3277 m_c_Add(m_ZExtOrSelf(m_EVL(m_VPValue(AVL))), m_Specific(EVLPhi)));
3278 assert(FoundAVL && "Didn't find AVL?");
3279
3280 // The AVL may be capped to a safe distance.
3281 VPValue *SafeAVL;
3282 if (match(AVL, m_Select(m_VPValue(), m_VPValue(SafeAVL), m_VPValue())))
3283 AVL = SafeAVL;
3284
3285 VPValue *AVLNext;
3286 [[maybe_unused]] bool FoundAVLNext =
3288 m_Specific(Plan.getTripCount()), m_VPValue(AVLNext)));
3289 assert(FoundAVLNext && "Didn't find AVL backedge?");
3290
3291 // Convert EVLPhi to concrete recipe.
3292 auto *ScalarR =
3293 VPBuilder(EVLPhi).createScalarPhi({EVLPhi->getStartValue(), EVLIncrement},
3294 EVLPhi->getDebugLoc(), "evl.based.iv");
3295 EVLPhi->replaceAllUsesWith(ScalarR);
3296 EVLPhi->eraseFromParent();
3297
3298 // Replace CanonicalIVInc with EVL-PHI increment.
3299 auto *CanonicalIV = cast<VPPhi>(&*HeaderVPBB->begin());
3300 VPValue *Backedge = CanonicalIV->getIncomingValue(1);
3301 assert(match(Backedge, m_c_Add(m_Specific(CanonicalIV),
3302 m_Specific(&Plan.getVFxUF()))) &&
3303 "Unexpected canonical iv");
3304 Backedge->replaceAllUsesWith(EVLIncrement);
3305
3306 // Remove unused phi and increment.
3307 VPRecipeBase *CanonicalIVIncrement = Backedge->getDefiningRecipe();
3308 CanonicalIVIncrement->eraseFromParent();
3309 CanonicalIV->eraseFromParent();
3310
3311 // Replace the use of VectorTripCount in the latch-exiting block.
3312 // Before: (branch-on-cond (icmp eq EVLIVInc, VectorTripCount))
3313 // After: (branch-on-cond icmp eq AVLNext, 0)
3314 VPBasicBlock *LatchExiting =
3315 HeaderVPBB->getPredecessors()[1]->getEntryBasicBlock();
3316 auto *LatchExitingBr = cast<VPInstruction>(LatchExiting->getTerminator());
3317 if (match(LatchExitingBr, m_BranchOnCond(m_True())))
3318 return;
3319
3320 assert(match(LatchExitingBr, m_BranchOnCond(m_SpecificCmp(
3321 CmpInst::ICMP_EQ, m_VPValue(EVLIncrement),
3322 m_Specific(&Plan.getVectorTripCount())))) &&
3323 "Expected BranchOnCond with ICmp comparing EVL increment with vector "
3324 "trip count");
3325
3326 Type *AVLTy = VPTypeAnalysis(Plan).inferScalarType(AVLNext);
3327 VPBuilder Builder(LatchExitingBr);
3328 LatchExitingBr->setOperand(0,
3329 Builder.createICmp(CmpInst::ICMP_EQ, AVLNext,
3330 Plan.getConstantInt(AVLTy, 0)));
3331}
3332
3334 VPlan &Plan, PredicatedScalarEvolution &PSE,
3335 const DenseMap<Value *, const SCEV *> &StridesMap) {
3336 // Replace VPValues for known constant strides guaranteed by predicate scalar
3337 // evolution.
3338 auto CanUseVersionedStride = [&Plan](VPUser &U, unsigned) {
3339 auto *R = cast<VPRecipeBase>(&U);
3340 return R->getRegion() ||
3341 R->getParent() == Plan.getVectorLoopRegion()->getSinglePredecessor();
3342 };
3343 ValueToSCEVMapTy RewriteMap;
3344 for (const SCEV *Stride : StridesMap.values()) {
3345 using namespace SCEVPatternMatch;
3346 auto *StrideV = cast<SCEVUnknown>(Stride)->getValue();
3347 const APInt *StrideConst;
3348 if (!match(PSE.getSCEV(StrideV), m_scev_APInt(StrideConst)))
3349 // Only handle constant strides for now.
3350 continue;
3351
3352 auto *CI = Plan.getConstantInt(*StrideConst);
3353 if (VPValue *StrideVPV = Plan.getLiveIn(StrideV))
3354 StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
3355
3356 // The versioned value may not be used in the loop directly but through a
3357 // sext/zext. Add new live-ins in those cases.
3358 for (Value *U : StrideV->users()) {
3360 continue;
3361 VPValue *StrideVPV = Plan.getLiveIn(U);
3362 if (!StrideVPV)
3363 continue;
3364 unsigned BW = U->getType()->getScalarSizeInBits();
3365 APInt C =
3366 isa<SExtInst>(U) ? StrideConst->sext(BW) : StrideConst->zext(BW);
3367 VPValue *CI = Plan.getConstantInt(C);
3368 StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
3369 }
3370 RewriteMap[StrideV] = PSE.getSCEV(StrideV);
3371 }
3372
3373 for (VPRecipeBase &R : *Plan.getEntry()) {
3374 auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(&R);
3375 if (!ExpSCEV)
3376 continue;
3377 const SCEV *ScevExpr = ExpSCEV->getSCEV();
3378 auto *NewSCEV =
3379 SCEVParameterRewriter::rewrite(ScevExpr, *PSE.getSE(), RewriteMap);
3380 if (NewSCEV != ScevExpr) {
3381 VPValue *NewExp = vputils::getOrCreateVPValueForSCEVExpr(Plan, NewSCEV);
3382 ExpSCEV->replaceAllUsesWith(NewExp);
3383 if (Plan.getTripCount() == ExpSCEV)
3384 Plan.resetTripCount(NewExp);
3385 }
3386 }
3387}
3388
3390 VPlan &Plan,
3391 const std::function<bool(BasicBlock *)> &BlockNeedsPredication) {
3392 // Collect recipes in the backward slice of `Root` that may generate a poison
3393 // value that is used after vectorization.
3395 auto CollectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) {
3397 Worklist.push_back(Root);
3398
3399 // Traverse the backward slice of Root through its use-def chain.
3400 while (!Worklist.empty()) {
3401 VPRecipeBase *CurRec = Worklist.pop_back_val();
3402
3403 if (!Visited.insert(CurRec).second)
3404 continue;
3405
3406 // Prune search if we find another recipe generating a widen memory
3407 // instruction. Widen memory instructions involved in address computation
3408 // will lead to gather/scatter instructions, which don't need to be
3409 // handled.
3411 VPHeaderPHIRecipe>(CurRec))
3412 continue;
3413
3414 // This recipe contributes to the address computation of a widen
3415 // load/store. If the underlying instruction has poison-generating flags,
3416 // drop them directly.
3417 if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(CurRec)) {
3418 VPValue *A, *B;
3419 // Dropping disjoint from an OR may yield incorrect results, as some
3420 // analysis may have converted it to an Add implicitly (e.g. SCEV used
3421 // for dependence analysis). Instead, replace it with an equivalent Add.
3422 // This is possible as all users of the disjoint OR only access lanes
3423 // where the operands are disjoint or poison otherwise.
3424 if (match(RecWithFlags, m_BinaryOr(m_VPValue(A), m_VPValue(B))) &&
3425 RecWithFlags->isDisjoint()) {
3426 VPBuilder Builder(RecWithFlags);
3427 VPInstruction *New = Builder.createOverflowingOp(
3428 Instruction::Add, {A, B}, {false, false},
3429 RecWithFlags->getDebugLoc());
3430 New->setUnderlyingValue(RecWithFlags->getUnderlyingValue());
3431 RecWithFlags->replaceAllUsesWith(New);
3432 RecWithFlags->eraseFromParent();
3433 CurRec = New;
3434 } else
3435 RecWithFlags->dropPoisonGeneratingFlags();
3436 } else {
3439 (void)Instr;
3440 assert((!Instr || !Instr->hasPoisonGeneratingFlags()) &&
3441 "found instruction with poison generating flags not covered by "
3442 "VPRecipeWithIRFlags");
3443 }
3444
3445 // Add new definitions to the worklist.
3446 for (VPValue *Operand : CurRec->operands())
3447 if (VPRecipeBase *OpDef = Operand->getDefiningRecipe())
3448 Worklist.push_back(OpDef);
3449 }
3450 });
3451
3452 // Traverse all the recipes in the VPlan and collect the poison-generating
3453 // recipes in the backward slice starting at the address of a VPWidenRecipe or
3454 // VPInterleaveRecipe.
3455 auto Iter = vp_depth_first_deep(Plan.getEntry());
3457 for (VPRecipeBase &Recipe : *VPBB) {
3458 if (auto *WidenRec = dyn_cast<VPWidenMemoryRecipe>(&Recipe)) {
3459 Instruction &UnderlyingInstr = WidenRec->getIngredient();
3460 VPRecipeBase *AddrDef = WidenRec->getAddr()->getDefiningRecipe();
3461 if (AddrDef && WidenRec->isConsecutive() &&
3462 BlockNeedsPredication(UnderlyingInstr.getParent()))
3463 CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
3464 } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) {
3465 VPRecipeBase *AddrDef = InterleaveRec->getAddr()->getDefiningRecipe();
3466 if (AddrDef) {
3467 // Check if any member of the interleave group needs predication.
3468 const InterleaveGroup<Instruction> *InterGroup =
3469 InterleaveRec->getInterleaveGroup();
3470 bool NeedPredication = false;
3471 for (int I = 0, NumMembers = InterGroup->getNumMembers();
3472 I < NumMembers; ++I) {
3473 Instruction *Member = InterGroup->getMember(I);
3474 if (Member)
3475 NeedPredication |= BlockNeedsPredication(Member->getParent());
3476 }
3477
3478 if (NeedPredication)
3479 CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
3480 }
3481 }
3482 }
3483 }
3484}
3485
3487 VPlan &Plan,
3489 &InterleaveGroups,
3490 VPRecipeBuilder &RecipeBuilder, const bool &ScalarEpilogueAllowed) {
3491 if (InterleaveGroups.empty())
3492 return;
3493
3494 // Interleave memory: for each Interleave Group we marked earlier as relevant
3495 // for this VPlan, replace the Recipes widening its memory instructions with a
3496 // single VPInterleaveRecipe at its insertion point.
3497 VPDominatorTree VPDT(Plan);
3498 for (const auto *IG : InterleaveGroups) {
3499 auto *Start =
3500 cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IG->getMember(0)));
3501 VPIRMetadata InterleaveMD(*Start);
3502 SmallVector<VPValue *, 4> StoredValues;
3503 if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(Start))
3504 StoredValues.push_back(StoreR->getStoredValue());
3505 for (unsigned I = 1; I < IG->getFactor(); ++I) {
3506 Instruction *MemberI = IG->getMember(I);
3507 if (!MemberI)
3508 continue;
3509 VPWidenMemoryRecipe *MemoryR =
3510 cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(MemberI));
3511 if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(MemoryR))
3512 StoredValues.push_back(StoreR->getStoredValue());
3513 InterleaveMD.intersect(*MemoryR);
3514 }
3515
3516 bool NeedsMaskForGaps =
3517 (IG->requiresScalarEpilogue() && !ScalarEpilogueAllowed) ||
3518 (!StoredValues.empty() && !IG->isFull());
3519
3520 Instruction *IRInsertPos = IG->getInsertPos();
3521 auto *InsertPos =
3522 cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IRInsertPos));
3523
3525 if (auto *Gep = dyn_cast<GetElementPtrInst>(
3526 getLoadStorePointerOperand(IRInsertPos)->stripPointerCasts()))
3527 NW = Gep->getNoWrapFlags().withoutNoUnsignedWrap();
3528
3529 // Get or create the start address for the interleave group.
3530 VPValue *Addr = Start->getAddr();
3531 VPRecipeBase *AddrDef = Addr->getDefiningRecipe();
3532 if (AddrDef && !VPDT.properlyDominates(AddrDef, InsertPos)) {
3533 // We cannot re-use the address of member zero because it does not
3534 // dominate the insert position. Instead, use the address of the insert
3535 // position and create a PtrAdd adjusting it to the address of member
3536 // zero.
3537 // TODO: Hoist Addr's defining recipe (and any operands as needed) to
3538 // InsertPos or sink loads above zero members to join it.
3539 assert(IG->getIndex(IRInsertPos) != 0 &&
3540 "index of insert position shouldn't be zero");
3541 auto &DL = IRInsertPos->getDataLayout();
3542 APInt Offset(32,
3543 DL.getTypeAllocSize(getLoadStoreType(IRInsertPos)) *
3544 IG->getIndex(IRInsertPos),
3545 /*IsSigned=*/true);
3546 VPValue *OffsetVPV = Plan.getConstantInt(-Offset);
3547 VPBuilder B(InsertPos);
3548 Addr = B.createNoWrapPtrAdd(InsertPos->getAddr(), OffsetVPV, NW);
3549 }
3550 // If the group is reverse, adjust the index to refer to the last vector
3551 // lane instead of the first. We adjust the index from the first vector
3552 // lane, rather than directly getting the pointer for lane VF - 1, because
3553 // the pointer operand of the interleaved access is supposed to be uniform.
3554 if (IG->isReverse()) {
3555 auto *ReversePtr = new VPVectorEndPointerRecipe(
3556 Addr, &Plan.getVF(), getLoadStoreType(IRInsertPos),
3557 -(int64_t)IG->getFactor(), NW, InsertPos->getDebugLoc());
3558 ReversePtr->insertBefore(InsertPos);
3559 Addr = ReversePtr;
3560 }
3561 auto *VPIG = new VPInterleaveRecipe(IG, Addr, StoredValues,
3562 InsertPos->getMask(), NeedsMaskForGaps,
3563 InterleaveMD, InsertPos->getDebugLoc());
3564 VPIG->insertBefore(InsertPos);
3565
3566 unsigned J = 0;
3567 for (unsigned i = 0; i < IG->getFactor(); ++i)
3568 if (Instruction *Member = IG->getMember(i)) {
3569 VPRecipeBase *MemberR = RecipeBuilder.getRecipe(Member);
3570 if (!Member->getType()->isVoidTy()) {
3571 VPValue *OriginalV = MemberR->getVPSingleValue();
3572 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));
3573 J++;
3574 }
3575 MemberR->eraseFromParent();
3576 }
3577 }
3578}
3579
3580/// Expand a VPWidenIntOrFpInduction into executable recipes, for the initial
3581/// value, phi and backedge value. In the following example:
3582///
3583/// vector.ph:
3584/// Successor(s): vector loop
3585///
3586/// <x1> vector loop: {
3587/// vector.body:
3588/// WIDEN-INDUCTION %i = phi %start, %step, %vf
3589/// ...
3590/// EMIT branch-on-count ...
3591/// No successors
3592/// }
3593///
3594/// WIDEN-INDUCTION will get expanded to:
3595///
3596/// vector.ph:
3597/// ...
3598/// vp<%induction.start> = ...
3599/// vp<%induction.increment> = ...
3600///
3601/// Successor(s): vector loop
3602///
3603/// <x1> vector loop: {
3604/// vector.body:
3605/// ir<%i> = WIDEN-PHI vp<%induction.start>, vp<%vec.ind.next>
3606/// ...
3607/// vp<%vec.ind.next> = add ir<%i>, vp<%induction.increment>
3608/// EMIT branch-on-count ...
3609/// No successors
3610/// }
3611static void
3613 VPTypeAnalysis &TypeInfo) {
3614 VPlan *Plan = WidenIVR->getParent()->getPlan();
3615 VPValue *Start = WidenIVR->getStartValue();
3616 VPValue *Step = WidenIVR->getStepValue();
3617 VPValue *VF = WidenIVR->getVFValue();
3618 DebugLoc DL = WidenIVR->getDebugLoc();
3619
3620 // The value from the original loop to which we are mapping the new induction
3621 // variable.
3622 Type *Ty = TypeInfo.inferScalarType(WidenIVR);
3623
3624 const InductionDescriptor &ID = WidenIVR->getInductionDescriptor();
3627 VPIRFlags Flags = *WidenIVR;
3628 if (ID.getKind() == InductionDescriptor::IK_IntInduction) {
3629 AddOp = Instruction::Add;
3630 MulOp = Instruction::Mul;
3631 } else {
3632 AddOp = ID.getInductionOpcode();
3633 MulOp = Instruction::FMul;
3634 }
3635
3636 // If the phi is truncated, truncate the start and step values.
3637 VPBuilder Builder(Plan->getVectorPreheader());
3638 Type *StepTy = TypeInfo.inferScalarType(Step);
3639 if (Ty->getScalarSizeInBits() < StepTy->getScalarSizeInBits()) {
3640 assert(StepTy->isIntegerTy() && "Truncation requires an integer type");
3641 Step = Builder.createScalarCast(Instruction::Trunc, Step, Ty, DL);
3642 Start = Builder.createScalarCast(Instruction::Trunc, Start, Ty, DL);
3643 // Truncation doesn't preserve WrapFlags.
3644 Flags.dropPoisonGeneratingFlags();
3645 StepTy = Ty;
3646 }
3647
3648 // Construct the initial value of the vector IV in the vector loop preheader.
3649 Type *IVIntTy =
3651 VPValue *Init = Builder.createNaryOp(VPInstruction::StepVector, {}, IVIntTy);
3652 if (StepTy->isFloatingPointTy())
3653 Init = Builder.createWidenCast(Instruction::UIToFP, Init, StepTy);
3654
3655 VPValue *SplatStart = Builder.createNaryOp(VPInstruction::Broadcast, Start);
3656 VPValue *SplatStep = Builder.createNaryOp(VPInstruction::Broadcast, Step);
3657
3658 Init = Builder.createNaryOp(MulOp, {Init, SplatStep}, Flags);
3659 Init = Builder.createNaryOp(AddOp, {SplatStart, Init}, Flags,
3660 DebugLoc::getUnknown(), "induction");
3661
3662 // Create the widened phi of the vector IV.
3663 auto *WidePHI = new VPWidenPHIRecipe(WidenIVR->getPHINode(), Init,
3664 WidenIVR->getDebugLoc(), "vec.ind");
3665 WidePHI->insertBefore(WidenIVR);
3666
3667 // Create the backedge value for the vector IV.
3668 VPValue *Inc;
3669 VPValue *Prev;
3670 // If unrolled, use the increment and prev value from the operands.
3671 if (auto *SplatVF = WidenIVR->getSplatVFValue()) {
3672 Inc = SplatVF;
3673 Prev = WidenIVR->getLastUnrolledPartOperand();
3674 } else {
3675 if (VPRecipeBase *R = VF->getDefiningRecipe())
3676 Builder.setInsertPoint(R->getParent(), std::next(R->getIterator()));
3677 // Multiply the vectorization factor by the step using integer or
3678 // floating-point arithmetic as appropriate.
3679 if (StepTy->isFloatingPointTy())
3680 VF = Builder.createScalarCast(Instruction::CastOps::UIToFP, VF, StepTy,
3681 DL);
3682 else
3683 VF = Builder.createScalarZExtOrTrunc(VF, StepTy,
3684 TypeInfo.inferScalarType(VF), DL);
3685
3686 Inc = Builder.createNaryOp(MulOp, {Step, VF}, Flags);
3687 Inc = Builder.createNaryOp(VPInstruction::Broadcast, Inc);
3688 Prev = WidePHI;
3689 }
3690
3692 Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator());
3693 auto *Next = Builder.createNaryOp(AddOp, {Prev, Inc}, Flags,
3694 WidenIVR->getDebugLoc(), "vec.ind.next");
3695
3696 WidePHI->addOperand(Next);
3697
3698 WidenIVR->replaceAllUsesWith(WidePHI);
3699}
3700
3701/// Expand a VPWidenPointerInductionRecipe into executable recipes, for the
3702/// initial value, phi and backedge value. In the following example:
3703///
3704/// <x1> vector loop: {
3705/// vector.body:
3706/// EMIT ir<%ptr.iv> = WIDEN-POINTER-INDUCTION %start, %step, %vf
3707/// ...
3708/// EMIT branch-on-count ...
3709/// }
3710///
3711/// WIDEN-POINTER-INDUCTION will get expanded to:
3712///
3713/// <x1> vector loop: {
3714/// vector.body:
3715/// EMIT-SCALAR %pointer.phi = phi %start, %ptr.ind
3716/// EMIT %mul = mul %stepvector, %step
3717/// EMIT %vector.gep = wide-ptradd %pointer.phi, %mul
3718/// ...
3719/// EMIT %ptr.ind = ptradd %pointer.phi, %vf
3720/// EMIT branch-on-count ...
3721/// }
3723 VPTypeAnalysis &TypeInfo) {
3724 VPlan *Plan = R->getParent()->getPlan();
3725 VPValue *Start = R->getStartValue();
3726 VPValue *Step = R->getStepValue();
3727 VPValue *VF = R->getVFValue();
3728
3729 assert(R->getInductionDescriptor().getKind() ==
3731 "Not a pointer induction according to InductionDescriptor!");
3732 assert(TypeInfo.inferScalarType(R)->isPointerTy() && "Unexpected type.");
3733 assert(!R->onlyScalarsGenerated(Plan->hasScalableVF()) &&
3734 "Recipe should have been replaced");
3735
3736 VPBuilder Builder(R);
3737 DebugLoc DL = R->getDebugLoc();
3738
3739 // Build a scalar pointer phi.
3740 VPPhi *ScalarPtrPhi = Builder.createScalarPhi(Start, DL, "pointer.phi");
3741
3742 // Create actual address geps that use the pointer phi as base and a
3743 // vectorized version of the step value (<step*0, ..., step*N>) as offset.
3744 Builder.setInsertPoint(R->getParent(), R->getParent()->getFirstNonPhi());
3745 Type *StepTy = TypeInfo.inferScalarType(Step);
3746 VPValue *Offset = Builder.createNaryOp(VPInstruction::StepVector, {}, StepTy);
3747 Offset = Builder.createOverflowingOp(Instruction::Mul, {Offset, Step});
3748 VPValue *PtrAdd = Builder.createNaryOp(
3749 VPInstruction::WidePtrAdd, {ScalarPtrPhi, Offset}, DL, "vector.gep");
3750 R->replaceAllUsesWith(PtrAdd);
3751
3752 // Create the backedge value for the scalar pointer phi.
3754 Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator());
3755 VF = Builder.createScalarZExtOrTrunc(VF, StepTy, TypeInfo.inferScalarType(VF),
3756 DL);
3757 VPValue *Inc = Builder.createOverflowingOp(Instruction::Mul, {Step, VF});
3758
3759 VPValue *InductionGEP =
3760 Builder.createPtrAdd(ScalarPtrPhi, Inc, DL, "ptr.ind");
3761 ScalarPtrPhi->addOperand(InductionGEP);
3762}
3763
3765 // Replace loop regions with explicity CFG.
3766 SmallVector<VPRegionBlock *> LoopRegions;
3768 vp_depth_first_deep(Plan.getEntry()))) {
3769 if (!R->isReplicator())
3770 LoopRegions.push_back(R);
3771 }
3772 for (VPRegionBlock *R : LoopRegions)
3773 R->dissolveToCFGLoop();
3774}
3775
3778 // The transform runs after dissolving loop regions, so all VPBasicBlocks
3779 // terminated with BranchOnTwoConds are reached via a shallow traversal.
3782 if (!VPBB->empty() && match(&VPBB->back(), m_BranchOnTwoConds()))
3783 WorkList.push_back(cast<VPInstruction>(&VPBB->back()));
3784 }
3785
3786 // Expand BranchOnTwoConds instructions into explicit CFG with two new
3787 // single-condition branches:
3788 // 1. A branch that replaces BranchOnTwoConds, jumps to the first successor if
3789 // the first condition is true, and otherwise jumps to a new interim block.
3790 // 2. A branch that ends the interim block, jumps to the second successor if
3791 // the second condition is true, and otherwise jumps to the third
3792 // successor.
3793 for (VPInstruction *Br : WorkList) {
3794 assert(Br->getNumOperands() == 2 &&
3795 "BranchOnTwoConds must have exactly 2 conditions");
3796 DebugLoc DL = Br->getDebugLoc();
3797 VPBasicBlock *BrOnTwoCondsBB = Br->getParent();
3798 const auto Successors = to_vector(BrOnTwoCondsBB->getSuccessors());
3799 assert(Successors.size() == 3 &&
3800 "BranchOnTwoConds must have exactly 3 successors");
3801
3802 for (VPBlockBase *Succ : Successors)
3803 VPBlockUtils::disconnectBlocks(BrOnTwoCondsBB, Succ);
3804
3805 VPValue *Cond0 = Br->getOperand(0);
3806 VPValue *Cond1 = Br->getOperand(1);
3807 VPBlockBase *Succ0 = Successors[0];
3808 VPBlockBase *Succ1 = Successors[1];
3809 VPBlockBase *Succ2 = Successors[2];
3810 assert(!Succ0->getParent() && !Succ1->getParent() && !Succ2->getParent() &&
3811 !BrOnTwoCondsBB->getParent() && "regions must already be dissolved");
3812
3813 VPBasicBlock *InterimBB =
3814 Plan.createVPBasicBlock(BrOnTwoCondsBB->getName() + ".interim");
3815
3816 VPBuilder(BrOnTwoCondsBB)
3818 VPBlockUtils::connectBlocks(BrOnTwoCondsBB, Succ0);
3819 VPBlockUtils::connectBlocks(BrOnTwoCondsBB, InterimBB);
3820
3822 VPBlockUtils::connectBlocks(InterimBB, Succ1);
3823 VPBlockUtils::connectBlocks(InterimBB, Succ2);
3824 Br->eraseFromParent();
3825 }
3826}
3827
3829 VPTypeAnalysis TypeInfo(Plan);
3832 vp_depth_first_deep(Plan.getEntry()))) {
3833 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
3834 if (auto *WidenIVR = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R)) {
3835 expandVPWidenIntOrFpInduction(WidenIVR, TypeInfo);
3836 ToRemove.push_back(WidenIVR);
3837 continue;
3838 }
3839
3840 if (auto *WidenIVR = dyn_cast<VPWidenPointerInductionRecipe>(&R)) {
3841 // If the recipe only generates scalars, scalarize it instead of
3842 // expanding it.
3843 if (WidenIVR->onlyScalarsGenerated(Plan.hasScalableVF())) {
3844 VPBuilder Builder(WidenIVR);
3845 VPValue *PtrAdd =
3846 scalarizeVPWidenPointerInduction(WidenIVR, Plan, Builder);
3847 WidenIVR->replaceAllUsesWith(PtrAdd);
3848 ToRemove.push_back(WidenIVR);
3849 continue;
3850 }
3851 expandVPWidenPointerInduction(WidenIVR, TypeInfo);
3852 ToRemove.push_back(WidenIVR);
3853 continue;
3854 }
3855
3856 // Expand VPBlendRecipe into VPInstruction::Select.
3857 VPBuilder Builder(&R);
3858 if (auto *Blend = dyn_cast<VPBlendRecipe>(&R)) {
3859 VPValue *Select = Blend->getIncomingValue(0);
3860 for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
3861 Select = Builder.createSelect(Blend->getMask(I),
3862 Blend->getIncomingValue(I), Select,
3863 R.getDebugLoc(), "predphi");
3864 Blend->replaceAllUsesWith(Select);
3865 ToRemove.push_back(Blend);
3866 }
3867
3868 if (auto *Expr = dyn_cast<VPExpressionRecipe>(&R)) {
3869 Expr->decompose();
3870 ToRemove.push_back(Expr);
3871 }
3872
3873 // Expand LastActiveLane into Not + FirstActiveLane + Sub.
3874 auto *LastActiveL = dyn_cast<VPInstruction>(&R);
3875 if (LastActiveL &&
3876 LastActiveL->getOpcode() == VPInstruction::LastActiveLane) {
3877 // Create Not(Mask) for all operands.
3879 for (VPValue *Op : LastActiveL->operands()) {
3880 VPValue *NotMask = Builder.createNot(Op, LastActiveL->getDebugLoc());
3881 NotMasks.push_back(NotMask);
3882 }
3883
3884 // Create FirstActiveLane on the inverted masks.
3885 VPValue *FirstInactiveLane = Builder.createNaryOp(
3887 LastActiveL->getDebugLoc(), "first.inactive.lane");
3888
3889 // Subtract 1 to get the last active lane.
3890 VPValue *One = Plan.getOrAddLiveIn(
3891 ConstantInt::get(Type::getInt64Ty(Plan.getContext()), 1));
3892 VPValue *LastLane = Builder.createNaryOp(
3893 Instruction::Sub, {FirstInactiveLane, One},
3894 LastActiveL->getDebugLoc(), "last.active.lane");
3895
3896 LastActiveL->replaceAllUsesWith(LastLane);
3897 ToRemove.push_back(LastActiveL);
3898 continue;
3899 }
3900
3901 // Lower BranchOnCount to ICmp + BranchOnCond.
3902 VPValue *IV, *TC;
3903 if (match(&R, m_BranchOnCount(m_VPValue(IV), m_VPValue(TC)))) {
3904 auto *BranchOnCountInst = cast<VPInstruction>(&R);
3905 DebugLoc DL = BranchOnCountInst->getDebugLoc();
3906 VPValue *Cond = Builder.createICmp(CmpInst::ICMP_EQ, IV, TC, DL);
3907 Builder.createNaryOp(VPInstruction::BranchOnCond, Cond, DL);
3908 ToRemove.push_back(BranchOnCountInst);
3909 continue;
3910 }
3911
3912 VPValue *VectorStep;
3913 VPValue *ScalarStep;
3915 m_VPValue(VectorStep), m_VPValue(ScalarStep))))
3916 continue;
3917
3918 // Expand WideIVStep.
3919 auto *VPI = cast<VPInstruction>(&R);
3920 Type *IVTy = TypeInfo.inferScalarType(VPI);
3921 if (TypeInfo.inferScalarType(VectorStep) != IVTy) {
3923 ? Instruction::UIToFP
3924 : Instruction::Trunc;
3925 VectorStep = Builder.createWidenCast(CastOp, VectorStep, IVTy);
3926 }
3927
3928 assert(!match(ScalarStep, m_One()) && "Expected non-unit scalar-step");
3929 if (TypeInfo.inferScalarType(ScalarStep) != IVTy) {
3930 ScalarStep =
3931 Builder.createWidenCast(Instruction::Trunc, ScalarStep, IVTy);
3932 }
3933
3934 VPIRFlags Flags;
3935 if (IVTy->isFloatingPointTy())
3936 Flags = {VPI->getFastMathFlags()};
3937
3938 unsigned MulOpc =
3939 IVTy->isFloatingPointTy() ? Instruction::FMul : Instruction::Mul;
3940 VPInstruction *Mul = Builder.createNaryOp(
3941 MulOpc, {VectorStep, ScalarStep}, Flags, R.getDebugLoc());
3942 VectorStep = Mul;
3943 VPI->replaceAllUsesWith(VectorStep);
3944 ToRemove.push_back(VPI);
3945 }
3946 }
3947
3948 for (VPRecipeBase *R : ToRemove)
3949 R->eraseFromParent();
3950}
3951
3953 VPBasicBlock *EarlyExitVPBB,
3954 VPlan &Plan,
3955 VPBasicBlock *HeaderVPBB,
3956 VPBasicBlock *LatchVPBB) {
3957 auto *MiddleVPBB = cast<VPBasicBlock>(LatchVPBB->getSuccessors()[0]);
3958 if (!EarlyExitVPBB->getSinglePredecessor() &&
3959 EarlyExitVPBB->getPredecessors()[1] == MiddleVPBB) {
3960 assert(EarlyExitVPBB->getNumPredecessors() == 2 &&
3961 EarlyExitVPBB->getPredecessors()[0] == EarlyExitingVPBB &&
3962 "unsupported early exit VPBB");
3963 // Early exit operand should always be last phi operand. If EarlyExitVPBB
3964 // has two predecessors and EarlyExitingVPBB is the first, swap the operands
3965 // of the phis.
3966 for (VPRecipeBase &R : EarlyExitVPBB->phis())
3967 cast<VPIRPhi>(&R)->swapOperands();
3968 }
3969
3970 VPBuilder Builder(LatchVPBB->getTerminator());
3971 VPBlockBase *TrueSucc = EarlyExitingVPBB->getSuccessors()[0];
3972 assert(match(EarlyExitingVPBB->getTerminator(), m_BranchOnCond()) &&
3973 "Terminator must be be BranchOnCond");
3974 VPValue *CondOfEarlyExitingVPBB =
3975 EarlyExitingVPBB->getTerminator()->getOperand(0);
3976 auto *CondToEarlyExit = TrueSucc == EarlyExitVPBB
3977 ? CondOfEarlyExitingVPBB
3978 : Builder.createNot(CondOfEarlyExitingVPBB);
3979
3980 // Create a BranchOnTwoConds in the latch that branches to:
3981 // [0] vector.early.exit, [1] middle block, [2] header (continue looping).
3982 VPValue *IsEarlyExitTaken =
3983 Builder.createNaryOp(VPInstruction::AnyOf, {CondToEarlyExit});
3984 VPBasicBlock *VectorEarlyExitVPBB =
3985 Plan.createVPBasicBlock("vector.early.exit");
3986 VectorEarlyExitVPBB->setParent(EarlyExitVPBB->getParent());
3987
3988 VPBlockUtils::connectBlocks(VectorEarlyExitVPBB, EarlyExitVPBB);
3989
3990 // Update the exit phis in the early exit block.
3991 VPBuilder MiddleBuilder(MiddleVPBB);
3992 VPBuilder EarlyExitB(VectorEarlyExitVPBB);
3993 for (VPRecipeBase &R : EarlyExitVPBB->phis()) {
3994 auto *ExitIRI = cast<VPIRPhi>(&R);
3995 // Early exit operand should always be last, i.e., 0 if EarlyExitVPBB has
3996 // a single predecessor and 1 if it has two.
3997 unsigned EarlyExitIdx = ExitIRI->getNumOperands() - 1;
3998 if (ExitIRI->getNumOperands() != 1) {
3999 // The first of two operands corresponds to the latch exit, via MiddleVPBB
4000 // predecessor. Extract its final lane.
4001 ExitIRI->extractLastLaneOfLastPartOfFirstOperand(MiddleBuilder);
4002 }
4003
4004 VPValue *IncomingFromEarlyExit = ExitIRI->getOperand(EarlyExitIdx);
4005 if (!isa<VPIRValue>(IncomingFromEarlyExit)) {
4006 // Update the incoming value from the early exit.
4007 VPValue *FirstActiveLane = EarlyExitB.createNaryOp(
4008 VPInstruction::FirstActiveLane, {CondToEarlyExit},
4009 DebugLoc::getUnknown(), "first.active.lane");
4010 IncomingFromEarlyExit = EarlyExitB.createNaryOp(
4011 VPInstruction::ExtractLane, {FirstActiveLane, IncomingFromEarlyExit},
4012 DebugLoc::getUnknown(), "early.exit.value");
4013 ExitIRI->setOperand(EarlyExitIdx, IncomingFromEarlyExit);
4014 }
4015 }
4016
4017 // Replace the conditional branch controlling the latch exit from the vector
4018 // loop with a multi-conditional branch exiting to vector early exit if the
4019 // early exit has been taken, exiting to middle block if the original
4020 // condition of the vector latch is true, otherwise continuing back to header.
4021 auto *LatchExitingBranch = cast<VPInstruction>(LatchVPBB->getTerminator());
4022 assert(LatchExitingBranch->getOpcode() == VPInstruction::BranchOnCount &&
4023 "Unexpected terminator");
4024 auto *IsLatchExitTaken =
4025 Builder.createICmp(CmpInst::ICMP_EQ, LatchExitingBranch->getOperand(0),
4026 LatchExitingBranch->getOperand(1));
4027
4028 DebugLoc LatchDL = LatchExitingBranch->getDebugLoc();
4029 LatchExitingBranch->eraseFromParent();
4030
4031 Builder.setInsertPoint(LatchVPBB);
4032 Builder.createNaryOp(VPInstruction::BranchOnTwoConds,
4033 {IsEarlyExitTaken, IsLatchExitTaken}, LatchDL);
4034 LatchVPBB->clearSuccessors();
4035 LatchVPBB->setSuccessors({VectorEarlyExitVPBB, MiddleVPBB, HeaderVPBB});
4036 VectorEarlyExitVPBB->setPredecessors({LatchVPBB});
4037}
4038
4039/// This function tries convert extended in-loop reductions to
4040/// VPExpressionRecipe and clamp the \p Range if it is beneficial and
4041/// valid. The created recipe must be decomposed to its constituent
4042/// recipes before execution.
4043static VPExpressionRecipe *
4045 VFRange &Range) {
4046 Type *RedTy = Ctx.Types.inferScalarType(Red);
4047 VPValue *VecOp = Red->getVecOp();
4048
4049 // Clamp the range if using extended-reduction is profitable.
4050 auto IsExtendedRedValidAndClampRange =
4051 [&](unsigned Opcode, Instruction::CastOps ExtOpc, Type *SrcTy) -> bool {
4053 [&](ElementCount VF) {
4054 auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF));
4056
4058 InstructionCost ExtCost =
4059 cast<VPWidenCastRecipe>(VecOp)->computeCost(VF, Ctx);
4060 InstructionCost RedCost = Red->computeCost(VF, Ctx);
4061
4062 if (Red->isPartialReduction()) {
4065 // FIXME: Move partial reduction creation, costing and clamping
4066 // here from LoopVectorize.cpp.
4067 ExtRedCost = Ctx.TTI.getPartialReductionCost(
4068 Opcode, SrcTy, nullptr, RedTy, VF, ExtKind,
4069 llvm::TargetTransformInfo::PR_None, std::nullopt, Ctx.CostKind,
4070 RedTy->isFloatingPointTy()
4071 ? std::optional{Red->getFastMathFlags()}
4072 : std::nullopt);
4073 } else if (!RedTy->isFloatingPointTy()) {
4074 // TTI::getExtendedReductionCost only supports integer types.
4075 ExtRedCost = Ctx.TTI.getExtendedReductionCost(
4076 Opcode, ExtOpc == Instruction::CastOps::ZExt, RedTy, SrcVecTy,
4077 Red->getFastMathFlags(), CostKind);
4078 }
4079 return ExtRedCost.isValid() && ExtRedCost < ExtCost + RedCost;
4080 },
4081 Range);
4082 };
4083
4084 VPValue *A;
4085 // Match reduce(ext)).
4086 if (isa<VPWidenCastRecipe>(VecOp) &&
4087 (match(VecOp, m_ZExtOrSExt(m_VPValue(A))) ||
4088 match(VecOp, m_FPExt(m_VPValue(A)))) &&
4089 IsExtendedRedValidAndClampRange(
4090 RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()),
4091 cast<VPWidenCastRecipe>(VecOp)->getOpcode(),
4092 Ctx.Types.inferScalarType(A)))
4093 return new VPExpressionRecipe(cast<VPWidenCastRecipe>(VecOp), Red);
4094
4095 return nullptr;
4096}
4097
4098/// This function tries convert extended in-loop reductions to
4099/// VPExpressionRecipe and clamp the \p Range if it is beneficial
4100/// and valid. The created VPExpressionRecipe must be decomposed to its
4101/// constituent recipes before execution. Patterns of the
4102/// VPExpressionRecipe:
4103/// reduce.add(mul(...)),
4104/// reduce.add(mul(ext(A), ext(B))),
4105/// reduce.add(ext(mul(ext(A), ext(B)))).
4106/// reduce.fadd(fmul(ext(A), ext(B)))
4107static VPExpressionRecipe *
4109 VPCostContext &Ctx, VFRange &Range) {
4110 unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind());
4111 if (Opcode != Instruction::Add && Opcode != Instruction::Sub &&
4112 Opcode != Instruction::FAdd)
4113 return nullptr;
4114
4115 Type *RedTy = Ctx.Types.inferScalarType(Red);
4116
4117 // Clamp the range if using multiply-accumulate-reduction is profitable.
4118 auto IsMulAccValidAndClampRange =
4120 VPWidenCastRecipe *OuterExt) -> bool {
4122 [&](ElementCount VF) {
4124 Type *SrcTy =
4125 Ext0 ? Ctx.Types.inferScalarType(Ext0->getOperand(0)) : RedTy;
4126 InstructionCost MulAccCost;
4127
4128 if (Red->isPartialReduction()) {
4129 Type *SrcTy2 =
4130 Ext1 ? Ctx.Types.inferScalarType(Ext1->getOperand(0)) : nullptr;
4131 // FIXME: Move partial reduction creation, costing and clamping
4132 // here from LoopVectorize.cpp.
4133 MulAccCost = Ctx.TTI.getPartialReductionCost(
4134 Opcode, SrcTy, SrcTy2, RedTy, VF,
4136 Ext0->getOpcode())
4139 Ext1->getOpcode())
4141 Mul->getOpcode(), CostKind,
4142 RedTy->isFloatingPointTy()
4143 ? std::optional{Red->getFastMathFlags()}
4144 : std::nullopt);
4145 } else {
4146 // Only partial reductions support mixed or floating-point extends
4147 // at the moment.
4148 if (Ext0 && Ext1 &&
4149 (Ext0->getOpcode() != Ext1->getOpcode() ||
4150 Ext0->getOpcode() == Instruction::CastOps::FPExt))
4151 return false;
4152
4153 bool IsZExt =
4154 !Ext0 || Ext0->getOpcode() == Instruction::CastOps::ZExt;
4155 auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF));
4156 MulAccCost = Ctx.TTI.getMulAccReductionCost(IsZExt, Opcode, RedTy,
4157 SrcVecTy, CostKind);
4158 }
4159
4160 InstructionCost MulCost = Mul->computeCost(VF, Ctx);
4161 InstructionCost RedCost = Red->computeCost(VF, Ctx);
4162 InstructionCost ExtCost = 0;
4163 if (Ext0)
4164 ExtCost += Ext0->computeCost(VF, Ctx);
4165 if (Ext1)
4166 ExtCost += Ext1->computeCost(VF, Ctx);
4167 if (OuterExt)
4168 ExtCost += OuterExt->computeCost(VF, Ctx);
4169
4170 return MulAccCost.isValid() &&
4171 MulAccCost < ExtCost + MulCost + RedCost;
4172 },
4173 Range);
4174 };
4175
4176 VPValue *VecOp = Red->getVecOp();
4177 VPRecipeBase *Sub = nullptr;
4178 VPValue *A, *B;
4179 VPValue *Tmp = nullptr;
4180
4181 // Try to match reduce.fadd(fmul(fpext(...), fpext(...))).
4182 if (match(VecOp, m_FMul(m_FPExt(m_VPValue()), m_FPExt(m_VPValue())))) {
4183 assert(Opcode == Instruction::FAdd &&
4184 "MulAccumulateReduction from an FMul must accumulate into an FAdd "
4185 "instruction");
4186 auto *FMul = dyn_cast<VPWidenRecipe>(VecOp);
4187 if (!FMul)
4188 return nullptr;
4189
4190 auto *RecipeA = dyn_cast<VPWidenCastRecipe>(FMul->getOperand(0));
4191 auto *RecipeB = dyn_cast<VPWidenCastRecipe>(FMul->getOperand(1));
4192
4193 if (RecipeA && RecipeB &&
4194 IsMulAccValidAndClampRange(FMul, RecipeA, RecipeB, nullptr)) {
4195 return new VPExpressionRecipe(RecipeA, RecipeB, FMul, Red);
4196 }
4197 }
4198 if (RedTy->isFloatingPointTy())
4199 return nullptr;
4200
4201 // Sub reductions could have a sub between the add reduction and vec op.
4202 if (match(VecOp, m_Sub(m_ZeroInt(), m_VPValue(Tmp)))) {
4203 Sub = VecOp->getDefiningRecipe();
4204 VecOp = Tmp;
4205 }
4206
4207 // If ValB is a constant and can be safely extended, truncate it to the same
4208 // type as ExtA's operand, then extend it to the same type as ExtA. This
4209 // creates two uniform extends that can more easily be matched by the rest of
4210 // the bundling code. The ExtB reference, ValB and operand 1 of Mul are all
4211 // replaced with the new extend of the constant.
4212 auto ExtendAndReplaceConstantOp = [&Ctx](VPWidenCastRecipe *ExtA,
4213 VPWidenCastRecipe *&ExtB,
4214 VPValue *&ValB, VPWidenRecipe *Mul) {
4215 if (!ExtA || ExtB || !isa<VPIRValue>(ValB))
4216 return;
4217 Type *NarrowTy = Ctx.Types.inferScalarType(ExtA->getOperand(0));
4218 Instruction::CastOps ExtOpc = ExtA->getOpcode();
4219 const APInt *Const;
4220 if (!match(ValB, m_APInt(Const)) ||
4222 Const, NarrowTy, TTI::getPartialReductionExtendKind(ExtOpc)))
4223 return;
4224 // The truncate ensures that the type of each extended operand is the
4225 // same, and it's been proven that the constant can be extended from
4226 // NarrowTy safely. Necessary since ExtA's extended operand would be
4227 // e.g. an i8, while the const will likely be an i32. This will be
4228 // elided by later optimisations.
4229 VPBuilder Builder(Mul);
4230 auto *Trunc =
4231 Builder.createWidenCast(Instruction::CastOps::Trunc, ValB, NarrowTy);
4232 Type *WideTy = Ctx.Types.inferScalarType(ExtA);
4233 ValB = ExtB = Builder.createWidenCast(ExtOpc, Trunc, WideTy);
4234 Mul->setOperand(1, ExtB);
4235 };
4236
4237 // Try to match reduce.add(mul(...)).
4238 if (match(VecOp, m_Mul(m_VPValue(A), m_VPValue(B)))) {
4241 auto *Mul = cast<VPWidenRecipe>(VecOp);
4242
4243 // Convert reduce.add(mul(ext, const)) to reduce.add(mul(ext, ext(const)))
4244 ExtendAndReplaceConstantOp(RecipeA, RecipeB, B, Mul);
4245
4246 // Match reduce.add/sub(mul(ext, ext)).
4247 if (RecipeA && RecipeB && match(RecipeA, m_ZExtOrSExt(m_VPValue())) &&
4248 match(RecipeB, m_ZExtOrSExt(m_VPValue())) &&
4249 IsMulAccValidAndClampRange(Mul, RecipeA, RecipeB, nullptr)) {
4250 if (Sub)
4251 return new VPExpressionRecipe(RecipeA, RecipeB, Mul,
4252 cast<VPWidenRecipe>(Sub), Red);
4253 return new VPExpressionRecipe(RecipeA, RecipeB, Mul, Red);
4254 }
4255 // TODO: Add an expression type for this variant with a negated mul
4256 if (!Sub && IsMulAccValidAndClampRange(Mul, nullptr, nullptr, nullptr))
4257 return new VPExpressionRecipe(Mul, Red);
4258 }
4259 // TODO: Add an expression type for negated versions of other expression
4260 // variants.
4261 if (Sub)
4262 return nullptr;
4263
4264 // Match reduce.add(ext(mul(A, B))).
4265 if (match(VecOp, m_ZExtOrSExt(m_Mul(m_VPValue(A), m_VPValue(B))))) {
4266 auto *Ext = cast<VPWidenCastRecipe>(VecOp);
4267 auto *Mul = cast<VPWidenRecipe>(Ext->getOperand(0));
4270
4271 // reduce.add(ext(mul(ext, const)))
4272 // -> reduce.add(ext(mul(ext, ext(const))))
4273 ExtendAndReplaceConstantOp(Ext0, Ext1, B, Mul);
4274
4275 // reduce.add(ext(mul(ext(A), ext(B))))
4276 // -> reduce.add(mul(wider_ext(A), wider_ext(B)))
4277 // The inner extends must either have the same opcode as the outer extend or
4278 // be the same, in which case the multiply can never result in a negative
4279 // value and the outer extend can be folded away by doing wider
4280 // extends for the operands of the mul.
4281 if (Ext0 && Ext1 &&
4282 (Ext->getOpcode() == Ext0->getOpcode() || Ext0 == Ext1) &&
4283 Ext0->getOpcode() == Ext1->getOpcode() &&
4284 IsMulAccValidAndClampRange(Mul, Ext0, Ext1, Ext) && Mul->hasOneUse()) {
4285 auto *NewExt0 = new VPWidenCastRecipe(
4286 Ext0->getOpcode(), Ext0->getOperand(0), Ext->getResultType(), nullptr,
4287 *Ext0, *Ext0, Ext0->getDebugLoc());
4288 NewExt0->insertBefore(Ext0);
4289
4290 VPWidenCastRecipe *NewExt1 = NewExt0;
4291 if (Ext0 != Ext1) {
4292 NewExt1 = new VPWidenCastRecipe(Ext1->getOpcode(), Ext1->getOperand(0),
4293 Ext->getResultType(), nullptr, *Ext1,
4294 *Ext1, Ext1->getDebugLoc());
4295 NewExt1->insertBefore(Ext1);
4296 }
4297 Mul->setOperand(0, NewExt0);
4298 Mul->setOperand(1, NewExt1);
4299 Red->setOperand(1, Mul);
4300 return new VPExpressionRecipe(NewExt0, NewExt1, Mul, Red);
4301 }
4302 }
4303 return nullptr;
4304}
4305
4306/// This function tries to create abstract recipes from the reduction recipe for
4307/// following optimizations and cost estimation.
4309 VPCostContext &Ctx,
4310 VFRange &Range) {
4311 VPExpressionRecipe *AbstractR = nullptr;
4312 auto IP = std::next(Red->getIterator());
4313 auto *VPBB = Red->getParent();
4314 if (auto *MulAcc = tryToMatchAndCreateMulAccumulateReduction(Red, Ctx, Range))
4315 AbstractR = MulAcc;
4316 else if (auto *ExtRed = tryToMatchAndCreateExtendedReduction(Red, Ctx, Range))
4317 AbstractR = ExtRed;
4318 // Cannot create abstract inloop reduction recipes.
4319 if (!AbstractR)
4320 return;
4321
4322 AbstractR->insertBefore(*VPBB, IP);
4323 Red->replaceAllUsesWith(AbstractR);
4324}
4325
4336
4338 if (Plan.hasScalarVFOnly())
4339 return;
4340
4341#ifndef NDEBUG
4342 VPDominatorTree VPDT(Plan);
4343#endif
4344
4345 SmallVector<VPValue *> VPValues;
4348 append_range(VPValues, Plan.getLiveIns());
4349 for (VPRecipeBase &R : *Plan.getEntry())
4350 append_range(VPValues, R.definedValues());
4351
4352 auto *VectorPreheader = Plan.getVectorPreheader();
4353 for (VPValue *VPV : VPValues) {
4355 (isa<VPIRValue>(VPV) && isa<Constant>(VPV->getLiveInIRValue())))
4356 continue;
4357
4358 // Add explicit broadcast at the insert point that dominates all users.
4359 VPBasicBlock *HoistBlock = VectorPreheader;
4360 VPBasicBlock::iterator HoistPoint = VectorPreheader->end();
4361 for (VPUser *User : VPV->users()) {
4362 if (User->usesScalars(VPV))
4363 continue;
4364 if (cast<VPRecipeBase>(User)->getParent() == VectorPreheader)
4365 HoistPoint = HoistBlock->begin();
4366 else
4367 assert(VPDT.dominates(VectorPreheader,
4368 cast<VPRecipeBase>(User)->getParent()) &&
4369 "All users must be in the vector preheader or dominated by it");
4370 }
4371
4372 VPBuilder Builder(cast<VPBasicBlock>(HoistBlock), HoistPoint);
4373 auto *Broadcast = Builder.createNaryOp(VPInstruction::Broadcast, {VPV});
4374 VPV->replaceUsesWithIf(Broadcast,
4375 [VPV, Broadcast](VPUser &U, unsigned Idx) {
4376 return Broadcast != &U && !U.usesScalars(VPV);
4377 });
4378 }
4379}
4380
4382 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
4383
4384 // Collect candidate loads with invariant addresses and noalias scopes
4385 // metadata and memory-writing recipes with noalias metadata.
4389 vp_depth_first_shallow(LoopRegion->getEntry()))) {
4390 for (VPRecipeBase &R : *VPBB) {
4391 // Only handle single-scalar replicated loads with invariant addresses.
4392 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
4393 if (RepR->isPredicated() || !RepR->isSingleScalar() ||
4394 RepR->getOpcode() != Instruction::Load)
4395 continue;
4396
4397 VPValue *Addr = RepR->getOperand(0);
4398 if (Addr->isDefinedOutsideLoopRegions()) {
4400 if (!Loc.AATags.Scope)
4401 continue;
4402 CandidateLoads.push_back({RepR, Loc});
4403 }
4404 }
4405 if (R.mayWriteToMemory()) {
4407 if (!Loc || !Loc->AATags.Scope || !Loc->AATags.NoAlias)
4408 return;
4409 Stores.push_back(*Loc);
4410 }
4411 }
4412 }
4413
4414 VPBasicBlock *Preheader = Plan.getVectorPreheader();
4415 for (auto &[LoadRecipe, LoadLoc] : CandidateLoads) {
4416 // Hoist the load to the preheader if it doesn't alias with any stores
4417 // according to the noalias metadata. Other loads should have been hoisted
4418 // by other passes
4419 const AAMDNodes &LoadAA = LoadLoc.AATags;
4420 if (all_of(Stores, [&](const MemoryLocation &StoreLoc) {
4422 LoadAA.Scope, StoreLoc.AATags.NoAlias);
4423 })) {
4424 LoadRecipe->moveBefore(*Preheader, Preheader->getFirstNonPhi());
4425 }
4426 }
4427}
4428
4429// Collect common metadata from a group of replicate recipes by intersecting
4430// metadata from all recipes in the group.
4432 VPIRMetadata CommonMetadata = *Recipes.front();
4433 for (VPReplicateRecipe *Recipe : drop_begin(Recipes))
4434 CommonMetadata.intersect(*Recipe);
4435 return CommonMetadata;
4436}
4437
4438template <unsigned Opcode>
4442 const Loop *L) {
4443 static_assert(Opcode == Instruction::Load || Opcode == Instruction::Store,
4444 "Only Load and Store opcodes supported");
4445 constexpr bool IsLoad = (Opcode == Instruction::Load);
4446 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
4447 VPTypeAnalysis TypeInfo(Plan);
4448
4449 // Group predicated operations by their address SCEV.
4451 for (VPBlockBase *Block : vp_depth_first_shallow(LoopRegion->getEntry())) {
4452 auto *VPBB = cast<VPBasicBlock>(Block);
4453 for (VPRecipeBase &R : *VPBB) {
4454 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
4455 if (!RepR || RepR->getOpcode() != Opcode || !RepR->isPredicated())
4456 continue;
4457
4458 // For loads, operand 0 is address; for stores, operand 1 is address.
4459 VPValue *Addr = RepR->getOperand(IsLoad ? 0 : 1);
4460 const SCEV *AddrSCEV = vputils::getSCEVExprForVPValue(Addr, PSE, L);
4461 if (!isa<SCEVCouldNotCompute>(AddrSCEV))
4462 RecipesByAddress[AddrSCEV].push_back(RepR);
4463 }
4464 }
4465
4466 // For each address, collect operations with the same or complementary masks.
4468 auto GetLoadStoreValueType = [&](VPReplicateRecipe *Recipe) {
4469 return TypeInfo.inferScalarType(IsLoad ? Recipe : Recipe->getOperand(0));
4470 };
4471 for (auto &[Addr, Recipes] : RecipesByAddress) {
4472 if (Recipes.size() < 2)
4473 continue;
4474
4475 // Collect groups with the same or complementary masks.
4476 for (VPReplicateRecipe *&RecipeI : Recipes) {
4477 if (!RecipeI)
4478 continue;
4479
4480 VPValue *MaskI = RecipeI->getMask();
4481 Type *TypeI = GetLoadStoreValueType(RecipeI);
4483 Group.push_back(RecipeI);
4484 RecipeI = nullptr;
4485
4486 // Find all operations with the same or complementary masks.
4487 bool HasComplementaryMask = false;
4488 for (VPReplicateRecipe *&RecipeJ : Recipes) {
4489 if (!RecipeJ)
4490 continue;
4491
4492 VPValue *MaskJ = RecipeJ->getMask();
4493 Type *TypeJ = GetLoadStoreValueType(RecipeJ);
4494 if (TypeI == TypeJ) {
4495 // Check if any operation in the group has a complementary mask with
4496 // another, that is M1 == NOT(M2) or M2 == NOT(M1).
4497 HasComplementaryMask |= match(MaskI, m_Not(m_Specific(MaskJ))) ||
4498 match(MaskJ, m_Not(m_Specific(MaskI)));
4499 Group.push_back(RecipeJ);
4500 RecipeJ = nullptr;
4501 }
4502 }
4503
4504 if (HasComplementaryMask) {
4505 assert(Group.size() >= 2 && "must have at least 2 entries");
4506 AllGroups.push_back(std::move(Group));
4507 }
4508 }
4509 }
4510
4511 return AllGroups;
4512}
4513
4514// Find the recipe with minimum alignment in the group.
4515template <typename InstType>
4516static VPReplicateRecipe *
4518 return *min_element(Group, [](VPReplicateRecipe *A, VPReplicateRecipe *B) {
4519 return cast<InstType>(A->getUnderlyingInstr())->getAlign() <
4520 cast<InstType>(B->getUnderlyingInstr())->getAlign();
4521 });
4522}
4523
4526 const Loop *L) {
4527 auto Groups =
4529 if (Groups.empty())
4530 return;
4531
4532 VPDominatorTree VPDT(Plan);
4533
4534 // Process each group of loads.
4535 for (auto &Group : Groups) {
4536 // Sort loads by dominance order, with earliest (most dominating) first.
4537 sort(Group, [&VPDT](VPReplicateRecipe *A, VPReplicateRecipe *B) {
4538 return VPDT.properlyDominates(A, B);
4539 });
4540
4541 // Try to use the earliest (most dominating) load to replace all others.
4542 VPReplicateRecipe *EarliestLoad = Group[0];
4543 VPBasicBlock *FirstBB = EarliestLoad->getParent();
4544 VPBasicBlock *LastBB = Group.back()->getParent();
4545
4546 // Check that the load doesn't alias with stores between first and last.
4547 auto LoadLoc = vputils::getMemoryLocation(*EarliestLoad);
4548 if (!LoadLoc || !canHoistOrSinkWithNoAliasCheck(*LoadLoc, FirstBB, LastBB))
4549 continue;
4550
4551 // Collect common metadata from all loads in the group.
4552 VPIRMetadata CommonMetadata = getCommonMetadata(Group);
4553
4554 // Find the load with minimum alignment to use.
4555 auto *LoadWithMinAlign = findRecipeWithMinAlign<LoadInst>(Group);
4556
4557 // Create an unpredicated version of the earliest load with common
4558 // metadata.
4559 auto *UnpredicatedLoad = new VPReplicateRecipe(
4560 LoadWithMinAlign->getUnderlyingInstr(), {EarliestLoad->getOperand(0)},
4561 /*IsSingleScalar=*/false, /*Mask=*/nullptr, *EarliestLoad,
4562 CommonMetadata);
4563
4564 UnpredicatedLoad->insertBefore(EarliestLoad);
4565
4566 // Replace all loads in the group with the unpredicated load.
4567 for (VPReplicateRecipe *Load : Group) {
4568 Load->replaceAllUsesWith(UnpredicatedLoad);
4569 Load->eraseFromParent();
4570 }
4571 }
4572}
4573
4574static bool
4576 PredicatedScalarEvolution &PSE, const Loop &L,
4577 VPTypeAnalysis &TypeInfo) {
4578 auto StoreLoc = vputils::getMemoryLocation(*StoresToSink.front());
4579 if (!StoreLoc || !StoreLoc->AATags.Scope)
4580 return false;
4581
4582 // When sinking a group of stores, all members of the group alias each other.
4583 // Skip them during the alias checks.
4584 SmallPtrSet<VPRecipeBase *, 4> StoresToSinkSet(StoresToSink.begin(),
4585 StoresToSink.end());
4586
4587 VPBasicBlock *FirstBB = StoresToSink.front()->getParent();
4588 VPBasicBlock *LastBB = StoresToSink.back()->getParent();
4589 SinkStoreInfo SinkInfo(StoresToSinkSet, *StoresToSink[0], PSE, L, TypeInfo);
4590 return canHoistOrSinkWithNoAliasCheck(*StoreLoc, FirstBB, LastBB, SinkInfo);
4591}
4592
4595 const Loop *L) {
4596 auto Groups =
4598 if (Groups.empty())
4599 return;
4600
4601 VPDominatorTree VPDT(Plan);
4602 VPTypeAnalysis TypeInfo(Plan);
4603
4604 for (auto &Group : Groups) {
4605 sort(Group, [&VPDT](VPReplicateRecipe *A, VPReplicateRecipe *B) {
4606 return VPDT.properlyDominates(A, B);
4607 });
4608
4609 if (!canSinkStoreWithNoAliasCheck(Group, PSE, *L, TypeInfo))
4610 continue;
4611
4612 // Use the last (most dominated) store's location for the unconditional
4613 // store.
4614 VPReplicateRecipe *LastStore = Group.back();
4615 VPBasicBlock *InsertBB = LastStore->getParent();
4616
4617 // Collect common alias metadata from all stores in the group.
4618 VPIRMetadata CommonMetadata = getCommonMetadata(Group);
4619
4620 // Build select chain for stored values.
4621 VPValue *SelectedValue = Group[0]->getOperand(0);
4622 VPBuilder Builder(InsertBB, LastStore->getIterator());
4623
4624 for (unsigned I = 1; I < Group.size(); ++I) {
4625 VPValue *Mask = Group[I]->getMask();
4626 VPValue *Value = Group[I]->getOperand(0);
4627 SelectedValue = Builder.createSelect(Mask, Value, SelectedValue,
4628 Group[I]->getDebugLoc());
4629 }
4630
4631 // Find the store with minimum alignment to use.
4632 auto *StoreWithMinAlign = findRecipeWithMinAlign<StoreInst>(Group);
4633
4634 // Create unconditional store with selected value and common metadata.
4635 auto *UnpredicatedStore =
4636 new VPReplicateRecipe(StoreWithMinAlign->getUnderlyingInstr(),
4637 {SelectedValue, LastStore->getOperand(1)},
4638 /*IsSingleScalar=*/false,
4639 /*Mask=*/nullptr, *LastStore, CommonMetadata);
4640 UnpredicatedStore->insertBefore(*InsertBB, LastStore->getIterator());
4641
4642 // Remove all predicated stores from the group.
4643 for (VPReplicateRecipe *Store : Group)
4644 Store->eraseFromParent();
4645 }
4646}
4647
4649 VPlan &Plan, ElementCount BestVF, unsigned BestUF,
4651 assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
4652 assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
4653
4654 VPValue *TC = Plan.getTripCount();
4655 // Skip cases for which the trip count may be non-trivial to materialize.
4656 // I.e., when a scalar tail is absent - due to tail folding, or when a scalar
4657 // tail is required.
4658 if (!Plan.hasScalarTail() ||
4660 Plan.getScalarPreheader() ||
4661 !isa<VPIRValue>(TC))
4662 return;
4663
4664 // Materialize vector trip counts for constants early if it can simply
4665 // be computed as (Original TC / VF * UF) * VF * UF.
4666 // TODO: Compute vector trip counts for loops requiring a scalar epilogue and
4667 // tail-folded loops.
4668 ScalarEvolution &SE = *PSE.getSE();
4669 auto *TCScev = SE.getSCEV(TC->getLiveInIRValue());
4670 if (!isa<SCEVConstant>(TCScev))
4671 return;
4672 const SCEV *VFxUF = SE.getElementCount(TCScev->getType(), BestVF * BestUF);
4673 auto VecTCScev = SE.getMulExpr(SE.getUDivExpr(TCScev, VFxUF), VFxUF);
4674 if (auto *ConstVecTC = dyn_cast<SCEVConstant>(VecTCScev))
4675 Plan.getVectorTripCount().setUnderlyingValue(ConstVecTC->getValue());
4676}
4677
4679 VPBasicBlock *VectorPH) {
4681 if (BTC->getNumUsers() == 0)
4682 return;
4683
4684 VPBuilder Builder(VectorPH, VectorPH->begin());
4685 auto *TCTy = VPTypeAnalysis(Plan).inferScalarType(Plan.getTripCount());
4686 auto *TCMO = Builder.createNaryOp(
4687 Instruction::Sub, {Plan.getTripCount(), Plan.getConstantInt(TCTy, 1)},
4688 DebugLoc::getCompilerGenerated(), "trip.count.minus.1");
4689 BTC->replaceAllUsesWith(TCMO);
4690}
4691
4693 if (Plan.hasScalarVFOnly())
4694 return;
4695
4696 VPTypeAnalysis TypeInfo(Plan);
4697 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
4698 auto VPBBsOutsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
4700 auto VPBBsInsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
4701 vp_depth_first_shallow(LoopRegion->getEntry()));
4702 // Materialize Build(Struct)Vector for all replicating VPReplicateRecipes and
4703 // VPInstructions, excluding ones in replicate regions. Those are not
4704 // materialized explicitly yet. Those vector users are still handled in
4705 // VPReplicateRegion::execute(), via shouldPack().
4706 // TODO: materialize build vectors for replicating recipes in replicating
4707 // regions.
4708 for (VPBasicBlock *VPBB :
4709 concat<VPBasicBlock *>(VPBBsOutsideLoopRegion, VPBBsInsideLoopRegion)) {
4710 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
4712 continue;
4713 auto *DefR = cast<VPRecipeWithIRFlags>(&R);
4714 auto UsesVectorOrInsideReplicateRegion = [DefR, LoopRegion](VPUser *U) {
4715 VPRegionBlock *ParentRegion = cast<VPRecipeBase>(U)->getRegion();
4716 return !U->usesScalars(DefR) || ParentRegion != LoopRegion;
4717 };
4718 if ((isa<VPReplicateRecipe>(DefR) &&
4719 cast<VPReplicateRecipe>(DefR)->isSingleScalar()) ||
4720 (isa<VPInstruction>(DefR) &&
4722 !cast<VPInstruction>(DefR)->doesGeneratePerAllLanes())) ||
4723 none_of(DefR->users(), UsesVectorOrInsideReplicateRegion))
4724 continue;
4725
4726 Type *ScalarTy = TypeInfo.inferScalarType(DefR);
4727 unsigned Opcode = ScalarTy->isStructTy()
4730 auto *BuildVector = new VPInstruction(Opcode, {DefR});
4731 BuildVector->insertAfter(DefR);
4732
4733 DefR->replaceUsesWithIf(
4734 BuildVector, [BuildVector, &UsesVectorOrInsideReplicateRegion](
4735 VPUser &U, unsigned) {
4736 return &U != BuildVector && UsesVectorOrInsideReplicateRegion(&U);
4737 });
4738 }
4739 }
4740
4741 // Create explicit VPInstructions to convert vectors to scalars. The current
4742 // implementation is conservative - it may miss some cases that may or may not
4743 // be vector values. TODO: introduce Unpacks speculatively - remove them later
4744 // if they are known to operate on scalar values.
4745 for (VPBasicBlock *VPBB : VPBBsInsideLoopRegion) {
4746 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
4749 continue;
4750 for (VPValue *Def : R.definedValues()) {
4751 // Skip recipes that are single-scalar or only have their first lane
4752 // used.
4753 // TODO: The Defs skipped here may or may not be vector values.
4754 // Introduce Unpacks, and remove them later, if they are guaranteed to
4755 // produce scalar values.
4757 continue;
4758
4759 // At the moment, we create unpacks only for scalar users outside
4760 // replicate regions. Recipes inside replicate regions still extract the
4761 // required lanes implicitly.
4762 // TODO: Remove once replicate regions are unrolled completely.
4763 auto IsCandidateUnpackUser = [Def](VPUser *U) {
4764 VPRegionBlock *ParentRegion = cast<VPRecipeBase>(U)->getRegion();
4765 return U->usesScalars(Def) &&
4766 (!ParentRegion || !ParentRegion->isReplicator());
4767 };
4768 if (none_of(Def->users(), IsCandidateUnpackUser))
4769 continue;
4770
4771 auto *Unpack = new VPInstruction(VPInstruction::Unpack, {Def});
4772 if (R.isPhi())
4773 Unpack->insertBefore(*VPBB, VPBB->getFirstNonPhi());
4774 else
4775 Unpack->insertAfter(&R);
4776 Def->replaceUsesWithIf(Unpack,
4777 [&IsCandidateUnpackUser](VPUser &U, unsigned) {
4778 return IsCandidateUnpackUser(&U);
4779 });
4780 }
4781 }
4782 }
4783}
4784
4786 VPBasicBlock *VectorPHVPBB,
4787 bool TailByMasking,
4788 bool RequiresScalarEpilogue) {
4789 VPSymbolicValue &VectorTC = Plan.getVectorTripCount();
4790 // There's nothing to do if there are no users of the vector trip count or its
4791 // IR value has already been set.
4792 if (VectorTC.getNumUsers() == 0 || VectorTC.getUnderlyingValue())
4793 return;
4794
4795 VPValue *TC = Plan.getTripCount();
4796 Type *TCTy = VPTypeAnalysis(Plan).inferScalarType(TC);
4797 VPBuilder Builder(VectorPHVPBB, VectorPHVPBB->begin());
4798 VPValue *Step = &Plan.getVFxUF();
4799
4800 // If the tail is to be folded by masking, round the number of iterations N
4801 // up to a multiple of Step instead of rounding down. This is done by first
4802 // adding Step-1 and then rounding down. Note that it's ok if this addition
4803 // overflows: the vector induction variable will eventually wrap to zero given
4804 // that it starts at zero and its Step is a power of two; the loop will then
4805 // exit, with the last early-exit vector comparison also producing all-true.
4806 // For scalable vectors the VF is not guaranteed to be a power of 2, but this
4807 // is accounted for in emitIterationCountCheck that adds an overflow check.
4808 if (TailByMasking) {
4809 TC = Builder.createNaryOp(
4810 Instruction::Add,
4811 {TC, Builder.createNaryOp(Instruction::Sub,
4812 {Step, Plan.getConstantInt(TCTy, 1)})},
4813 DebugLoc::getCompilerGenerated(), "n.rnd.up");
4814 }
4815
4816 // Now we need to generate the expression for the part of the loop that the
4817 // vectorized body will execute. This is equal to N - (N % Step) if scalar
4818 // iterations are not required for correctness, or N - Step, otherwise. Step
4819 // is equal to the vectorization factor (number of SIMD elements) times the
4820 // unroll factor (number of SIMD instructions).
4821 VPValue *R =
4822 Builder.createNaryOp(Instruction::URem, {TC, Step},
4823 DebugLoc::getCompilerGenerated(), "n.mod.vf");
4824
4825 // There are cases where we *must* run at least one iteration in the remainder
4826 // loop. See the cost model for when this can happen. If the step evenly
4827 // divides the trip count, we set the remainder to be equal to the step. If
4828 // the step does not evenly divide the trip count, no adjustment is necessary
4829 // since there will already be scalar iterations. Note that the minimum
4830 // iterations check ensures that N >= Step.
4831 if (RequiresScalarEpilogue) {
4832 assert(!TailByMasking &&
4833 "requiring scalar epilogue is not supported with fail folding");
4834 VPValue *IsZero =
4835 Builder.createICmp(CmpInst::ICMP_EQ, R, Plan.getConstantInt(TCTy, 0));
4836 R = Builder.createSelect(IsZero, Step, R);
4837 }
4838
4839 VPValue *Res = Builder.createNaryOp(
4840 Instruction::Sub, {TC, R}, DebugLoc::getCompilerGenerated(), "n.vec");
4841 VectorTC.replaceAllUsesWith(Res);
4842}
4843
4845 ElementCount VFEC) {
4846 VPBuilder Builder(VectorPH, VectorPH->begin());
4847 Type *TCTy = VPTypeAnalysis(Plan).inferScalarType(Plan.getTripCount());
4848 VPValue &VF = Plan.getVF();
4849 VPValue &VFxUF = Plan.getVFxUF();
4850 // Note that after the transform, Plan.getVF and Plan.getVFxUF should not be
4851 // used.
4852 // TODO: Assert that they aren't used.
4853
4854 // If there are no users of the runtime VF, compute VFxUF by constant folding
4855 // the multiplication of VF and UF.
4856 if (VF.getNumUsers() == 0) {
4857 VPValue *RuntimeVFxUF =
4858 Builder.createElementCount(TCTy, VFEC * Plan.getUF());
4859 VFxUF.replaceAllUsesWith(RuntimeVFxUF);
4860 return;
4861 }
4862
4863 // For users of the runtime VF, compute it as VF * vscale, and VFxUF as (VF *
4864 // vscale) * UF.
4865 VPValue *RuntimeVF = Builder.createElementCount(TCTy, VFEC);
4867 VPValue *BC = Builder.createNaryOp(VPInstruction::Broadcast, RuntimeVF);
4869 BC, [&VF](VPUser &U, unsigned) { return !U.usesScalars(&VF); });
4870 }
4871 VF.replaceAllUsesWith(RuntimeVF);
4872
4873 VPValue *UF = Plan.getConstantInt(TCTy, Plan.getUF());
4874 VPValue *MulByUF = Builder.createOverflowingOp(
4875 Instruction::Mul, {RuntimeVF, UF}, {true, false});
4876 VFxUF.replaceAllUsesWith(MulByUF);
4877}
4878
4881 SCEVExpander Expander(SE, "induction", /*PreserveLCSSA=*/false);
4882
4883 auto *Entry = cast<VPIRBasicBlock>(Plan.getEntry());
4884 BasicBlock *EntryBB = Entry->getIRBasicBlock();
4885 DenseMap<const SCEV *, Value *> ExpandedSCEVs;
4886 for (VPRecipeBase &R : make_early_inc_range(*Entry)) {
4888 continue;
4889 auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(&R);
4890 if (!ExpSCEV)
4891 break;
4892 const SCEV *Expr = ExpSCEV->getSCEV();
4893 Value *Res =
4894 Expander.expandCodeFor(Expr, Expr->getType(), EntryBB->getTerminator());
4895 ExpandedSCEVs[ExpSCEV->getSCEV()] = Res;
4896 VPValue *Exp = Plan.getOrAddLiveIn(Res);
4897 ExpSCEV->replaceAllUsesWith(Exp);
4898 if (Plan.getTripCount() == ExpSCEV)
4899 Plan.resetTripCount(Exp);
4900 ExpSCEV->eraseFromParent();
4901 }
4903 "VPExpandSCEVRecipes must be at the beginning of the entry block, "
4904 "before any VPIRInstructions");
4905 // Add IR instructions in the entry basic block but not in the VPIRBasicBlock
4906 // to the VPIRBasicBlock.
4907 auto EI = Entry->begin();
4908 for (Instruction &I : drop_end(*EntryBB)) {
4909 if (EI != Entry->end() && isa<VPIRInstruction>(*EI) &&
4910 &cast<VPIRInstruction>(&*EI)->getInstruction() == &I) {
4911 EI++;
4912 continue;
4913 }
4915 }
4916
4917 return ExpandedSCEVs;
4918}
4919
4920/// Returns true if \p V is VPWidenLoadRecipe or VPInterleaveRecipe that can be
4921/// converted to a narrower recipe. \p V is used by a wide recipe that feeds a
4922/// store interleave group at index \p Idx, \p WideMember0 is the recipe feeding
4923/// the same interleave group at index 0. A VPWidenLoadRecipe can be narrowed to
4924/// an index-independent load if it feeds all wide ops at all indices (\p OpV
4925/// must be the operand at index \p OpIdx for both the recipe at lane 0, \p
4926/// WideMember0). A VPInterleaveRecipe can be narrowed to a wide load, if \p V
4927/// is defined at \p Idx of a load interleave group.
4928static bool canNarrowLoad(VPWidenRecipe *WideMember0, unsigned OpIdx,
4929 VPValue *OpV, unsigned Idx) {
4930 VPValue *Member0Op = WideMember0->getOperand(OpIdx);
4931 VPRecipeBase *Member0OpR = Member0Op->getDefiningRecipe();
4932 if (!Member0OpR)
4933 return Member0Op == OpV;
4934 if (auto *W = dyn_cast<VPWidenLoadRecipe>(Member0OpR))
4935 return !W->getMask() && Member0Op == OpV;
4936 if (auto *IR = dyn_cast<VPInterleaveRecipe>(Member0OpR))
4937 return IR->getInterleaveGroup()->isFull() && IR->getVPValue(Idx) == OpV;
4938 return false;
4939}
4940
4941/// Returns true if \p IR is a full interleave group with factor and number of
4942/// members both equal to \p VF. The interleave group must also access the full
4943/// vector width \p VectorRegWidth.
4945 ElementCount VF,
4946 VPTypeAnalysis &TypeInfo,
4947 TypeSize VectorRegWidth) {
4948 if (!InterleaveR || InterleaveR->getMask())
4949 return false;
4950
4951 Type *GroupElementTy = nullptr;
4952 if (InterleaveR->getStoredValues().empty()) {
4953 GroupElementTy = TypeInfo.inferScalarType(InterleaveR->getVPValue(0));
4954 if (!all_of(InterleaveR->definedValues(),
4955 [&TypeInfo, GroupElementTy](VPValue *Op) {
4956 return TypeInfo.inferScalarType(Op) == GroupElementTy;
4957 }))
4958 return false;
4959 } else {
4960 GroupElementTy =
4961 TypeInfo.inferScalarType(InterleaveR->getStoredValues()[0]);
4962 if (!all_of(InterleaveR->getStoredValues(),
4963 [&TypeInfo, GroupElementTy](VPValue *Op) {
4964 return TypeInfo.inferScalarType(Op) == GroupElementTy;
4965 }))
4966 return false;
4967 }
4968
4969 unsigned VFMin = VF.getKnownMinValue();
4970 TypeSize GroupSize = TypeSize::get(
4971 GroupElementTy->getScalarSizeInBits() * VFMin, VF.isScalable());
4972 const auto *IG = InterleaveR->getInterleaveGroup();
4973 return IG->getFactor() == VFMin && IG->getNumMembers() == VFMin &&
4974 GroupSize == VectorRegWidth;
4975}
4976
4977/// Returns true if \p VPValue is a narrow VPValue.
4978static bool isAlreadyNarrow(VPValue *VPV) {
4979 if (isa<VPIRValue>(VPV))
4980 return true;
4981 auto *RepR = dyn_cast<VPReplicateRecipe>(VPV);
4982 return RepR && RepR->isSingleScalar();
4983}
4984
4985// Convert a wide recipe defining a VPValue \p V feeding an interleave group to
4986// a narrow variant.
4987static VPValue *
4989 auto *R = V->getDefiningRecipe();
4990 if (!R || NarrowedOps.contains(V))
4991 return V;
4992
4993 if (isAlreadyNarrow(V))
4994 return V;
4995
4996 if (auto *WideMember0 = dyn_cast<VPWidenRecipe>(R)) {
4997 for (unsigned Idx = 0, E = WideMember0->getNumOperands(); Idx != E; ++Idx)
4998 WideMember0->setOperand(
4999 Idx,
5000 narrowInterleaveGroupOp(WideMember0->getOperand(Idx), NarrowedOps));
5001 return V;
5002 }
5003
5004 if (auto *LoadGroup = dyn_cast<VPInterleaveRecipe>(R)) {
5005 // Narrow interleave group to wide load, as transformed VPlan will only
5006 // process one original iteration.
5007 auto *LI = cast<LoadInst>(LoadGroup->getInterleaveGroup()->getInsertPos());
5008 auto *L = new VPWidenLoadRecipe(
5009 *LI, LoadGroup->getAddr(), LoadGroup->getMask(), /*Consecutive=*/true,
5010 /*Reverse=*/false, {}, LoadGroup->getDebugLoc());
5011 L->insertBefore(LoadGroup);
5012 NarrowedOps.insert(L);
5013 return L;
5014 }
5015
5016 if (auto *RepR = dyn_cast<VPReplicateRecipe>(R)) {
5017 assert(RepR->isSingleScalar() &&
5018 isa<LoadInst>(RepR->getUnderlyingInstr()) &&
5019 "must be a single scalar load");
5020 NarrowedOps.insert(RepR);
5021 return RepR;
5022 }
5023
5024 auto *WideLoad = cast<VPWidenLoadRecipe>(R);
5025 VPValue *PtrOp = WideLoad->getAddr();
5026 if (auto *VecPtr = dyn_cast<VPVectorPointerRecipe>(PtrOp))
5027 PtrOp = VecPtr->getOperand(0);
5028 // Narrow wide load to uniform scalar load, as transformed VPlan will only
5029 // process one original iteration.
5030 auto *N = new VPReplicateRecipe(&WideLoad->getIngredient(), {PtrOp},
5031 /*IsUniform*/ true,
5032 /*Mask*/ nullptr, {}, *WideLoad);
5033 N->insertBefore(WideLoad);
5034 NarrowedOps.insert(N);
5035 return N;
5036}
5037
5039 TypeSize VectorRegWidth) {
5040 VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
5041 if (!VectorLoop || VectorLoop->getEntry()->getNumSuccessors() != 0)
5042 return;
5043
5044 VPTypeAnalysis TypeInfo(Plan);
5045
5047 for (auto &R : *VectorLoop->getEntryBasicBlock()) {
5049 continue;
5050
5053 continue;
5054
5055 // Bail out on recipes not supported at the moment:
5056 // * phi recipes other than the canonical induction
5057 // * recipes writing to memory except interleave groups
5058 // Only support plans with a canonical induction phi.
5059 if (R.isPhi())
5060 return;
5061
5062 auto *InterleaveR = dyn_cast<VPInterleaveRecipe>(&R);
5063 if (R.mayWriteToMemory() && !InterleaveR)
5064 return;
5065
5066 // Do not narrow interleave groups if there are VectorPointer recipes and
5067 // the plan was unrolled. The recipe implicitly uses VF from
5068 // VPTransformState.
5069 // TODO: Remove restriction once the VF for the VectorPointer offset is
5070 // modeled explicitly as operand.
5071 if (isa<VPVectorPointerRecipe>(&R) && Plan.getUF() > 1)
5072 return;
5073
5074 // All other ops are allowed, but we reject uses that cannot be converted
5075 // when checking all allowed consumers (store interleave groups) below.
5076 if (!InterleaveR)
5077 continue;
5078
5079 // Bail out on non-consecutive interleave groups.
5080 if (!isConsecutiveInterleaveGroup(InterleaveR, VF, TypeInfo,
5081 VectorRegWidth))
5082 return;
5083
5084 // Skip read interleave groups.
5085 if (InterleaveR->getStoredValues().empty())
5086 continue;
5087
5088 // Narrow interleave groups, if all operands are already matching narrow
5089 // ops.
5090 auto *Member0 = InterleaveR->getStoredValues()[0];
5091 if (isAlreadyNarrow(Member0) &&
5092 all_of(InterleaveR->getStoredValues(), equal_to(Member0))) {
5093 StoreGroups.push_back(InterleaveR);
5094 continue;
5095 }
5096
5097 // For now, we only support full interleave groups storing load interleave
5098 // groups.
5099 if (all_of(enumerate(InterleaveR->getStoredValues()), [](auto Op) {
5100 VPRecipeBase *DefR = Op.value()->getDefiningRecipe();
5101 if (!DefR)
5102 return false;
5103 auto *IR = dyn_cast<VPInterleaveRecipe>(DefR);
5104 return IR && IR->getInterleaveGroup()->isFull() &&
5105 IR->getVPValue(Op.index()) == Op.value();
5106 })) {
5107 StoreGroups.push_back(InterleaveR);
5108 continue;
5109 }
5110
5111 // Check if all values feeding InterleaveR are matching wide recipes, which
5112 // operands that can be narrowed.
5113 auto *WideMember0 =
5114 dyn_cast_or_null<VPWidenRecipe>(InterleaveR->getStoredValues()[0]);
5115 if (!WideMember0)
5116 return;
5117 for (const auto &[I, V] : enumerate(InterleaveR->getStoredValues())) {
5119 if (!R || R->getOpcode() != WideMember0->getOpcode() ||
5120 R->getNumOperands() > 2)
5121 return;
5122 if (any_of(enumerate(R->operands()),
5123 [WideMember0, Idx = I](const auto &P) {
5124 const auto &[OpIdx, OpV] = P;
5125 return !canNarrowLoad(WideMember0, OpIdx, OpV, Idx);
5126 }))
5127 return;
5128 }
5129 StoreGroups.push_back(InterleaveR);
5130 }
5131
5132 if (StoreGroups.empty())
5133 return;
5134
5135 // Convert InterleaveGroup \p R to a single VPWidenLoadRecipe.
5136 SmallPtrSet<VPValue *, 4> NarrowedOps;
5137 // Narrow operation tree rooted at store groups.
5138 for (auto *StoreGroup : StoreGroups) {
5139 VPValue *Res =
5140 narrowInterleaveGroupOp(StoreGroup->getStoredValues()[0], NarrowedOps);
5141 auto *SI =
5142 cast<StoreInst>(StoreGroup->getInterleaveGroup()->getInsertPos());
5143 auto *S = new VPWidenStoreRecipe(
5144 *SI, StoreGroup->getAddr(), Res, nullptr, /*Consecutive=*/true,
5145 /*Reverse=*/false, {}, StoreGroup->getDebugLoc());
5146 S->insertBefore(StoreGroup);
5147 StoreGroup->eraseFromParent();
5148 }
5149
5150 // Adjust induction to reflect that the transformed plan only processes one
5151 // original iteration.
5152 auto *CanIV = VectorLoop->getCanonicalIV();
5153 auto *Inc = cast<VPInstruction>(CanIV->getBackedgeValue());
5154 VPBuilder PHBuilder(Plan.getVectorPreheader());
5155
5156 VPValue *UF = Plan.getOrAddLiveIn(
5157 ConstantInt::get(VectorLoop->getCanonicalIVType(), 1 * Plan.getUF()));
5158 if (VF.isScalable()) {
5159 VPValue *VScale = PHBuilder.createElementCount(
5161 VPValue *VScaleUF = PHBuilder.createOverflowingOp(
5162 Instruction::Mul, {VScale, UF}, {true, false});
5163 Inc->setOperand(1, VScaleUF);
5164 Plan.getVF().replaceAllUsesWith(VScale);
5165 } else {
5166 Inc->setOperand(1, UF);
5168 Plan.getConstantInt(CanIV->getScalarType(), 1));
5169 }
5170 removeDeadRecipes(Plan);
5171}
5172
5173/// Add branch weight metadata, if the \p Plan's middle block is terminated by a
5174/// BranchOnCond recipe.
5176 VPlan &Plan, ElementCount VF, std::optional<unsigned> VScaleForTuning) {
5177 VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
5178 auto *MiddleTerm =
5180 // Only add branch metadata if there is a (conditional) terminator.
5181 if (!MiddleTerm)
5182 return;
5183
5184 assert(MiddleTerm->getOpcode() == VPInstruction::BranchOnCond &&
5185 "must have a BranchOnCond");
5186 // Assume that `TripCount % VectorStep ` is equally distributed.
5187 unsigned VectorStep = Plan.getUF() * VF.getKnownMinValue();
5188 if (VF.isScalable() && VScaleForTuning.has_value())
5189 VectorStep *= *VScaleForTuning;
5190 assert(VectorStep > 0 && "trip count should not be zero");
5191 MDBuilder MDB(Plan.getContext());
5192 MDNode *BranchWeights =
5193 MDB.createBranchWeights({1, VectorStep - 1}, /*IsExpected=*/false);
5194 MiddleTerm->setMetadata(LLVMContext::MD_prof, BranchWeights);
5195}
5196
5197/// Compute and return the end value for \p WideIV, unless it is truncated. If
5198/// the induction recipe is not canonical, creates a VPDerivedIVRecipe to
5199/// compute the end value of the induction.
5201 VPBuilder &VectorPHBuilder,
5202 VPTypeAnalysis &TypeInfo,
5203 VPValue *VectorTC) {
5204 auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
5205 // Truncated wide inductions resume from the last lane of their vector value
5206 // in the last vector iteration which is handled elsewhere.
5207 if (WideIntOrFp && WideIntOrFp->getTruncInst())
5208 return nullptr;
5209
5210 VPIRValue *Start = WideIV->getStartValue();
5211 VPValue *Step = WideIV->getStepValue();
5213 VPValue *EndValue = VectorTC;
5214 if (!WideIntOrFp || !WideIntOrFp->isCanonical()) {
5215 EndValue = VectorPHBuilder.createDerivedIV(
5216 ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
5217 Start, VectorTC, Step);
5218 }
5219
5220 // EndValue is derived from the vector trip count (which has the same type as
5221 // the widest induction) and thus may be wider than the induction here.
5222 Type *ScalarTypeOfWideIV = TypeInfo.inferScalarType(WideIV);
5223 if (ScalarTypeOfWideIV != TypeInfo.inferScalarType(EndValue)) {
5224 EndValue = VectorPHBuilder.createScalarCast(Instruction::Trunc, EndValue,
5225 ScalarTypeOfWideIV,
5226 WideIV->getDebugLoc());
5227 }
5228
5229 return EndValue;
5230}
5231
5233 VPlan &Plan, DenseMap<VPValue *, VPValue *> &IVEndValues) {
5234 VPTypeAnalysis TypeInfo(Plan);
5235 auto *ScalarPH = Plan.getScalarPreheader();
5236 auto *MiddleVPBB = cast<VPBasicBlock>(ScalarPH->getPredecessors()[0]);
5237 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
5238 VPBuilder VectorPHBuilder(
5239 cast<VPBasicBlock>(VectorRegion->getSinglePredecessor()));
5240 VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
5241 for (VPRecipeBase &PhiR : Plan.getScalarPreheader()->phis()) {
5242 auto *ResumePhiR = cast<VPPhi>(&PhiR);
5243
5244 // TODO: Extract final value from induction recipe initially, optimize to
5245 // pre-computed end value together in optimizeInductionExitUsers.
5246 auto *VectorPhiR = cast<VPHeaderPHIRecipe>(ResumePhiR->getOperand(0));
5247 if (auto *WideIVR = dyn_cast<VPWidenInductionRecipe>(VectorPhiR)) {
5249 WideIVR, VectorPHBuilder, TypeInfo, &Plan.getVectorTripCount())) {
5250 IVEndValues[WideIVR] = EndValue;
5251 ResumePhiR->setOperand(0, EndValue);
5252 ResumePhiR->setName("bc.resume.val");
5253 continue;
5254 }
5255 // TODO: Also handle truncated inductions here. Computing end-values
5256 // separately should be done as VPlan-to-VPlan optimization, after
5257 // legalizing all resume values to use the last lane from the loop.
5258 assert(cast<VPWidenIntOrFpInductionRecipe>(VectorPhiR)->getTruncInst() &&
5259 "should only skip truncated wide inductions");
5260 continue;
5261 }
5262
5263 // The backedge value provides the value to resume coming out of a loop,
5264 // which for FORs is a vector whose last element needs to be extracted. The
5265 // start value provides the value if the loop is bypassed.
5266 bool IsFOR = isa<VPFirstOrderRecurrencePHIRecipe>(VectorPhiR);
5267 auto *ResumeFromVectorLoop = VectorPhiR->getBackedgeValue();
5268 assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
5269 "Cannot handle loops with uncountable early exits");
5270 if (IsFOR) {
5271 auto *ExtractPart = MiddleBuilder.createNaryOp(
5272 VPInstruction::ExtractLastPart, ResumeFromVectorLoop);
5273 ResumeFromVectorLoop = MiddleBuilder.createNaryOp(
5275 "vector.recur.extract");
5276 }
5277 ResumePhiR->setName(IsFOR ? "scalar.recur.init" : "bc.merge.rdx");
5278 ResumePhiR->setOperand(0, ResumeFromVectorLoop);
5279 }
5280}
5281
5283 VFRange &Range) {
5284 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
5285 auto *ScalarPHVPBB = Plan.getScalarPreheader();
5286 auto *MiddleVPBB = Plan.getMiddleBlock();
5287 VPBuilder ScalarPHBuilder(ScalarPHVPBB);
5288 VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
5289
5290 auto IsScalableOne = [](ElementCount VF) -> bool {
5291 return VF == ElementCount::getScalable(1);
5292 };
5293
5294 for (auto &HeaderPhi : VectorRegion->getEntryBasicBlock()->phis()) {
5295 auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&HeaderPhi);
5296 if (!FOR)
5297 continue;
5298
5299 assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
5300 "Cannot handle loops with uncountable early exits");
5301
5302 // This is the second phase of vectorizing first-order recurrences, creating
5303 // extract for users outside the loop. An overview of the transformation is
5304 // described below. Suppose we have the following loop with some use after
5305 // the loop of the last a[i-1],
5306 //
5307 // for (int i = 0; i < n; ++i) {
5308 // t = a[i - 1];
5309 // b[i] = a[i] - t;
5310 // }
5311 // use t;
5312 //
5313 // There is a first-order recurrence on "a". For this loop, the shorthand
5314 // scalar IR looks like:
5315 //
5316 // scalar.ph:
5317 // s.init = a[-1]
5318 // br scalar.body
5319 //
5320 // scalar.body:
5321 // i = phi [0, scalar.ph], [i+1, scalar.body]
5322 // s1 = phi [s.init, scalar.ph], [s2, scalar.body]
5323 // s2 = a[i]
5324 // b[i] = s2 - s1
5325 // br cond, scalar.body, exit.block
5326 //
5327 // exit.block:
5328 // use = lcssa.phi [s1, scalar.body]
5329 //
5330 // In this example, s1 is a recurrence because it's value depends on the
5331 // previous iteration. In the first phase of vectorization, we created a
5332 // VPFirstOrderRecurrencePHIRecipe v1 for s1. Now we create the extracts
5333 // for users in the scalar preheader and exit block.
5334 //
5335 // vector.ph:
5336 // v_init = vector(..., ..., ..., a[-1])
5337 // br vector.body
5338 //
5339 // vector.body
5340 // i = phi [0, vector.ph], [i+4, vector.body]
5341 // v1 = phi [v_init, vector.ph], [v2, vector.body]
5342 // v2 = a[i, i+1, i+2, i+3]
5343 // b[i] = v2 - v1
5344 // // Next, third phase will introduce v1' = splice(v1(3), v2(0, 1, 2))
5345 // b[i, i+1, i+2, i+3] = v2 - v1
5346 // br cond, vector.body, middle.block
5347 //
5348 // middle.block:
5349 // vector.recur.extract.for.phi = v2(2)
5350 // vector.recur.extract = v2(3)
5351 // br cond, scalar.ph, exit.block
5352 //
5353 // scalar.ph:
5354 // scalar.recur.init = phi [vector.recur.extract, middle.block],
5355 // [s.init, otherwise]
5356 // br scalar.body
5357 //
5358 // scalar.body:
5359 // i = phi [0, scalar.ph], [i+1, scalar.body]
5360 // s1 = phi [scalar.recur.init, scalar.ph], [s2, scalar.body]
5361 // s2 = a[i]
5362 // b[i] = s2 - s1
5363 // br cond, scalar.body, exit.block
5364 //
5365 // exit.block:
5366 // lo = lcssa.phi [s1, scalar.body],
5367 // [vector.recur.extract.for.phi, middle.block]
5368 //
5369 // Now update VPIRInstructions modeling LCSSA phis in the exit block.
5370 // Extract the penultimate value of the recurrence and use it as operand for
5371 // the VPIRInstruction modeling the phi.
5373 make_range(MiddleVPBB->getFirstNonPhi(), MiddleVPBB->end()))) {
5375 continue;
5376
5377 // For VF vscale x 1, if vscale = 1, we are unable to extract the
5378 // penultimate value of the recurrence. Instead we rely on the existing
5379 // extract of the last element from the result of
5380 // VPInstruction::FirstOrderRecurrenceSplice.
5381 // TODO: Consider vscale_range info and UF.
5383 Range))
5384 return;
5385 VPValue *PenultimateElement = MiddleBuilder.createNaryOp(
5386 VPInstruction::ExtractPenultimateElement, FOR->getBackedgeValue(), {},
5387 "vector.recur.extract.for.phi");
5388 cast<VPInstruction>(&R)->replaceAllUsesWith(PenultimateElement);
5389 }
5390 }
5391}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
AMDGPU Register Bank Select
This file implements a class to represent arbitrary precision integral constant values and operations...
ReachingDefInfo InstSet & ToRemove
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool isEqual(const Function &Caller, const Function &Callee)
static const Function * getParent(const Value *V)
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
static bool isSentinel(const DWARFDebugNames::AttributeEncoding &AE)
@ Default
Hexagon Common GEP
iv Induction Variable Users
Definition IVUsers.cpp:48
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
licm
Definition LICM.cpp:383
Legalize the Machine IR a function s Machine IR
Definition Legalizer.cpp:81
static bool mergeBlocksIntoPredecessors(Loop &L, DominatorTree &DT, LoopInfo &LI, MemorySSAUpdater *MSSAU, ScalarEvolution &SE)
#define I(x, y, z)
Definition MD5.cpp:57
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first DebugLoc that has line number information, given a range of instructions.
This file provides utility analysis objects describing memory locations.
This file contains the declarations for metadata subclasses.
MachineInstr unsigned OpIdx
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
#define P(N)
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
const SmallVectorImpl< MachineOperand > & Cond
This file contains some templates that are useful if you are working with the STL at all.
This is the interface for a metadata-based scoped no-alias analysis.
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file defines the SmallPtrSet class.
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This file implements the TypeSwitch template, which mimics a switch() statement whose cases are type ...
This file implements dominator tree analysis for a single level of a VPlan's H-CFG.
This file contains the declarations of different VPlan-related auxiliary helpers.
static SmallVector< SmallVector< VPReplicateRecipe *, 4 > > collectComplementaryPredicatedMemOps(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L)
static void removeCommonBlendMask(VPBlendRecipe *Blend)
Try to see if all of Blend's masks share a common value logically and'ed and remove it from the masks...
static void tryToCreateAbstractReductionRecipe(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries to create abstract recipes from the reduction recipe for following optimizations ...
static VPReplicateRecipe * findRecipeWithMinAlign(ArrayRef< VPReplicateRecipe * > Group)
static bool sinkScalarOperands(VPlan &Plan)
static bool cannotHoistOrSinkRecipe(const VPRecipeBase &R)
Return true if we do not know how to (mechanically) hoist or sink R out of a loop region.
static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Try to simplify the branch condition of Plan.
static void simplifyRecipe(VPSingleDefRecipe *Def, VPTypeAnalysis &TypeInfo)
Try to simplify VPSingleDefRecipe Def.
static void removeRedundantInductionCasts(VPlan &Plan)
Remove redundant casts of inductions.
static bool isConditionTrueViaVFAndUF(VPValue *Cond, VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Return true if Cond is known to be true for given BestVF and BestUF.
static bool tryToReplaceALMWithWideALM(VPlan &Plan, ElementCount VF, unsigned UF)
Try to replace multiple active lane masks used for control flow with a single, wide active lane mask ...
static std::optional< std::pair< bool, unsigned > > getOpcodeOrIntrinsicID(const VPSingleDefRecipe *R)
Get any instruction opcode or intrinsic ID data embedded in recipe R.
static VPExpressionRecipe * tryToMatchAndCreateExtendedReduction(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries convert extended in-loop reductions to VPExpressionRecipe and clamp the Range if ...
static RemoveMask_match< Op0_t, Op1_t > m_RemoveMask(const Op0_t &In, Op1_t &Out)
Match a specific mask In, or a combination of it (logical-and In, Out).
static VPIRMetadata getCommonMetadata(ArrayRef< VPReplicateRecipe * > Recipes)
static VPValue * getPredicatedMask(VPRegionBlock *R)
If R is a region with a VPBranchOnMaskRecipe in the entry block, return the mask.
static bool sinkRecurrenceUsersAfterPrevious(VPFirstOrderRecurrencePHIRecipe *FOR, VPRecipeBase *Previous, VPDominatorTree &VPDT)
Sink users of FOR after the recipe defining the previous value Previous of the recurrence.
static bool mergeReplicateRegionsIntoSuccessors(VPlan &Plan)
static VPScalarIVStepsRecipe * createScalarIVSteps(VPlan &Plan, InductionDescriptor::InductionKind Kind, Instruction::BinaryOps InductionOpcode, FPMathOperator *FPBinOp, Instruction *TruncI, VPIRValue *StartV, VPValue *Step, DebugLoc DL, VPBuilder &Builder)
static VPWidenInductionRecipe * getOptimizableIVOf(VPValue *VPV, PredicatedScalarEvolution &PSE)
Check if VPV is an untruncated wide induction, either before or after the increment.
static VPActiveLaneMaskPHIRecipe * addVPLaneMaskPhiAndUpdateExitBranch(VPlan &Plan, bool DataAndControlFlowWithoutRuntimeCheck)
static void fixupVFUsersForEVL(VPlan &Plan, VPValue &EVL)
After replacing the canonical IV with a EVL-based IV, fixup recipes that use VF to use the EVL instea...
static void expandVPWidenPointerInduction(VPWidenPointerInductionRecipe *R, VPTypeAnalysis &TypeInfo)
Expand a VPWidenPointerInductionRecipe into executable recipes, for the initial value,...
static bool isDeadRecipe(VPRecipeBase &R)
Returns true if R is dead and can be removed.
static void legalizeAndOptimizeInductions(VPlan &Plan)
Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd (IndStart, ScalarIVSteps (0,...
static void addReplicateRegions(VPlan &Plan)
static VPIRValue * tryToFoldLiveIns(VPSingleDefRecipe &R, ArrayRef< VPValue * > Operands, const DataLayout &DL, VPTypeAnalysis &TypeInfo)
Try to fold R using InstSimplifyFolder.
static VPValue * tryToComputeEndValueForInduction(VPWidenInductionRecipe *WideIV, VPBuilder &VectorPHBuilder, VPTypeAnalysis &TypeInfo, VPValue *VectorTC)
Compute and return the end value for WideIV, unless it is truncated.
static void removeRedundantExpandSCEVRecipes(VPlan &Plan)
Remove redundant EpxandSCEVRecipes in Plan's entry block by replacing them with already existing reci...
static bool simplifyKnownEVL(VPlan &Plan, ElementCount VF, PredicatedScalarEvolution &PSE)
From the definition of llvm.experimental.get.vector.length, VPInstruction::ExplicitVectorLength(AVL) ...
static bool hoistPreviousBeforeFORUsers(VPFirstOrderRecurrencePHIRecipe *FOR, VPRecipeBase *Previous, VPDominatorTree &VPDT)
Try to hoist Previous and its operands before all users of FOR.
static VPValue * scalarizeVPWidenPointerInduction(VPWidenPointerInductionRecipe *PtrIV, VPlan &Plan, VPBuilder &Builder)
Scalarize a VPWidenPointerInductionRecipe by replacing it with a PtrAdd (IndStart,...
static SmallVector< VPUser * > collectUsersRecursively(VPValue *V)
static VPValue * optimizeEarlyExitInductionUser(VPlan &Plan, VPTypeAnalysis &TypeInfo, VPBlockBase *PredVPBB, VPValue *Op, PredicatedScalarEvolution &PSE)
Attempts to optimize the induction variable exit values for users in the early exit block.
static void recursivelyDeleteDeadRecipes(VPValue *V)
static bool canSinkStoreWithNoAliasCheck(ArrayRef< VPReplicateRecipe * > StoresToSink, PredicatedScalarEvolution &PSE, const Loop &L, VPTypeAnalysis &TypeInfo)
static VPRegionBlock * createReplicateRegion(VPReplicateRecipe *PredRecipe, VPlan &Plan)
static VPBasicBlock * getPredicatedThenBlock(VPRegionBlock *R)
If R is a triangle region, return the 'then' block of the triangle.
static VPValue * narrowInterleaveGroupOp(VPValue *V, SmallPtrSetImpl< VPValue * > &NarrowedOps)
static bool canHoistOrSinkWithNoAliasCheck(const MemoryLocation &MemLoc, VPBasicBlock *FirstBB, VPBasicBlock *LastBB, std::optional< SinkStoreInfo > SinkInfo={})
Check if a memory operation doesn't alias with memory operations in blocks between FirstBB and LastBB...
static void simplifyBlends(VPlan &Plan)
Normalize and simplify VPBlendRecipes.
static bool isConsecutiveInterleaveGroup(VPInterleaveRecipe *InterleaveR, ElementCount VF, VPTypeAnalysis &TypeInfo, TypeSize VectorRegWidth)
Returns true if IR is a full interleave group with factor and number of members both equal to VF.
static VPRecipeBase * optimizeMaskToEVL(VPValue *HeaderMask, VPRecipeBase &CurRecipe, VPTypeAnalysis &TypeInfo, VPValue &EVL)
Try to optimize a CurRecipe masked by HeaderMask to a corresponding EVL-based recipe without the head...
static bool isAlreadyNarrow(VPValue *VPV)
Returns true if VPValue is a narrow VPValue.
static bool optimizeVectorInductionWidthForTCAndVFUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF)
Optimize the width of vector induction variables in Plan based on a known constant Trip Count,...
static VPExpressionRecipe * tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries convert extended in-loop reductions to VPExpressionRecipe and clamp the Range if ...
static void expandVPWidenIntOrFpInduction(VPWidenIntOrFpInductionRecipe *WidenIVR, VPTypeAnalysis &TypeInfo)
Expand a VPWidenIntOrFpInduction into executable recipes, for the initial value, phi and backedge val...
static VPSingleDefRecipe * findHeaderMask(VPlan &Plan)
Collect the header mask with the pattern: (ICMP_ULE, WideCanonicalIV, backedge-taken-count) TODO: Int...
static void removeRedundantCanonicalIVs(VPlan &Plan)
Try to replace VPWidenCanonicalIVRecipes with a widened canonical IV recipe, if it exists.
static bool canNarrowLoad(VPWidenRecipe *WideMember0, unsigned OpIdx, VPValue *OpV, unsigned Idx)
Returns true if V is VPWidenLoadRecipe or VPInterleaveRecipe that can be converted to a narrower reci...
static void narrowToSingleScalarRecipes(VPlan &Plan)
static VPValue * optimizeLatchExitInductionUser(VPlan &Plan, VPTypeAnalysis &TypeInfo, VPBlockBase *PredVPBB, VPValue *Op, DenseMap< VPValue *, VPValue * > &EndValues, PredicatedScalarEvolution &PSE)
Attempts to optimize the induction variable exit values for users in the exit block coming from the l...
This file provides utility VPlan to VPlan transformations.
#define RUN_VPLAN_PASS(PASS,...)
This file declares the class VPlanVerifier, which contains utility functions to check the consistency...
This file contains the declarations of the Vectorization Plan base classes:
static const X86InstrFMA3Group Groups[]
Value * RHS
Value * LHS
BinaryOperator * Mul
static const uint32_t IV[8]
Definition blake3_impl.h:83
Helper for extra no-alias checks via known-safe recipe and SCEV.
SinkStoreInfo(const SmallPtrSetImpl< VPRecipeBase * > &ExcludeRecipes, VPReplicateRecipe &GroupLeader, PredicatedScalarEvolution &PSE, const Loop &L, VPTypeAnalysis &TypeInfo)
bool shouldSkip(VPRecipeBase &R) const
Return true if R should be skipped during alias checking, either because it's in the exclude set or b...
Class for arbitrary precision integers.
Definition APInt.h:78
LLVM_ABI APInt zext(unsigned width) const
Zero extend to a new width.
Definition APInt.cpp:1023
unsigned getActiveBits() const
Compute the number of active bits in the value.
Definition APInt.h:1521
APInt abs() const
Get the absolute value.
Definition APInt.h:1804
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1497
LLVM_ABI APInt sext(unsigned width) const
Sign extend to a new width.
Definition APInt.cpp:996
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition APInt.h:441
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition APInt.h:1222
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
const T & back() const
back - Get the last element.
Definition ArrayRef.h:151
const T & front() const
front - Get the first element.
Definition ArrayRef.h:145
iterator end() const
Definition ArrayRef.h:131
iterator begin() const
Definition ArrayRef.h:130
LLVM Basic Block Representation.
Definition BasicBlock.h:62
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this basic block belongs to.
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233
This class represents a function call, abstracting a target machine's calling convention.
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:701
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:702
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition InstrTypes.h:686
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition InstrTypes.h:789
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
static ConstantInt * getSigned(IntegerType *Ty, int64_t V, bool ImplicitTrunc=false)
Return a ConstantInt with the specified value for the specified type.
Definition Constants.h:135
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
A debug info location.
Definition DebugLoc.h:123
static DebugLoc getCompilerGenerated()
Definition DebugLoc.h:162
static DebugLoc getUnknown()
Definition DebugLoc.h:161
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition DenseMap.h:205
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:256
bool dominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
dominates - Returns true iff A dominates B.
constexpr bool isVector() const
One or more elements.
Definition TypeSize.h:324
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition TypeSize.h:312
Utility class for floating point operations which can have information about relaxed accuracy require...
Definition Operator.h:200
Represents flags for the getelementptr instruction/expression.
GEPNoWrapFlags withoutNoUnsignedWrap() const
static GEPNoWrapFlags none()
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
A struct for saving information about induction variables.
InductionKind
This enum represents the kinds of inductions that we support.
@ IK_PtrInduction
Pointer induction var. Step = C.
@ IK_IntInduction
Integer induction variable. Step = C.
InstSimplifyFolder - Use InstructionSimplify to fold operations to existing values.
static InstructionCost getInvalid(CostType Val=0)
bool isCast() const
bool isBinaryOp() const
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:318
The group of interleaved loads/stores sharing the same stride and close to each other.
InstTy * getMember(uint32_t Index) const
Get the member with the given index Index.
uint32_t getNumMembers() const
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
static bool getDecisionAndClampRange(const std::function< bool(ElementCount)> &Predicate, VFRange &Range)
Test a Predicate on a Range of VF's.
Definition VPlan.cpp:1558
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
LLVM_ABI MDNode * createBranchWeights(uint32_t TrueWeight, uint32_t FalseWeight, bool IsExpected=false)
Return metadata containing two branch weights.
Definition MDBuilder.cpp:38
Metadata node.
Definition Metadata.h:1080
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:36
ValueT lookup(const KeyT &Key) const
Definition MapVector.h:108
Representation for a specific memory location.
AAMDNodes AATags
The metadata nodes which describes the aliasing of the location (each member is null if that kind of ...
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
ScalarEvolution * getSE() const
Returns the ScalarEvolution analysis used.
LLVM_ABI const SCEV * getSCEV(Value *V)
Returns the SCEV expression of V, in the context of the current SCEV predicate.
static LLVM_ABI unsigned getOpcode(RecurKind Kind)
Returns the opcode corresponding to the RecurrenceKind.
RegionT * getParent() const
Get the parent of the Region.
Definition RegionInfo.h:362
This class uses information about analyze scalars to rewrite expressions in canonical form.
LLVM_ABI Value * expandCodeFor(const SCEV *SH, Type *Ty, BasicBlock::iterator I)
Insert code to directly compute the specified SCEV expression into the program.
static const SCEV * rewrite(const SCEV *Scev, ScalarEvolution &SE, ValueToSCEVMapTy &Map)
This class represents an analyzed expression in the program.
LLVM_ABI Type * getType() const
Return the LLVM type of this SCEV expression.
The main scalar evolution driver.
const DataLayout & getDataLayout() const
Return the DataLayout associated with the module this SCEV instance is operating on.
LLVM_ABI const SCEV * getNegativeSCEV(const SCEV *V, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
Return the SCEV object corresponding to -V.
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI const SCEV * getUDivExpr(const SCEV *LHS, const SCEV *RHS)
Get a canonical unsigned division expression, or something simpler if possible.
LLVM_ABI const SCEV * getElementCount(Type *Ty, ElementCount EC, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
LLVM_ABI const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
LLVM_ABI const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
LLVM_ABI bool isKnownPredicate(CmpPredicate Pred, const SCEV *LHS, const SCEV *RHS)
Test if the given expression is known to satisfy the condition described by Pred, LHS,...
static LLVM_ABI bool mayAliasInScopes(const MDNode *Scopes, const MDNode *NoAlias)
A vector that has set insertion semantics.
Definition SetVector.h:57
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:103
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:151
size_type size() const
Definition SmallPtrSet.h:99
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
iterator begin() const
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
Provides information about what library functions are available for the current target.
static LLVM_ABI PartialReductionExtendKind getPartialReductionExtendKind(Instruction *I)
Get the kind of extension that an instruction represents.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
static constexpr TypeSize get(ScalarTy Quantity, bool Scalable)
Definition TypeSize.h:340
This class implements a switch-like dispatch statement for a value of 'T' using dyn_cast functionalit...
Definition TypeSwitch.h:89
TypeSwitch< T, ResultT > & Case(CallableT &&caseFn)
Add a case on the given type.
Definition TypeSwitch.h:98
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:297
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:296
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
Definition Type.cpp:294
bool isStructTy() const
True if this is an instance of StructType.
Definition Type.h:261
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:230
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:293
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:184
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
op_range operands()
Definition User.h:267
A recipe for generating the active lane mask for the vector loop that is used to predicate the vector...
Definition VPlan.h:3713
VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph.
Definition VPlan.h:4081
void appendRecipe(VPRecipeBase *Recipe)
Augment the existing recipes of a VPBasicBlock with an additional Recipe as the last recipe.
Definition VPlan.h:4156
RecipeListTy::iterator iterator
Instruction iterators...
Definition VPlan.h:4108
iterator end()
Definition VPlan.h:4118
iterator begin()
Recipe iterator methods.
Definition VPlan.h:4116
iterator_range< iterator > phis()
Returns an iterator range over the PHI-like recipes in the block.
Definition VPlan.h:4169
iterator getFirstNonPhi()
Return the position of the first non-phi node recipe in the block.
Definition VPlan.cpp:230
VPRegionBlock * getEnclosingLoopRegion()
Definition VPlan.cpp:591
VPBasicBlock * splitAt(iterator SplitAt)
Split current block at SplitAt by inserting a new block between the current block and its successors ...
Definition VPlan.cpp:563
VPRecipeBase * getTerminator()
If the block has multiple successors, return the branch recipe terminating the block.
Definition VPlan.cpp:637
const VPRecipeBase & back() const
Definition VPlan.h:4130
A recipe for vectorizing a phi-node as a sequence of mask-based select instructions.
Definition VPlan.h:2617
VPValue * getMask(unsigned Idx) const
Return mask number Idx.
Definition VPlan.h:2651
unsigned getNumIncomingValues() const
Return the number of incoming values, taking into account when normalized the first incoming value wi...
Definition VPlan.h:2641
void setMask(unsigned Idx, VPValue *V)
Set mask number Idx to V.
Definition VPlan.h:2657
bool isNormalized() const
A normalized blend is one that has an odd number of operands, whereby the first operand does not have...
Definition VPlan.h:2637
VPBlockBase is the building block of the Hierarchical Control-Flow Graph.
Definition VPlan.h:81
void setSuccessors(ArrayRef< VPBlockBase * > NewSuccs)
Set each VPBasicBlock in NewSuccss as successor of this VPBlockBase.
Definition VPlan.h:300
VPRegionBlock * getParent()
Definition VPlan.h:173
const VPBasicBlock * getExitingBasicBlock() const
Definition VPlan.cpp:200
size_t getNumSuccessors() const
Definition VPlan.h:219
size_t getNumPredecessors() const
Definition VPlan.h:220
void setPredecessors(ArrayRef< VPBlockBase * > NewPreds)
Set each VPBasicBlock in NewPreds as predecessor of this VPBlockBase.
Definition VPlan.h:291
const VPBlocksTy & getPredecessors() const
Definition VPlan.h:204
VPlan * getPlan()
Definition VPlan.cpp:175
const std::string & getName() const
Definition VPlan.h:164
void clearSuccessors()
Remove all the successors of this block.
Definition VPlan.h:310
VPBlockBase * getSinglePredecessor() const
Definition VPlan.h:215
const VPBasicBlock * getEntryBasicBlock() const
Definition VPlan.cpp:180
void setParent(VPRegionBlock *P)
Definition VPlan.h:184
VPBlockBase * getSingleHierarchicalPredecessor()
Definition VPlan.h:264
VPBlockBase * getSingleSuccessor() const
Definition VPlan.h:209
const VPBlocksTy & getSuccessors() const
Definition VPlan.h:198
static auto blocksOnly(const T &Range)
Return an iterator range over Range which only includes BlockTy blocks.
Definition VPlanUtils.h:243
static void insertOnEdge(VPBlockBase *From, VPBlockBase *To, VPBlockBase *BlockPtr)
Inserts BlockPtr on the edge between From and To.
Definition VPlanUtils.h:264
static void insertTwoBlocksAfter(VPBlockBase *IfTrue, VPBlockBase *IfFalse, VPBlockBase *BlockPtr)
Insert disconnected VPBlockBases IfTrue and IfFalse after BlockPtr.
Definition VPlanUtils.h:176
static void connectBlocks(VPBlockBase *From, VPBlockBase *To, unsigned PredIdx=-1u, unsigned SuccIdx=-1u)
Connect VPBlockBases From and To bi-directionally.
Definition VPlanUtils.h:195
static void disconnectBlocks(VPBlockBase *From, VPBlockBase *To)
Disconnect VPBlockBases From and To bi-directionally.
Definition VPlanUtils.h:213
A recipe for generating conditional branches on the bits of a mask.
Definition VPlan.h:3122
RAII object that stores the current insertion point and restores it when the object is destroyed.
VPlan-based builder utility analogous to IRBuilder.
VPValue * createScalarZExtOrTrunc(VPValue *Op, Type *ResultTy, Type *SrcTy, DebugLoc DL)
VPValue * createElementCount(Type *Ty, ElementCount EC)
VPInstruction * createScalarCast(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy, DebugLoc DL, const VPIRFlags &Flags={}, const VPIRMetadata &Metadata={})
static VPBuilder getToInsertAfter(VPRecipeBase *R)
Create a VPBuilder to insert after R.
VPInstruction * createOverflowingOp(unsigned Opcode, ArrayRef< VPValue * > Operands, VPRecipeWithIRFlags::WrapFlagsTy WrapFlags={false, false}, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
VPDerivedIVRecipe * createDerivedIV(InductionDescriptor::InductionKind Kind, FPMathOperator *FPBinOp, VPIRValue *Start, VPValue *Current, VPValue *Step, const Twine &Name="")
Convert the input value Current to the corresponding value of an induction with Start and Step values...
VPPhi * createScalarPhi(ArrayRef< VPValue * > IncomingValues, DebugLoc DL, const Twine &Name="")
void setInsertPoint(VPBasicBlock *TheBB)
This specifies that created VPInstructions should be appended to the end of the specified block.
VPInstruction * createNaryOp(unsigned Opcode, ArrayRef< VPValue * > Operands, Instruction *Inst=nullptr, const VPIRFlags &Flags={}, const VPIRMetadata &MD={}, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
Create an N-ary operation with Opcode, Operands and set Inst as its underlying Instruction.
Canonical scalar induction phi of the vector loop.
Definition VPlan.h:3655
unsigned getNumDefinedValues() const
Returns the number of values defined by the VPDef.
Definition VPlanValue.h:427
VPValue * getVPSingleValue()
Returns the only VPValue defined by the VPDef.
Definition VPlanValue.h:400
VPValue * getVPValue(unsigned I)
Returns the VPValue with index I defined by the VPDef.
Definition VPlanValue.h:412
ArrayRef< VPRecipeValue * > definedValues()
Returns an ArrayRef of the values defined by the VPDef.
Definition VPlanValue.h:422
A recipe for converting the input value IV value to the corresponding value of an IV with different s...
Definition VPlan.h:3825
Template specialization of the standard LLVM dominator tree utility for VPBlockBases.
bool properlyDominates(const VPRecipeBase *A, const VPRecipeBase *B)
A recipe for generating the phi node for the current index of elements, adjusted in accordance with E...
Definition VPlan.h:3745
A recipe to combine multiple recipes into a single 'expression' recipe, which should be considered a ...
Definition VPlan.h:3167
A pure virtual base class for all recipes modeling header phis, including phis for first order recurr...
Definition VPlan.h:2132
virtual VPValue * getBackedgeValue()
Returns the incoming value from the loop backedge.
Definition VPlan.h:2175
VPValue * getStartValue()
Returns the start value of the phi, if one is set.
Definition VPlan.h:2164
A special type of VPBasicBlock that wraps an existing IR basic block.
Definition VPlan.h:4234
BasicBlock * getIRBasicBlock() const
Definition VPlan.h:4258
Class to record and manage LLVM IR flags.
Definition VPlan.h:665
static LLVM_ABI_FOR_TEST VPIRInstruction * create(Instruction &I)
Create a new VPIRPhi for \I , if it is a PHINode, otherwise create a VPIRInstruction.
Helper to manage IR metadata for recipes.
Definition VPlan.h:1087
void intersect(const VPIRMetadata &MD)
Intersect this VPIRMetadata object with MD, keeping only metadata nodes that are common to both.
This is a concrete Recipe that models a single VPlan-level instruction.
Definition VPlan.h:1141
@ ExtractLane
Extracts a single lane (first operand) from a set of vector operands.
Definition VPlan.h:1242
@ Unpack
Extracts all lanes from its (non-scalable) vector operand.
Definition VPlan.h:1185
@ BuildVector
Creates a fixed-width vector containing all operands.
Definition VPlan.h:1180
@ BuildStructVector
Given operands of (the same) struct type, creates a struct of fixed- width vectors each containing a ...
Definition VPlan.h:1177
@ CanonicalIVIncrementForPart
Definition VPlan.h:1161
const InterleaveGroup< Instruction > * getInterleaveGroup() const
Definition VPlan.h:2760
VPValue * getMask() const
Return the mask used by this recipe.
Definition VPlan.h:2752
ArrayRef< VPValue * > getStoredValues() const
Return the VPValues stored by this interleave group.
Definition VPlan.h:2781
A recipe for interleaved memory operations with vector-predication intrinsics.
Definition VPlan.h:2834
VPInterleaveRecipe is a recipe for transforming an interleave group of load or stores into one wide l...
Definition VPlan.h:2792
VPPredInstPHIRecipe is a recipe for generating the phi nodes needed when control converges back from ...
Definition VPlan.h:3309
VPRecipeBase is a base class modeling a sequence of one or more output IR instructions.
Definition VPlan.h:387
VPRegionBlock * getRegion()
Definition VPlan.h:4386
VPBasicBlock * getParent()
Definition VPlan.h:462
DebugLoc getDebugLoc() const
Returns the debug location of the recipe.
Definition VPlan.h:536
void moveBefore(VPBasicBlock &BB, iplist< VPRecipeBase >::iterator I)
Unlink this recipe and insert into BB before I.
void insertBefore(VPRecipeBase *InsertPos)
Insert an unlinked recipe into a basic block immediately before the specified recipe.
void insertAfter(VPRecipeBase *InsertPos)
Insert an unlinked Recipe into a basic block immediately after the specified Recipe.
iplist< VPRecipeBase >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Helper class to create VPRecipies from IR instructions.
VPRecipeBase * getRecipe(Instruction *I)
Return the recipe created for given ingredient.
A recipe to represent inloop reduction operations with vector-predication intrinsics,...
Definition VPlan.h:2996
A recipe to represent inloop, ordered or partial reduction operations.
Definition VPlan.h:2885
VPRegionBlock represents a collection of VPBasicBlocks and VPRegionBlocks which form a Single-Entry-S...
Definition VPlan.h:4269
const VPBlockBase * getEntry() const
Definition VPlan.h:4305
Type * getCanonicalIVType()
Return the type of the canonical IV for loop regions.
Definition VPlan.h:4380
bool isReplicator() const
An indicator whether this region is to generate multiple replicated instances of output IR correspond...
Definition VPlan.h:4337
void setExiting(VPBlockBase *ExitingBlock)
Set ExitingBlock as the exiting VPBlockBase of this VPRegionBlock.
Definition VPlan.h:4322
VPCanonicalIVPHIRecipe * getCanonicalIV()
Returns the canonical induction recipe of the region.
Definition VPlan.h:4367
const VPBlockBase * getExiting() const
Definition VPlan.h:4317
VPBasicBlock * getPreheaderVPBB()
Returns the pre-header VPBasicBlock of the loop region.
Definition VPlan.h:4330
VPReplicateRecipe replicates a given instruction producing multiple scalar copies of the original sca...
Definition VPlan.h:3041
bool isSingleScalar() const
Definition VPlan.h:3082
VPValue * getMask()
Return the mask of a predicated VPReplicateRecipe.
Definition VPlan.h:3106
A recipe for handling phi nodes of integer and floating-point inductions, producing their scalar valu...
Definition VPlan.h:3897
VPSingleDef is a base class for recipes for modeling a sequence of one or more output IR that define ...
Definition VPlan.h:588
Instruction * getUnderlyingInstr()
Returns the underlying instruction.
Definition VPlan.h:651
VPSingleDefRecipe * clone() override=0
Clone the current recipe.
An analysis for type-inference for VPValues.
LLVMContext & getContext()
Return the LLVMContext used by the analysis.
Type * inferScalarType(const VPValue *V)
Infer the type of V. Returns the scalar type of V.
This class augments VPValue with operands which provide the inverse def-use edges from VPValue's user...
Definition VPlanValue.h:258
operand_range operands()
Definition VPlanValue.h:326
void setOperand(unsigned I, VPValue *New)
Definition VPlanValue.h:302
VPValue * getOperand(unsigned N) const
Definition VPlanValue.h:297
void addOperand(VPValue *Operand)
Definition VPlanValue.h:291
This is the base class of the VPlan Def/Use graph, used for modeling the data flow into,...
Definition VPlanValue.h:46
Value * getLiveInIRValue() const
Return the underlying IR value for a VPIRValue.
Definition VPlan.cpp:135
bool isDefinedOutsideLoopRegions() const
Returns true if the VPValue is defined outside any loop.
Definition VPlan.cpp:1391
VPRecipeBase * getDefiningRecipe()
Returns the recipe defining this VPValue or nullptr if it is not defined by a recipe,...
Definition VPlan.cpp:125
Value * getUnderlyingValue() const
Return the underlying Value attached to this VPValue.
Definition VPlanValue.h:71
void setUnderlyingValue(Value *Val)
Definition VPlanValue.h:172
void replaceAllUsesWith(VPValue *New)
Definition VPlan.cpp:1394
unsigned getNumUsers() const
Definition VPlanValue.h:104
void replaceUsesWithIf(VPValue *New, llvm::function_ref< bool(VPUser &U, unsigned Idx)> ShouldReplace)
Go through the uses list for this VPValue and make each use point to New if the callback ShouldReplac...
Definition VPlan.cpp:1398
user_range users()
Definition VPlanValue.h:125
A recipe to compute a pointer to the last element of each part of a widened memory access for widened...
Definition VPlan.h:1991
A Recipe for widening the canonical induction variable of the vector loop.
Definition VPlan.h:3788
VPWidenCastRecipe is a recipe to create vector cast instructions.
Definition VPlan.h:1680
Instruction::CastOps getOpcode() const
Definition VPlan.h:1716
A recipe for handling GEP instructions.
Definition VPlan.h:1928
Base class for widened induction (VPWidenIntOrFpInductionRecipe and VPWidenPointerInductionRecipe),...
Definition VPlan.h:2199
VPIRValue * getStartValue() const
Returns the start value of the induction.
Definition VPlan.h:2227
PHINode * getPHINode() const
Definition VPlan.h:2244
VPValue * getStepValue()
Returns the step value of the induction.
Definition VPlan.h:2230
const InductionDescriptor & getInductionDescriptor() const
Returns the induction descriptor for the recipe.
Definition VPlan.h:2247
A recipe for handling phi nodes of integer and floating-point inductions, producing their vector valu...
Definition VPlan.h:2278
VPIRValue * getStartValue() const
Returns the start value of the induction.
Definition VPlan.h:2325
VPValue * getSplatVFValue() const
If the recipe has been unrolled, return the VPValue for the induction increment, otherwise return nul...
Definition VPlan.h:2329
VPValue * getLastUnrolledPartOperand()
Returns the VPValue representing the value of this induction at the last unrolled part,...
Definition VPlan.h:2356
A recipe for widening vector intrinsics.
Definition VPlan.h:1730
A common base class for widening memory operations.
Definition VPlan.h:3352
A recipe for widened phis.
Definition VPlan.h:2414
VPWidenRecipe is a recipe for producing a widened instruction using the opcode and operands of the re...
Definition VPlan.h:1632
VPlan models a candidate for vectorization, encoding various decisions take to produce efficient outp...
Definition VPlan.h:4399
VPIRValue * getLiveIn(Value *V) const
Return the live-in VPIRValue for V, if there is one or nullptr otherwise.
Definition VPlan.h:4682
bool hasVF(ElementCount VF) const
Definition VPlan.h:4596
LLVMContext & getContext() const
Definition VPlan.h:4584
VPBasicBlock * getEntry()
Definition VPlan.h:4488
bool hasScalableVF() const
Definition VPlan.h:4597
VPValue & getVFxUF()
Returns VF * UF of the vector loop region.
Definition VPlan.h:4582
VPValue & getVF()
Returns the VF of the vector loop region.
Definition VPlan.h:4578
VPValue * getTripCount() const
The trip count of the original loop.
Definition VPlan.h:4546
VPValue * getOrCreateBackedgeTakenCount()
The backedge taken count of the original loop.
Definition VPlan.h:4567
VPIRValue * getFalse()
Return a VPIRValue wrapping i1 false.
Definition VPlan.h:4661
unsigned getUF() const
Definition VPlan.h:4616
VPRegionBlock * createReplicateRegion(VPBlockBase *Entry, VPBlockBase *Exiting, const std::string &Name="")
Create a new replicate region with Entry, Exiting and Name.
Definition VPlan.h:4730
auto getLiveIns() const
Return the list of live-in VPValues available in the VPlan.
Definition VPlan.h:4685
bool hasUF(unsigned UF) const
Definition VPlan.h:4614
ArrayRef< VPIRBasicBlock * > getExitBlocks() const
Return an ArrayRef containing VPIRBasicBlocks wrapping the exit blocks of the original scalar loop.
Definition VPlan.h:4536
VPSymbolicValue & getVectorTripCount()
The vector trip count.
Definition VPlan.h:4575
VPIRValue * getOrAddLiveIn(Value *V)
Gets the live-in VPIRValue for V or adds a new live-in (if none exists yet) for V.
Definition VPlan.h:4638
void setVF(ElementCount VF)
Definition VPlan.h:4590
bool isUnrolled() const
Returns true if the VPlan already has been unrolled, i.e.
Definition VPlan.h:4629
LLVM_ABI_FOR_TEST VPRegionBlock * getVectorLoopRegion()
Returns the VPRegionBlock of the vector loop.
Definition VPlan.cpp:1031
void resetTripCount(VPValue *NewTripCount)
Resets the trip count for the VPlan.
Definition VPlan.h:4560
VPBasicBlock * getMiddleBlock()
Returns the 'middle' block of the plan, that is the block that selects whether to execute the scalar ...
Definition VPlan.h:4513
VPBasicBlock * createVPBasicBlock(const Twine &Name, VPRecipeBase *Recipe=nullptr)
Create a new VPBasicBlock with Name and containing Recipe if present.
Definition VPlan.h:4708
VPIRValue * getTrue()
Return a VPIRValue wrapping i1 true.
Definition VPlan.h:4658
bool hasScalarVFOnly() const
Definition VPlan.h:4607
VPBasicBlock * getScalarPreheader() const
Return the VPBasicBlock for the preheader of the scalar loop.
Definition VPlan.h:4527
VPIRBasicBlock * getScalarHeader() const
Return the VPIRBasicBlock wrapping the header of the scalar loop.
Definition VPlan.h:4532
VPBasicBlock * getVectorPreheader()
Returns the preheader of the vector loop region, if one exists, or null otherwise.
Definition VPlan.h:4493
void setUF(unsigned UF)
Definition VPlan.h:4621
bool hasScalarTail() const
Returns true if the scalar tail may execute after the vector loop.
Definition VPlan.h:4762
VPIRValue * getConstantInt(Type *Ty, uint64_t Val, bool IsSigned=false)
Return a VPIRValue wrapping a ConstantInt with the given type and value.
Definition VPlan.h:4664
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
iterator_range< user_iterator > users()
Definition Value.h:426
bool hasName() const
Definition Value.h:262
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
static constexpr bool isKnownLT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:216
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:168
constexpr LeafTy multiplyCoefficientBy(ScalarTy RHS) const
Definition TypeSize.h:256
constexpr bool isFixed() const
Returns true if the quantity is not scaled by vscale.
Definition TypeSize.h:171
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
IteratorT end() const
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
LLVM_ABI APInt RoundingUDiv(const APInt &A, const APInt &B, APInt::Rounding RM)
Return A unsign-divided by B, rounded by the given rounding mode.
Definition APInt.cpp:2774
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
m_Intrinsic_Ty< Opnd0, Opnd1, Opnd2 >::Ty m_MaskedStore(const Opnd0 &Op0, const Opnd1 &Op1, const Opnd2 &Op2)
Matches MaskedStore Intrinsic.
ap_match< APInt > m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
CastInst_match< OpTy, TruncInst > m_Trunc(const OpTy &Op)
Matches Trunc.
LogicalOp_match< LHS, RHS, Instruction::And > m_LogicalAnd(const LHS &L, const RHS &R)
Matches L && R either in the form of L & R or L ?
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, ZExtInst >, OpTy > m_ZExtOrSelf(const OpTy &Op)
bool match(Val *V, const Pattern &P)
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
auto match_fn(const Pattern &P)
A match functor that can be used as a UnaryPredicate in functional algorithms like all_of.
m_Intrinsic_Ty< Opnd0, Opnd1, Opnd2 >::Ty m_MaskedLoad(const Opnd0 &Op0, const Opnd1 &Op1, const Opnd2 &Op2)
Matches MaskedLoad Intrinsic.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
SpecificCmpClass_match< LHS, RHS, CmpInst > m_SpecificCmp(CmpPredicate MatchPred, const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
deferredval_ty< Value > m_Deferred(Value *const &V)
Like m_Specific(), but works if the specific value to match is determined as part of the same match()...
CastInst_match< OpTy, FPExtInst > m_FPExt(const OpTy &Op)
SpecificCmpClass_match< LHS, RHS, ICmpInst > m_SpecificICmp(CmpPredicate MatchPred, const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::UDiv > m_UDiv(const LHS &L, const RHS &R)
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
CmpClass_match< LHS, RHS, ICmpInst > m_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
BinaryOp_match< LHS, RHS, Instruction::Mul, true > m_c_Mul(const LHS &L, const RHS &R)
Matches a Mul with LHS and RHS in either order.
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
bind_cst_ty m_scev_APInt(const APInt *&C)
Match an SCEV constant and bind it to an APInt.
bool match(const SCEV *S, const Pattern &P)
VPInstruction_match< VPInstruction::ExtractLastLane, VPInstruction_match< VPInstruction::ExtractLastPart, Op0_t > > m_ExtractLastLaneOfLastPart(const Op0_t &Op0)
AllRecipe_commutative_match< Instruction::And, Op0_t, Op1_t > m_c_BinaryAnd(const Op0_t &Op0, const Op1_t &Op1)
Match a binary AND operation.
AllRecipe_match< Instruction::Or, Op0_t, Op1_t > m_BinaryOr(const Op0_t &Op0, const Op1_t &Op1)
Match a binary OR operation.
VPInstruction_match< VPInstruction::AnyOf > m_AnyOf()
AllRecipe_commutative_match< Opcode, Op0_t, Op1_t > m_c_Binary(const Op0_t &Op0, const Op1_t &Op1)
AllRecipe_commutative_match< Instruction::Or, Op0_t, Op1_t > m_c_BinaryOr(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::ComputeReductionResult, Op0_t > m_ComputeReductionResult(const Op0_t &Op0)
VPInstruction_match< VPInstruction::StepVector > m_StepVector()
GEPLikeRecipe_match< Op0_t, Op1_t > m_GetElementPtr(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::BranchOnTwoConds > m_BranchOnTwoConds()
AllRecipe_match< Opcode, Op0_t, Op1_t > m_Binary(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::LastActiveLane, Op0_t > m_LastActiveLane(const Op0_t &Op0)
VPInstruction_match< Instruction::ExtractElement, Op0_t, Op1_t > m_ExtractElement(const Op0_t &Op0, const Op1_t &Op1)
specific_intval< 1 > m_False()
VPDerivedIV_match< Op0_t, Op1_t, Op2_t > m_DerivedIV(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2)
VPInstruction_match< VPInstruction::ExtractLastLane, Op0_t > m_ExtractLastLane(const Op0_t &Op0)
VPInstruction_match< VPInstruction::ActiveLaneMask, Op0_t, Op1_t, Op2_t > m_ActiveLaneMask(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2)
VPInstruction_match< VPInstruction::BranchOnCount > m_BranchOnCount()
bind_ty< VPIRValue > m_VPIRValue(VPIRValue *&V)
Match a VPIRValue.
specific_intval< 1 > m_True()
VectorEndPointerRecipe_match< Op0_t, Op1_t > m_VecEndPtr(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::ExtractLastPart, Op0_t > m_ExtractLastPart(const Op0_t &Op0)
VPInstruction_match< VPInstruction::Broadcast, Op0_t > m_Broadcast(const Op0_t &Op0)
class_match< VPValue > m_VPValue()
Match an arbitrary VPValue and ignore it.
VPInstruction_match< VPInstruction::ExplicitVectorLength, Op0_t > m_EVL(const Op0_t &Op0)
VPInstruction_match< VPInstruction::BuildVector > m_BuildVector()
BuildVector is matches only its opcode, w/o matching its operands as the number of operands is not fi...
VPInstruction_match< VPInstruction::ExtractPenultimateElement, Op0_t > m_ExtractPenultimateElement(const Op0_t &Op0)
VPInstruction_match< VPInstruction::FirstActiveLane, Op0_t > m_FirstActiveLane(const Op0_t &Op0)
bind_ty< VPInstruction > m_VPInstruction(VPInstruction *&V)
Match a VPInstruction, capturing if we match.
VPInstruction_match< VPInstruction::BranchOnCond > m_BranchOnCond()
VPInstruction_match< VPInstruction::ExtractLane, Op0_t, Op1_t > m_ExtractLane(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::Reverse, Op0_t > m_Reverse(const Op0_t &Op0)
NodeAddr< DefNode * > Def
Definition RDFGraph.h:384
bool isSingleScalar(const VPValue *VPV)
Returns true if VPV is a single scalar, either because it produces the same value for all lanes or on...
bool isUniformAcrossVFsAndUFs(VPValue *V)
Checks if V is uniform across all VF lanes and UF parts.
VPValue * getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr)
Get or create a VPValue that corresponds to the expansion of Expr.
std::optional< MemoryLocation > getMemoryLocation(const VPRecipeBase &R)
Return a MemoryLocation for R with noalias metadata populated from R, if the recipe is supported and ...
bool onlyFirstLaneUsed(const VPValue *Def)
Returns true if only the first lane of Def is used.
bool onlyScalarValuesUsed(const VPValue *Def)
Returns true if only scalar values of Def are used by all users.
const SCEV * getSCEVExprForVPValue(const VPValue *V, PredicatedScalarEvolution &PSE, const Loop *L=nullptr)
Return the SCEV expression for V.
bool isHeaderMask(const VPValue *V, const VPlan &Plan)
Return true if V is a header mask in Plan.
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
@ Offset
Definition DWP.cpp:532
auto min_element(R &&Range)
Provide wrappers to std::min_element which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:2068
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1737
LLVM_ABI Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
Definition STLExtras.h:839
DenseMap< const Value *, const SCEV * > ValueToSCEVMapTy
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2544
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
constexpr from_range_t from_range
auto dyn_cast_if_present(const Y &Val)
dyn_cast_if_present<X> - Functionally identical to dyn_cast, except that a null (or none in the case ...
Definition Casting.h:732
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2198
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:632
auto cast_or_null(const Y &Val)
Definition Casting.h:714
iterator_range< df_iterator< VPBlockShallowTraversalWrapper< VPBlockBase * > > > vp_depth_first_shallow(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order.
Definition VPlanCFG.h:216
iterator_range< df_iterator< VPBlockDeepTraversalWrapper< VPBlockBase * > > > vp_depth_first_deep(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order while traversing t...
Definition VPlanCFG.h:243
constexpr auto equal_to(T &&Arg)
Functor variant of std::equal_to that can be used as a UnaryPredicate in functional algorithms like a...
Definition STLExtras.h:2163
detail::concat_range< ValueT, RangeTs... > concat(RangeTs &&...Ranges)
Returns a concatenated range across two or more ranges.
Definition STLExtras.h:1150
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:385
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1744
auto reverse(ContainerTy &&C)
Definition STLExtras.h:406
iterator_range< po_iterator< VPBlockDeepTraversalWrapper< VPBlockBase * > > > vp_post_order_deep(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in post order while traversing through ...
Definition VPlanCFG.h:236
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1634
LLVM_ABI_FOR_TEST cl::opt< bool > EnableWideActiveLaneMask
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1751
SmallVector< ValueTypeFromRangeType< R >, Size > to_vector(R &&Range)
Given a range of type R, iterate the entire range and return a SmallVector with elements of the vecto...
iterator_range< filter_iterator< detail::IterOfRange< RangeT >, PredicateT > > make_filter_range(RangeT &&Range, PredicateT Pred)
Convenience function that takes a range of elements and a predicate, and return a new filter_iterator...
Definition STLExtras.h:550
bool canConstantBeExtended(const APInt *C, Type *NarrowType, TTI::PartialReductionExtendKind ExtKind)
Check if a constant CI can be safely treated as having been extended from a narrower type with the gi...
Definition VPlan.cpp:1751
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
auto drop_end(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the last N elements excluded.
Definition STLExtras.h:323
RecurKind
These are the kinds of recurrences that we support.
@ Mul
Product of integers.
@ FMul
Product of floats.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
@ AddChainWithSubs
A chain of adds and subs.
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition STLExtras.h:2002
DWARFExpression::Operation Op
auto max_element(R &&Range)
Provide wrappers to std::max_element which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:2078
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition STLExtras.h:2009
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="", bool Before=false)
Split the specified block at the specified instruction.
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1770
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition STLExtras.h:2156
@ DataAndControlFlowWithoutRuntimeCheck
Use predicate to control both data and control flow, but modify the trip count so that a runtime over...
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition Hashing.h:592
bool equal(L &&LRange, R &&RRange)
Wrapper function around std::equal to detect if pair-wise elements between two ranges are the same.
Definition STLExtras.h:2136
Type * toVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
@ Default
The result values are uniform if and only if all operands are uniform.
Definition Uniformity.h:20
constexpr detail::IsaCheckPredicate< Types... > IsaPred
Function object wrapper for the llvm::isa type check.
Definition Casting.h:866
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition Hashing.h:466
#define N
RemoveMask_match(const Op0_t &In, Op1_t &Out)
bool match(OpTy *V) const
A collection of metadata nodes that might be associated with a memory access used by the alias-analys...
Definition Metadata.h:763
MDNode * Scope
The tag for alias scope specification (used with noalias).
Definition Metadata.h:786
MDNode * NoAlias
The tag specifying the noalias scope.
Definition Metadata.h:789
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
An information struct used to provide DenseMap with the various necessary components for a given valu...
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
A range of powers-of-2 vectorization factors with fixed start and adjustable end.
Struct to hold various analysis needed for cost computations.
A recipe for handling first-order recurrence phis.
Definition VPlan.h:2461
A VPValue representing a live-in from the input IR or a constant.
Definition VPlanValue.h:183
Type * getType() const
Returns the type of the underlying IR value.
Definition VPlan.cpp:139
A symbolic live-in VPValue, used for values like vector trip count, VF, and VFxUF.
Definition VPlanValue.h:223
A recipe for widening load operations with vector-predication intrinsics, using the address to load f...
Definition VPlan.h:3485
A recipe for widening load operations, using the address to load from and an optional mask.
Definition VPlan.h:3443
A recipe for widening store operations with vector-predication intrinsics, using the value to store,...
Definition VPlan.h:3570
A recipe for widening store operations, using the stored value, the address to store to and an option...
Definition VPlan.h:3526
static LLVM_ABI_FOR_TEST bool tryToConvertVPInstructionsToVPRecipes(VPlan &Plan, const TargetLibraryInfo &TLI)
Replaces the VPInstructions in Plan with corresponding widen recipes.
static void materializeBroadcasts(VPlan &Plan)
Add explicit broadcasts for live-ins and VPValues defined in Plan's entry block if they are used as v...
static void materializePacksAndUnpacks(VPlan &Plan)
Add explicit Build[Struct]Vector recipes to Pack multiple scalar values into vectors and Unpack recip...
static void optimizeInductionExitUsers(VPlan &Plan, DenseMap< VPValue *, VPValue * > &EndValues, PredicatedScalarEvolution &PSE)
If there's a single exit block, optimize its phi recipes that use exiting IV values by feeding them p...
static void materializeBackedgeTakenCount(VPlan &Plan, VPBasicBlock *VectorPH)
Materialize the backedge-taken count to be computed explicitly using VPInstructions.
static void hoistInvariantLoads(VPlan &Plan)
Hoist single-scalar loads with invariant addresses out of the vector loop to the preheader,...
static void canonicalizeEVLLoops(VPlan &Plan)
Transform EVL loops to use variable-length stepping after region dissolution.
static void dropPoisonGeneratingRecipes(VPlan &Plan, const std::function< bool(BasicBlock *)> &BlockNeedsPredication)
Drop poison flags from recipes that may generate a poison value that is used after vectorization,...
static void createAndOptimizeReplicateRegions(VPlan &Plan)
Wrap predicated VPReplicateRecipes with a mask operand in an if-then region block and remove the mask...
static void createInterleaveGroups(VPlan &Plan, const SmallPtrSetImpl< const InterleaveGroup< Instruction > * > &InterleaveGroups, VPRecipeBuilder &RecipeBuilder, const bool &ScalarEpilogueAllowed)
static void addBranchWeightToMiddleTerminator(VPlan &Plan, ElementCount VF, std::optional< unsigned > VScaleForTuning)
Add branch weight metadata, if the Plan's middle block is terminated by a BranchOnCond recipe.
static void narrowInterleaveGroups(VPlan &Plan, ElementCount VF, TypeSize VectorRegWidth)
Try to convert a plan with interleave groups with VF elements to a plan with the interleave groups re...
static DenseMap< const SCEV *, Value * > expandSCEVs(VPlan &Plan, ScalarEvolution &SE)
Expand VPExpandSCEVRecipes in Plan's entry block.
static void convertToConcreteRecipes(VPlan &Plan)
Lower abstract recipes to concrete ones, that can be codegen'd.
static void expandBranchOnTwoConds(VPlan &Plan)
Expand BranchOnTwoConds instructions into explicit CFG with BranchOnCond instructions.
static void hoistPredicatedLoads(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L)
Hoist predicated loads from the same address to the loop entry block, if they are guaranteed to execu...
static void convertToAbstractRecipes(VPlan &Plan, VPCostContext &Ctx, VFRange &Range)
This function converts initial recipes to the abstract recipes and clamps Range based on cost model f...
static void materializeConstantVectorTripCount(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
static void addExitUsersForFirstOrderRecurrences(VPlan &Plan, VFRange &Range)
Handle users in the exit block for first order reductions in the original exit block.
static void addExplicitVectorLength(VPlan &Plan, const std::optional< unsigned > &MaxEVLSafeElements)
Add a VPEVLBasedIVPHIRecipe and related recipes to Plan and replaces all uses except the canonical IV...
static void optimizeEVLMasks(VPlan &Plan)
Optimize recipes which use an EVL-based header mask to VP intrinsics, for example:
static void replaceSymbolicStrides(VPlan &Plan, PredicatedScalarEvolution &PSE, const DenseMap< Value *, const SCEV * > &StridesMap)
Replace symbolic strides from StridesMap in Plan with constants when possible.
static void removeBranchOnConst(VPlan &Plan)
Remove BranchOnCond recipes with true or false conditions together with removing dead edges to their ...
static void removeDeadRecipes(VPlan &Plan)
Remove dead recipes from Plan.
static void materializeVectorTripCount(VPlan &Plan, VPBasicBlock *VectorPHVPBB, bool TailByMasking, bool RequiresScalarEpilogue)
Materialize vector trip count computations to a set of VPInstructions.
static void simplifyRecipes(VPlan &Plan)
Perform instcombine-like simplifications on recipes in Plan.
static void sinkPredicatedStores(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L)
Sink predicated stores to the same address with complementary predicates (P and NOT P) to an uncondit...
static void handleUncountableEarlyExit(VPBasicBlock *EarlyExitingVPBB, VPBasicBlock *EarlyExitVPBB, VPlan &Plan, VPBasicBlock *HeaderVPBB, VPBasicBlock *LatchVPBB)
Update Plan to account for the uncountable early exit from EarlyExitingVPBB to EarlyExitVPBB by intro...
static void clearReductionWrapFlags(VPlan &Plan)
Clear NSW/NUW flags from reduction instructions if necessary.
static void cse(VPlan &Plan)
Perform common-subexpression-elimination on Plan.
static void addActiveLaneMask(VPlan &Plan, bool UseActiveLaneMaskForControlFlow, bool DataAndControlFlowWithoutRuntimeCheck)
Replace (ICMP_ULE, wide canonical IV, backedge-taken-count) checks with an (active-lane-mask recipe,...
static LLVM_ABI_FOR_TEST void optimize(VPlan &Plan)
Apply VPlan-to-VPlan optimizations to Plan, including induction recipe optimizations,...
static void dissolveLoopRegions(VPlan &Plan)
Replace loop regions with explicit CFG.
static void truncateToMinimalBitwidths(VPlan &Plan, const MapVector< Instruction *, uint64_t > &MinBWs)
Insert truncates and extends for any truncated recipe.
static bool adjustFixedOrderRecurrences(VPlan &Plan, VPBuilder &Builder)
Try to have all users of fixed-order recurrences appear after the recipe defining their previous valu...
static void optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Optimize Plan based on BestVF and BestUF.
static void materializeVFAndVFxUF(VPlan &Plan, VPBasicBlock *VectorPH, ElementCount VF)
Materialize VF and VFxUF to be computed explicitly using VPInstructions.
static void updateScalarResumePhis(VPlan &Plan, DenseMap< VPValue *, VPValue * > &IVEndValues)
Update the resume phis in the scalar preheader after creating wide recipes for first-order recurrence...