LLVM 22.0.0git
VPlanRecipes.cpp
Go to the documentation of this file.
1//===- VPlanRecipes.cpp - Implementations for VPlan recipes ---------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file contains implementations for different VPlan recipes.
11///
12//===----------------------------------------------------------------------===//
13
15#include "VPlan.h"
16#include "VPlanAnalysis.h"
17#include "VPlanHelpers.h"
18#include "VPlanPatternMatch.h"
19#include "VPlanUtils.h"
20#include "llvm/ADT/STLExtras.h"
22#include "llvm/ADT/Twine.h"
26#include "llvm/IR/BasicBlock.h"
27#include "llvm/IR/IRBuilder.h"
28#include "llvm/IR/Instruction.h"
30#include "llvm/IR/Intrinsics.h"
31#include "llvm/IR/Type.h"
32#include "llvm/IR/Value.h"
35#include "llvm/Support/Debug.h"
40#include <cassert>
41
42using namespace llvm;
43using namespace llvm::VPlanPatternMatch;
44
46
47#define LV_NAME "loop-vectorize"
48#define DEBUG_TYPE LV_NAME
49
51 switch (getVPDefID()) {
52 case VPExpressionSC:
53 return cast<VPExpressionRecipe>(this)->mayReadOrWriteMemory();
54 case VPInstructionSC:
55 return cast<VPInstruction>(this)->opcodeMayReadOrWriteFromMemory();
56 case VPInterleaveEVLSC:
57 case VPInterleaveSC:
58 return cast<VPInterleaveBase>(this)->getNumStoreOperands() > 0;
59 case VPWidenStoreEVLSC:
60 case VPWidenStoreSC:
61 return true;
62 case VPReplicateSC:
63 return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
64 ->mayWriteToMemory();
65 case VPWidenCallSC:
66 return !cast<VPWidenCallRecipe>(this)
67 ->getCalledScalarFunction()
68 ->onlyReadsMemory();
69 case VPWidenIntrinsicSC:
70 return cast<VPWidenIntrinsicRecipe>(this)->mayWriteToMemory();
71 case VPCanonicalIVPHISC:
72 case VPBranchOnMaskSC:
73 case VPDerivedIVSC:
74 case VPFirstOrderRecurrencePHISC:
75 case VPReductionPHISC:
76 case VPScalarIVStepsSC:
77 case VPPredInstPHISC:
78 return false;
79 case VPBlendSC:
80 case VPReductionEVLSC:
81 case VPReductionSC:
82 case VPVectorPointerSC:
83 case VPWidenCanonicalIVSC:
84 case VPWidenCastSC:
85 case VPWidenGEPSC:
86 case VPWidenIntOrFpInductionSC:
87 case VPWidenLoadEVLSC:
88 case VPWidenLoadSC:
89 case VPWidenPHISC:
90 case VPWidenPointerInductionSC:
91 case VPWidenSC:
92 case VPWidenSelectSC: {
93 const Instruction *I =
94 dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
95 (void)I;
96 assert((!I || !I->mayWriteToMemory()) &&
97 "underlying instruction may write to memory");
98 return false;
99 }
100 default:
101 return true;
102 }
103}
104
106 switch (getVPDefID()) {
107 case VPExpressionSC:
108 return cast<VPExpressionRecipe>(this)->mayReadOrWriteMemory();
109 case VPInstructionSC:
110 return cast<VPInstruction>(this)->opcodeMayReadOrWriteFromMemory();
111 case VPWidenLoadEVLSC:
112 case VPWidenLoadSC:
113 return true;
114 case VPReplicateSC:
115 return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
116 ->mayReadFromMemory();
117 case VPWidenCallSC:
118 return !cast<VPWidenCallRecipe>(this)
119 ->getCalledScalarFunction()
120 ->onlyWritesMemory();
121 case VPWidenIntrinsicSC:
122 return cast<VPWidenIntrinsicRecipe>(this)->mayReadFromMemory();
123 case VPBranchOnMaskSC:
124 case VPDerivedIVSC:
125 case VPFirstOrderRecurrencePHISC:
126 case VPPredInstPHISC:
127 case VPScalarIVStepsSC:
128 case VPWidenStoreEVLSC:
129 case VPWidenStoreSC:
130 return false;
131 case VPBlendSC:
132 case VPReductionEVLSC:
133 case VPReductionSC:
134 case VPVectorPointerSC:
135 case VPWidenCanonicalIVSC:
136 case VPWidenCastSC:
137 case VPWidenGEPSC:
138 case VPWidenIntOrFpInductionSC:
139 case VPWidenPHISC:
140 case VPWidenPointerInductionSC:
141 case VPWidenSC:
142 case VPWidenSelectSC: {
143 const Instruction *I =
144 dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
145 (void)I;
146 assert((!I || !I->mayReadFromMemory()) &&
147 "underlying instruction may read from memory");
148 return false;
149 }
150 default:
151 // FIXME: Return false if the recipe represents an interleaved store.
152 return true;
153 }
154}
155
157 switch (getVPDefID()) {
158 case VPExpressionSC:
159 return cast<VPExpressionRecipe>(this)->mayHaveSideEffects();
160 case VPDerivedIVSC:
161 case VPFirstOrderRecurrencePHISC:
162 case VPPredInstPHISC:
163 case VPVectorEndPointerSC:
164 return false;
165 case VPInstructionSC: {
166 auto *VPI = cast<VPInstruction>(this);
167 return mayWriteToMemory() ||
168 VPI->getOpcode() == VPInstruction::BranchOnCount ||
169 VPI->getOpcode() == VPInstruction::BranchOnCond;
170 }
171 case VPWidenCallSC: {
172 Function *Fn = cast<VPWidenCallRecipe>(this)->getCalledScalarFunction();
173 return mayWriteToMemory() || !Fn->doesNotThrow() || !Fn->willReturn();
174 }
175 case VPWidenIntrinsicSC:
176 return cast<VPWidenIntrinsicRecipe>(this)->mayHaveSideEffects();
177 case VPBlendSC:
178 case VPReductionEVLSC:
179 case VPPartialReductionSC:
180 case VPReductionSC:
181 case VPScalarIVStepsSC:
182 case VPVectorPointerSC:
183 case VPWidenCanonicalIVSC:
184 case VPWidenCastSC:
185 case VPWidenGEPSC:
186 case VPWidenIntOrFpInductionSC:
187 case VPWidenPHISC:
188 case VPWidenPointerInductionSC:
189 case VPWidenSC:
190 case VPWidenSelectSC: {
191 const Instruction *I =
192 dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
193 (void)I;
194 assert((!I || !I->mayHaveSideEffects()) &&
195 "underlying instruction has side-effects");
196 return false;
197 }
198 case VPInterleaveEVLSC:
199 case VPInterleaveSC:
200 return mayWriteToMemory();
201 case VPWidenLoadEVLSC:
202 case VPWidenLoadSC:
203 case VPWidenStoreEVLSC:
204 case VPWidenStoreSC:
205 assert(
206 cast<VPWidenMemoryRecipe>(this)->getIngredient().mayHaveSideEffects() ==
208 "mayHaveSideffects result for ingredient differs from this "
209 "implementation");
210 return mayWriteToMemory();
211 case VPReplicateSC: {
212 auto *R = cast<VPReplicateRecipe>(this);
213 return R->getUnderlyingInstr()->mayHaveSideEffects();
214 }
215 default:
216 return true;
217 }
218}
219
221 assert(!Parent && "Recipe already in some VPBasicBlock");
222 assert(InsertPos->getParent() &&
223 "Insertion position not in any VPBasicBlock");
224 InsertPos->getParent()->insert(this, InsertPos->getIterator());
225}
226
227void VPRecipeBase::insertBefore(VPBasicBlock &BB,
229 assert(!Parent && "Recipe already in some VPBasicBlock");
230 assert(I == BB.end() || I->getParent() == &BB);
231 BB.insert(this, I);
232}
233
235 assert(!Parent && "Recipe already in some VPBasicBlock");
236 assert(InsertPos->getParent() &&
237 "Insertion position not in any VPBasicBlock");
238 InsertPos->getParent()->insert(this, std::next(InsertPos->getIterator()));
239}
240
242 assert(getParent() && "Recipe not in any VPBasicBlock");
244 Parent = nullptr;
245}
246
248 assert(getParent() && "Recipe not in any VPBasicBlock");
250}
251
254 insertAfter(InsertPos);
255}
256
262
264 // Get the underlying instruction for the recipe, if there is one. It is used
265 // to
266 // * decide if cost computation should be skipped for this recipe,
267 // * apply forced target instruction cost.
268 Instruction *UI = nullptr;
269 if (auto *S = dyn_cast<VPSingleDefRecipe>(this))
270 UI = dyn_cast_or_null<Instruction>(S->getUnderlyingValue());
271 else if (auto *IG = dyn_cast<VPInterleaveBase>(this))
272 UI = IG->getInsertPos();
273 else if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(this))
274 UI = &WidenMem->getIngredient();
275
276 InstructionCost RecipeCost;
277 if (UI && Ctx.skipCostComputation(UI, VF.isVector())) {
278 RecipeCost = 0;
279 } else {
280 RecipeCost = computeCost(VF, Ctx);
281 if (UI && ForceTargetInstructionCost.getNumOccurrences() > 0 &&
282 RecipeCost.isValid())
284 }
285
286 LLVM_DEBUG({
287 dbgs() << "Cost of " << RecipeCost << " for VF " << VF << ": ";
288 dump();
289 });
290 return RecipeCost;
291}
292
294 VPCostContext &Ctx) const {
295 llvm_unreachable("subclasses should implement computeCost");
296}
297
299 return (getVPDefID() >= VPFirstPHISC && getVPDefID() <= VPLastPHISC) ||
301}
302
304 auto *VPI = dyn_cast<VPInstruction>(this);
305 return VPI && Instruction::isCast(VPI->getOpcode());
306}
307
310 VPCostContext &Ctx) const {
311 std::optional<unsigned> Opcode;
312 VPValue *Op = getVecOp();
313 uint64_t MulConst;
314 // If the partial reduction is predicated, a select will be operand 1.
315 // If it isn't predicated and the mul isn't operating on a constant, then it
316 // should have been turned into a VPExpressionRecipe.
317 // FIXME: Replace the entire function with this once all partial reduction
318 // variants are bundled into VPExpressionRecipe.
320 !match(Op, m_Mul(m_VPValue(), m_ConstantInt(MulConst)))) {
321 auto *PhiType = Ctx.Types.inferScalarType(getChainOp());
322 auto *InputType = Ctx.Types.inferScalarType(getVecOp());
323 return Ctx.TTI.getPartialReductionCost(getOpcode(), InputType, InputType,
324 PhiType, VF, TTI::PR_None,
325 TTI::PR_None, {}, Ctx.CostKind);
326 }
327
328 VPRecipeBase *OpR = Op->getDefiningRecipe();
329 Type *InputTypeA = nullptr, *InputTypeB = nullptr;
331 ExtBType = TTI::PR_None;
332
333 auto GetExtendKind = [](VPRecipeBase *R) {
334 if (!R)
335 return TTI::PR_None;
336 auto *WidenCastR = dyn_cast<VPWidenCastRecipe>(R);
337 if (!WidenCastR)
338 return TTI::PR_None;
339 if (WidenCastR->getOpcode() == Instruction::CastOps::ZExt)
340 return TTI::PR_ZeroExtend;
341 if (WidenCastR->getOpcode() == Instruction::CastOps::SExt)
342 return TTI::PR_SignExtend;
343 return TTI::PR_None;
344 };
345
346 // Pick out opcode, type/ext information and use sub side effects from a widen
347 // recipe.
348 auto HandleWiden = [&](VPWidenRecipe *Widen) {
349 if (match(Widen, m_Sub(m_ZeroInt(), m_VPValue(Op)))) {
350 Widen = dyn_cast<VPWidenRecipe>(Op->getDefiningRecipe());
351 }
352 Opcode = Widen->getOpcode();
353 VPRecipeBase *ExtAR = Widen->getOperand(0)->getDefiningRecipe();
354 VPRecipeBase *ExtBR = Widen->getOperand(1)->getDefiningRecipe();
355 InputTypeA = Ctx.Types.inferScalarType(ExtAR ? ExtAR->getOperand(0)
356 : Widen->getOperand(0));
357 InputTypeB = Ctx.Types.inferScalarType(ExtBR ? ExtBR->getOperand(0)
358 : Widen->getOperand(1));
359 ExtAType = GetExtendKind(ExtAR);
360 ExtBType = GetExtendKind(ExtBR);
361
362 using namespace VPlanPatternMatch;
363 const APInt *C;
364 if (!ExtBR && match(Widen->getOperand(1), m_APInt(C)) &&
365 canConstantBeExtended(C, InputTypeA, ExtAType)) {
366 InputTypeB = InputTypeA;
367 ExtBType = ExtAType;
368 }
369 };
370
371 if (isa<VPWidenCastRecipe>(OpR)) {
372 InputTypeA = Ctx.Types.inferScalarType(OpR->getOperand(0));
373 ExtAType = GetExtendKind(OpR);
374 } else if (isa<VPReductionPHIRecipe>(OpR)) {
375 auto RedPhiOp1R = getOperand(1)->getDefiningRecipe();
376 if (isa<VPWidenCastRecipe>(RedPhiOp1R)) {
377 InputTypeA = Ctx.Types.inferScalarType(RedPhiOp1R->getOperand(0));
378 ExtAType = GetExtendKind(RedPhiOp1R);
379 } else if (auto Widen = dyn_cast<VPWidenRecipe>(RedPhiOp1R))
380 HandleWiden(Widen);
381 } else if (auto Widen = dyn_cast<VPWidenRecipe>(OpR)) {
382 HandleWiden(Widen);
383 } else if (auto Reduction = dyn_cast<VPPartialReductionRecipe>(OpR)) {
384 return Reduction->computeCost(VF, Ctx);
385 }
386 auto *PhiType = Ctx.Types.inferScalarType(getOperand(1));
387 return Ctx.TTI.getPartialReductionCost(getOpcode(), InputTypeA, InputTypeB,
388 PhiType, VF, ExtAType, ExtBType,
389 Opcode, Ctx.CostKind);
390}
391
393 auto &Builder = State.Builder;
394
395 assert(getOpcode() == Instruction::Add &&
396 "Unhandled partial reduction opcode");
397
398 Value *BinOpVal = State.get(getOperand(1));
399 Value *PhiVal = State.get(getOperand(0));
400 assert(PhiVal && BinOpVal && "Phi and Mul must be set");
401
402 Type *RetTy = PhiVal->getType();
403
404 CallInst *V =
405 Builder.CreateIntrinsic(RetTy, Intrinsic::vector_partial_reduce_add,
406 {PhiVal, BinOpVal}, nullptr, "partial.reduce");
407
408 State.set(this, V);
409}
410
411#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
413 VPSlotTracker &SlotTracker) const {
414 O << Indent << "PARTIAL-REDUCE ";
416 O << " = " << Instruction::getOpcodeName(getOpcode()) << " ";
418}
419#endif
420
422 assert(OpType == Other.OpType && "OpType must match");
423 switch (OpType) {
424 case OperationType::OverflowingBinOp:
425 WrapFlags.HasNUW &= Other.WrapFlags.HasNUW;
426 WrapFlags.HasNSW &= Other.WrapFlags.HasNSW;
427 break;
428 case OperationType::Trunc:
429 TruncFlags.HasNUW &= Other.TruncFlags.HasNUW;
430 TruncFlags.HasNSW &= Other.TruncFlags.HasNSW;
431 break;
432 case OperationType::DisjointOp:
433 DisjointFlags.IsDisjoint &= Other.DisjointFlags.IsDisjoint;
434 break;
435 case OperationType::PossiblyExactOp:
436 ExactFlags.IsExact &= Other.ExactFlags.IsExact;
437 break;
438 case OperationType::GEPOp:
439 GEPFlags &= Other.GEPFlags;
440 break;
441 case OperationType::FPMathOp:
442 FMFs.NoNaNs &= Other.FMFs.NoNaNs;
443 FMFs.NoInfs &= Other.FMFs.NoInfs;
444 break;
445 case OperationType::NonNegOp:
446 NonNegFlags.NonNeg &= Other.NonNegFlags.NonNeg;
447 break;
448 case OperationType::Cmp:
449 assert(CmpPredicate == Other.CmpPredicate && "Cannot drop CmpPredicate");
450 break;
451 case OperationType::Other:
452 assert(AllFlags == Other.AllFlags && "Cannot drop other flags");
453 break;
454 }
455}
456
458 assert(OpType == OperationType::FPMathOp &&
459 "recipe doesn't have fast math flags");
460 FastMathFlags Res;
461 Res.setAllowReassoc(FMFs.AllowReassoc);
462 Res.setNoNaNs(FMFs.NoNaNs);
463 Res.setNoInfs(FMFs.NoInfs);
464 Res.setNoSignedZeros(FMFs.NoSignedZeros);
465 Res.setAllowReciprocal(FMFs.AllowReciprocal);
466 Res.setAllowContract(FMFs.AllowContract);
467 Res.setApproxFunc(FMFs.ApproxFunc);
468 return Res;
469}
470
471#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
473#endif
474
475template <unsigned PartOpIdx>
476VPValue *
478 if (U.getNumOperands() == PartOpIdx + 1)
479 return U.getOperand(PartOpIdx);
480 return nullptr;
481}
482
483template <unsigned PartOpIdx>
485 if (auto *UnrollPartOp = getUnrollPartOperand(U))
486 return cast<ConstantInt>(UnrollPartOp->getLiveInIRValue())->getZExtValue();
487 return 0;
488}
489
490namespace llvm {
491template class VPUnrollPartAccessor<1>;
492template class VPUnrollPartAccessor<2>;
493template class VPUnrollPartAccessor<3>;
494}
495
497 const VPIRFlags &Flags, const VPIRMetadata &MD,
498 DebugLoc DL, const Twine &Name)
499 : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, Flags, DL),
500 VPIRMetadata(MD), Opcode(Opcode), Name(Name.str()) {
502 "Set flags not supported for the provided opcode");
503 assert((getNumOperandsForOpcode(Opcode) == -1u ||
504 getNumOperandsForOpcode(Opcode) == getNumOperands()) &&
505 "number of operands does not match opcode");
506}
507
508#ifndef NDEBUG
509unsigned VPInstruction::getNumOperandsForOpcode(unsigned Opcode) {
510 if (Instruction::isUnaryOp(Opcode) || Instruction::isCast(Opcode))
511 return 1;
512
513 if (Instruction::isBinaryOp(Opcode))
514 return 2;
515
516 switch (Opcode) {
519 return 0;
520 case Instruction::Alloca:
521 case Instruction::ExtractValue:
522 case Instruction::Freeze:
523 case Instruction::Load:
537 return 1;
538 case Instruction::ICmp:
539 case Instruction::FCmp:
540 case Instruction::Store:
548 return 2;
549 case Instruction::Select:
553 return 3;
555 return 4;
556 case Instruction::Call:
557 case Instruction::GetElementPtr:
558 case Instruction::PHI:
559 case Instruction::Switch:
560 // Cannot determine the number of operands from the opcode.
561 return -1u;
562 }
563 llvm_unreachable("all cases should be handled above");
564}
565#endif
566
570
571bool VPInstruction::canGenerateScalarForFirstLane() const {
573 return true;
575 return true;
576 switch (Opcode) {
577 case Instruction::Freeze:
578 case Instruction::ICmp:
579 case Instruction::PHI:
580 case Instruction::Select:
589 return true;
590 default:
591 return false;
592 }
593}
594
595/// Create a conditional branch using \p Cond branching to the successors of \p
596/// VPBB. Note that the first successor is always forward (i.e. not created yet)
597/// while the second successor may already have been created (if it is a header
598/// block and VPBB is a latch).
600 VPTransformState &State) {
601 // Replace the temporary unreachable terminator with a new conditional
602 // branch, hooking it up to backward destination (header) for latch blocks
603 // now, and to forward destination(s) later when they are created.
604 // Second successor may be backwards - iff it is already in VPBB2IRBB.
605 VPBasicBlock *SecondVPSucc = cast<VPBasicBlock>(VPBB->getSuccessors()[1]);
606 BasicBlock *SecondIRSucc = State.CFG.VPBB2IRBB.lookup(SecondVPSucc);
607 BasicBlock *IRBB = State.CFG.VPBB2IRBB[VPBB];
608 BranchInst *CondBr = State.Builder.CreateCondBr(Cond, IRBB, SecondIRSucc);
609 // First successor is always forward, reset it to nullptr
610 CondBr->setSuccessor(0, nullptr);
612 return CondBr;
613}
614
615Value *VPInstruction::generate(VPTransformState &State) {
616 IRBuilderBase &Builder = State.Builder;
617
619 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
620 Value *A = State.get(getOperand(0), OnlyFirstLaneUsed);
621 Value *B = State.get(getOperand(1), OnlyFirstLaneUsed);
622 auto *Res =
623 Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B, Name);
624 if (auto *I = dyn_cast<Instruction>(Res))
625 applyFlags(*I);
626 return Res;
627 }
628
629 switch (getOpcode()) {
630 case VPInstruction::Not: {
631 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
632 Value *A = State.get(getOperand(0), OnlyFirstLaneUsed);
633 return Builder.CreateNot(A, Name);
634 }
635 case Instruction::ExtractElement: {
636 assert(State.VF.isVector() && "Only extract elements from vectors");
637 if (getOperand(1)->isLiveIn()) {
638 unsigned IdxToExtract =
639 cast<ConstantInt>(getOperand(1)->getLiveInIRValue())->getZExtValue();
640 return State.get(getOperand(0), VPLane(IdxToExtract));
641 }
642 Value *Vec = State.get(getOperand(0));
643 Value *Idx = State.get(getOperand(1), /*IsScalar=*/true);
644 return Builder.CreateExtractElement(Vec, Idx, Name);
645 }
646 case Instruction::Freeze: {
648 return Builder.CreateFreeze(Op, Name);
649 }
650 case Instruction::FCmp:
651 case Instruction::ICmp: {
652 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
653 Value *A = State.get(getOperand(0), OnlyFirstLaneUsed);
654 Value *B = State.get(getOperand(1), OnlyFirstLaneUsed);
655 return Builder.CreateCmp(getPredicate(), A, B, Name);
656 }
657 case Instruction::PHI: {
658 llvm_unreachable("should be handled by VPPhi::execute");
659 }
660 case Instruction::Select: {
661 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
662 Value *Cond = State.get(getOperand(0), OnlyFirstLaneUsed);
663 Value *Op1 = State.get(getOperand(1), OnlyFirstLaneUsed);
664 Value *Op2 = State.get(getOperand(2), OnlyFirstLaneUsed);
665 return Builder.CreateSelect(Cond, Op1, Op2, Name);
666 }
668 // Get first lane of vector induction variable.
669 Value *VIVElem0 = State.get(getOperand(0), VPLane(0));
670 // Get the original loop tripcount.
671 Value *ScalarTC = State.get(getOperand(1), VPLane(0));
672
673 // If this part of the active lane mask is scalar, generate the CMP directly
674 // to avoid unnecessary extracts.
675 if (State.VF.isScalar())
676 return Builder.CreateCmp(CmpInst::Predicate::ICMP_ULT, VIVElem0, ScalarTC,
677 Name);
678
679 auto *Int1Ty = Type::getInt1Ty(Builder.getContext());
680 auto PredTy = VectorType::get(
681 Int1Ty, State.VF * cast<ConstantInt>(getOperand(2)->getLiveInIRValue())
682 ->getZExtValue());
683 return Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask,
684 {PredTy, ScalarTC->getType()},
685 {VIVElem0, ScalarTC}, nullptr, Name);
686 }
688 // Generate code to combine the previous and current values in vector v3.
689 //
690 // vector.ph:
691 // v_init = vector(..., ..., ..., a[-1])
692 // br vector.body
693 //
694 // vector.body
695 // i = phi [0, vector.ph], [i+4, vector.body]
696 // v1 = phi [v_init, vector.ph], [v2, vector.body]
697 // v2 = a[i, i+1, i+2, i+3];
698 // v3 = vector(v1(3), v2(0, 1, 2))
699
700 auto *V1 = State.get(getOperand(0));
701 if (!V1->getType()->isVectorTy())
702 return V1;
703 Value *V2 = State.get(getOperand(1));
704 return Builder.CreateVectorSplice(V1, V2, -1, Name);
705 }
707 unsigned UF = getParent()->getPlan()->getUF();
708 Value *ScalarTC = State.get(getOperand(0), VPLane(0));
709 Value *Step = createStepForVF(Builder, ScalarTC->getType(), State.VF, UF);
710 Value *Sub = Builder.CreateSub(ScalarTC, Step);
711 Value *Cmp = Builder.CreateICmp(CmpInst::Predicate::ICMP_UGT, ScalarTC, Step);
712 Value *Zero = ConstantInt::get(ScalarTC->getType(), 0);
713 return Builder.CreateSelect(Cmp, Sub, Zero);
714 }
716 // TODO: Restructure this code with an explicit remainder loop, vsetvli can
717 // be outside of the main loop.
718 Value *AVL = State.get(getOperand(0), /*IsScalar*/ true);
719 // Compute EVL
720 assert(AVL->getType()->isIntegerTy() &&
721 "Requested vector length should be an integer.");
722
723 assert(State.VF.isScalable() && "Expected scalable vector factor.");
724 Value *VFArg = State.Builder.getInt32(State.VF.getKnownMinValue());
725
726 Value *EVL = State.Builder.CreateIntrinsic(
727 State.Builder.getInt32Ty(), Intrinsic::experimental_get_vector_length,
728 {AVL, VFArg, State.Builder.getTrue()});
729 return EVL;
730 }
732 unsigned Part = getUnrollPart(*this);
733 auto *IV = State.get(getOperand(0), VPLane(0));
734 assert(Part != 0 && "Must have a positive part");
735 // The canonical IV is incremented by the vectorization factor (num of
736 // SIMD elements) times the unroll part.
737 Value *Step = createStepForVF(Builder, IV->getType(), State.VF, Part);
738 return Builder.CreateAdd(IV, Step, Name, hasNoUnsignedWrap(),
740 }
742 Value *Cond = State.get(getOperand(0), VPLane(0));
743 auto *Br = createCondBranch(Cond, getParent(), State);
744 applyMetadata(*Br);
745 return Br;
746 }
748 // First create the compare.
749 Value *IV = State.get(getOperand(0), /*IsScalar*/ true);
750 Value *TC = State.get(getOperand(1), /*IsScalar*/ true);
751 Value *Cond = Builder.CreateICmpEQ(IV, TC);
752 return createCondBranch(Cond, getParent(), State);
753 }
755 return Builder.CreateVectorSplat(
756 State.VF, State.get(getOperand(0), /*IsScalar*/ true), "broadcast");
757 }
759 // For struct types, we need to build a new 'wide' struct type, where each
760 // element is widened, i.e., we create a struct of vectors.
761 auto *StructTy =
763 Value *Res = PoisonValue::get(toVectorizedTy(StructTy, State.VF));
764 for (const auto &[LaneIndex, Op] : enumerate(operands())) {
765 for (unsigned FieldIndex = 0; FieldIndex != StructTy->getNumElements();
766 FieldIndex++) {
767 Value *ScalarValue =
768 Builder.CreateExtractValue(State.get(Op, true), FieldIndex);
769 Value *VectorValue = Builder.CreateExtractValue(Res, FieldIndex);
770 VectorValue =
771 Builder.CreateInsertElement(VectorValue, ScalarValue, LaneIndex);
772 Res = Builder.CreateInsertValue(Res, VectorValue, FieldIndex);
773 }
774 }
775 return Res;
776 }
778 auto *ScalarTy = State.TypeAnalysis.inferScalarType(getOperand(0));
779 auto NumOfElements = ElementCount::getFixed(getNumOperands());
780 Value *Res = PoisonValue::get(toVectorizedTy(ScalarTy, NumOfElements));
781 for (const auto &[Idx, Op] : enumerate(operands()))
782 Res = State.Builder.CreateInsertElement(Res, State.get(Op, true),
783 State.Builder.getInt32(Idx));
784 return Res;
785 }
787 if (State.VF.isScalar())
788 return State.get(getOperand(0), true);
789 IRBuilderBase::FastMathFlagGuard FMFG(Builder);
791 // If this start vector is scaled then it should produce a vector with fewer
792 // elements than the VF.
793 ElementCount VF = State.VF.divideCoefficientBy(
794 cast<ConstantInt>(getOperand(2)->getLiveInIRValue())->getZExtValue());
795 auto *Iden = Builder.CreateVectorSplat(VF, State.get(getOperand(1), true));
796 Constant *Zero = Builder.getInt32(0);
797 return Builder.CreateInsertElement(Iden, State.get(getOperand(0), true),
798 Zero);
799 }
801 // FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary
802 // and will be removed by breaking up the recipe further.
803 auto *PhiR = cast<VPReductionPHIRecipe>(getOperand(0));
804 auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
805 Value *ReducedPartRdx = State.get(getOperand(2));
806 for (unsigned Idx = 3; Idx < getNumOperands(); ++Idx)
807 ReducedPartRdx = Builder.CreateBinOp(
810 State.get(getOperand(Idx)), ReducedPartRdx, "bin.rdx");
811 return createAnyOfReduction(Builder, ReducedPartRdx,
812 State.get(getOperand(1), VPLane(0)), OrigPhi);
813 }
815 // FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary
816 // and will be removed by breaking up the recipe further.
817 auto *PhiR = cast<VPReductionPHIRecipe>(getOperand(0));
818 // Get its reduction variable descriptor.
819 RecurKind RK = PhiR->getRecurrenceKind();
821 "Unexpected reduction kind");
822 assert(!PhiR->isInLoop() &&
823 "In-loop FindLastIV reduction is not supported yet");
824
825 // The recipe's operands are the reduction phi, the start value, the
826 // sentinel value, followed by one operand for each part of the reduction.
827 unsigned UF = getNumOperands() - 3;
828 Value *ReducedPartRdx = State.get(getOperand(3));
829 RecurKind MinMaxKind;
832 MinMaxKind = IsSigned ? RecurKind::SMax : RecurKind::UMax;
833 else
834 MinMaxKind = IsSigned ? RecurKind::SMin : RecurKind::UMin;
835 for (unsigned Part = 1; Part < UF; ++Part)
836 ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx,
837 State.get(getOperand(3 + Part)));
838
839 Value *Start = State.get(getOperand(1), true);
841 return createFindLastIVReduction(Builder, ReducedPartRdx, RK, Start,
842 Sentinel);
843 }
845 // FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary
846 // and will be removed by breaking up the recipe further.
847 auto *PhiR = cast<VPReductionPHIRecipe>(getOperand(0));
848 // Get its reduction variable descriptor.
849
850 RecurKind RK = PhiR->getRecurrenceKind();
852 "should be handled by ComputeFindIVResult");
853
854 // The recipe's operands are the reduction phi, followed by one operand for
855 // each part of the reduction.
856 unsigned UF = getNumOperands() - 1;
857 VectorParts RdxParts(UF);
858 for (unsigned Part = 0; Part < UF; ++Part)
859 RdxParts[Part] = State.get(getOperand(1 + Part), PhiR->isInLoop());
860
861 IRBuilderBase::FastMathFlagGuard FMFG(Builder);
862 if (hasFastMathFlags())
864
865 // Reduce all of the unrolled parts into a single vector.
866 Value *ReducedPartRdx = RdxParts[0];
867 if (PhiR->isOrdered()) {
868 ReducedPartRdx = RdxParts[UF - 1];
869 } else {
870 // Floating-point operations should have some FMF to enable the reduction.
871 for (unsigned Part = 1; Part < UF; ++Part) {
872 Value *RdxPart = RdxParts[Part];
874 ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
875 else {
877 // For sub-recurrences, each UF's reduction variable is already
878 // negative, we need to do: reduce.add(-acc_uf0 + -acc_uf1)
879 if (RK == RecurKind::Sub)
880 Opcode = Instruction::Add;
881 else
882 Opcode =
884 ReducedPartRdx =
885 Builder.CreateBinOp(Opcode, RdxPart, ReducedPartRdx, "bin.rdx");
886 }
887 }
888 }
889
890 // Create the reduction after the loop. Note that inloop reductions create
891 // the target reduction in the loop using a Reduction recipe.
892 if (State.VF.isVector() && !PhiR->isInLoop()) {
893 // TODO: Support in-order reductions based on the recurrence descriptor.
894 // All ops in the reduction inherit fast-math-flags from the recurrence
895 // descriptor.
896 ReducedPartRdx = createSimpleReduction(Builder, ReducedPartRdx, RK);
897 }
898
899 return ReducedPartRdx;
900 }
904 unsigned Offset =
906 Value *Res;
907 if (State.VF.isVector()) {
908 assert(Offset <= State.VF.getKnownMinValue() &&
909 "invalid offset to extract from");
910 // Extract lane VF - Offset from the operand.
911 Res = State.get(getOperand(0), VPLane::getLaneFromEnd(State.VF, Offset));
912 } else {
913 assert(Offset <= 1 && "invalid offset to extract from");
914 Res = State.get(getOperand(0));
915 }
917 Res->setName(Name);
918 return Res;
919 }
921 Value *A = State.get(getOperand(0));
922 Value *B = State.get(getOperand(1));
923 return Builder.CreateLogicalAnd(A, B, Name);
924 }
927 "can only generate first lane for PtrAdd");
928 Value *Ptr = State.get(getOperand(0), VPLane(0));
929 Value *Addend = State.get(getOperand(1), VPLane(0));
930 return Builder.CreatePtrAdd(Ptr, Addend, Name, getGEPNoWrapFlags());
931 }
935 Value *Addend = State.get(getOperand(1));
936 return Builder.CreatePtrAdd(Ptr, Addend, Name, getGEPNoWrapFlags());
937 }
939 Value *Res = Builder.CreateFreeze(State.get(getOperand(0)));
940 for (VPValue *Op : drop_begin(operands()))
941 Res = Builder.CreateOr(Res, Builder.CreateFreeze(State.get(Op)));
942 return State.VF.isScalar() ? Res : Builder.CreateOrReduce(Res);
943 }
945 Value *LaneToExtract = State.get(getOperand(0), true);
946 Type *IdxTy = State.TypeAnalysis.inferScalarType(getOperand(0));
947 Value *Res = nullptr;
948 Value *RuntimeVF = getRuntimeVF(State.Builder, IdxTy, State.VF);
949
950 for (unsigned Idx = 1; Idx != getNumOperands(); ++Idx) {
951 Value *VectorStart =
952 Builder.CreateMul(RuntimeVF, ConstantInt::get(IdxTy, Idx - 1));
953 Value *VectorIdx = Idx == 1
954 ? LaneToExtract
955 : Builder.CreateSub(LaneToExtract, VectorStart);
956 Value *Ext = State.VF.isScalar()
957 ? State.get(getOperand(Idx))
958 : Builder.CreateExtractElement(
959 State.get(getOperand(Idx)), VectorIdx);
960 if (Res) {
961 Value *Cmp = Builder.CreateICmpUGE(LaneToExtract, VectorStart);
962 Res = Builder.CreateSelect(Cmp, Ext, Res);
963 } else {
964 Res = Ext;
965 }
966 }
967 return Res;
968 }
970 if (getNumOperands() == 1) {
971 Value *Mask = State.get(getOperand(0));
972 return Builder.CreateCountTrailingZeroElems(Builder.getInt64Ty(), Mask,
973 true, Name);
974 }
975 // If there are multiple operands, create a chain of selects to pick the
976 // first operand with an active lane and add the number of lanes of the
977 // preceding operands.
978 Value *RuntimeVF =
979 getRuntimeVF(State.Builder, State.Builder.getInt64Ty(), State.VF);
980 unsigned LastOpIdx = getNumOperands() - 1;
981 Value *Res = nullptr;
982 for (int Idx = LastOpIdx; Idx >= 0; --Idx) {
983 Value *TrailingZeros =
984 State.VF.isScalar()
985 ? Builder.CreateZExt(
986 Builder.CreateICmpEQ(State.get(getOperand(Idx)),
987 Builder.getFalse()),
988 Builder.getInt64Ty())
989 : Builder.CreateCountTrailingZeroElems(Builder.getInt64Ty(),
990 State.get(getOperand(Idx)),
991 true, Name);
992 Value *Current = Builder.CreateAdd(
993 Builder.CreateMul(RuntimeVF, Builder.getInt64(Idx)), TrailingZeros);
994 if (Res) {
995 Value *Cmp = Builder.CreateICmpNE(TrailingZeros, RuntimeVF);
996 Res = Builder.CreateSelect(Cmp, Current, Res);
997 } else {
998 Res = Current;
999 }
1000 }
1001
1002 return Res;
1003 }
1005 return State.get(getOperand(0), true);
1006 default:
1007 llvm_unreachable("Unsupported opcode for instruction");
1008 }
1009}
1010
1012 unsigned Opcode, ElementCount VF, VPCostContext &Ctx) const {
1013 Type *ScalarTy = Ctx.Types.inferScalarType(this);
1014 Type *ResultTy = VF.isVector() ? toVectorTy(ScalarTy, VF) : ScalarTy;
1015 switch (Opcode) {
1016 case Instruction::FNeg:
1017 return Ctx.TTI.getArithmeticInstrCost(Opcode, ResultTy, Ctx.CostKind);
1018 case Instruction::UDiv:
1019 case Instruction::SDiv:
1020 case Instruction::SRem:
1021 case Instruction::URem:
1022 case Instruction::Add:
1023 case Instruction::FAdd:
1024 case Instruction::Sub:
1025 case Instruction::FSub:
1026 case Instruction::Mul:
1027 case Instruction::FMul:
1028 case Instruction::FDiv:
1029 case Instruction::FRem:
1030 case Instruction::Shl:
1031 case Instruction::LShr:
1032 case Instruction::AShr:
1033 case Instruction::And:
1034 case Instruction::Or:
1035 case Instruction::Xor: {
1038
1039 if (VF.isVector()) {
1040 // Certain instructions can be cheaper to vectorize if they have a
1041 // constant second vector operand. One example of this are shifts on x86.
1042 VPValue *RHS = getOperand(1);
1043 RHSInfo = Ctx.getOperandInfo(RHS);
1044
1045 if (RHSInfo.Kind == TargetTransformInfo::OK_AnyValue &&
1048 }
1049
1052 if (CtxI)
1053 Operands.append(CtxI->value_op_begin(), CtxI->value_op_end());
1054 return Ctx.TTI.getArithmeticInstrCost(
1055 Opcode, ResultTy, Ctx.CostKind,
1056 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
1057 RHSInfo, Operands, CtxI, &Ctx.TLI);
1058 }
1059 case Instruction::Freeze:
1060 // This opcode is unknown. Assume that it is the same as 'mul'.
1061 return Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, ResultTy,
1062 Ctx.CostKind);
1063 case Instruction::ExtractValue:
1064 return Ctx.TTI.getInsertExtractValueCost(Instruction::ExtractValue,
1065 Ctx.CostKind);
1066 case Instruction::ICmp:
1067 case Instruction::FCmp: {
1068 Type *ScalarOpTy = Ctx.Types.inferScalarType(getOperand(0));
1069 Type *OpTy = VF.isVector() ? toVectorTy(ScalarOpTy, VF) : ScalarOpTy;
1071 return Ctx.TTI.getCmpSelInstrCost(
1072 Opcode, OpTy, CmpInst::makeCmpResultType(OpTy), getPredicate(),
1073 Ctx.CostKind, {TTI::OK_AnyValue, TTI::OP_None},
1074 {TTI::OK_AnyValue, TTI::OP_None}, CtxI);
1075 }
1076 }
1077 llvm_unreachable("called for unsupported opcode");
1078}
1079
1081 VPCostContext &Ctx) const {
1083 if (!getUnderlyingValue() && getOpcode() != Instruction::FMul) {
1084 // TODO: Compute cost for VPInstructions without underlying values once
1085 // the legacy cost model has been retired.
1086 return 0;
1087 }
1088
1090 "Should only generate a vector value or single scalar, not scalars "
1091 "for all lanes.");
1093 getOpcode(),
1095 }
1096
1097 switch (getOpcode()) {
1098 case Instruction::Select: {
1099 // TODO: It may be possible to improve this by analyzing where the
1100 // condition operand comes from.
1102 auto *CondTy = Ctx.Types.inferScalarType(getOperand(0));
1103 auto *VecTy = Ctx.Types.inferScalarType(getOperand(1));
1104 if (!vputils::onlyFirstLaneUsed(this)) {
1105 CondTy = toVectorTy(CondTy, VF);
1106 VecTy = toVectorTy(VecTy, VF);
1107 }
1108 return Ctx.TTI.getCmpSelInstrCost(Instruction::Select, VecTy, CondTy, Pred,
1109 Ctx.CostKind);
1110 }
1111 case Instruction::ExtractElement:
1113 if (VF.isScalar()) {
1114 // ExtractLane with VF=1 takes care of handling extracting across multiple
1115 // parts.
1116 return 0;
1117 }
1118
1119 // Add on the cost of extracting the element.
1120 auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
1121 return Ctx.TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy,
1122 Ctx.CostKind);
1123 }
1124 case VPInstruction::AnyOf: {
1125 auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
1126 return Ctx.TTI.getArithmeticReductionCost(
1127 Instruction::Or, cast<VectorType>(VecTy), std::nullopt, Ctx.CostKind);
1128 }
1130 Type *ScalarTy = Ctx.Types.inferScalarType(getOperand(0));
1131 if (VF.isScalar())
1132 return Ctx.TTI.getCmpSelInstrCost(Instruction::ICmp, ScalarTy,
1134 CmpInst::ICMP_EQ, Ctx.CostKind);
1135 // Calculate the cost of determining the lane index.
1136 auto *PredTy = toVectorTy(ScalarTy, VF);
1137 IntrinsicCostAttributes Attrs(Intrinsic::experimental_cttz_elts,
1138 Type::getInt64Ty(Ctx.LLVMCtx),
1139 {PredTy, Type::getInt1Ty(Ctx.LLVMCtx)});
1140 return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
1141 }
1143 assert(VF.isVector() && "Scalar FirstOrderRecurrenceSplice?");
1145 std::iota(Mask.begin(), Mask.end(), VF.getKnownMinValue() - 1);
1146 Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
1147
1148 return Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Splice,
1149 cast<VectorType>(VectorTy),
1150 cast<VectorType>(VectorTy), Mask,
1151 Ctx.CostKind, VF.getKnownMinValue() - 1);
1152 }
1154 Type *ArgTy = Ctx.Types.inferScalarType(getOperand(0));
1155 unsigned Multiplier =
1156 cast<ConstantInt>(getOperand(2)->getLiveInIRValue())->getZExtValue();
1157 Type *RetTy = toVectorTy(Type::getInt1Ty(Ctx.LLVMCtx), VF * Multiplier);
1158 IntrinsicCostAttributes Attrs(Intrinsic::get_active_lane_mask, RetTy,
1159 {ArgTy, ArgTy});
1160 return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
1161 }
1163 Type *Arg0Ty = Ctx.Types.inferScalarType(getOperand(0));
1164 Type *I32Ty = Type::getInt32Ty(Ctx.LLVMCtx);
1165 Type *I1Ty = Type::getInt1Ty(Ctx.LLVMCtx);
1166 IntrinsicCostAttributes Attrs(Intrinsic::experimental_get_vector_length,
1167 I32Ty, {Arg0Ty, I32Ty, I1Ty});
1168 return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
1169 }
1171 // Add on the cost of extracting the element.
1172 auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
1173 return Ctx.TTI.getIndexedVectorInstrCostFromEnd(Instruction::ExtractElement,
1174 VecTy, Ctx.CostKind, 0);
1175 }
1177 if (VF == ElementCount::getScalable(1))
1179 [[fallthrough]];
1180 default:
1181 // TODO: Compute cost other VPInstructions once the legacy cost model has
1182 // been retired.
1184 "unexpected VPInstruction witht underlying value");
1185 return 0;
1186 }
1187}
1188
1201
1203 switch (getOpcode()) {
1204 case Instruction::PHI:
1208 return true;
1209 default:
1210 return isScalarCast();
1211 }
1212}
1213
1215 assert(!State.Lane && "VPInstruction executing an Lane");
1216 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
1218 "Set flags not supported for the provided opcode");
1219 if (hasFastMathFlags())
1220 State.Builder.setFastMathFlags(getFastMathFlags());
1221 Value *GeneratedValue = generate(State);
1222 if (!hasResult())
1223 return;
1224 assert(GeneratedValue && "generate must produce a value");
1225 bool GeneratesPerFirstLaneOnly = canGenerateScalarForFirstLane() &&
1228 assert((((GeneratedValue->getType()->isVectorTy() ||
1229 GeneratedValue->getType()->isStructTy()) ==
1230 !GeneratesPerFirstLaneOnly) ||
1231 State.VF.isScalar()) &&
1232 "scalar value but not only first lane defined");
1233 State.set(this, GeneratedValue,
1234 /*IsScalar*/ GeneratesPerFirstLaneOnly);
1235}
1236
1239 return false;
1240 switch (getOpcode()) {
1241 case Instruction::ExtractElement:
1242 case Instruction::Freeze:
1243 case Instruction::FCmp:
1244 case Instruction::ICmp:
1245 case Instruction::Select:
1246 case Instruction::PHI:
1263 case VPInstruction::Not:
1271 return false;
1272 default:
1273 return true;
1274 }
1275}
1276
1278 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
1280 return vputils::onlyFirstLaneUsed(this);
1281
1282 switch (getOpcode()) {
1283 default:
1284 return false;
1285 case Instruction::ExtractElement:
1286 return Op == getOperand(1);
1287 case Instruction::PHI:
1288 return true;
1289 case Instruction::FCmp:
1290 case Instruction::ICmp:
1291 case Instruction::Select:
1292 case Instruction::Or:
1293 case Instruction::Freeze:
1294 case VPInstruction::Not:
1295 // TODO: Cover additional opcodes.
1296 return vputils::onlyFirstLaneUsed(this);
1305 return true;
1308 // Before replicating by VF, Build(Struct)Vector uses all lanes of the
1309 // operand, after replicating its operands only the first lane is used.
1310 // Before replicating, it will have only a single operand.
1311 return getNumOperands() > 1;
1313 return Op == getOperand(0) || vputils::onlyFirstLaneUsed(this);
1315 // WidePtrAdd supports scalar and vector base addresses.
1316 return false;
1319 return Op == getOperand(1);
1321 return Op == getOperand(0);
1322 };
1323 llvm_unreachable("switch should return");
1324}
1325
1327 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
1329 return vputils::onlyFirstPartUsed(this);
1330
1331 switch (getOpcode()) {
1332 default:
1333 return false;
1334 case Instruction::FCmp:
1335 case Instruction::ICmp:
1336 case Instruction::Select:
1337 return vputils::onlyFirstPartUsed(this);
1341 return true;
1342 };
1343 llvm_unreachable("switch should return");
1344}
1345
1346#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1348 VPSlotTracker SlotTracker(getParent()->getPlan());
1349 print(dbgs(), "", SlotTracker);
1350}
1351
1353 VPSlotTracker &SlotTracker) const {
1354 O << Indent << "EMIT" << (isSingleScalar() ? "-SCALAR" : "") << " ";
1355
1356 if (hasResult()) {
1358 O << " = ";
1359 }
1360
1361 switch (getOpcode()) {
1362 case VPInstruction::Not:
1363 O << "not";
1364 break;
1366 O << "combined load";
1367 break;
1369 O << "combined store";
1370 break;
1372 O << "active lane mask";
1373 break;
1375 O << "EXPLICIT-VECTOR-LENGTH";
1376 break;
1378 O << "first-order splice";
1379 break;
1381 O << "branch-on-cond";
1382 break;
1384 O << "TC > VF ? TC - VF : 0";
1385 break;
1387 O << "VF * Part +";
1388 break;
1390 O << "branch-on-count";
1391 break;
1393 O << "broadcast";
1394 break;
1396 O << "buildstructvector";
1397 break;
1399 O << "buildvector";
1400 break;
1402 O << "extract-lane";
1403 break;
1405 O << "extract-last-element";
1406 break;
1408 O << "extract-last-lane-per-part";
1409 break;
1411 O << "extract-penultimate-element";
1412 break;
1414 O << "compute-anyof-result";
1415 break;
1417 O << "compute-find-iv-result";
1418 break;
1420 O << "compute-reduction-result";
1421 break;
1423 O << "logical-and";
1424 break;
1426 O << "ptradd";
1427 break;
1429 O << "wide-ptradd";
1430 break;
1432 O << "any-of";
1433 break;
1435 O << "first-active-lane";
1436 break;
1438 O << "reduction-start-vector";
1439 break;
1441 O << "resume-for-epilogue";
1442 break;
1444 O << "unpack";
1445 break;
1446 default:
1448 }
1449
1450 printFlags(O);
1452
1453 if (auto DL = getDebugLoc()) {
1454 O << ", !dbg ";
1455 DL.print(O);
1456 }
1457}
1458#endif
1459
1461 State.setDebugLocFrom(getDebugLoc());
1462 if (isScalarCast()) {
1463 Value *Op = State.get(getOperand(0), VPLane(0));
1464 Value *Cast = State.Builder.CreateCast(Instruction::CastOps(getOpcode()),
1465 Op, ResultTy);
1466 State.set(this, Cast, VPLane(0));
1467 return;
1468 }
1469 switch (getOpcode()) {
1471 Value *StepVector =
1472 State.Builder.CreateStepVector(VectorType::get(ResultTy, State.VF));
1473 State.set(this, StepVector);
1474 break;
1475 }
1476 case VPInstruction::VScale: {
1477 Value *VScale = State.Builder.CreateVScale(ResultTy);
1478 State.set(this, VScale, true);
1479 break;
1480 }
1481
1482 default:
1483 llvm_unreachable("opcode not implemented yet");
1484 }
1485}
1486
1487#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1489 VPSlotTracker &SlotTracker) const {
1490 O << Indent << "EMIT" << (isSingleScalar() ? "-SCALAR" : "") << " ";
1492 O << " = ";
1493
1494 switch (getOpcode()) {
1496 O << "wide-iv-step ";
1498 break;
1500 O << "step-vector " << *ResultTy;
1501 break;
1503 O << "vscale " << *ResultTy;
1504 break;
1505 default:
1506 assert(Instruction::isCast(getOpcode()) && "unhandled opcode");
1509 O << " to " << *ResultTy;
1510 }
1511}
1512#endif
1513
1515 State.setDebugLocFrom(getDebugLoc());
1516 PHINode *NewPhi = State.Builder.CreatePHI(
1517 State.TypeAnalysis.inferScalarType(this), 2, getName());
1518 unsigned NumIncoming = getNumIncoming();
1519 if (getParent() != getParent()->getPlan()->getScalarPreheader()) {
1520 // TODO: Fixup all incoming values of header phis once recipes defining them
1521 // are introduced.
1522 NumIncoming = 1;
1523 }
1524 for (unsigned Idx = 0; Idx != NumIncoming; ++Idx) {
1525 Value *IncV = State.get(getIncomingValue(Idx), VPLane(0));
1526 BasicBlock *PredBB = State.CFG.VPBB2IRBB.at(getIncomingBlock(Idx));
1527 NewPhi->addIncoming(IncV, PredBB);
1528 }
1529 State.set(this, NewPhi, VPLane(0));
1530}
1531
1532#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1533void VPPhi::print(raw_ostream &O, const Twine &Indent,
1534 VPSlotTracker &SlotTracker) const {
1535 O << Indent << "EMIT" << (isSingleScalar() ? "-SCALAR" : "") << " ";
1537 O << " = phi ";
1539}
1540#endif
1541
1542VPIRInstruction *VPIRInstruction ::create(Instruction &I) {
1543 if (auto *Phi = dyn_cast<PHINode>(&I))
1544 return new VPIRPhi(*Phi);
1545 return new VPIRInstruction(I);
1546}
1547
1549 assert(!isa<VPIRPhi>(this) && getNumOperands() == 0 &&
1550 "PHINodes must be handled by VPIRPhi");
1551 // Advance the insert point after the wrapped IR instruction. This allows
1552 // interleaving VPIRInstructions and other recipes.
1553 State.Builder.SetInsertPoint(I.getParent(), std::next(I.getIterator()));
1554}
1555
1557 VPCostContext &Ctx) const {
1558 // The recipe wraps an existing IR instruction on the border of VPlan's scope,
1559 // hence it does not contribute to the cost-modeling for the VPlan.
1560 return 0;
1561}
1562
1565 "can only update exiting operands to phi nodes");
1566 assert(getNumOperands() > 0 && "must have at least one operand");
1567 VPValue *Exiting = getOperand(0);
1568 if (Exiting->isLiveIn())
1569 return;
1570
1571 Exiting = Builder.createNaryOp(VPInstruction::ExtractLastElement, {Exiting});
1572 setOperand(0, Exiting);
1573}
1574
1575#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1577 VPSlotTracker &SlotTracker) const {
1578 O << Indent << "IR " << I;
1579}
1580#endif
1581
1583 PHINode *Phi = &getIRPhi();
1584 for (const auto &[Idx, Op] : enumerate(operands())) {
1585 VPValue *ExitValue = Op;
1586 auto Lane = vputils::isSingleScalar(ExitValue)
1588 : VPLane::getLastLaneForVF(State.VF);
1589 VPBlockBase *Pred = getParent()->getPredecessors()[Idx];
1590 auto *PredVPBB = Pred->getExitingBasicBlock();
1591 BasicBlock *PredBB = State.CFG.VPBB2IRBB[PredVPBB];
1592 // Set insertion point in PredBB in case an extract needs to be generated.
1593 // TODO: Model extracts explicitly.
1594 State.Builder.SetInsertPoint(PredBB, PredBB->getFirstNonPHIIt());
1595 Value *V = State.get(ExitValue, VPLane(Lane));
1596 // If there is no existing block for PredBB in the phi, add a new incoming
1597 // value. Otherwise update the existing incoming value for PredBB.
1598 if (Phi->getBasicBlockIndex(PredBB) == -1)
1599 Phi->addIncoming(V, PredBB);
1600 else
1601 Phi->setIncomingValueForBlock(PredBB, V);
1602 }
1603
1604 // Advance the insert point after the wrapped IR instruction. This allows
1605 // interleaving VPIRInstructions and other recipes.
1606 State.Builder.SetInsertPoint(Phi->getParent(), std::next(Phi->getIterator()));
1607}
1608
1610 VPRecipeBase *R = const_cast<VPRecipeBase *>(getAsRecipe());
1611 assert(R->getNumOperands() == R->getParent()->getNumPredecessors() &&
1612 "Number of phi operands must match number of predecessors");
1613 unsigned Position = R->getParent()->getIndexForPredecessor(IncomingBlock);
1614 R->removeOperand(Position);
1615}
1616
1617#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1619 VPSlotTracker &SlotTracker) const {
1620 interleaveComma(enumerate(getAsRecipe()->operands()), O,
1621 [this, &O, &SlotTracker](auto Op) {
1622 O << "[ ";
1623 Op.value()->printAsOperand(O, SlotTracker);
1624 O << ", ";
1625 getIncomingBlock(Op.index())->printAsOperand(O);
1626 O << " ]";
1627 });
1628}
1629#endif
1630
1631#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1632void VPIRPhi::print(raw_ostream &O, const Twine &Indent,
1633 VPSlotTracker &SlotTracker) const {
1635
1636 if (getNumOperands() != 0) {
1637 O << " (extra operand" << (getNumOperands() > 1 ? "s" : "") << ": ";
1639 [&O, &SlotTracker](auto Op) {
1640 std::get<0>(Op)->printAsOperand(O, SlotTracker);
1641 O << " from ";
1642 std::get<1>(Op)->printAsOperand(O);
1643 });
1644 O << ")";
1645 }
1646}
1647#endif
1648
1650 : VPIRMetadata(I) {
1651 if (!LVer || !isa<LoadInst, StoreInst>(&I))
1652 return;
1653 const auto &[AliasScopeMD, NoAliasMD] = LVer->getNoAliasMetadataFor(&I);
1654 if (AliasScopeMD)
1655 Metadata.emplace_back(LLVMContext::MD_alias_scope, AliasScopeMD);
1656 if (NoAliasMD)
1657 Metadata.emplace_back(LLVMContext::MD_noalias, NoAliasMD);
1658}
1659
1661 for (const auto &[Kind, Node] : Metadata)
1662 I.setMetadata(Kind, Node);
1663}
1664
1666 SmallVector<std::pair<unsigned, MDNode *>> MetadataIntersection;
1667 for (const auto &[KindA, MDA] : Metadata) {
1668 for (const auto &[KindB, MDB] : Other.Metadata) {
1669 if (KindA == KindB && MDA == MDB) {
1670 MetadataIntersection.emplace_back(KindA, MDA);
1671 break;
1672 }
1673 }
1674 }
1675 Metadata = std::move(MetadataIntersection);
1676}
1677
1679 assert(State.VF.isVector() && "not widening");
1680 assert(Variant != nullptr && "Can't create vector function.");
1681
1682 FunctionType *VFTy = Variant->getFunctionType();
1683 // Add return type if intrinsic is overloaded on it.
1685 for (const auto &I : enumerate(args())) {
1686 Value *Arg;
1687 // Some vectorized function variants may also take a scalar argument,
1688 // e.g. linear parameters for pointers. This needs to be the scalar value
1689 // from the start of the respective part when interleaving.
1690 if (!VFTy->getParamType(I.index())->isVectorTy())
1691 Arg = State.get(I.value(), VPLane(0));
1692 else
1693 Arg = State.get(I.value(), onlyFirstLaneUsed(I.value()));
1694 Args.push_back(Arg);
1695 }
1696
1699 if (CI)
1700 CI->getOperandBundlesAsDefs(OpBundles);
1701
1702 CallInst *V = State.Builder.CreateCall(Variant, Args, OpBundles);
1703 applyFlags(*V);
1704 applyMetadata(*V);
1705 V->setCallingConv(Variant->getCallingConv());
1706
1707 if (!V->getType()->isVoidTy())
1708 State.set(this, V);
1709}
1710
1712 VPCostContext &Ctx) const {
1713 return Ctx.TTI.getCallInstrCost(nullptr, Variant->getReturnType(),
1714 Variant->getFunctionType()->params(),
1715 Ctx.CostKind);
1716}
1717
1718#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1720 VPSlotTracker &SlotTracker) const {
1721 O << Indent << "WIDEN-CALL ";
1722
1723 Function *CalledFn = getCalledScalarFunction();
1724 if (CalledFn->getReturnType()->isVoidTy())
1725 O << "void ";
1726 else {
1728 O << " = ";
1729 }
1730
1731 O << "call";
1732 printFlags(O);
1733 O << " @" << CalledFn->getName() << "(";
1734 interleaveComma(args(), O, [&O, &SlotTracker](VPValue *Op) {
1735 Op->printAsOperand(O, SlotTracker);
1736 });
1737 O << ")";
1738
1739 O << " (using library function";
1740 if (Variant->hasName())
1741 O << ": " << Variant->getName();
1742 O << ")";
1743}
1744#endif
1745
1747 assert(State.VF.isVector() && "not widening");
1748
1749 SmallVector<Type *, 2> TysForDecl;
1750 // Add return type if intrinsic is overloaded on it.
1751 if (isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, -1, State.TTI))
1752 TysForDecl.push_back(VectorType::get(getResultType(), State.VF));
1754 for (const auto &I : enumerate(operands())) {
1755 // Some intrinsics have a scalar argument - don't replace it with a
1756 // vector.
1757 Value *Arg;
1758 if (isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index(),
1759 State.TTI))
1760 Arg = State.get(I.value(), VPLane(0));
1761 else
1762 Arg = State.get(I.value(), onlyFirstLaneUsed(I.value()));
1763 if (isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, I.index(),
1764 State.TTI))
1765 TysForDecl.push_back(Arg->getType());
1766 Args.push_back(Arg);
1767 }
1768
1769 // Use vector version of the intrinsic.
1770 Module *M = State.Builder.GetInsertBlock()->getModule();
1771 Function *VectorF =
1772 Intrinsic::getOrInsertDeclaration(M, VectorIntrinsicID, TysForDecl);
1773 assert(VectorF &&
1774 "Can't retrieve vector intrinsic or vector-predication intrinsics.");
1775
1778 if (CI)
1779 CI->getOperandBundlesAsDefs(OpBundles);
1780
1781 CallInst *V = State.Builder.CreateCall(VectorF, Args, OpBundles);
1782
1783 applyFlags(*V);
1784 applyMetadata(*V);
1785
1786 if (!V->getType()->isVoidTy())
1787 State.set(this, V);
1788}
1789
1790/// Compute the cost for the intrinsic \p ID with \p Operands, produced by \p R.
1793 const VPRecipeWithIRFlags &R,
1794 ElementCount VF,
1795 VPCostContext &Ctx) {
1796 // Some backends analyze intrinsic arguments to determine cost. Use the
1797 // underlying value for the operand if it has one. Otherwise try to use the
1798 // operand of the underlying call instruction, if there is one. Otherwise
1799 // clear Arguments.
1800 // TODO: Rework TTI interface to be independent of concrete IR values.
1802 for (const auto &[Idx, Op] : enumerate(Operands)) {
1803 auto *V = Op->getUnderlyingValue();
1804 if (!V) {
1805 if (auto *UI = dyn_cast_or_null<CallBase>(R.getUnderlyingValue())) {
1806 Arguments.push_back(UI->getArgOperand(Idx));
1807 continue;
1808 }
1809 Arguments.clear();
1810 break;
1811 }
1812 Arguments.push_back(V);
1813 }
1814
1815 Type *ScalarRetTy = Ctx.Types.inferScalarType(&R);
1816 Type *RetTy = VF.isVector() ? toVectorizedTy(ScalarRetTy, VF) : ScalarRetTy;
1817 SmallVector<Type *> ParamTys;
1818 for (const VPValue *Op : Operands) {
1819 ParamTys.push_back(VF.isVector()
1820 ? toVectorTy(Ctx.Types.inferScalarType(Op), VF)
1821 : Ctx.Types.inferScalarType(Op));
1822 }
1823
1824 // TODO: Rework TTI interface to avoid reliance on underlying IntrinsicInst.
1825 FastMathFlags FMF =
1826 R.hasFastMathFlags() ? R.getFastMathFlags() : FastMathFlags();
1827 IntrinsicCostAttributes CostAttrs(
1828 ID, RetTy, Arguments, ParamTys, FMF,
1829 dyn_cast_or_null<IntrinsicInst>(R.getUnderlyingValue()),
1830 InstructionCost::getInvalid(), &Ctx.TLI);
1831 return Ctx.TTI.getIntrinsicInstrCost(CostAttrs, Ctx.CostKind);
1832}
1833
1835 VPCostContext &Ctx) const {
1837 return getCostForIntrinsics(VectorIntrinsicID, ArgOps, *this, VF, Ctx);
1838}
1839
1841 return Intrinsic::getBaseName(VectorIntrinsicID);
1842}
1843
1845 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
1846 return all_of(enumerate(operands()), [this, &Op](const auto &X) {
1847 auto [Idx, V] = X;
1849 Idx, nullptr);
1850 });
1851}
1852
1853#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1855 VPSlotTracker &SlotTracker) const {
1856 O << Indent << "WIDEN-INTRINSIC ";
1857 if (ResultTy->isVoidTy()) {
1858 O << "void ";
1859 } else {
1861 O << " = ";
1862 }
1863
1864 O << "call";
1865 printFlags(O);
1866 O << getIntrinsicName() << "(";
1867
1869 Op->printAsOperand(O, SlotTracker);
1870 });
1871 O << ")";
1872}
1873#endif
1874
1876 IRBuilderBase &Builder = State.Builder;
1877
1878 Value *Address = State.get(getOperand(0));
1879 Value *IncAmt = State.get(getOperand(1), /*IsScalar=*/true);
1880 VectorType *VTy = cast<VectorType>(Address->getType());
1881
1882 // The histogram intrinsic requires a mask even if the recipe doesn't;
1883 // if the mask operand was omitted then all lanes should be executed and
1884 // we just need to synthesize an all-true mask.
1885 Value *Mask = nullptr;
1886 if (VPValue *VPMask = getMask())
1887 Mask = State.get(VPMask);
1888 else
1889 Mask =
1890 Builder.CreateVectorSplat(VTy->getElementCount(), Builder.getInt1(1));
1891
1892 // If this is a subtract, we want to invert the increment amount. We may
1893 // add a separate intrinsic in future, but for now we'll try this.
1894 if (Opcode == Instruction::Sub)
1895 IncAmt = Builder.CreateNeg(IncAmt);
1896 else
1897 assert(Opcode == Instruction::Add && "only add or sub supported for now");
1898
1899 State.Builder.CreateIntrinsic(Intrinsic::experimental_vector_histogram_add,
1900 {VTy, IncAmt->getType()},
1901 {Address, IncAmt, Mask});
1902}
1903
1905 VPCostContext &Ctx) const {
1906 // FIXME: Take the gather and scatter into account as well. For now we're
1907 // generating the same cost as the fallback path, but we'll likely
1908 // need to create a new TTI method for determining the cost, including
1909 // whether we can use base + vec-of-smaller-indices or just
1910 // vec-of-pointers.
1911 assert(VF.isVector() && "Invalid VF for histogram cost");
1912 Type *AddressTy = Ctx.Types.inferScalarType(getOperand(0));
1913 VPValue *IncAmt = getOperand(1);
1914 Type *IncTy = Ctx.Types.inferScalarType(IncAmt);
1915 VectorType *VTy = VectorType::get(IncTy, VF);
1916
1917 // Assume that a non-constant update value (or a constant != 1) requires
1918 // a multiply, and add that into the cost.
1919 InstructionCost MulCost =
1920 Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, VTy, Ctx.CostKind);
1921 if (IncAmt->isLiveIn()) {
1923
1924 if (CI && CI->getZExtValue() == 1)
1925 MulCost = TTI::TCC_Free;
1926 }
1927
1928 // Find the cost of the histogram operation itself.
1929 Type *PtrTy = VectorType::get(AddressTy, VF);
1930 Type *MaskTy = VectorType::get(Type::getInt1Ty(Ctx.LLVMCtx), VF);
1931 IntrinsicCostAttributes ICA(Intrinsic::experimental_vector_histogram_add,
1932 Type::getVoidTy(Ctx.LLVMCtx),
1933 {PtrTy, IncTy, MaskTy});
1934
1935 // Add the costs together with the add/sub operation.
1936 return Ctx.TTI.getIntrinsicInstrCost(ICA, Ctx.CostKind) + MulCost +
1937 Ctx.TTI.getArithmeticInstrCost(Opcode, VTy, Ctx.CostKind);
1938}
1939
1940#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1942 VPSlotTracker &SlotTracker) const {
1943 O << Indent << "WIDEN-HISTOGRAM buckets: ";
1945
1946 if (Opcode == Instruction::Sub)
1947 O << ", dec: ";
1948 else {
1949 assert(Opcode == Instruction::Add);
1950 O << ", inc: ";
1951 }
1953
1954 if (VPValue *Mask = getMask()) {
1955 O << ", mask: ";
1956 Mask->printAsOperand(O, SlotTracker);
1957 }
1958}
1959
1961 VPSlotTracker &SlotTracker) const {
1962 O << Indent << "WIDEN-SELECT ";
1964 O << " = select ";
1965 printFlags(O);
1967 O << ", ";
1969 O << ", ";
1971 O << (isInvariantCond() ? " (condition is loop invariant)" : "");
1972}
1973#endif
1974
1976 // The condition can be loop invariant but still defined inside the
1977 // loop. This means that we can't just use the original 'cond' value.
1978 // We have to take the 'vectorized' value and pick the first lane.
1979 // Instcombine will make this a no-op.
1980 Value *Cond = State.get(getCond(), isInvariantCond());
1981
1982 Value *Op0 = State.get(getOperand(1));
1983 Value *Op1 = State.get(getOperand(2));
1984 Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1);
1985 State.set(this, Sel);
1986 if (auto *I = dyn_cast<Instruction>(Sel)) {
1988 applyFlags(*I);
1989 applyMetadata(*I);
1990 }
1991}
1992
1994 VPCostContext &Ctx) const {
1996 bool ScalarCond = getOperand(0)->isDefinedOutsideLoopRegions();
1997 Type *ScalarTy = Ctx.Types.inferScalarType(this);
1998 Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
1999
2000 VPValue *Op0, *Op1;
2001 if (!ScalarCond && ScalarTy->getScalarSizeInBits() == 1 &&
2002 (match(this, m_LogicalAnd(m_VPValue(Op0), m_VPValue(Op1))) ||
2003 match(this, m_LogicalOr(m_VPValue(Op0), m_VPValue(Op1))))) {
2004 // select x, y, false --> x & y
2005 // select x, true, y --> x | y
2006 const auto [Op1VK, Op1VP] = Ctx.getOperandInfo(Op0);
2007 const auto [Op2VK, Op2VP] = Ctx.getOperandInfo(Op1);
2008
2010 if (all_of(operands(),
2011 [](VPValue *Op) { return Op->getUnderlyingValue(); }))
2012 Operands.append(SI->op_begin(), SI->op_end());
2013 bool IsLogicalOr = match(this, m_LogicalOr(m_VPValue(Op0), m_VPValue(Op1)));
2014 return Ctx.TTI.getArithmeticInstrCost(
2015 IsLogicalOr ? Instruction::Or : Instruction::And, VectorTy,
2016 Ctx.CostKind, {Op1VK, Op1VP}, {Op2VK, Op2VP}, Operands, SI);
2017 }
2018
2019 Type *CondTy = Ctx.Types.inferScalarType(getOperand(0));
2020 if (!ScalarCond)
2021 CondTy = VectorType::get(CondTy, VF);
2022
2024 if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition()))
2025 Pred = Cmp->getPredicate();
2026 return Ctx.TTI.getCmpSelInstrCost(
2027 Instruction::Select, VectorTy, CondTy, Pred, Ctx.CostKind,
2028 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None}, SI);
2029}
2030
2031VPIRFlags::FastMathFlagsTy::FastMathFlagsTy(const FastMathFlags &FMF) {
2032 AllowReassoc = FMF.allowReassoc();
2033 NoNaNs = FMF.noNaNs();
2034 NoInfs = FMF.noInfs();
2035 NoSignedZeros = FMF.noSignedZeros();
2036 AllowReciprocal = FMF.allowReciprocal();
2037 AllowContract = FMF.allowContract();
2038 ApproxFunc = FMF.approxFunc();
2039}
2040
2041#if !defined(NDEBUG)
2042bool VPIRFlags::flagsValidForOpcode(unsigned Opcode) const {
2043 switch (OpType) {
2044 case OperationType::OverflowingBinOp:
2045 return Opcode == Instruction::Add || Opcode == Instruction::Sub ||
2046 Opcode == Instruction::Mul ||
2047 Opcode == VPInstruction::VPInstruction::CanonicalIVIncrementForPart;
2048 case OperationType::Trunc:
2049 return Opcode == Instruction::Trunc;
2050 case OperationType::DisjointOp:
2051 return Opcode == Instruction::Or;
2052 case OperationType::PossiblyExactOp:
2053 return Opcode == Instruction::AShr;
2054 case OperationType::GEPOp:
2055 return Opcode == Instruction::GetElementPtr ||
2056 Opcode == VPInstruction::PtrAdd ||
2057 Opcode == VPInstruction::WidePtrAdd;
2058 case OperationType::FPMathOp:
2059 return Opcode == Instruction::FAdd || Opcode == Instruction::FMul ||
2060 Opcode == Instruction::FSub || Opcode == Instruction::FNeg ||
2061 Opcode == Instruction::FDiv || Opcode == Instruction::FRem ||
2062 Opcode == Instruction::FPExt || Opcode == Instruction::FPTrunc ||
2063 Opcode == Instruction::FCmp || Opcode == Instruction::Select ||
2064 Opcode == VPInstruction::WideIVStep ||
2067 case OperationType::NonNegOp:
2068 return Opcode == Instruction::ZExt || Opcode == Instruction::UIToFP;
2069 case OperationType::Cmp:
2070 return Opcode == Instruction::FCmp || Opcode == Instruction::ICmp;
2071 case OperationType::Other:
2072 return true;
2073 }
2074 llvm_unreachable("Unknown OperationType enum");
2075}
2076#endif
2077
2078#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2080 switch (OpType) {
2081 case OperationType::Cmp:
2083 break;
2084 case OperationType::DisjointOp:
2085 if (DisjointFlags.IsDisjoint)
2086 O << " disjoint";
2087 break;
2088 case OperationType::PossiblyExactOp:
2089 if (ExactFlags.IsExact)
2090 O << " exact";
2091 break;
2092 case OperationType::OverflowingBinOp:
2093 if (WrapFlags.HasNUW)
2094 O << " nuw";
2095 if (WrapFlags.HasNSW)
2096 O << " nsw";
2097 break;
2098 case OperationType::Trunc:
2099 if (TruncFlags.HasNUW)
2100 O << " nuw";
2101 if (TruncFlags.HasNSW)
2102 O << " nsw";
2103 break;
2104 case OperationType::FPMathOp:
2106 break;
2107 case OperationType::GEPOp:
2108 if (GEPFlags.isInBounds())
2109 O << " inbounds";
2110 else if (GEPFlags.hasNoUnsignedSignedWrap())
2111 O << " nusw";
2112 if (GEPFlags.hasNoUnsignedWrap())
2113 O << " nuw";
2114 break;
2115 case OperationType::NonNegOp:
2116 if (NonNegFlags.NonNeg)
2117 O << " nneg";
2118 break;
2119 case OperationType::Other:
2120 break;
2121 }
2122 O << " ";
2123}
2124#endif
2125
2127 auto &Builder = State.Builder;
2128 switch (Opcode) {
2129 case Instruction::Call:
2130 case Instruction::Br:
2131 case Instruction::PHI:
2132 case Instruction::GetElementPtr:
2133 case Instruction::Select:
2134 llvm_unreachable("This instruction is handled by a different recipe.");
2135 case Instruction::UDiv:
2136 case Instruction::SDiv:
2137 case Instruction::SRem:
2138 case Instruction::URem:
2139 case Instruction::Add:
2140 case Instruction::FAdd:
2141 case Instruction::Sub:
2142 case Instruction::FSub:
2143 case Instruction::FNeg:
2144 case Instruction::Mul:
2145 case Instruction::FMul:
2146 case Instruction::FDiv:
2147 case Instruction::FRem:
2148 case Instruction::Shl:
2149 case Instruction::LShr:
2150 case Instruction::AShr:
2151 case Instruction::And:
2152 case Instruction::Or:
2153 case Instruction::Xor: {
2154 // Just widen unops and binops.
2156 for (VPValue *VPOp : operands())
2157 Ops.push_back(State.get(VPOp));
2158
2159 Value *V = Builder.CreateNAryOp(Opcode, Ops);
2160
2161 if (auto *VecOp = dyn_cast<Instruction>(V)) {
2162 applyFlags(*VecOp);
2163 applyMetadata(*VecOp);
2164 }
2165
2166 // Use this vector value for all users of the original instruction.
2167 State.set(this, V);
2168 break;
2169 }
2170 case Instruction::ExtractValue: {
2171 assert(getNumOperands() == 2 && "expected single level extractvalue");
2172 Value *Op = State.get(getOperand(0));
2174 Value *Extract = Builder.CreateExtractValue(Op, CI->getZExtValue());
2175 State.set(this, Extract);
2176 break;
2177 }
2178 case Instruction::Freeze: {
2179 Value *Op = State.get(getOperand(0));
2180 Value *Freeze = Builder.CreateFreeze(Op);
2181 State.set(this, Freeze);
2182 break;
2183 }
2184 case Instruction::ICmp:
2185 case Instruction::FCmp: {
2186 // Widen compares. Generate vector compares.
2187 bool FCmp = Opcode == Instruction::FCmp;
2188 Value *A = State.get(getOperand(0));
2189 Value *B = State.get(getOperand(1));
2190 Value *C = nullptr;
2191 if (FCmp) {
2192 // Propagate fast math flags.
2193 C = Builder.CreateFCmpFMF(
2194 getPredicate(), A, B,
2196 } else {
2197 C = Builder.CreateICmp(getPredicate(), A, B);
2198 }
2199 if (auto *I = dyn_cast<Instruction>(C))
2200 applyMetadata(*I);
2201 State.set(this, C);
2202 break;
2203 }
2204 default:
2205 // This instruction is not vectorized by simple widening.
2206 LLVM_DEBUG(dbgs() << "LV: Found an unhandled opcode : "
2207 << Instruction::getOpcodeName(Opcode));
2208 llvm_unreachable("Unhandled instruction!");
2209 } // end of switch.
2210
2211#if !defined(NDEBUG)
2212 // Verify that VPlan type inference results agree with the type of the
2213 // generated values.
2214 assert(VectorType::get(State.TypeAnalysis.inferScalarType(this), State.VF) ==
2215 State.get(this)->getType() &&
2216 "inferred type and type from generated instructions do not match");
2217#endif
2218}
2219
2221 VPCostContext &Ctx) const {
2222 switch (Opcode) {
2223 case Instruction::UDiv:
2224 case Instruction::SDiv:
2225 case Instruction::SRem:
2226 case Instruction::URem:
2227 // If the div/rem operation isn't safe to speculate and requires
2228 // predication, then the only way we can even create a vplan is to insert
2229 // a select on the second input operand to ensure we use the value of 1
2230 // for the inactive lanes. The select will be costed separately.
2231 case Instruction::FNeg:
2232 case Instruction::Add:
2233 case Instruction::FAdd:
2234 case Instruction::Sub:
2235 case Instruction::FSub:
2236 case Instruction::Mul:
2237 case Instruction::FMul:
2238 case Instruction::FDiv:
2239 case Instruction::FRem:
2240 case Instruction::Shl:
2241 case Instruction::LShr:
2242 case Instruction::AShr:
2243 case Instruction::And:
2244 case Instruction::Or:
2245 case Instruction::Xor:
2246 case Instruction::Freeze:
2247 case Instruction::ExtractValue:
2248 case Instruction::ICmp:
2249 case Instruction::FCmp:
2250 return getCostForRecipeWithOpcode(getOpcode(), VF, Ctx);
2251 default:
2252 llvm_unreachable("Unsupported opcode for instruction");
2253 }
2254}
2255
2256#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2258 VPSlotTracker &SlotTracker) const {
2259 O << Indent << "WIDEN ";
2261 O << " = " << Instruction::getOpcodeName(Opcode);
2262 printFlags(O);
2264}
2265#endif
2266
2268 auto &Builder = State.Builder;
2269 /// Vectorize casts.
2270 assert(State.VF.isVector() && "Not vectorizing?");
2271 Type *DestTy = VectorType::get(getResultType(), State.VF);
2272 VPValue *Op = getOperand(0);
2273 Value *A = State.get(Op);
2274 Value *Cast = Builder.CreateCast(Instruction::CastOps(Opcode), A, DestTy);
2275 State.set(this, Cast);
2276 if (auto *CastOp = dyn_cast<Instruction>(Cast)) {
2277 applyFlags(*CastOp);
2278 applyMetadata(*CastOp);
2279 }
2280}
2281
2283 VPCostContext &Ctx) const {
2284 // TODO: In some cases, VPWidenCastRecipes are created but not considered in
2285 // the legacy cost model, including truncates/extends when evaluating a
2286 // reduction in a smaller type.
2287 if (!getUnderlyingValue())
2288 return 0;
2289 // Computes the CastContextHint from a recipes that may access memory.
2290 auto ComputeCCH = [&](const VPRecipeBase *R) -> TTI::CastContextHint {
2291 if (VF.isScalar())
2293 if (isa<VPInterleaveBase>(R))
2295 if (const auto *ReplicateRecipe = dyn_cast<VPReplicateRecipe>(R))
2296 return ReplicateRecipe->isPredicated() ? TTI::CastContextHint::Masked
2298 const auto *WidenMemoryRecipe = dyn_cast<VPWidenMemoryRecipe>(R);
2299 if (WidenMemoryRecipe == nullptr)
2301 if (!WidenMemoryRecipe->isConsecutive())
2303 if (WidenMemoryRecipe->isReverse())
2305 if (WidenMemoryRecipe->isMasked())
2308 };
2309
2310 VPValue *Operand = getOperand(0);
2312 // For Trunc/FPTrunc, get the context from the only user.
2313 if ((Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) &&
2315 if (auto *StoreRecipe = dyn_cast<VPRecipeBase>(*user_begin()))
2316 CCH = ComputeCCH(StoreRecipe);
2317 }
2318 // For Z/Sext, get the context from the operand.
2319 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
2320 Opcode == Instruction::FPExt) {
2321 if (Operand->isLiveIn())
2323 else if (Operand->getDefiningRecipe())
2324 CCH = ComputeCCH(Operand->getDefiningRecipe());
2325 }
2326
2327 auto *SrcTy =
2328 cast<VectorType>(toVectorTy(Ctx.Types.inferScalarType(Operand), VF));
2329 auto *DestTy = cast<VectorType>(toVectorTy(getResultType(), VF));
2330 // Arm TTI will use the underlying instruction to determine the cost.
2331 return Ctx.TTI.getCastInstrCost(
2332 Opcode, DestTy, SrcTy, CCH, Ctx.CostKind,
2334}
2335
2336#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2338 VPSlotTracker &SlotTracker) const {
2339 O << Indent << "WIDEN-CAST ";
2341 O << " = " << Instruction::getOpcodeName(Opcode);
2342 printFlags(O);
2344 O << " to " << *getResultType();
2345}
2346#endif
2347
2349 VPCostContext &Ctx) const {
2350 return Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind);
2351}
2352
2353/// A helper function that returns an integer or floating-point constant with
2354/// value C.
2356 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
2357 : ConstantFP::get(Ty, C);
2358}
2359
2360#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2362 VPSlotTracker &SlotTracker) const {
2363 O << Indent;
2365 O << " = WIDEN-INDUCTION ";
2367
2368 if (auto *TI = getTruncInst())
2369 O << " (truncated to " << *TI->getType() << ")";
2370}
2371#endif
2372
2374 // The step may be defined by a recipe in the preheader (e.g. if it requires
2375 // SCEV expansion), but for the canonical induction the step is required to be
2376 // 1, which is represented as live-in.
2378 return false;
2381 return StartC && StartC->isZero() && StepC && StepC->isOne() &&
2382 getScalarType() == getRegion()->getCanonicalIVType();
2383}
2384
2385#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2387 VPSlotTracker &SlotTracker) const {
2388 O << Indent;
2390 O << " = DERIVED-IV ";
2391 getStartValue()->printAsOperand(O, SlotTracker);
2392 O << " + ";
2393 getOperand(1)->printAsOperand(O, SlotTracker);
2394 O << " * ";
2395 getStepValue()->printAsOperand(O, SlotTracker);
2396}
2397#endif
2398
2400 // Fast-math-flags propagate from the original induction instruction.
2401 IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
2402 if (hasFastMathFlags())
2403 State.Builder.setFastMathFlags(getFastMathFlags());
2404
2405 /// Compute scalar induction steps. \p ScalarIV is the scalar induction
2406 /// variable on which to base the steps, \p Step is the size of the step.
2407
2408 Value *BaseIV = State.get(getOperand(0), VPLane(0));
2409 Value *Step = State.get(getStepValue(), VPLane(0));
2410 IRBuilderBase &Builder = State.Builder;
2411
2412 // Ensure step has the same type as that of scalar IV.
2413 Type *BaseIVTy = BaseIV->getType()->getScalarType();
2414 assert(BaseIVTy == Step->getType() && "Types of BaseIV and Step must match!");
2415
2416 // We build scalar steps for both integer and floating-point induction
2417 // variables. Here, we determine the kind of arithmetic we will perform.
2420 if (BaseIVTy->isIntegerTy()) {
2421 AddOp = Instruction::Add;
2422 MulOp = Instruction::Mul;
2423 } else {
2424 AddOp = InductionOpcode;
2425 MulOp = Instruction::FMul;
2426 }
2427
2428 // Determine the number of scalars we need to generate for each unroll
2429 // iteration.
2430 bool FirstLaneOnly = vputils::onlyFirstLaneUsed(this);
2431 // Compute the scalar steps and save the results in State.
2432 Type *IntStepTy =
2433 IntegerType::get(BaseIVTy->getContext(), BaseIVTy->getScalarSizeInBits());
2434 Type *VecIVTy = nullptr;
2435 Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr;
2436 if (!FirstLaneOnly && State.VF.isScalable()) {
2437 VecIVTy = VectorType::get(BaseIVTy, State.VF);
2438 UnitStepVec =
2439 Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF));
2440 SplatStep = Builder.CreateVectorSplat(State.VF, Step);
2441 SplatIV = Builder.CreateVectorSplat(State.VF, BaseIV);
2442 }
2443
2444 unsigned StartLane = 0;
2445 unsigned EndLane = FirstLaneOnly ? 1 : State.VF.getKnownMinValue();
2446 if (State.Lane) {
2447 StartLane = State.Lane->getKnownLane();
2448 EndLane = StartLane + 1;
2449 }
2450 Value *StartIdx0;
2451 if (getUnrollPart(*this) == 0)
2452 StartIdx0 = ConstantInt::get(IntStepTy, 0);
2453 else {
2454 StartIdx0 = State.get(getOperand(2), true);
2455 if (getUnrollPart(*this) != 1) {
2456 StartIdx0 =
2457 Builder.CreateMul(StartIdx0, ConstantInt::get(StartIdx0->getType(),
2458 getUnrollPart(*this)));
2459 }
2460 StartIdx0 = Builder.CreateSExtOrTrunc(StartIdx0, IntStepTy);
2461 }
2462
2463 if (!FirstLaneOnly && State.VF.isScalable()) {
2464 auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0);
2465 auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec);
2466 if (BaseIVTy->isFloatingPointTy())
2467 InitVec = Builder.CreateSIToFP(InitVec, VecIVTy);
2468 auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep);
2469 auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul);
2470 State.set(this, Add);
2471 // It's useful to record the lane values too for the known minimum number
2472 // of elements so we do those below. This improves the code quality when
2473 // trying to extract the first element, for example.
2474 }
2475
2476 if (BaseIVTy->isFloatingPointTy())
2477 StartIdx0 = Builder.CreateSIToFP(StartIdx0, BaseIVTy);
2478
2479 for (unsigned Lane = StartLane; Lane < EndLane; ++Lane) {
2480 Value *StartIdx = Builder.CreateBinOp(
2481 AddOp, StartIdx0, getSignedIntOrFpConstant(BaseIVTy, Lane));
2482 // The step returned by `createStepForVF` is a runtime-evaluated value
2483 // when VF is scalable. Otherwise, it should be folded into a Constant.
2484 assert((State.VF.isScalable() || isa<Constant>(StartIdx)) &&
2485 "Expected StartIdx to be folded to a constant when VF is not "
2486 "scalable");
2487 auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step);
2488 auto *Add = Builder.CreateBinOp(AddOp, BaseIV, Mul);
2489 State.set(this, Add, VPLane(Lane));
2490 }
2491}
2492
2493#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2495 VPSlotTracker &SlotTracker) const {
2496 O << Indent;
2498 O << " = SCALAR-STEPS ";
2500}
2501#endif
2502
2504 assert(State.VF.isVector() && "not widening");
2505 // Construct a vector GEP by widening the operands of the scalar GEP as
2506 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
2507 // results in a vector of pointers when at least one operand of the GEP
2508 // is vector-typed. Thus, to keep the representation compact, we only use
2509 // vector-typed operands for loop-varying values.
2510
2511 if (areAllOperandsInvariant()) {
2512 // If we are vectorizing, but the GEP has only loop-invariant operands,
2513 // the GEP we build (by only using vector-typed operands for
2514 // loop-varying values) would be a scalar pointer. Thus, to ensure we
2515 // produce a vector of pointers, we need to either arbitrarily pick an
2516 // operand to broadcast, or broadcast a clone of the original GEP.
2517 // Here, we broadcast a clone of the original.
2518 //
2519 // TODO: If at some point we decide to scalarize instructions having
2520 // loop-invariant operands, this special case will no longer be
2521 // required. We would add the scalarization decision to
2522 // collectLoopScalars() and teach getVectorValue() to broadcast
2523 // the lane-zero scalar value.
2525 for (unsigned I = 0, E = getNumOperands(); I != E; I++)
2526 Ops.push_back(State.get(getOperand(I), VPLane(0)));
2527
2528 auto *NewGEP =
2529 State.Builder.CreateGEP(getSourceElementType(), Ops[0], drop_begin(Ops),
2530 "", getGEPNoWrapFlags());
2531 Value *Splat = State.Builder.CreateVectorSplat(State.VF, NewGEP);
2532 State.set(this, Splat);
2533 } else {
2534 // If the GEP has at least one loop-varying operand, we are sure to
2535 // produce a vector of pointers unless VF is scalar.
2536 // The pointer operand of the new GEP. If it's loop-invariant, we
2537 // won't broadcast it.
2538 auto *Ptr = State.get(getOperand(0), isPointerLoopInvariant());
2539
2540 // Collect all the indices for the new GEP. If any index is
2541 // loop-invariant, we won't broadcast it.
2543 for (unsigned I = 1, E = getNumOperands(); I < E; I++) {
2544 VPValue *Operand = getOperand(I);
2545 Indices.push_back(State.get(Operand, isIndexLoopInvariant(I - 1)));
2546 }
2547
2548 // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
2549 // but it should be a vector, otherwise.
2550 auto *NewGEP = State.Builder.CreateGEP(getSourceElementType(), Ptr, Indices,
2551 "", getGEPNoWrapFlags());
2552 assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
2553 "NewGEP is not a pointer vector");
2554 State.set(this, NewGEP);
2555 }
2556}
2557
2558#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2560 VPSlotTracker &SlotTracker) const {
2561 O << Indent << "WIDEN-GEP ";
2562 O << (isPointerLoopInvariant() ? "Inv" : "Var");
2563 for (size_t I = 0; I < getNumOperands() - 1; ++I)
2564 O << "[" << (isIndexLoopInvariant(I) ? "Inv" : "Var") << "]";
2565
2566 O << " ";
2568 O << " = getelementptr";
2569 printFlags(O);
2571}
2572#endif
2573
2574static Type *getGEPIndexTy(bool IsScalable, bool IsReverse, bool IsUnitStride,
2575 unsigned CurrentPart, IRBuilderBase &Builder) {
2576 // Use i32 for the gep index type when the value is constant,
2577 // or query DataLayout for a more suitable index type otherwise.
2578 const DataLayout &DL = Builder.GetInsertBlock()->getDataLayout();
2579 return !IsUnitStride || (IsScalable && (IsReverse || CurrentPart > 0))
2580 ? DL.getIndexType(Builder.getPtrTy(0))
2581 : Builder.getInt32Ty();
2582}
2583
2585 auto &Builder = State.Builder;
2586 unsigned CurrentPart = getUnrollPart(*this);
2587 bool IsUnitStride = Stride == 1 || Stride == -1;
2588 Type *IndexTy = getGEPIndexTy(State.VF.isScalable(), /*IsReverse*/ true,
2589 IsUnitStride, CurrentPart, Builder);
2590
2591 // The wide store needs to start at the last vector element.
2592 Value *RunTimeVF = State.get(getVFValue(), VPLane(0));
2593 if (IndexTy != RunTimeVF->getType())
2594 RunTimeVF = Builder.CreateZExtOrTrunc(RunTimeVF, IndexTy);
2595 // NumElt = Stride * CurrentPart * RunTimeVF
2596 Value *NumElt = Builder.CreateMul(
2597 ConstantInt::get(IndexTy, Stride * (int64_t)CurrentPart), RunTimeVF);
2598 // LastLane = Stride * (RunTimeVF - 1)
2599 Value *LastLane = Builder.CreateSub(RunTimeVF, ConstantInt::get(IndexTy, 1));
2600 if (Stride != 1)
2601 LastLane = Builder.CreateMul(ConstantInt::get(IndexTy, Stride), LastLane);
2602 Value *Ptr = State.get(getOperand(0), VPLane(0));
2603 Value *ResultPtr =
2604 Builder.CreateGEP(IndexedTy, Ptr, NumElt, "", getGEPNoWrapFlags());
2605 ResultPtr = Builder.CreateGEP(IndexedTy, ResultPtr, LastLane, "",
2607
2608 State.set(this, ResultPtr, /*IsScalar*/ true);
2609}
2610
2611#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2613 VPSlotTracker &SlotTracker) const {
2614 O << Indent;
2616 O << " = vector-end-pointer";
2617 printFlags(O);
2619}
2620#endif
2621
2623 auto &Builder = State.Builder;
2624 unsigned CurrentPart = getUnrollPart(*this);
2625 Type *IndexTy = getGEPIndexTy(State.VF.isScalable(), /*IsReverse*/ false,
2626 /*IsUnitStride*/ true, CurrentPart, Builder);
2627 Value *Ptr = State.get(getOperand(0), VPLane(0));
2628
2629 Value *Increment = createStepForVF(Builder, IndexTy, State.VF, CurrentPart);
2630 Value *ResultPtr = Builder.CreateGEP(getSourceElementType(), Ptr, Increment,
2631 "", getGEPNoWrapFlags());
2632
2633 State.set(this, ResultPtr, /*IsScalar*/ true);
2634}
2635
2636#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2638 VPSlotTracker &SlotTracker) const {
2639 O << Indent;
2641 O << " = vector-pointer ";
2642
2644}
2645#endif
2646
2648 VPCostContext &Ctx) const {
2649 // Handle cases where only the first lane is used the same way as the legacy
2650 // cost model.
2652 return Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind);
2653
2654 Type *ResultTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
2655 Type *CmpTy = toVectorTy(Type::getInt1Ty(Ctx.Types.getContext()), VF);
2656 return (getNumIncomingValues() - 1) *
2657 Ctx.TTI.getCmpSelInstrCost(Instruction::Select, ResultTy, CmpTy,
2658 CmpInst::BAD_ICMP_PREDICATE, Ctx.CostKind);
2659}
2660
2661#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2663 VPSlotTracker &SlotTracker) const {
2664 O << Indent << "BLEND ";
2666 O << " =";
2667 if (getNumIncomingValues() == 1) {
2668 // Not a User of any mask: not really blending, this is a
2669 // single-predecessor phi.
2670 O << " ";
2671 getIncomingValue(0)->printAsOperand(O, SlotTracker);
2672 } else {
2673 for (unsigned I = 0, E = getNumIncomingValues(); I < E; ++I) {
2674 O << " ";
2675 getIncomingValue(I)->printAsOperand(O, SlotTracker);
2676 if (I == 0)
2677 continue;
2678 O << "/";
2679 getMask(I)->printAsOperand(O, SlotTracker);
2680 }
2681 }
2682}
2683#endif
2684
2686 assert(!State.Lane && "Reduction being replicated.");
2687 Value *PrevInChain = State.get(getChainOp(), /*IsScalar*/ true);
2690 "In-loop AnyOf reductions aren't currently supported");
2691 // Propagate the fast-math flags carried by the underlying instruction.
2692 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
2693 State.Builder.setFastMathFlags(getFastMathFlags());
2694 Value *NewVecOp = State.get(getVecOp());
2695 if (VPValue *Cond = getCondOp()) {
2696 Value *NewCond = State.get(Cond, State.VF.isScalar());
2697 VectorType *VecTy = dyn_cast<VectorType>(NewVecOp->getType());
2698 Type *ElementTy = VecTy ? VecTy->getElementType() : NewVecOp->getType();
2699
2700 Value *Start = getRecurrenceIdentity(Kind, ElementTy, getFastMathFlags());
2701 if (State.VF.isVector())
2702 Start = State.Builder.CreateVectorSplat(VecTy->getElementCount(), Start);
2703
2704 Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, Start);
2705 NewVecOp = Select;
2706 }
2707 Value *NewRed;
2708 Value *NextInChain;
2709 if (IsOrdered) {
2710 if (State.VF.isVector())
2711 NewRed =
2712 createOrderedReduction(State.Builder, Kind, NewVecOp, PrevInChain);
2713 else
2714 NewRed = State.Builder.CreateBinOp(
2716 PrevInChain, NewVecOp);
2717 PrevInChain = NewRed;
2718 NextInChain = NewRed;
2719 } else {
2720 PrevInChain = State.get(getChainOp(), /*IsScalar*/ true);
2721 NewRed = createSimpleReduction(State.Builder, NewVecOp, Kind);
2723 NextInChain = createMinMaxOp(State.Builder, Kind, NewRed, PrevInChain);
2724 else
2725 NextInChain = State.Builder.CreateBinOp(
2727 PrevInChain, NewRed);
2728 }
2729 State.set(this, NextInChain, /*IsScalar*/ true);
2730}
2731
2733 assert(!State.Lane && "Reduction being replicated.");
2734
2735 auto &Builder = State.Builder;
2736 // Propagate the fast-math flags carried by the underlying instruction.
2737 IRBuilderBase::FastMathFlagGuard FMFGuard(Builder);
2738 Builder.setFastMathFlags(getFastMathFlags());
2739
2741 Value *Prev = State.get(getChainOp(), /*IsScalar*/ true);
2742 Value *VecOp = State.get(getVecOp());
2743 Value *EVL = State.get(getEVL(), VPLane(0));
2744
2745 Value *Mask;
2746 if (VPValue *CondOp = getCondOp())
2747 Mask = State.get(CondOp);
2748 else
2749 Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
2750
2751 Value *NewRed;
2752 if (isOrdered()) {
2753 NewRed = createOrderedReduction(Builder, Kind, VecOp, Prev, Mask, EVL);
2754 } else {
2755 NewRed = createSimpleReduction(Builder, VecOp, Kind, Mask, EVL);
2757 NewRed = createMinMaxOp(Builder, Kind, NewRed, Prev);
2758 else
2759 NewRed = Builder.CreateBinOp(
2761 Prev);
2762 }
2763 State.set(this, NewRed, /*IsScalar*/ true);
2764}
2765
2767 VPCostContext &Ctx) const {
2768 RecurKind RdxKind = getRecurrenceKind();
2769 Type *ElementTy = Ctx.Types.inferScalarType(this);
2770 auto *VectorTy = cast<VectorType>(toVectorTy(ElementTy, VF));
2771 unsigned Opcode = RecurrenceDescriptor::getOpcode(RdxKind);
2773 std::optional<FastMathFlags> OptionalFMF =
2774 ElementTy->isFloatingPointTy() ? std::make_optional(FMFs) : std::nullopt;
2775
2776 // TODO: Support any-of reductions.
2777 assert(
2779 ForceTargetInstructionCost.getNumOccurrences() > 0) &&
2780 "Any-of reduction not implemented in VPlan-based cost model currently.");
2781
2782 // Note that TTI should model the cost of moving result to the scalar register
2783 // and the BinOp cost in the getMinMaxReductionCost().
2786 return Ctx.TTI.getMinMaxReductionCost(Id, VectorTy, FMFs, Ctx.CostKind);
2787 }
2788
2789 // Note that TTI should model the cost of moving result to the scalar register
2790 // and the BinOp cost in the getArithmeticReductionCost().
2791 return Ctx.TTI.getArithmeticReductionCost(Opcode, VectorTy, OptionalFMF,
2792 Ctx.CostKind);
2793}
2794
2796 ExpressionTypes ExpressionType,
2797 ArrayRef<VPSingleDefRecipe *> ExpressionRecipes)
2798 : VPSingleDefRecipe(VPDef::VPExpressionSC, {}, {}),
2799 ExpressionRecipes(ExpressionRecipes), ExpressionType(ExpressionType) {
2800 assert(!ExpressionRecipes.empty() && "Nothing to combine?");
2801 assert(
2802 none_of(ExpressionRecipes,
2803 [](VPSingleDefRecipe *R) { return R->mayHaveSideEffects(); }) &&
2804 "expression cannot contain recipes with side-effects");
2805
2806 // Maintain a copy of the expression recipes as a set of users.
2807 SmallPtrSet<VPUser *, 4> ExpressionRecipesAsSetOfUsers;
2808 for (auto *R : ExpressionRecipes)
2809 ExpressionRecipesAsSetOfUsers.insert(R);
2810
2811 // Recipes in the expression, except the last one, must only be used by
2812 // (other) recipes inside the expression. If there are other users, external
2813 // to the expression, use a clone of the recipe for external users.
2814 for (VPSingleDefRecipe *R : reverse(ExpressionRecipes)) {
2815 if (R != ExpressionRecipes.back() &&
2816 any_of(R->users(), [&ExpressionRecipesAsSetOfUsers](VPUser *U) {
2817 return !ExpressionRecipesAsSetOfUsers.contains(U);
2818 })) {
2819 // There are users outside of the expression. Clone the recipe and use the
2820 // clone those external users.
2821 VPSingleDefRecipe *CopyForExtUsers = R->clone();
2822 R->replaceUsesWithIf(CopyForExtUsers, [&ExpressionRecipesAsSetOfUsers](
2823 VPUser &U, unsigned) {
2824 return !ExpressionRecipesAsSetOfUsers.contains(&U);
2825 });
2826 CopyForExtUsers->insertBefore(R);
2827 }
2828 if (R->getParent())
2829 R->removeFromParent();
2830 }
2831
2832 // Internalize all external operands to the expression recipes. To do so,
2833 // create new temporary VPValues for all operands defined by a recipe outside
2834 // the expression. The original operands are added as operands of the
2835 // VPExpressionRecipe itself.
2836 for (auto *R : ExpressionRecipes) {
2837 for (const auto &[Idx, Op] : enumerate(R->operands())) {
2838 auto *Def = Op->getDefiningRecipe();
2839 if (Def && ExpressionRecipesAsSetOfUsers.contains(Def))
2840 continue;
2841 addOperand(Op);
2842 LiveInPlaceholders.push_back(new VPValue());
2843 }
2844 }
2845
2846 // Replace each external operand with the first one created for it in
2847 // LiveInPlaceholders.
2848 for (auto *R : ExpressionRecipes)
2849 for (auto const &[LiveIn, Tmp] : zip(operands(), LiveInPlaceholders))
2850 R->replaceUsesOfWith(LiveIn, Tmp);
2851}
2852
2854 for (auto *R : ExpressionRecipes)
2855 // Since the list could contain duplicates, make sure the recipe hasn't
2856 // already been inserted.
2857 if (!R->getParent())
2858 R->insertBefore(this);
2859
2860 for (const auto &[Idx, Op] : enumerate(operands()))
2861 LiveInPlaceholders[Idx]->replaceAllUsesWith(Op);
2862
2863 replaceAllUsesWith(ExpressionRecipes.back());
2864 ExpressionRecipes.clear();
2865}
2866
2868 VPCostContext &Ctx) const {
2869 Type *RedTy = Ctx.Types.inferScalarType(this);
2870 auto *SrcVecTy = cast<VectorType>(
2871 toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF));
2872 assert(RedTy->isIntegerTy() &&
2873 "VPExpressionRecipe only supports integer types currently.");
2874 unsigned Opcode = RecurrenceDescriptor::getOpcode(
2875 cast<VPReductionRecipe>(ExpressionRecipes.back())->getRecurrenceKind());
2876 switch (ExpressionType) {
2877 case ExpressionTypes::ExtendedReduction: {
2878 unsigned Opcode = RecurrenceDescriptor::getOpcode(
2879 cast<VPReductionRecipe>(ExpressionRecipes[1])->getRecurrenceKind());
2880 auto *ExtR = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
2881 return isa<VPPartialReductionRecipe>(ExpressionRecipes.back())
2882 ? Ctx.TTI.getPartialReductionCost(
2883 Opcode, Ctx.Types.inferScalarType(getOperand(0)), nullptr,
2884 RedTy, VF,
2886 ExtR->getOpcode()),
2887 TargetTransformInfo::PR_None, std::nullopt, Ctx.CostKind)
2888 : Ctx.TTI.getExtendedReductionCost(
2889 Opcode, ExtR->getOpcode() == Instruction::ZExt, RedTy,
2890 SrcVecTy, std::nullopt, Ctx.CostKind);
2891 }
2892 case ExpressionTypes::MulAccReduction:
2893 return Ctx.TTI.getMulAccReductionCost(false, Opcode, RedTy, SrcVecTy,
2894 Ctx.CostKind);
2895
2896 case ExpressionTypes::ExtNegatedMulAccReduction:
2897 assert(Opcode == Instruction::Add && "Unexpected opcode");
2898 Opcode = Instruction::Sub;
2899 [[fallthrough]];
2900 case ExpressionTypes::ExtMulAccReduction: {
2901 if (isa<VPPartialReductionRecipe>(ExpressionRecipes.back())) {
2902 auto *Ext0R = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
2903 auto *Ext1R = cast<VPWidenCastRecipe>(ExpressionRecipes[1]);
2904 auto *Mul = cast<VPWidenRecipe>(ExpressionRecipes[2]);
2905 return Ctx.TTI.getPartialReductionCost(
2906 Opcode, Ctx.Types.inferScalarType(getOperand(0)),
2907 Ctx.Types.inferScalarType(getOperand(1)), RedTy, VF,
2909 Ext0R->getOpcode()),
2911 Ext1R->getOpcode()),
2912 Mul->getOpcode(), Ctx.CostKind);
2913 }
2914 return Ctx.TTI.getMulAccReductionCost(
2915 cast<VPWidenCastRecipe>(ExpressionRecipes.front())->getOpcode() ==
2916 Instruction::ZExt,
2917 Opcode, RedTy, SrcVecTy, Ctx.CostKind);
2918 }
2919 }
2920 llvm_unreachable("Unknown VPExpressionRecipe::ExpressionTypes enum");
2921}
2922
2924 return any_of(ExpressionRecipes, [](VPSingleDefRecipe *R) {
2925 return R->mayReadFromMemory() || R->mayWriteToMemory();
2926 });
2927}
2928
2930 assert(
2931 none_of(ExpressionRecipes,
2932 [](VPSingleDefRecipe *R) { return R->mayHaveSideEffects(); }) &&
2933 "expression cannot contain recipes with side-effects");
2934 return false;
2935}
2936
2938 // Cannot use vputils::isSingleScalar(), because all external operands
2939 // of the expression will be live-ins while bundled.
2940 return isa<VPReductionRecipe>(ExpressionRecipes.back()) &&
2941 !isa<VPPartialReductionRecipe>(ExpressionRecipes.back());
2942}
2943
2944#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2945
2947 VPSlotTracker &SlotTracker) const {
2948 O << Indent << "EXPRESSION ";
2950 O << " = ";
2951 auto *Red = cast<VPReductionRecipe>(ExpressionRecipes.back());
2952 unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind());
2953 bool IsPartialReduction = isa<VPPartialReductionRecipe>(Red);
2954
2955 switch (ExpressionType) {
2956 case ExpressionTypes::ExtendedReduction: {
2958 O << " + " << (IsPartialReduction ? "partial." : "") << "reduce.";
2959 O << Instruction::getOpcodeName(Opcode) << " (";
2961 Red->printFlags(O);
2962
2963 auto *Ext0 = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
2964 O << Instruction::getOpcodeName(Ext0->getOpcode()) << " to "
2965 << *Ext0->getResultType();
2966 if (Red->isConditional()) {
2967 O << ", ";
2968 Red->getCondOp()->printAsOperand(O, SlotTracker);
2969 }
2970 O << ")";
2971 break;
2972 }
2973 case ExpressionTypes::ExtNegatedMulAccReduction: {
2975 O << " + " << (IsPartialReduction ? "partial." : "") << "reduce.";
2977 RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()))
2978 << " (sub (0, mul";
2979 auto *Mul = cast<VPWidenRecipe>(ExpressionRecipes[2]);
2980 Mul->printFlags(O);
2981 O << "(";
2983 auto *Ext0 = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
2984 O << " " << Instruction::getOpcodeName(Ext0->getOpcode()) << " to "
2985 << *Ext0->getResultType() << "), (";
2987 auto *Ext1 = cast<VPWidenCastRecipe>(ExpressionRecipes[1]);
2988 O << " " << Instruction::getOpcodeName(Ext1->getOpcode()) << " to "
2989 << *Ext1->getResultType() << ")";
2990 if (Red->isConditional()) {
2991 O << ", ";
2992 Red->getCondOp()->printAsOperand(O, SlotTracker);
2993 }
2994 O << "))";
2995 break;
2996 }
2997 case ExpressionTypes::MulAccReduction:
2998 case ExpressionTypes::ExtMulAccReduction: {
3000 O << " + " << (IsPartialReduction ? "partial." : "") << "reduce.";
3002 RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()))
3003 << " (";
3004 O << "mul";
3005 bool IsExtended = ExpressionType == ExpressionTypes::ExtMulAccReduction;
3006 auto *Mul = cast<VPWidenRecipe>(IsExtended ? ExpressionRecipes[2]
3007 : ExpressionRecipes[0]);
3008 Mul->printFlags(O);
3009 if (IsExtended)
3010 O << "(";
3012 if (IsExtended) {
3013 auto *Ext0 = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
3014 O << " " << Instruction::getOpcodeName(Ext0->getOpcode()) << " to "
3015 << *Ext0->getResultType() << "), (";
3016 } else {
3017 O << ", ";
3018 }
3020 if (IsExtended) {
3021 auto *Ext1 = cast<VPWidenCastRecipe>(ExpressionRecipes[1]);
3022 O << " " << Instruction::getOpcodeName(Ext1->getOpcode()) << " to "
3023 << *Ext1->getResultType() << ")";
3024 }
3025 if (Red->isConditional()) {
3026 O << ", ";
3027 Red->getCondOp()->printAsOperand(O, SlotTracker);
3028 }
3029 O << ")";
3030 break;
3031 }
3032 }
3033}
3034
3036 VPSlotTracker &SlotTracker) const {
3037 O << Indent << "REDUCE ";
3039 O << " = ";
3041 O << " +";
3042 printFlags(O);
3043 O << " reduce."
3046 << " (";
3048 if (isConditional()) {
3049 O << ", ";
3051 }
3052 O << ")";
3053}
3054
3056 VPSlotTracker &SlotTracker) const {
3057 O << Indent << "REDUCE ";
3059 O << " = ";
3061 O << " +";
3062 printFlags(O);
3063 O << " vp.reduce."
3066 << " (";
3068 O << ", ";
3070 if (isConditional()) {
3071 O << ", ";
3073 }
3074 O << ")";
3075}
3076
3077#endif
3078
3079/// A helper function to scalarize a single Instruction in the innermost loop.
3080/// Generates a sequence of scalar instances for lane \p Lane. Uses the VPValue
3081/// operands from \p RepRecipe instead of \p Instr's operands.
3082static void scalarizeInstruction(const Instruction *Instr,
3083 VPReplicateRecipe *RepRecipe,
3084 const VPLane &Lane, VPTransformState &State) {
3085 assert((!Instr->getType()->isAggregateType() ||
3086 canVectorizeTy(Instr->getType())) &&
3087 "Expected vectorizable or non-aggregate type.");
3088
3089 // Does this instruction return a value ?
3090 bool IsVoidRetTy = Instr->getType()->isVoidTy();
3091
3092 Instruction *Cloned = Instr->clone();
3093 if (!IsVoidRetTy) {
3094 Cloned->setName(Instr->getName() + ".cloned");
3095 Type *ResultTy = State.TypeAnalysis.inferScalarType(RepRecipe);
3096 // The operands of the replicate recipe may have been narrowed, resulting in
3097 // a narrower result type. Update the type of the cloned instruction to the
3098 // correct type.
3099 if (ResultTy != Cloned->getType())
3100 Cloned->mutateType(ResultTy);
3101 }
3102
3103 RepRecipe->applyFlags(*Cloned);
3104 RepRecipe->applyMetadata(*Cloned);
3105
3106 if (RepRecipe->hasPredicate())
3107 cast<CmpInst>(Cloned)->setPredicate(RepRecipe->getPredicate());
3108
3109 if (auto DL = RepRecipe->getDebugLoc())
3110 State.setDebugLocFrom(DL);
3111
3112 // Replace the operands of the cloned instructions with their scalar
3113 // equivalents in the new loop.
3114 for (const auto &I : enumerate(RepRecipe->operands())) {
3115 auto InputLane = Lane;
3116 VPValue *Operand = I.value();
3117 if (vputils::isSingleScalar(Operand))
3118 InputLane = VPLane::getFirstLane();
3119 Cloned->setOperand(I.index(), State.get(Operand, InputLane));
3120 }
3121
3122 // Place the cloned scalar in the new loop.
3123 State.Builder.Insert(Cloned);
3124
3125 State.set(RepRecipe, Cloned, Lane);
3126
3127 // If we just cloned a new assumption, add it the assumption cache.
3128 if (auto *II = dyn_cast<AssumeInst>(Cloned))
3129 State.AC->registerAssumption(II);
3130
3131 assert(
3132 (RepRecipe->getRegion() ||
3133 !RepRecipe->getParent()->getPlan()->getVectorLoopRegion() ||
3134 all_of(RepRecipe->operands(),
3135 [](VPValue *Op) { return Op->isDefinedOutsideLoopRegions(); })) &&
3136 "Expected a recipe is either within a region or all of its operands "
3137 "are defined outside the vectorized region.");
3138}
3139
3142
3143 if (!State.Lane) {
3144 assert(IsSingleScalar && "VPReplicateRecipes outside replicate regions "
3145 "must have already been unrolled");
3146 scalarizeInstruction(UI, this, VPLane(0), State);
3147 return;
3148 }
3149
3150 assert((State.VF.isScalar() || !isSingleScalar()) &&
3151 "uniform recipe shouldn't be predicated");
3152 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
3153 scalarizeInstruction(UI, this, *State.Lane, State);
3154 // Insert scalar instance packing it into a vector.
3155 if (State.VF.isVector() && shouldPack()) {
3156 Value *WideValue =
3157 State.Lane->isFirstLane()
3158 ? PoisonValue::get(toVectorizedTy(UI->getType(), State.VF))
3159 : State.get(this);
3160 State.set(this, State.packScalarIntoVectorizedValue(this, WideValue,
3161 *State.Lane));
3162 }
3163}
3164
3166 // Find if the recipe is used by a widened recipe via an intervening
3167 // VPPredInstPHIRecipe. In this case, also pack the scalar values in a vector.
3168 return any_of(users(), [](const VPUser *U) {
3169 if (auto *PredR = dyn_cast<VPPredInstPHIRecipe>(U))
3170 return !vputils::onlyScalarValuesUsed(PredR);
3171 return false;
3172 });
3173}
3174
3175/// Returns a SCEV expression for \p Ptr if it is a pointer computation for
3176/// which the legacy cost model computes a SCEV expression when computing the
3177/// address cost. Computing SCEVs for VPValues is incomplete and returns
3178/// SCEVCouldNotCompute in cases the legacy cost model can compute SCEVs. In
3179/// those cases we fall back to the legacy cost model. Otherwise return nullptr.
3181 const Loop *L) {
3182 auto *PtrR = Ptr->getDefiningRecipe();
3183 if (!PtrR || !((isa<VPReplicateRecipe>(PtrR) &&
3185 Instruction::GetElementPtr) ||
3186 isa<VPWidenGEPRecipe>(PtrR) ||
3188 return nullptr;
3189
3190 // We are looking for a GEP where all indices are either loop invariant or
3191 // inductions.
3192 for (VPValue *Opd : drop_begin(PtrR->operands())) {
3193 if (!Opd->isDefinedOutsideLoopRegions() &&
3195 return nullptr;
3196 }
3197
3198 return vputils::getSCEVExprForVPValue(Ptr, SE, L);
3199}
3200
3201/// Returns true if \p V is used as part of the address of another load or
3202/// store.
3203static bool isUsedByLoadStoreAddress(const VPUser *V) {
3205 SmallVector<const VPUser *> WorkList = {V};
3206
3207 while (!WorkList.empty()) {
3208 auto *Cur = dyn_cast<VPSingleDefRecipe>(WorkList.pop_back_val());
3209 if (!Cur || !Seen.insert(Cur).second)
3210 continue;
3211
3212 auto *Blend = dyn_cast<VPBlendRecipe>(Cur);
3213 // Skip blends that use V only through a compare by checking if any incoming
3214 // value was already visited.
3215 if (Blend && none_of(seq<unsigned>(0, Blend->getNumIncomingValues()),
3216 [&](unsigned I) {
3217 return Seen.contains(
3218 Blend->getIncomingValue(I)->getDefiningRecipe());
3219 }))
3220 continue;
3221
3222 for (VPUser *U : Cur->users()) {
3223 if (auto *InterleaveR = dyn_cast<VPInterleaveBase>(U))
3224 if (InterleaveR->getAddr() == Cur)
3225 return true;
3226 if (auto *RepR = dyn_cast<VPReplicateRecipe>(U)) {
3227 if (RepR->getOpcode() == Instruction::Load &&
3228 RepR->getOperand(0) == Cur)
3229 return true;
3230 if (RepR->getOpcode() == Instruction::Store &&
3231 RepR->getOperand(1) == Cur)
3232 return true;
3233 }
3234 if (auto *MemR = dyn_cast<VPWidenMemoryRecipe>(U)) {
3235 if (MemR->getAddr() == Cur && MemR->isConsecutive())
3236 return true;
3237 }
3238 }
3239
3240 // The legacy cost model only supports scalarization loads/stores with phi
3241 // addresses, if the phi is directly used as load/store address. Don't
3242 // traverse further for Blends.
3243 if (Blend)
3244 continue;
3245
3246 append_range(WorkList, Cur->users());
3247 }
3248 return false;
3249}
3250
3252 VPCostContext &Ctx) const {
3254 // VPReplicateRecipe may be cloned as part of an existing VPlan-to-VPlan
3255 // transform, avoid computing their cost multiple times for now.
3256 Ctx.SkipCostComputation.insert(UI);
3257
3258 if (VF.isScalable() && !isSingleScalar())
3260
3261 switch (UI->getOpcode()) {
3262 case Instruction::GetElementPtr:
3263 // We mark this instruction as zero-cost because the cost of GEPs in
3264 // vectorized code depends on whether the corresponding memory instruction
3265 // is scalarized or not. Therefore, we handle GEPs with the memory
3266 // instruction cost.
3267 return 0;
3268 case Instruction::Call: {
3269 auto *CalledFn =
3271
3274 for (const VPValue *ArgOp : ArgOps)
3275 Tys.push_back(Ctx.Types.inferScalarType(ArgOp));
3276
3277 if (CalledFn->isIntrinsic())
3278 // Various pseudo-intrinsics with costs of 0 are scalarized instead of
3279 // vectorized via VPWidenIntrinsicRecipe. Return 0 for them early.
3280 switch (CalledFn->getIntrinsicID()) {
3281 case Intrinsic::assume:
3282 case Intrinsic::lifetime_end:
3283 case Intrinsic::lifetime_start:
3284 case Intrinsic::sideeffect:
3285 case Intrinsic::pseudoprobe:
3286 case Intrinsic::experimental_noalias_scope_decl: {
3287 assert(getCostForIntrinsics(CalledFn->getIntrinsicID(), ArgOps, *this,
3288 ElementCount::getFixed(1), Ctx) == 0 &&
3289 "scalarizing intrinsic should be free");
3290 return InstructionCost(0);
3291 }
3292 default:
3293 break;
3294 }
3295
3296 Type *ResultTy = Ctx.Types.inferScalarType(this);
3297 InstructionCost ScalarCallCost =
3298 Ctx.TTI.getCallInstrCost(CalledFn, ResultTy, Tys, Ctx.CostKind);
3299 if (isSingleScalar()) {
3300 if (CalledFn->isIntrinsic())
3301 ScalarCallCost = std::min(
3302 ScalarCallCost,
3303 getCostForIntrinsics(CalledFn->getIntrinsicID(), ArgOps, *this,
3304 ElementCount::getFixed(1), Ctx));
3305 return ScalarCallCost;
3306 }
3307
3308 return ScalarCallCost * VF.getFixedValue() +
3309 Ctx.getScalarizationOverhead(ResultTy, ArgOps, VF);
3310 }
3311 case Instruction::Add:
3312 case Instruction::Sub:
3313 case Instruction::FAdd:
3314 case Instruction::FSub:
3315 case Instruction::Mul:
3316 case Instruction::FMul:
3317 case Instruction::FDiv:
3318 case Instruction::FRem:
3319 case Instruction::Shl:
3320 case Instruction::LShr:
3321 case Instruction::AShr:
3322 case Instruction::And:
3323 case Instruction::Or:
3324 case Instruction::Xor:
3325 case Instruction::ICmp:
3326 case Instruction::FCmp:
3328 Ctx) *
3329 (isSingleScalar() ? 1 : VF.getFixedValue());
3330 case Instruction::SDiv:
3331 case Instruction::UDiv:
3332 case Instruction::SRem:
3333 case Instruction::URem: {
3334 InstructionCost ScalarCost =
3336 if (isSingleScalar())
3337 return ScalarCost;
3338
3339 ScalarCost = ScalarCost * VF.getFixedValue() +
3340 Ctx.getScalarizationOverhead(Ctx.Types.inferScalarType(this),
3341 to_vector(operands()), VF);
3342 // If the recipe is not predicated (i.e. not in a replicate region), return
3343 // the scalar cost. Otherwise handle predicated cost.
3344 if (!getRegion()->isReplicator())
3345 return ScalarCost;
3346
3347 // Account for the phi nodes that we will create.
3348 ScalarCost += VF.getFixedValue() *
3349 Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind);
3350 // Scale the cost by the probability of executing the predicated blocks.
3351 // This assumes the predicated block for each vector lane is equally
3352 // likely.
3353 ScalarCost /= getPredBlockCostDivisor(Ctx.CostKind);
3354 return ScalarCost;
3355 }
3356 case Instruction::Load:
3357 case Instruction::Store: {
3358 // TODO: See getMemInstScalarizationCost for how to handle replicating and
3359 // predicated cases.
3360 const VPRegionBlock *ParentRegion = getRegion();
3361 if (ParentRegion && ParentRegion->isReplicator())
3362 break;
3363
3364 bool IsLoad = UI->getOpcode() == Instruction::Load;
3365 const VPValue *PtrOp = getOperand(!IsLoad);
3366 const SCEV *PtrSCEV = getAddressAccessSCEV(PtrOp, Ctx.SE, Ctx.L);
3368 break;
3369
3370 Type *ValTy = Ctx.Types.inferScalarType(IsLoad ? this : getOperand(0));
3371 Type *ScalarPtrTy = Ctx.Types.inferScalarType(PtrOp);
3372 const Align Alignment = getLoadStoreAlignment(UI);
3373 unsigned AS = cast<PointerType>(ScalarPtrTy)->getAddressSpace();
3375 InstructionCost ScalarMemOpCost = Ctx.TTI.getMemoryOpCost(
3376 UI->getOpcode(), ValTy, Alignment, AS, Ctx.CostKind, OpInfo);
3377
3378 Type *PtrTy = isSingleScalar() ? ScalarPtrTy : toVectorTy(ScalarPtrTy, VF);
3379 bool PreferVectorizedAddressing = Ctx.TTI.prefersVectorizedAddressing();
3380 bool UsedByLoadStoreAddress =
3381 !PreferVectorizedAddressing && isUsedByLoadStoreAddress(this);
3382 InstructionCost ScalarCost =
3383 ScalarMemOpCost + Ctx.TTI.getAddressComputationCost(
3384 PtrTy, UsedByLoadStoreAddress ? nullptr : &Ctx.SE,
3385 PtrSCEV, Ctx.CostKind);
3386 if (isSingleScalar())
3387 return ScalarCost;
3388
3389 SmallVector<const VPValue *> OpsToScalarize;
3390 Type *ResultTy = Type::getVoidTy(PtrTy->getContext());
3391 // Set ResultTy and OpsToScalarize, if scalarization is needed. Currently we
3392 // don't assign scalarization overhead in general, if the target prefers
3393 // vectorized addressing or the loaded value is used as part of an address
3394 // of another load or store.
3395 if (!UsedByLoadStoreAddress) {
3396 bool EfficientVectorLoadStore =
3397 Ctx.TTI.supportsEfficientVectorElementLoadStore();
3398 if (!(IsLoad && !PreferVectorizedAddressing) &&
3399 !(!IsLoad && EfficientVectorLoadStore))
3400 append_range(OpsToScalarize, operands());
3401
3402 if (!EfficientVectorLoadStore)
3403 ResultTy = Ctx.Types.inferScalarType(this);
3404 }
3405
3406 return (ScalarCost * VF.getFixedValue()) +
3407 Ctx.getScalarizationOverhead(ResultTy, OpsToScalarize, VF, true);
3408 }
3409 }
3410
3411 return Ctx.getLegacyCost(UI, VF);
3412}
3413
3414#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3416 VPSlotTracker &SlotTracker) const {
3417 O << Indent << (IsSingleScalar ? "CLONE " : "REPLICATE ");
3418
3419 if (!getUnderlyingInstr()->getType()->isVoidTy()) {
3421 O << " = ";
3422 }
3423 if (auto *CB = dyn_cast<CallBase>(getUnderlyingInstr())) {
3424 O << "call";
3425 printFlags(O);
3426 O << "@" << CB->getCalledFunction()->getName() << "(";
3428 O, [&O, &SlotTracker](VPValue *Op) {
3429 Op->printAsOperand(O, SlotTracker);
3430 });
3431 O << ")";
3432 } else {
3434 printFlags(O);
3436 }
3437
3438 if (shouldPack())
3439 O << " (S->V)";
3440}
3441#endif
3442
3444 assert(State.Lane && "Branch on Mask works only on single instance.");
3445
3446 VPValue *BlockInMask = getOperand(0);
3447 Value *ConditionBit = State.get(BlockInMask, *State.Lane);
3448
3449 // Replace the temporary unreachable terminator with a new conditional branch,
3450 // whose two destinations will be set later when they are created.
3451 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
3452 assert(isa<UnreachableInst>(CurrentTerminator) &&
3453 "Expected to replace unreachable terminator with conditional branch.");
3454 auto CondBr =
3455 State.Builder.CreateCondBr(ConditionBit, State.CFG.PrevBB, nullptr);
3456 CondBr->setSuccessor(0, nullptr);
3457 CurrentTerminator->eraseFromParent();
3458}
3459
3461 VPCostContext &Ctx) const {
3462 // The legacy cost model doesn't assign costs to branches for individual
3463 // replicate regions. Match the current behavior in the VPlan cost model for
3464 // now.
3465 return 0;
3466}
3467
3469 assert(State.Lane && "Predicated instruction PHI works per instance.");
3470 Instruction *ScalarPredInst =
3471 cast<Instruction>(State.get(getOperand(0), *State.Lane));
3472 BasicBlock *PredicatedBB = ScalarPredInst->getParent();
3473 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
3474 assert(PredicatingBB && "Predicated block has no single predecessor.");
3476 "operand must be VPReplicateRecipe");
3477
3478 // By current pack/unpack logic we need to generate only a single phi node: if
3479 // a vector value for the predicated instruction exists at this point it means
3480 // the instruction has vector users only, and a phi for the vector value is
3481 // needed. In this case the recipe of the predicated instruction is marked to
3482 // also do that packing, thereby "hoisting" the insert-element sequence.
3483 // Otherwise, a phi node for the scalar value is needed.
3484 if (State.hasVectorValue(getOperand(0))) {
3485 auto *VecI = cast<Instruction>(State.get(getOperand(0)));
3487 "Packed operands must generate an insertelement or insertvalue");
3488
3489 // If VectorI is a struct, it will be a sequence like:
3490 // %1 = insertvalue %unmodified, %x, 0
3491 // %2 = insertvalue %1, %y, 1
3492 // %VectorI = insertvalue %2, %z, 2
3493 // To get the unmodified vector we need to look through the chain.
3494 if (auto *StructTy = dyn_cast<StructType>(VecI->getType()))
3495 for (unsigned I = 0; I < StructTy->getNumContainedTypes() - 1; I++)
3496 VecI = cast<InsertValueInst>(VecI->getOperand(0));
3497
3498 PHINode *VPhi = State.Builder.CreatePHI(VecI->getType(), 2);
3499 VPhi->addIncoming(VecI->getOperand(0), PredicatingBB); // Unmodified vector.
3500 VPhi->addIncoming(VecI, PredicatedBB); // New vector with inserted element.
3501 if (State.hasVectorValue(this))
3502 State.reset(this, VPhi);
3503 else
3504 State.set(this, VPhi);
3505 // NOTE: Currently we need to update the value of the operand, so the next
3506 // predicated iteration inserts its generated value in the correct vector.
3507 State.reset(getOperand(0), VPhi);
3508 } else {
3509 if (vputils::onlyFirstLaneUsed(this) && !State.Lane->isFirstLane())
3510 return;
3511
3512 Type *PredInstType = State.TypeAnalysis.inferScalarType(getOperand(0));
3513 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
3514 Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()),
3515 PredicatingBB);
3516 Phi->addIncoming(ScalarPredInst, PredicatedBB);
3517 if (State.hasScalarValue(this, *State.Lane))
3518 State.reset(this, Phi, *State.Lane);
3519 else
3520 State.set(this, Phi, *State.Lane);
3521 // NOTE: Currently we need to update the value of the operand, so the next
3522 // predicated iteration inserts its generated value in the correct vector.
3523 State.reset(getOperand(0), Phi, *State.Lane);
3524 }
3525}
3526
3527#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3529 VPSlotTracker &SlotTracker) const {
3530 O << Indent << "PHI-PREDICATED-INSTRUCTION ";
3532 O << " = ";
3534}
3535#endif
3536
3538 VPCostContext &Ctx) const {
3540 unsigned AS = cast<PointerType>(Ctx.Types.inferScalarType(getAddr()))
3541 ->getAddressSpace();
3542 unsigned Opcode = isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe>(this)
3543 ? Instruction::Load
3544 : Instruction::Store;
3545
3546 if (!Consecutive) {
3547 // TODO: Using the original IR may not be accurate.
3548 // Currently, ARM will use the underlying IR to calculate gather/scatter
3549 // instruction cost.
3550 assert(!Reverse &&
3551 "Inconsecutive memory access should not have the order.");
3552
3554 Type *PtrTy = Ptr->getType();
3555
3556 // If the address value is uniform across all lanes, then the address can be
3557 // calculated with scalar type and broadcast.
3559 PtrTy = toVectorTy(PtrTy, VF);
3560
3561 return Ctx.TTI.getAddressComputationCost(PtrTy, nullptr, nullptr,
3562 Ctx.CostKind) +
3563 Ctx.TTI.getGatherScatterOpCost(Opcode, Ty, Ptr, IsMasked, Alignment,
3564 Ctx.CostKind, &Ingredient);
3565 }
3566
3568 if (IsMasked) {
3569 Cost +=
3570 Ctx.TTI.getMaskedMemoryOpCost(Opcode, Ty, Alignment, AS, Ctx.CostKind);
3571 } else {
3572 TTI::OperandValueInfo OpInfo = Ctx.getOperandInfo(
3574 : getOperand(1));
3575 Cost += Ctx.TTI.getMemoryOpCost(Opcode, Ty, Alignment, AS, Ctx.CostKind,
3576 OpInfo, &Ingredient);
3577 }
3578 if (!Reverse)
3579 return Cost;
3580
3581 return Cost += Ctx.TTI.getShuffleCost(
3583 cast<VectorType>(Ty), {}, Ctx.CostKind, 0);
3584}
3585
3587 Type *ScalarDataTy = getLoadStoreType(&Ingredient);
3588 auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
3589 bool CreateGather = !isConsecutive();
3590
3591 auto &Builder = State.Builder;
3592 Value *Mask = nullptr;
3593 if (auto *VPMask = getMask()) {
3594 // Mask reversal is only needed for non-all-one (null) masks, as reverse
3595 // of a null all-one mask is a null mask.
3596 Mask = State.get(VPMask);
3597 if (isReverse())
3598 Mask = Builder.CreateVectorReverse(Mask, "reverse");
3599 }
3600
3601 Value *Addr = State.get(getAddr(), /*IsScalar*/ !CreateGather);
3602 Value *NewLI;
3603 if (CreateGather) {
3604 NewLI = Builder.CreateMaskedGather(DataTy, Addr, Alignment, Mask, nullptr,
3605 "wide.masked.gather");
3606 } else if (Mask) {
3607 NewLI =
3608 Builder.CreateMaskedLoad(DataTy, Addr, Alignment, Mask,
3609 PoisonValue::get(DataTy), "wide.masked.load");
3610 } else {
3611 NewLI = Builder.CreateAlignedLoad(DataTy, Addr, Alignment, "wide.load");
3612 }
3614 if (Reverse)
3615 NewLI = Builder.CreateVectorReverse(NewLI, "reverse");
3616 State.set(this, NewLI);
3617}
3618
3619#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3621 VPSlotTracker &SlotTracker) const {
3622 O << Indent << "WIDEN ";
3624 O << " = load ";
3626}
3627#endif
3628
3629/// Use all-true mask for reverse rather than actual mask, as it avoids a
3630/// dependence w/o affecting the result.
3632 Value *EVL, const Twine &Name) {
3633 VectorType *ValTy = cast<VectorType>(Operand->getType());
3634 Value *AllTrueMask =
3635 Builder.CreateVectorSplat(ValTy->getElementCount(), Builder.getTrue());
3636 return Builder.CreateIntrinsic(ValTy, Intrinsic::experimental_vp_reverse,
3637 {Operand, AllTrueMask, EVL}, nullptr, Name);
3638}
3639
3641 Type *ScalarDataTy = getLoadStoreType(&Ingredient);
3642 auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
3643 bool CreateGather = !isConsecutive();
3644
3645 auto &Builder = State.Builder;
3646 CallInst *NewLI;
3647 Value *EVL = State.get(getEVL(), VPLane(0));
3648 Value *Addr = State.get(getAddr(), !CreateGather);
3649 Value *Mask = nullptr;
3650 if (VPValue *VPMask = getMask()) {
3651 Mask = State.get(VPMask);
3652 if (isReverse())
3653 Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask");
3654 } else {
3655 Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
3656 }
3657
3658 if (CreateGather) {
3659 NewLI =
3660 Builder.CreateIntrinsic(DataTy, Intrinsic::vp_gather, {Addr, Mask, EVL},
3661 nullptr, "wide.masked.gather");
3662 } else {
3663 NewLI = Builder.CreateIntrinsic(DataTy, Intrinsic::vp_load,
3664 {Addr, Mask, EVL}, nullptr, "vp.op.load");
3665 }
3666 NewLI->addParamAttr(
3668 applyMetadata(*NewLI);
3669 Instruction *Res = NewLI;
3670 if (isReverse())
3671 Res = createReverseEVL(Builder, Res, EVL, "vp.reverse");
3672 State.set(this, Res);
3673}
3674
3676 VPCostContext &Ctx) const {
3677 if (!Consecutive || IsMasked)
3678 return VPWidenMemoryRecipe::computeCost(VF, Ctx);
3679
3680 // We need to use the getMaskedMemoryOpCost() instead of getMemoryOpCost()
3681 // here because the EVL recipes using EVL to replace the tail mask. But in the
3682 // legacy model, it will always calculate the cost of mask.
3683 // TODO: Using getMemoryOpCost() instead of getMaskedMemoryOpCost when we
3684 // don't need to compare to the legacy cost model.
3686 unsigned AS = cast<PointerType>(Ctx.Types.inferScalarType(getAddr()))
3687 ->getAddressSpace();
3688 InstructionCost Cost = Ctx.TTI.getMaskedMemoryOpCost(
3689 Instruction::Load, Ty, Alignment, AS, Ctx.CostKind);
3690 if (!Reverse)
3691 return Cost;
3692
3693 return Cost + Ctx.TTI.getShuffleCost(
3695 cast<VectorType>(Ty), {}, Ctx.CostKind, 0);
3696}
3697
3698#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3700 VPSlotTracker &SlotTracker) const {
3701 O << Indent << "WIDEN ";
3703 O << " = vp.load ";
3705}
3706#endif
3707
3709 VPValue *StoredVPValue = getStoredValue();
3710 bool CreateScatter = !isConsecutive();
3711
3712 auto &Builder = State.Builder;
3713
3714 Value *Mask = nullptr;
3715 if (auto *VPMask = getMask()) {
3716 // Mask reversal is only needed for non-all-one (null) masks, as reverse
3717 // of a null all-one mask is a null mask.
3718 Mask = State.get(VPMask);
3719 if (isReverse())
3720 Mask = Builder.CreateVectorReverse(Mask, "reverse");
3721 }
3722
3723 Value *StoredVal = State.get(StoredVPValue);
3724 if (isReverse()) {
3725 // If we store to reverse consecutive memory locations, then we need
3726 // to reverse the order of elements in the stored value.
3727 StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse");
3728 // We don't want to update the value in the map as it might be used in
3729 // another expression. So don't call resetVectorValue(StoredVal).
3730 }
3731 Value *Addr = State.get(getAddr(), /*IsScalar*/ !CreateScatter);
3732 Instruction *NewSI = nullptr;
3733 if (CreateScatter)
3734 NewSI = Builder.CreateMaskedScatter(StoredVal, Addr, Alignment, Mask);
3735 else if (Mask)
3736 NewSI = Builder.CreateMaskedStore(StoredVal, Addr, Alignment, Mask);
3737 else
3738 NewSI = Builder.CreateAlignedStore(StoredVal, Addr, Alignment);
3739 applyMetadata(*NewSI);
3740}
3741
3742#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3744 VPSlotTracker &SlotTracker) const {
3745 O << Indent << "WIDEN store ";
3747}
3748#endif
3749
3751 VPValue *StoredValue = getStoredValue();
3752 bool CreateScatter = !isConsecutive();
3753
3754 auto &Builder = State.Builder;
3755
3756 CallInst *NewSI = nullptr;
3757 Value *StoredVal = State.get(StoredValue);
3758 Value *EVL = State.get(getEVL(), VPLane(0));
3759 if (isReverse())
3760 StoredVal = createReverseEVL(Builder, StoredVal, EVL, "vp.reverse");
3761 Value *Mask = nullptr;
3762 if (VPValue *VPMask = getMask()) {
3763 Mask = State.get(VPMask);
3764 if (isReverse())
3765 Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask");
3766 } else {
3767 Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
3768 }
3769 Value *Addr = State.get(getAddr(), !CreateScatter);
3770 if (CreateScatter) {
3771 NewSI = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()),
3772 Intrinsic::vp_scatter,
3773 {StoredVal, Addr, Mask, EVL});
3774 } else {
3775 NewSI = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()),
3776 Intrinsic::vp_store,
3777 {StoredVal, Addr, Mask, EVL});
3778 }
3779 NewSI->addParamAttr(
3781 applyMetadata(*NewSI);
3782}
3783
3785 VPCostContext &Ctx) const {
3786 if (!Consecutive || IsMasked)
3787 return VPWidenMemoryRecipe::computeCost(VF, Ctx);
3788
3789 // We need to use the getMaskedMemoryOpCost() instead of getMemoryOpCost()
3790 // here because the EVL recipes using EVL to replace the tail mask. But in the
3791 // legacy model, it will always calculate the cost of mask.
3792 // TODO: Using getMemoryOpCost() instead of getMaskedMemoryOpCost when we
3793 // don't need to compare to the legacy cost model.
3795 unsigned AS = cast<PointerType>(Ctx.Types.inferScalarType(getAddr()))
3796 ->getAddressSpace();
3797 InstructionCost Cost = Ctx.TTI.getMaskedMemoryOpCost(
3798 Instruction::Store, Ty, Alignment, AS, Ctx.CostKind);
3799 if (!Reverse)
3800 return Cost;
3801
3802 return Cost + Ctx.TTI.getShuffleCost(
3804 cast<VectorType>(Ty), {}, Ctx.CostKind, 0);
3805}
3806
3807#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3809 VPSlotTracker &SlotTracker) const {
3810 O << Indent << "WIDEN vp.store ";
3812}
3813#endif
3814
3816 VectorType *DstVTy, const DataLayout &DL) {
3817 // Verify that V is a vector type with same number of elements as DstVTy.
3818 auto VF = DstVTy->getElementCount();
3819 auto *SrcVecTy = cast<VectorType>(V->getType());
3820 assert(VF == SrcVecTy->getElementCount() && "Vector dimensions do not match");
3821 Type *SrcElemTy = SrcVecTy->getElementType();
3822 Type *DstElemTy = DstVTy->getElementType();
3823 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
3824 "Vector elements must have same size");
3825
3826 // Do a direct cast if element types are castable.
3827 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
3828 return Builder.CreateBitOrPointerCast(V, DstVTy);
3829 }
3830 // V cannot be directly casted to desired vector type.
3831 // May happen when V is a floating point vector but DstVTy is a vector of
3832 // pointers or vice-versa. Handle this using a two-step bitcast using an
3833 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
3834 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
3835 "Only one type should be a pointer type");
3836 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
3837 "Only one type should be a floating point type");
3838 Type *IntTy =
3839 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
3840 auto *VecIntTy = VectorType::get(IntTy, VF);
3841 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
3842 return Builder.CreateBitOrPointerCast(CastVal, DstVTy);
3843}
3844
3845/// Return a vector containing interleaved elements from multiple
3846/// smaller input vectors.
3848 const Twine &Name) {
3849 unsigned Factor = Vals.size();
3850 assert(Factor > 1 && "Tried to interleave invalid number of vectors");
3851
3852 VectorType *VecTy = cast<VectorType>(Vals[0]->getType());
3853#ifndef NDEBUG
3854 for (Value *Val : Vals)
3855 assert(Val->getType() == VecTy && "Tried to interleave mismatched types");
3856#endif
3857
3858 // Scalable vectors cannot use arbitrary shufflevectors (only splats), so
3859 // must use intrinsics to interleave.
3860 if (VecTy->isScalableTy()) {
3861 assert(Factor <= 8 && "Unsupported interleave factor for scalable vectors");
3862 return Builder.CreateVectorInterleave(Vals, Name);
3863 }
3864
3865 // Fixed length. Start by concatenating all vectors into a wide vector.
3866 Value *WideVec = concatenateVectors(Builder, Vals);
3867
3868 // Interleave the elements into the wide vector.
3869 const unsigned NumElts = VecTy->getElementCount().getFixedValue();
3870 return Builder.CreateShuffleVector(
3871 WideVec, createInterleaveMask(NumElts, Factor), Name);
3872}
3873
3874// Try to vectorize the interleave group that \p Instr belongs to.
3875//
3876// E.g. Translate following interleaved load group (factor = 3):
3877// for (i = 0; i < N; i+=3) {
3878// R = Pic[i]; // Member of index 0
3879// G = Pic[i+1]; // Member of index 1
3880// B = Pic[i+2]; // Member of index 2
3881// ... // do something to R, G, B
3882// }
3883// To:
3884// %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B
3885// %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements
3886// %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements
3887// %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements
3888//
3889// Or translate following interleaved store group (factor = 3):
3890// for (i = 0; i < N; i+=3) {
3891// ... do something to R, G, B
3892// Pic[i] = R; // Member of index 0
3893// Pic[i+1] = G; // Member of index 1
3894// Pic[i+2] = B; // Member of index 2
3895// }
3896// To:
3897// %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
3898// %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
3899// %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
3900// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements
3901// store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B
3903 assert(!State.Lane && "Interleave group being replicated.");
3904 assert((!needsMaskForGaps() || !State.VF.isScalable()) &&
3905 "Masking gaps for scalable vectors is not yet supported.");
3907 Instruction *Instr = Group->getInsertPos();
3908
3909 // Prepare for the vector type of the interleaved load/store.
3910 Type *ScalarTy = getLoadStoreType(Instr);
3911 unsigned InterleaveFactor = Group->getFactor();
3912 auto *VecTy = VectorType::get(ScalarTy, State.VF * InterleaveFactor);
3913
3914 VPValue *BlockInMask = getMask();
3915 VPValue *Addr = getAddr();
3916 Value *ResAddr = State.get(Addr, VPLane(0));
3917
3918 auto CreateGroupMask = [&BlockInMask, &State,
3919 &InterleaveFactor](Value *MaskForGaps) -> Value * {
3920 if (State.VF.isScalable()) {
3921 assert(!MaskForGaps && "Interleaved groups with gaps are not supported.");
3922 assert(InterleaveFactor <= 8 &&
3923 "Unsupported deinterleave factor for scalable vectors");
3924 auto *ResBlockInMask = State.get(BlockInMask);
3925 SmallVector<Value *> Ops(InterleaveFactor, ResBlockInMask);
3926 return interleaveVectors(State.Builder, Ops, "interleaved.mask");
3927 }
3928
3929 if (!BlockInMask)
3930 return MaskForGaps;
3931
3932 Value *ResBlockInMask = State.get(BlockInMask);
3933 Value *ShuffledMask = State.Builder.CreateShuffleVector(
3934 ResBlockInMask,
3935 createReplicatedMask(InterleaveFactor, State.VF.getFixedValue()),
3936 "interleaved.mask");
3937 return MaskForGaps ? State.Builder.CreateBinOp(Instruction::And,
3938 ShuffledMask, MaskForGaps)
3939 : ShuffledMask;
3940 };
3941
3942 const DataLayout &DL = Instr->getDataLayout();
3943 // Vectorize the interleaved load group.
3944 if (isa<LoadInst>(Instr)) {
3945 Value *MaskForGaps = nullptr;
3946 if (needsMaskForGaps()) {
3947 MaskForGaps =
3948 createBitMaskForGaps(State.Builder, State.VF.getFixedValue(), *Group);
3949 assert(MaskForGaps && "Mask for Gaps is required but it is null");
3950 }
3951
3952 Instruction *NewLoad;
3953 if (BlockInMask || MaskForGaps) {
3954 Value *GroupMask = CreateGroupMask(MaskForGaps);
3955 Value *PoisonVec = PoisonValue::get(VecTy);
3956 NewLoad = State.Builder.CreateMaskedLoad(VecTy, ResAddr,
3957 Group->getAlign(), GroupMask,
3958 PoisonVec, "wide.masked.vec");
3959 } else
3960 NewLoad = State.Builder.CreateAlignedLoad(VecTy, ResAddr,
3961 Group->getAlign(), "wide.vec");
3962 applyMetadata(*NewLoad);
3963 // TODO: Also manage existing metadata using VPIRMetadata.
3964 Group->addMetadata(NewLoad);
3965
3967 if (VecTy->isScalableTy()) {
3968 // Scalable vectors cannot use arbitrary shufflevectors (only splats),
3969 // so must use intrinsics to deinterleave.
3970 assert(InterleaveFactor <= 8 &&
3971 "Unsupported deinterleave factor for scalable vectors");
3972 NewLoad = State.Builder.CreateIntrinsic(
3973 Intrinsic::getDeinterleaveIntrinsicID(InterleaveFactor),
3974 NewLoad->getType(), NewLoad,
3975 /*FMFSource=*/nullptr, "strided.vec");
3976 }
3977
3978 auto CreateStridedVector = [&InterleaveFactor, &State,
3979 &NewLoad](unsigned Index) -> Value * {
3980 assert(Index < InterleaveFactor && "Illegal group index");
3981 if (State.VF.isScalable())
3982 return State.Builder.CreateExtractValue(NewLoad, Index);
3983
3984 // For fixed length VF, use shuffle to extract the sub-vectors from the
3985 // wide load.
3986 auto StrideMask =
3987 createStrideMask(Index, InterleaveFactor, State.VF.getFixedValue());
3988 return State.Builder.CreateShuffleVector(NewLoad, StrideMask,
3989 "strided.vec");
3990 };
3991
3992 for (unsigned I = 0, J = 0; I < InterleaveFactor; ++I) {
3993 Instruction *Member = Group->getMember(I);
3994
3995 // Skip the gaps in the group.
3996 if (!Member)
3997 continue;
3998
3999 Value *StridedVec = CreateStridedVector(I);
4000
4001 // If this member has different type, cast the result type.
4002 if (Member->getType() != ScalarTy) {
4003 VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF);
4004 StridedVec =
4005 createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL);
4006 }
4007
4008 if (Group->isReverse())
4009 StridedVec = State.Builder.CreateVectorReverse(StridedVec, "reverse");
4010
4011 State.set(VPDefs[J], StridedVec);
4012 ++J;
4013 }
4014 return;
4015 }
4016
4017 // The sub vector type for current instruction.
4018 auto *SubVT = VectorType::get(ScalarTy, State.VF);
4019
4020 // Vectorize the interleaved store group.
4021 Value *MaskForGaps =
4022 createBitMaskForGaps(State.Builder, State.VF.getKnownMinValue(), *Group);
4023 assert(((MaskForGaps != nullptr) == needsMaskForGaps()) &&
4024 "Mismatch between NeedsMaskForGaps and MaskForGaps");
4025 ArrayRef<VPValue *> StoredValues = getStoredValues();
4026 // Collect the stored vector from each member.
4027 SmallVector<Value *, 4> StoredVecs;
4028 unsigned StoredIdx = 0;
4029 for (unsigned i = 0; i < InterleaveFactor; i++) {
4030 assert((Group->getMember(i) || MaskForGaps) &&
4031 "Fail to get a member from an interleaved store group");
4032 Instruction *Member = Group->getMember(i);
4033
4034 // Skip the gaps in the group.
4035 if (!Member) {
4036 Value *Undef = PoisonValue::get(SubVT);
4037 StoredVecs.push_back(Undef);
4038 continue;
4039 }
4040
4041 Value *StoredVec = State.get(StoredValues[StoredIdx]);
4042 ++StoredIdx;
4043
4044 if (Group->isReverse())
4045 StoredVec = State.Builder.CreateVectorReverse(StoredVec, "reverse");
4046
4047 // If this member has different type, cast it to a unified type.
4048
4049 if (StoredVec->getType() != SubVT)
4050 StoredVec = createBitOrPointerCast(State.Builder, StoredVec, SubVT, DL);
4051
4052 StoredVecs.push_back(StoredVec);
4053 }
4054
4055 // Interleave all the smaller vectors into one wider vector.
4056 Value *IVec = interleaveVectors(State.Builder, StoredVecs, "interleaved.vec");
4057 Instruction *NewStoreInstr;
4058 if (BlockInMask || MaskForGaps) {
4059 Value *GroupMask = CreateGroupMask(MaskForGaps);
4060 NewStoreInstr = State.Builder.CreateMaskedStore(
4061 IVec, ResAddr, Group->getAlign(), GroupMask);
4062 } else
4063 NewStoreInstr =
4064 State.Builder.CreateAlignedStore(IVec, ResAddr, Group->getAlign());
4065
4066 applyMetadata(*NewStoreInstr);
4067 // TODO: Also manage existing metadata using VPIRMetadata.
4068 Group->addMetadata(NewStoreInstr);
4069}
4070
4071#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4073 VPSlotTracker &SlotTracker) const {
4075 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
4076 IG->getInsertPos()->printAsOperand(O, false);
4077 O << ", ";
4079 VPValue *Mask = getMask();
4080 if (Mask) {
4081 O << ", ";
4082 Mask->printAsOperand(O, SlotTracker);
4083 }
4084
4085 unsigned OpIdx = 0;
4086 for (unsigned i = 0; i < IG->getFactor(); ++i) {
4087 if (!IG->getMember(i))
4088 continue;
4089 if (getNumStoreOperands() > 0) {
4090 O << "\n" << Indent << " store ";
4092 O << " to index " << i;
4093 } else {
4094 O << "\n" << Indent << " ";
4096 O << " = load from index " << i;
4097 }
4098 ++OpIdx;
4099 }
4100}
4101#endif
4102
4104 assert(!State.Lane && "Interleave group being replicated.");
4105 assert(State.VF.isScalable() &&
4106 "Only support scalable VF for EVL tail-folding.");
4108 "Masking gaps for scalable vectors is not yet supported.");
4110 Instruction *Instr = Group->getInsertPos();
4111
4112 // Prepare for the vector type of the interleaved load/store.
4113 Type *ScalarTy = getLoadStoreType(Instr);
4114 unsigned InterleaveFactor = Group->getFactor();
4115 assert(InterleaveFactor <= 8 &&
4116 "Unsupported deinterleave/interleave factor for scalable vectors");
4117 ElementCount WideVF = State.VF * InterleaveFactor;
4118 auto *VecTy = VectorType::get(ScalarTy, WideVF);
4119
4120 VPValue *Addr = getAddr();
4121 Value *ResAddr = State.get(Addr, VPLane(0));
4122 Value *EVL = State.get(getEVL(), VPLane(0));
4123 Value *InterleaveEVL = State.Builder.CreateMul(
4124 EVL, ConstantInt::get(EVL->getType(), InterleaveFactor), "interleave.evl",
4125 /* NUW= */ true, /* NSW= */ true);
4126 LLVMContext &Ctx = State.Builder.getContext();
4127
4128 Value *GroupMask = nullptr;
4129 if (VPValue *BlockInMask = getMask()) {
4130 SmallVector<Value *> Ops(InterleaveFactor, State.get(BlockInMask));
4131 GroupMask = interleaveVectors(State.Builder, Ops, "interleaved.mask");
4132 } else {
4133 GroupMask =
4134 State.Builder.CreateVectorSplat(WideVF, State.Builder.getTrue());
4135 }
4136
4137 // Vectorize the interleaved load group.
4138 if (isa<LoadInst>(Instr)) {
4139 CallInst *NewLoad = State.Builder.CreateIntrinsic(
4140 VecTy, Intrinsic::vp_load, {ResAddr, GroupMask, InterleaveEVL}, nullptr,
4141 "wide.vp.load");
4142 NewLoad->addParamAttr(0,
4143 Attribute::getWithAlignment(Ctx, Group->getAlign()));
4144
4145 applyMetadata(*NewLoad);
4146 // TODO: Also manage existing metadata using VPIRMetadata.
4147 Group->addMetadata(NewLoad);
4148
4149 // Scalable vectors cannot use arbitrary shufflevectors (only splats),
4150 // so must use intrinsics to deinterleave.
4151 NewLoad = State.Builder.CreateIntrinsic(
4152 Intrinsic::getDeinterleaveIntrinsicID(InterleaveFactor),
4153 NewLoad->getType(), NewLoad,
4154 /*FMFSource=*/nullptr, "strided.vec");
4155
4156 const DataLayout &DL = Instr->getDataLayout();
4157 for (unsigned I = 0, J = 0; I < InterleaveFactor; ++I) {
4158 Instruction *Member = Group->getMember(I);
4159 // Skip the gaps in the group.
4160 if (!Member)
4161 continue;
4162
4163 Value *StridedVec = State.Builder.CreateExtractValue(NewLoad, I);
4164 // If this member has different type, cast the result type.
4165 if (Member->getType() != ScalarTy) {
4166 VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF);
4167 StridedVec =
4168 createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL);
4169 }
4170
4171 State.set(getVPValue(J), StridedVec);
4172 ++J;
4173 }
4174 return;
4175 } // End for interleaved load.
4176
4177 // The sub vector type for current instruction.
4178 auto *SubVT = VectorType::get(ScalarTy, State.VF);
4179 // Vectorize the interleaved store group.
4180 ArrayRef<VPValue *> StoredValues = getStoredValues();
4181 // Collect the stored vector from each member.
4182 SmallVector<Value *, 4> StoredVecs;
4183 const DataLayout &DL = Instr->getDataLayout();
4184 for (unsigned I = 0, StoredIdx = 0; I < InterleaveFactor; I++) {
4185 Instruction *Member = Group->getMember(I);
4186 // Skip the gaps in the group.
4187 if (!Member) {
4188 StoredVecs.push_back(PoisonValue::get(SubVT));
4189 continue;
4190 }
4191
4192 Value *StoredVec = State.get(StoredValues[StoredIdx]);
4193 // If this member has different type, cast it to a unified type.
4194 if (StoredVec->getType() != SubVT)
4195 StoredVec = createBitOrPointerCast(State.Builder, StoredVec, SubVT, DL);
4196
4197 StoredVecs.push_back(StoredVec);
4198 ++StoredIdx;
4199 }
4200
4201 // Interleave all the smaller vectors into one wider vector.
4202 Value *IVec = interleaveVectors(State.Builder, StoredVecs, "interleaved.vec");
4203 CallInst *NewStore =
4204 State.Builder.CreateIntrinsic(Type::getVoidTy(Ctx), Intrinsic::vp_store,
4205 {IVec, ResAddr, GroupMask, InterleaveEVL});
4206 NewStore->addParamAttr(1,
4207 Attribute::getWithAlignment(Ctx, Group->getAlign()));
4208
4209 applyMetadata(*NewStore);
4210 // TODO: Also manage existing metadata using VPIRMetadata.
4211 Group->addMetadata(NewStore);
4212}
4213
4214#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4216 VPSlotTracker &SlotTracker) const {
4218 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
4219 IG->getInsertPos()->printAsOperand(O, false);
4220 O << ", ";
4222 O << ", ";
4224 if (VPValue *Mask = getMask()) {
4225 O << ", ";
4226 Mask->printAsOperand(O, SlotTracker);
4227 }
4228
4229 unsigned OpIdx = 0;
4230 for (unsigned i = 0; i < IG->getFactor(); ++i) {
4231 if (!IG->getMember(i))
4232 continue;
4233 if (getNumStoreOperands() > 0) {
4234 O << "\n" << Indent << " vp.store ";
4236 O << " to index " << i;
4237 } else {
4238 O << "\n" << Indent << " ";
4240 O << " = vp.load from index " << i;
4241 }
4242 ++OpIdx;
4243 }
4244}
4245#endif
4246
4248 VPCostContext &Ctx) const {
4249 Instruction *InsertPos = getInsertPos();
4250 // Find the VPValue index of the interleave group. We need to skip gaps.
4251 unsigned InsertPosIdx = 0;
4252 for (unsigned Idx = 0; IG->getFactor(); ++Idx)
4253 if (auto *Member = IG->getMember(Idx)) {
4254 if (Member == InsertPos)
4255 break;
4256 InsertPosIdx++;
4257 }
4258 Type *ValTy = Ctx.Types.inferScalarType(
4259 getNumDefinedValues() > 0 ? getVPValue(InsertPosIdx)
4260 : getStoredValues()[InsertPosIdx]);
4261 auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
4262 unsigned AS = cast<PointerType>(Ctx.Types.inferScalarType(getAddr()))
4263 ->getAddressSpace();
4264
4265 unsigned InterleaveFactor = IG->getFactor();
4266 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
4267
4268 // Holds the indices of existing members in the interleaved group.
4270 for (unsigned IF = 0; IF < InterleaveFactor; IF++)
4271 if (IG->getMember(IF))
4272 Indices.push_back(IF);
4273
4274 // Calculate the cost of the whole interleaved group.
4275 InstructionCost Cost = Ctx.TTI.getInterleavedMemoryOpCost(
4276 InsertPos->getOpcode(), WideVecTy, IG->getFactor(), Indices,
4277 IG->getAlign(), AS, Ctx.CostKind, getMask(), NeedsMaskForGaps);
4278
4279 if (!IG->isReverse())
4280 return Cost;
4281
4282 return Cost + IG->getNumMembers() *
4283 Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
4284 VectorTy, VectorTy, {}, Ctx.CostKind,
4285 0);
4286}
4287
4288#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4290 VPSlotTracker &SlotTracker) const {
4291 O << Indent << "EMIT ";
4293 O << " = CANONICAL-INDUCTION ";
4295}
4296#endif
4297
4299 return IsScalarAfterVectorization &&
4300 (!IsScalable || vputils::onlyFirstLaneUsed(this));
4301}
4302
4303#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4305 VPSlotTracker &SlotTracker) const {
4306 assert((getNumOperands() == 3 || getNumOperands() == 5) &&
4307 "unexpected number of operands");
4308 O << Indent << "EMIT ";
4310 O << " = WIDEN-POINTER-INDUCTION ";
4312 O << ", ";
4314 O << ", ";
4316 if (getNumOperands() == 5) {
4317 O << ", ";
4319 O << ", ";
4321 }
4322}
4323
4325 VPSlotTracker &SlotTracker) const {
4326 O << Indent << "EMIT ";
4328 O << " = EXPAND SCEV " << *Expr;
4329}
4330#endif
4331
4333 Value *CanonicalIV = State.get(getOperand(0), /*IsScalar*/ true);
4334 Type *STy = CanonicalIV->getType();
4335 IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
4336 ElementCount VF = State.VF;
4337 Value *VStart = VF.isScalar()
4338 ? CanonicalIV
4339 : Builder.CreateVectorSplat(VF, CanonicalIV, "broadcast");
4340 Value *VStep = createStepForVF(Builder, STy, VF, getUnrollPart(*this));
4341 if (VF.isVector()) {
4342 VStep = Builder.CreateVectorSplat(VF, VStep);
4343 VStep =
4344 Builder.CreateAdd(VStep, Builder.CreateStepVector(VStep->getType()));
4345 }
4346 Value *CanonicalVectorIV = Builder.CreateAdd(VStart, VStep, "vec.iv");
4347 State.set(this, CanonicalVectorIV);
4348}
4349
4350#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4352 VPSlotTracker &SlotTracker) const {
4353 O << Indent << "EMIT ";
4355 O << " = WIDEN-CANONICAL-INDUCTION ";
4357}
4358#endif
4359
4361 auto &Builder = State.Builder;
4362 // Create a vector from the initial value.
4363 auto *VectorInit = getStartValue()->getLiveInIRValue();
4364
4365 Type *VecTy = State.VF.isScalar()
4366 ? VectorInit->getType()
4367 : VectorType::get(VectorInit->getType(), State.VF);
4368
4369 BasicBlock *VectorPH =
4370 State.CFG.VPBB2IRBB.at(getParent()->getCFGPredecessor(0));
4371 if (State.VF.isVector()) {
4372 auto *IdxTy = Builder.getInt32Ty();
4373 auto *One = ConstantInt::get(IdxTy, 1);
4374 IRBuilder<>::InsertPointGuard Guard(Builder);
4375 Builder.SetInsertPoint(VectorPH->getTerminator());
4376 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, State.VF);
4377 auto *LastIdx = Builder.CreateSub(RuntimeVF, One);
4378 VectorInit = Builder.CreateInsertElement(
4379 PoisonValue::get(VecTy), VectorInit, LastIdx, "vector.recur.init");
4380 }
4381
4382 // Create a phi node for the new recurrence.
4383 PHINode *Phi = PHINode::Create(VecTy, 2, "vector.recur");
4384 Phi->insertBefore(State.CFG.PrevBB->getFirstInsertionPt());
4385 Phi->addIncoming(VectorInit, VectorPH);
4386 State.set(this, Phi);
4387}
4388
4391 VPCostContext &Ctx) const {
4392 if (VF.isScalar())
4393 return Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind);
4394
4395 return 0;
4396}
4397
4398#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4400 VPSlotTracker &SlotTracker) const {
4401 O << Indent << "FIRST-ORDER-RECURRENCE-PHI ";
4403 O << " = phi ";
4405}
4406#endif
4407
4409 // Reductions do not have to start at zero. They can start with
4410 // any loop invariant values.
4411 VPValue *StartVPV = getStartValue();
4412
4413 // In order to support recurrences we need to be able to vectorize Phi nodes.
4414 // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4415 // stage #1: We create a new vector PHI node with no incoming edges. We'll use
4416 // this value when we vectorize all of the instructions that use the PHI.
4417 BasicBlock *VectorPH =
4418 State.CFG.VPBB2IRBB.at(getParent()->getCFGPredecessor(0));
4419 bool ScalarPHI = State.VF.isScalar() || IsInLoop;
4420 Value *StartV = State.get(StartVPV, ScalarPHI);
4421 Type *VecTy = StartV->getType();
4422
4423 BasicBlock *HeaderBB = State.CFG.PrevBB;
4424 assert(State.CurrentParentLoop->getHeader() == HeaderBB &&
4425 "recipe must be in the vector loop header");
4426 auto *Phi = PHINode::Create(VecTy, 2, "vec.phi");
4427 Phi->insertBefore(HeaderBB->getFirstInsertionPt());
4428 State.set(this, Phi, IsInLoop);
4429
4430 Phi->addIncoming(StartV, VectorPH);
4431}
4432
4433#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4435 VPSlotTracker &SlotTracker) const {
4436 O << Indent << "WIDEN-REDUCTION-PHI ";
4437
4439 O << " = phi ";
4441 if (VFScaleFactor != 1)
4442 O << " (VF scaled by 1/" << VFScaleFactor << ")";
4443}
4444#endif
4445
4447 Value *Op0 = State.get(getOperand(0));
4448 Type *VecTy = Op0->getType();
4449 Instruction *VecPhi = State.Builder.CreatePHI(VecTy, 2, Name);
4450 State.set(this, VecPhi);
4451}
4452
4453#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4455 VPSlotTracker &SlotTracker) const {
4456 O << Indent << "WIDEN-PHI ";
4457
4459 O << " = phi ";
4461}
4462#endif
4463
4464// TODO: It would be good to use the existing VPWidenPHIRecipe instead and
4465// remove VPActiveLaneMaskPHIRecipe.
4467 BasicBlock *VectorPH =
4468 State.CFG.VPBB2IRBB.at(getParent()->getCFGPredecessor(0));
4469 Value *StartMask = State.get(getOperand(0));
4470 PHINode *Phi =
4471 State.Builder.CreatePHI(StartMask->getType(), 2, "active.lane.mask");
4472 Phi->addIncoming(StartMask, VectorPH);
4473 State.set(this, Phi);
4474}
4475
4476#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4478 VPSlotTracker &SlotTracker) const {
4479 O << Indent << "ACTIVE-LANE-MASK-PHI ";
4480
4482 O << " = phi ";
4484}
4485#endif
4486
4487#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4489 VPSlotTracker &SlotTracker) const {
4490 O << Indent << "EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI ";
4491
4493 O << " = phi ";
4495}
4496#endif
static SDValue Widen(SelectionDAG *CurDAG, SDValue N)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static MCDisassembler::DecodeStatus addOperand(MCInst &Inst, const MCOperand &Opnd)
AMDGPU Lower Kernel Arguments
AMDGPU Register Bank Select
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static std::pair< Value *, APInt > getMask(Value *WideMask, unsigned Factor, ElementCount LeafValueEC)
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
This file provides a LoopVectorizationPlanner class.
static const SCEV * getAddressAccessSCEV(Value *Ptr, LoopVectorizationLegality *Legal, PredicatedScalarEvolution &PSE, const Loop *TheLoop)
Gets Address Access SCEV after verifying that the access pattern is loop invariant except the inducti...
#define I(x, y, z)
Definition MD5.cpp:58
static bool isOrdered(const Instruction *I)
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
if(PassOpts->AAPipeline)
const SmallVectorImpl< MachineOperand > & Cond
This file contains some templates that are useful if you are working with the STL at all.
This file defines the SmallVector class.
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This file contains the declarations of different VPlan-related auxiliary helpers.
static Instruction * createReverseEVL(IRBuilderBase &Builder, Value *Operand, Value *EVL, const Twine &Name)
Use all-true mask for reverse rather than actual mask, as it avoids a dependence w/o affecting the re...
static Value * interleaveVectors(IRBuilderBase &Builder, ArrayRef< Value * > Vals, const Twine &Name)
Return a vector containing interleaved elements from multiple smaller input vectors.
static InstructionCost getCostForIntrinsics(Intrinsic::ID ID, ArrayRef< const VPValue * > Operands, const VPRecipeWithIRFlags &R, ElementCount VF, VPCostContext &Ctx)
Compute the cost for the intrinsic ID with Operands, produced by R.
static Value * createBitOrPointerCast(IRBuilderBase &Builder, Value *V, VectorType *DstVTy, const DataLayout &DL)
static Type * getGEPIndexTy(bool IsScalable, bool IsReverse, bool IsUnitStride, unsigned CurrentPart, IRBuilderBase &Builder)
SmallVector< Value *, 2 > VectorParts
static bool isUsedByLoadStoreAddress(const VPUser *V)
Returns true if V is used as part of the address of another load or store.
static void scalarizeInstruction(const Instruction *Instr, VPReplicateRecipe *RepRecipe, const VPLane &Lane, VPTransformState &State)
A helper function to scalarize a single Instruction in the innermost loop.
static Constant * getSignedIntOrFpConstant(Type *Ty, int64_t C)
A helper function that returns an integer or floating-point constant with value C.
static BranchInst * createCondBranch(Value *Cond, VPBasicBlock *VPBB, VPTransformState &State)
Create a conditional branch using Cond branching to the successors of VPBB.
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition VPlanSLP.cpp:247
This file contains the declarations of the Vectorization Plan base classes:
static const uint32_t IV[8]
Definition blake3_impl.h:83
Class for arbitrary precision integers.
Definition APInt.h:78
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition ArrayRef.h:143
static LLVM_ABI Attribute getWithAlignment(LLVMContext &Context, Align Alignment)
Return a uniquified Attribute object that has the specific alignment set.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
LLVM_ABI const_iterator getFirstInsertionPt() const
Returns an iterator to the first instruction in this block that is suitable for inserting a non-PHI i...
LLVM_ABI InstListType::const_iterator getFirstNonPHIIt() const
Returns an iterator to the first instruction in this block that is not a PHINode instruction.
LLVM_ABI const BasicBlock * getSinglePredecessor() const
Return the predecessor of this block if it has a single predecessor block.
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233
Conditional or Unconditional Branch instruction.
void setSuccessor(unsigned idx, BasicBlock *NewSucc)
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
This class represents a function call, abstracting a target machine's calling convention.
static LLVM_ABI bool isBitOrNoopPointerCastable(Type *SrcTy, Type *DestTy, const DataLayout &DL)
Check whether a bitcast, inttoptr, or ptrtoint cast between these types is valid and a no-op.
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Definition InstrTypes.h:982
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ ICMP_UGT
unsigned greater than
Definition InstrTypes.h:699
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:701
static LLVM_ABI StringRef getPredicateName(Predicate P)
This is the shared class of boolean and integer constants.
Definition Constants.h:87
static ConstantInt * getSigned(IntegerType *Ty, int64_t V)
Return a ConstantInt with the specified value for the specified type.
Definition Constants.h:131
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition Constants.h:163
This is an important base class in LLVM.
Definition Constant.h:43
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
A debug info location.
Definition DebugLoc.h:124
constexpr bool isVector() const
One or more elements.
Definition TypeSize.h:325
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition TypeSize.h:313
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition TypeSize.h:310
constexpr bool isScalar() const
Exactly one element.
Definition TypeSize.h:321
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:22
void setAllowContract(bool B=true)
Definition FMF.h:90
bool noSignedZeros() const
Definition FMF.h:67
bool noInfs() const
Definition FMF.h:66
void setAllowReciprocal(bool B=true)
Definition FMF.h:87
bool allowReciprocal() const
Definition FMF.h:68
LLVM_ABI void print(raw_ostream &O) const
Print fast-math flags to O.
Definition Operator.cpp:271
void setNoSignedZeros(bool B=true)
Definition FMF.h:84
bool allowReassoc() const
Flag queries.
Definition FMF.h:64
bool approxFunc() const
Definition FMF.h:70
void setNoNaNs(bool B=true)
Definition FMF.h:78
void setAllowReassoc(bool B=true)
Flag setters.
Definition FMF.h:75
bool noNaNs() const
Definition FMF.h:65
void setApproxFunc(bool B=true)
Definition FMF.h:93
void setNoInfs(bool B=true)
Definition FMF.h:81
bool allowContract() const
Definition FMF.h:69
Class to represent function types.
Type * getParamType(unsigned i) const
Parameter type accessors.
bool willReturn() const
Determine if the function will return.
Definition Function.h:661
bool doesNotThrow() const
Determine if the function cannot unwind.
Definition Function.h:594
Type * getReturnType() const
Returns the type of the ret val.
Definition Function.h:214
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2579
Value * CreateInsertValue(Value *Agg, Value *Val, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition IRBuilder.h:2633
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2567
LLVM_ABI Value * CreateVectorSplice(Value *V1, Value *V2, int64_t Imm, const Twine &Name="")
Return a vector splice intrinsic if using scalable vectors, otherwise return a shufflevector.
LLVM_ABI Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
Value * CreateExtractValue(Value *Agg, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition IRBuilder.h:2626
LLVM_ABI Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Value * CreateFreeze(Value *V, const Twine &Name="")
Definition IRBuilder.h:2645
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
Definition IRBuilder.h:562
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
Definition IRBuilder.h:345
IntegerType * getInt64Ty()
Fetch the type representing a 64-bit integer.
Definition IRBuilder.h:567
Value * CreateICmpNE(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:2336
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition IRBuilder.h:527
Value * CreateLogicalAnd(Value *Cond1, Value *Cond2, const Twine &Name="", Instruction *MDFrom=nullptr)
Definition IRBuilder.h:1725
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Definition IRBuilder.h:522
Value * CreateCmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:2466
Value * CreateNot(Value *V, const Twine &Name="")
Definition IRBuilder.h:1808
Value * CreateICmpEQ(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:2332
Value * CreateCountTrailingZeroElems(Type *ResTy, Value *Mask, bool ZeroIsPoison=true, const Twine &Name="")
Create a call to llvm.experimental_cttz_elts.
Definition IRBuilder.h:1134
Value * CreateSub(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1420
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition IRBuilder.h:2085
LLVMContext & getContext() const
Definition IRBuilder.h:203
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1403
ConstantInt * getFalse()
Get the constant value for i1 false.
Definition IRBuilder.h:507
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:1708
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:2442
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1437
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2788
static InstructionCost getInvalid(CostType Val=0)
bool isCast() const
bool isBinaryOp() const
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
const char * getOpcodeName() const
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
bool isUnaryOp() const
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:319
The group of interleaved loads/stores sharing the same stride and close to each other.
uint32_t getFactor() const
InstTy * getMember(uint32_t Index) const
Get the member with the given index Index.
bool isReverse() const
InstTy * getInsertPos() const
void addMetadata(InstTy *NewInst) const
Add metadata (e.g.
Align getAlign() const
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
This class emits a version of the loop where run-time checks ensure that may-alias pointers can't ove...
std::pair< MDNode *, MDNode * > getNoAliasMetadataFor(const Instruction *OrigInst) const
Returns a pair containing the alias_scope and noalias metadata nodes for OrigInst,...
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
static PHINode * Create(Type *Ty, unsigned NumReservedValues, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
Constructors - NumReservedValues is a hint for the number of incoming edges that this phi node will h...
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
static bool isSignedRecurrenceKind(RecurKind Kind)
Returns true if recurrece kind is a signed redux kind.
static LLVM_ABI unsigned getOpcode(RecurKind Kind)
Returns the opcode corresponding to the RecurrenceKind.
static bool isAnyOfRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is of the form select(cmp(),x,y) where one of (x,...
static bool isFindLastIVRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is of the form select(cmp(),x,y) where one of (x,...
static bool isFindIVRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is of the form select(cmp(),x,y) where one of (x,...
static bool isMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is any min/max kind.
This class represents an analyzed expression in the program.
The main scalar evolution driver.
This class represents the LLVM 'select' instruction.
This class provides computation of slot numbers for LLVM Assembly writing.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
reference emplace_back(ArgTypes &&... Args)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
static LLVM_ABI PartialReductionExtendKind getPartialReductionExtendKind(Instruction *I)
Get the kind of extension that an instruction represents.
static LLVM_ABI OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
@ TCC_Free
Expected to fold away in lowering.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Reverse
Reverse the order of the vector.
CastContextHint
Represents a hint about the context in which a cast is used.
@ Reversed
The cast is used with a reversed load/store.
@ Masked
The cast is used with a masked load/store.
@ None
The cast is not used with a load/store of any kind.
@ Normal
The cast is used with a normal load/store.
@ Interleave
The cast is used with an interleaved load/store.
@ GatherScatter
The cast is used with a gather/scatter.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:298
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:273
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:297
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
static LLVM_ABI Type * getVoidTy(LLVMContext &C)
Definition Type.cpp:281
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
bool isStructTy() const
True if this is an instance of StructType.
Definition Type.h:261
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:128
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:231
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:294
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:184
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:301
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:139
value_op_iterator value_op_end()
Definition User.h:313
void setOperand(unsigned i, Value *Val)
Definition User.h:237
Value * getOperand(unsigned i) const
Definition User.h:232
value_op_iterator value_op_begin()
Definition User.h:310
void execute(VPTransformState &State) override
Generate the active lane mask phi of the vector loop.
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph.
Definition VPlan.h:3821
RecipeListTy & getRecipeList()
Returns a reference to the list of recipes.
Definition VPlan.h:3874
iterator end()
Definition VPlan.h:3858
void insert(VPRecipeBase *Recipe, iterator InsertPt)
Definition VPlan.h:3887
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPWidenMemoryRecipe.
VPValue * getIncomingValue(unsigned Idx) const
Return incoming value number Idx.
Definition VPlan.h:2438
unsigned getNumIncomingValues() const
Return the number of incoming values, taking into account when normalized the first incoming value wi...
Definition VPlan.h:2433
VPBlockBase is the building block of the Hierarchical Control-Flow Graph.
Definition VPlan.h:80
const VPBlocksTy & getPredecessors() const
Definition VPlan.h:203
VPlan * getPlan()
Definition VPlan.cpp:165
void printAsOperand(raw_ostream &OS, bool PrintType=false) const
Definition VPlan.h:348
const VPBlocksTy & getSuccessors() const
Definition VPlan.h:197
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPBranchOnMaskRecipe.
void execute(VPTransformState &State) override
Generate the extraction of the appropriate bit from the block mask and the conditional branch.
VPlan-based builder utility analogous to IRBuilder.
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
This class augments a recipe with a set of VPValues defined by the recipe.
Definition VPlanValue.h:302
void dump() const
Dump the VPDef to stderr (for debugging).
Definition VPlan.cpp:126
unsigned getNumDefinedValues() const
Returns the number of values defined by the VPDef.
Definition VPlanValue.h:424
ArrayRef< VPValue * > definedValues()
Returns an ArrayRef of the values defined by the VPDef.
Definition VPlanValue.h:419
VPValue * getVPSingleValue()
Returns the only VPValue defined by the VPDef.
Definition VPlanValue.h:397
VPValue * getVPValue(unsigned I)
Returns the VPValue with index I defined by the VPDef.
Definition VPlanValue.h:409
unsigned getVPDefID() const
Definition VPlanValue.h:429
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
VPValue * getStepValue() const
Definition VPlan.h:3698
VPValue * getStartValue() const
Definition VPlan.h:3697
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void decompose()
Insert the recipes of the expression back into the VPlan, directly before the current recipe.
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
bool isSingleScalar() const
Returns true if the result of this VPExpressionRecipe is a single-scalar.
bool mayHaveSideEffects() const
Returns true if this expression contains recipes that may have side effects.
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Compute the cost of this recipe either using a recipe's specialized implementation or using the legac...
bool mayReadOrWriteMemory() const
Returns true if this expression contains recipes that may read from or write to memory.
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this header phi recipe.
VPValue * getStartValue()
Returns the start value of the phi, if one is set.
Definition VPlan.h:2021
void execute(VPTransformState &State) override
Produce a vectorized histogram operation.
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPHistogramRecipe.
VPValue * getMask() const
Return the mask operand if one was provided, or a null pointer if all lanes should be executed uncond...
Definition VPlan.h:1717
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
Class to record and manage LLVM IR flags.
Definition VPlan.h:596
FastMathFlagsTy FMFs
Definition VPlan.h:660
bool flagsValidForOpcode(unsigned Opcode) const
Returns true if the set flags are valid for Opcode.
WrapFlagsTy WrapFlags
Definition VPlan.h:654
CmpInst::Predicate CmpPredicate
Definition VPlan.h:653
void printFlags(raw_ostream &O) const
GEPNoWrapFlags GEPFlags
Definition VPlan.h:658
bool hasFastMathFlags() const
Returns true if the recipe has fast-math flags.
Definition VPlan.h:818
LLVM_ABI_FOR_TEST FastMathFlags getFastMathFlags() const
TruncFlagsTy TruncFlags
Definition VPlan.h:655
CmpInst::Predicate getPredicate() const
Definition VPlan.h:800
ExactFlagsTy ExactFlags
Definition VPlan.h:657
bool hasNoSignedWrap() const
Definition VPlan.h:842
void intersectFlags(const VPIRFlags &Other)
Only keep flags also present in Other.
GEPNoWrapFlags getGEPNoWrapFlags() const
Definition VPlan.h:812
bool hasPredicate() const
Returns true if the recipe has a comparison predicate.
Definition VPlan.h:815
DisjointFlagsTy DisjointFlags
Definition VPlan.h:656
unsigned AllFlags
Definition VPlan.h:661
bool hasNoUnsignedWrap() const
Definition VPlan.h:831
NonNegFlagsTy NonNegFlags
Definition VPlan.h:659
void applyFlags(Instruction &I) const
Apply the IR flags to I.
Definition VPlan.h:763
Instruction & getInstruction() const
Definition VPlan.h:1382
void execute(VPTransformState &State) override
The method which generates the output IR instructions that correspond to this VPRecipe,...
void extractLastLaneOfFirstOperand(VPBuilder &Builder)
Update the recipes first operand to the last lane of the operand using Builder.
LLVM_ABI_FOR_TEST InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPIRInstruction.
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
VPIRInstruction(Instruction &I)
VPIRInstruction::create() should be used to create VPIRInstructions, as subclasses may need to be cre...
Definition VPlan.h:1357
void intersect(const VPIRMetadata &MD)
Intersect this VPIRMetada object with MD, keeping only metadata nodes that are common to both.
VPIRMetadata()=default
void applyMetadata(Instruction &I) const
Add all metadata to I.
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void execute(VPTransformState &State) override
Generate the instruction.
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPInstruction.
VPInstruction(unsigned Opcode, ArrayRef< VPValue * > Operands, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
Definition VPlan.h:1101
bool doesGeneratePerAllLanes() const
Returns true if this VPInstruction generates scalar values for all lanes.
@ ExtractLane
Extracts a single lane (first operand) from a set of vector operands.
Definition VPlan.h:1060
@ ComputeAnyOfResult
Compute the final result of a AnyOf reduction with select(cmp(),x,y), where one of (x,...
Definition VPlan.h:1014
@ WideIVStep
Scale the first operand (vector step) by the second operand (scalar-step).
Definition VPlan.h:1050
@ ResumeForEpilogue
Explicit user for the resume phi of the canonical induction in the main VPlan, used by the epilogue v...
Definition VPlan.h:1063
@ Unpack
Extracts all lanes from its (non-scalable) vector operand.
Definition VPlan.h:1011
@ FirstOrderRecurrenceSplice
Definition VPlan.h:982
@ ReductionStartVector
Start vector for reductions with 3 operands: the original start value, the identity value for the red...
Definition VPlan.h:1054
@ BuildVector
Creates a fixed-width vector containing all operands.
Definition VPlan.h:1006
@ BuildStructVector
Given operands of (the same) struct type, creates a struct of fixed- width vectors each containing a ...
Definition VPlan.h:1003
@ VScale
Returns the value for vscale.
Definition VPlan.h:1065
@ CanonicalIVIncrementForPart
Definition VPlan.h:996
@ CalculateTripCountMinusVF
Definition VPlan.h:994
bool hasResult() const
Definition VPlan.h:1140
bool opcodeMayReadOrWriteFromMemory() const
Returns true if the underlying opcode may read from or write to memory.
LLVM_DUMP_METHOD void dump() const
Print the VPInstruction to dbgs() (for debugging).
StringRef getName() const
Returns the symbolic name assigned to the VPInstruction.
Definition VPlan.h:1180
unsigned getOpcode() const
Definition VPlan.h:1120
bool onlyFirstPartUsed(const VPValue *Op) const override
Returns true if the recipe only uses the first part of operand Op.
bool isVectorToScalar() const
Returns true if this VPInstruction produces a scalar value from a vector, e.g.
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the VPInstruction to O.
bool onlyFirstLaneUsed(const VPValue *Op) const override
Returns true if the recipe only uses the first lane of operand Op.
bool isSingleScalar() const
Returns true if this VPInstruction's operands are single scalars and the result is also a single scal...
void execute(VPTransformState &State) override
Generate the instruction.
bool needsMaskForGaps() const
Return true if the access needs a mask because of the gaps.
Definition VPlan.h:2548
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this recipe.
Instruction * getInsertPos() const
Definition VPlan.h:2552
const InterleaveGroup< Instruction > * getInterleaveGroup() const
Definition VPlan.h:2550
VPValue * getMask() const
Return the mask used by this recipe.
Definition VPlan.h:2542
ArrayRef< VPValue * > getStoredValues() const
Return the VPValues stored by this interleave group.
Definition VPlan.h:2571
VPValue * getAddr() const
Return the address accessed by this recipe.
Definition VPlan.h:2536
VPValue * getEVL() const
The VPValue of the explicit vector length.
Definition VPlan.h:2645
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
unsigned getNumStoreOperands() const override
Returns the number of stored operands of this interleave group.
Definition VPlan.h:2664
void execute(VPTransformState &State) override
Generate the wide load or store, and shuffles.
unsigned getNumStoreOperands() const override
Returns the number of stored operands of this interleave group.
Definition VPlan.h:2615
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void execute(VPTransformState &State) override
Generate the wide load or store, and shuffles.
In what follows, the term "input IR" refers to code that is fed into the vectorizer whereas the term ...
static VPLane getLastLaneForVF(const ElementCount &VF)
static VPLane getLaneFromEnd(const ElementCount &VF, unsigned Offset)
static VPLane getFirstLane()
void execute(VPTransformState &State) override
Generate the reduction in the loop.
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPPartialReductionRecipe.
unsigned getOpcode() const
Get the binary op's opcode.
Definition VPlan.h:2815
virtual const VPRecipeBase * getAsRecipe() const =0
Return a VPRecipeBase* to the current object.
virtual unsigned getNumIncoming() const
Returns the number of incoming values, also number of incoming blocks.
Definition VPlan.h:1272
void removeIncomingValueFor(VPBlockBase *IncomingBlock) const
Removes the incoming value for IncomingBlock, which must be a predecessor.
const VPBasicBlock * getIncomingBlock(unsigned Idx) const
Returns the incoming block with index Idx.
Definition VPlan.h:3965
detail::zippy< llvm::detail::zip_first, VPUser::const_operand_range, const_incoming_blocks_range > incoming_values_and_blocks() const
Returns an iterator range over pairs of incoming values and corresponding incoming blocks.
Definition VPlan.h:1297
VPValue * getIncomingValue(unsigned Idx) const
Returns the incoming VPValue with index Idx.
Definition VPlan.h:1264
void printPhiOperands(raw_ostream &O, VPSlotTracker &SlotTracker) const
Print the recipe.
void execute(VPTransformState &State) override
Generates phi nodes for live-outs (from a replicate region) as needed to retain SSA form.
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
VPRecipeBase is a base class modeling a sequence of one or more output IR instructions.
Definition VPlan.h:386
bool mayReadFromMemory() const
Returns true if the recipe may read from memory.
bool mayHaveSideEffects() const
Returns true if the recipe may have side-effects.
VPRegionBlock * getRegion()
Definition VPlan.h:4126
bool isPhi() const
Returns true for PHI-like recipes.
bool mayWriteToMemory() const
Returns true if the recipe may write to memory.
virtual InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const
Compute the cost of this recipe either using a recipe's specialized implementation or using the legac...
VPBasicBlock * getParent()
Definition VPlan.h:407
DebugLoc getDebugLoc() const
Returns the debug location of the recipe.
Definition VPlan.h:478
void moveBefore(VPBasicBlock &BB, iplist< VPRecipeBase >::iterator I)
Unlink this recipe and insert into BB before I.
void insertBefore(VPRecipeBase *InsertPos)
Insert an unlinked recipe into a basic block immediately before the specified recipe.
void insertAfter(VPRecipeBase *InsertPos)
Insert an unlinked Recipe into a basic block immediately after the specified Recipe.
iplist< VPRecipeBase >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
InstructionCost cost(ElementCount VF, VPCostContext &Ctx)
Return the cost of this recipe, taking into account if the cost computation should be skipped and the...
bool isScalarCast() const
Return true if the recipe is a scalar cast.
void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
void moveAfter(VPRecipeBase *MovePos)
Unlink this recipe from its current VPBasicBlock and insert it into the VPBasicBlock that MovePos liv...
VPRecipeBase(const unsigned char SC, ArrayRef< VPValue * > Operands, DebugLoc DL=DebugLoc::getUnknown())
Definition VPlan.h:397
void execute(VPTransformState &State) override
Generate the reduction in the loop.
VPValue * getEVL() const
The VPValue of the explicit vector length.
Definition VPlan.h:2860
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void execute(VPTransformState &State) override
Generate the phi/select nodes.
bool isConditional() const
Return true if the in-loop reduction is conditional.
Definition VPlan.h:2754
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of VPReductionRecipe.
VPValue * getVecOp() const
The VPValue of the vector value to be reduced.
Definition VPlan.h:2758
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
VPValue * getCondOp() const
The VPValue of the condition for the block.
Definition VPlan.h:2760
RecurKind getRecurrenceKind() const
Return the recurrence kind for the in-loop reduction.
Definition VPlan.h:2750
VPValue * getChainOp() const
The VPValue of the scalar Chain being accumulated.
Definition VPlan.h:2756
void execute(VPTransformState &State) override
Generate the reduction in the loop.
VPRegionBlock represents a collection of VPBasicBlocks and VPRegionBlocks which form a Single-Entry-S...
Definition VPlan.h:4009
bool isReplicator() const
An indicator whether this region is to generate multiple replicated instances of output IR correspond...
Definition VPlan.h:4077
VPReplicateRecipe replicates a given instruction producing multiple scalar copies of the original sca...
Definition VPlan.h:2875
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void execute(VPTransformState &State) override
Generate replicas of the desired Ingredient.
bool isSingleScalar() const
Definition VPlan.h:2920
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPReplicateRecipe.
unsigned getOpcode() const
Definition VPlan.h:2949
bool shouldPack() const
Returns true if the recipe is used by a widened recipe via an intervening VPPredInstPHIRecipe.
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
VPValue * getStepValue() const
Definition VPlan.h:3763
void execute(VPTransformState &State) override
Generate the scalarized versions of the phi node as needed by their users.
VPSingleDef is a base class for recipes for modeling a sequence of one or more output IR that define ...
Definition VPlan.h:517
Instruction * getUnderlyingInstr()
Returns the underlying instruction.
Definition VPlan.h:582
LLVM_DUMP_METHOD void dump() const
Print this VPSingleDefRecipe to dbgs() (for debugging).
VPSingleDefRecipe(const unsigned char SC, ArrayRef< VPValue * > Operands, DebugLoc DL=DebugLoc::getUnknown())
Definition VPlan.h:519
This class can be used to assign names to VPValues.
Type * inferScalarType(const VPValue *V)
Infer the type of V. Returns the scalar type of V.
Helper to access the operand that contains the unroll part for this recipe after unrolling.
Definition VPlan.h:926
VPValue * getUnrollPartOperand(const VPUser &U) const
Return the VPValue operand containing the unroll part or null if there is no such operand.
unsigned getUnrollPart(const VPUser &U) const
Return the unroll part.
This class augments VPValue with operands which provide the inverse def-use edges from VPValue's user...
Definition VPlanValue.h:199
void printOperands(raw_ostream &O, VPSlotTracker &SlotTracker) const
Print the operands to O.
Definition VPlan.cpp:1436
operand_range operands()
Definition VPlanValue.h:267
void setOperand(unsigned I, VPValue *New)
Definition VPlanValue.h:243
unsigned getNumOperands() const
Definition VPlanValue.h:237
operand_iterator op_begin()
Definition VPlanValue.h:263
VPValue * getOperand(unsigned N) const
Definition VPlanValue.h:238
virtual bool onlyFirstLaneUsed(const VPValue *Op) const
Returns true if the VPUser only uses the first lane of operand Op.
Definition VPlanValue.h:282
This is the base class of the VPlan Def/Use graph, used for modeling the data flow into,...
Definition VPlanValue.h:48
bool isDefinedOutsideLoopRegions() const
Returns true if the VPValue is defined outside any loop.
Definition VPlan.cpp:1390
VPRecipeBase * getDefiningRecipe()
Returns the recipe defining this VPValue or nullptr if it is not defined by a recipe,...
Definition VPlan.cpp:135
friend class VPExpressionRecipe
Definition VPlanValue.h:53
void printAsOperand(raw_ostream &OS, VPSlotTracker &Tracker) const
Definition VPlan.cpp:1432
bool hasMoreThanOneUniqueUser() const
Returns true if the value has more than one unique user.
Definition VPlanValue.h:140
Value * getLiveInIRValue() const
Returns the underlying IR value, if this VPValue is defined outside the scope of VPlan.
Definition VPlanValue.h:176
Value * getUnderlyingValue() const
Return the underlying Value attached to this VPValue.
Definition VPlanValue.h:85
VPValue(const unsigned char SC, Value *UV=nullptr, VPDef *Def=nullptr)
Definition VPlan.cpp:98
void replaceAllUsesWith(VPValue *New)
Definition VPlan.cpp:1393
user_iterator user_begin()
Definition VPlanValue.h:130
unsigned getNumUsers() const
Definition VPlanValue.h:113
bool isLiveIn() const
Returns true if this VPValue is a live-in, i.e. defined outside the VPlan.
Definition VPlanValue.h:171
user_range users()
Definition VPlanValue.h:134
void execute(VPTransformState &State) override
The method which generates the output IR instructions that correspond to this VPRecipe,...
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
Type * getSourceElementType() const
Definition VPlan.h:1921
void execute(VPTransformState &State) override
The method which generates the output IR instructions that correspond to this VPRecipe,...
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
operand_range args()
Definition VPlan.h:1674
Function * getCalledScalarFunction() const
Definition VPlan.h:1670
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPWidenCallRecipe.
void execute(VPTransformState &State) override
Produce a widened version of the call instruction.
void execute(VPTransformState &State) override
Generate a canonical vector induction variable of the vector loop, with start = {<Part*VF,...
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
Type * getResultType() const
Returns the result type of the cast.
Definition VPlan.h:1543
void execute(VPTransformState &State) override
Produce widened copies of the cast.
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPWidenCastRecipe.
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void execute(VPTransformState &State) override
Generate the gep nodes.
Type * getSourceElementType() const
Definition VPlan.h:1818
VPValue * getStepValue()
Returns the step value of the induction.
Definition VPlan.h:2077
TruncInst * getTruncInst()
Returns the first defined value as TruncInst, if it is one or nullptr otherwise.
Definition VPlan.h:2188
Type * getScalarType() const
Returns the scalar type of the induction.
Definition VPlan.h:2197
bool isCanonical() const
Returns true if the induction is canonical, i.e.
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
bool onlyFirstLaneUsed(const VPValue *Op) const override
Returns true if the VPUser only uses the first lane of operand Op.
Intrinsic::ID getVectorIntrinsicID() const
Return the ID of the intrinsic.
Definition VPlan.h:1608
StringRef getIntrinsicName() const
Return to name of the intrinsic as string.
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
Type * getResultType() const
Return the scalar return type of the intrinsic.
Definition VPlan.h:1611
void execute(VPTransformState &State) override
Produce a widened version of the vector intrinsic.
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this vector intrinsic.
bool IsMasked
Whether the memory access is masked.
Definition VPlan.h:3195
bool Reverse
Whether the consecutive accessed addresses are in reverse order.
Definition VPlan.h:3192
bool isConsecutive() const
Return whether the loaded-from / stored-to addresses are consecutive.
Definition VPlan.h:3235
Instruction & Ingredient
Definition VPlan.h:3183
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPWidenMemoryRecipe.
bool Consecutive
Whether the accessed addresses are consecutive.
Definition VPlan.h:3189
VPValue * getMask() const
Return the mask used by this recipe.
Definition VPlan.h:3249
Align Alignment
Alignment information for this memory access.
Definition VPlan.h:3186
VPValue * getAddr() const
Return the address accessed by this recipe.
Definition VPlan.h:3242
bool isReverse() const
Return whether the consecutive loaded/stored addresses are in reverse order.
Definition VPlan.h:3239
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void execute(VPTransformState &State) override
Generate the phi/select nodes.
bool onlyScalarsGenerated(bool IsScalable)
Returns true if only scalar values will be generated.
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
VPWidenRecipe is a recipe for producing a widened instruction using the opcode and operands of the re...
Definition VPlan.h:1446
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPWidenRecipe.
void execute(VPTransformState &State) override
Produce a widened instruction using the opcode and operands of the recipe, processing State....
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
unsigned getUF() const
Definition VPlan.h:4365
LLVM_ABI_FOR_TEST VPRegionBlock * getVectorLoopRegion()
Returns the VPRegionBlock of the vector loop.
Definition VPlan.cpp:1027
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
LLVM_ABI void setName(const Twine &Name)
Change the name of the value.
Definition Value.cpp:382
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.cpp:1091
void mutateType(Type *Ty)
Mutate the type of this Value to be of the specified type.
Definition Value.h:838
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:314
Base class of all SIMD vector types.
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Type * getElementType() const
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:201
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:169
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:166
constexpr LeafTy divideCoefficientBy(ScalarTy RHS) const
We do not provide the '/' operator here because division for polynomial types does not work in the sa...
Definition TypeSize.h:253
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
iterator erase(iterator where)
Definition ilist.h:204
pointer remove(iterator &IT)
Definition ilist.h:188
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
LLVM_ABI Intrinsic::ID getDeinterleaveIntrinsicID(unsigned Factor)
Returns the corresponding llvm.vector.deinterleaveN intrinsic for factor N.
LLVM_ABI StringRef getBaseName(ID id)
Return the LLVM name for an intrinsic, without encoded types for overloading, such as "llvm....
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
ap_match< APInt > m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
bool match(Val *V, const Pattern &P)
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
auto m_LogicalOr()
Matches L || R where L and R are arbitrary values.
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
GEPLikeRecipe_match< Op0_t, Op1_t > m_GetElementPtr(const Op0_t &Op0, const Op1_t &Op1)
class_match< VPValue > m_VPValue()
Match an arbitrary VPValue and ignore it.
NodeAddr< DefNode * > Def
Definition RDFGraph.h:384
bool isSingleScalar(const VPValue *VPV)
Returns true if VPV is a single scalar, either because it produces the same value for all lanes or on...
bool onlyFirstPartUsed(const VPValue *Def)
Returns true if only the first part of Def is used.
bool onlyFirstLaneUsed(const VPValue *Def)
Returns true if only the first lane of Def is used.
bool onlyScalarValuesUsed(const VPValue *Def)
Returns true if only scalar values of Def are used by all users.
const SCEV * getSCEVExprForVPValue(const VPValue *V, ScalarEvolution &SE, const Loop *L=nullptr)
Return the SCEV expression for V.
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
LLVM_ABI Value * createSimpleReduction(IRBuilderBase &B, Value *Src, RecurKind RdxKind)
Create a reduction of the given vector.
@ Offset
Definition DWP.cpp:477
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition STLExtras.h:829
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
LLVM_ABI Value * createFindLastIVReduction(IRBuilderBase &B, Value *Src, RecurKind RdxKind, Value *Start, Value *Sentinel)
Create a reduction of the given vector Src for a reduction of the kind RecurKind::FindLastIV.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1725
LLVM_ABI Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
InstructionCost Cost
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2472
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
Value * getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF)
Return the runtime value for VF.
auto dyn_cast_if_present(const Y &Val)
dyn_cast_if_present<X> - Functionally identical to dyn_cast, except that a null (or none in the case ...
Definition Casting.h:732
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2136
void interleaveComma(const Container &c, StreamT &os, UnaryFunctor each_fn)
Definition STLExtras.h:2231
auto cast_or_null(const Y &Val)
Definition Casting.h:714
LLVM_ABI Value * concatenateVectors(IRBuilderBase &Builder, ArrayRef< Value * > Vecs)
Concatenate a list of vectors.
Align getLoadStoreAlignment(const Value *I)
A helper function that returns the alignment of load or store instruction.
bool isa_and_nonnull(const Y &Val)
Definition Casting.h:676
LLVM_ABI Value * createMinMaxOp(IRBuilderBase &Builder, RecurKind RK, Value *Left, Value *Right)
Returns a Min/Max operation corresponding to MinMaxRecurrenceKind.
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1732
LLVM_ABI Constant * createBitMaskForGaps(IRBuilderBase &Builder, unsigned VF, const InterleaveGroup< Instruction > &Group)
Create a mask that filters the members of an interleave group where there are gaps.
LLVM_ABI llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
auto reverse(ContainerTy &&C)
Definition STLExtras.h:406
LLVM_ABI llvm::SmallVector< int, 16 > createReplicatedMask(unsigned ReplicationFactor, unsigned VF)
Create a mask with replicated elements.
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
SmallVector< ValueTypeFromRangeType< R >, Size > to_vector(R &&Range)
Given a range of type R, iterate the entire range and return a SmallVector with elements of the vecto...
Type * toVectorizedTy(Type *Ty, ElementCount EC)
A helper for converting to vectorized types.
bool canConstantBeExtended(const APInt *C, Type *NarrowType, TTI::PartialReductionExtendKind ExtKind)
Check if a constant CI can be safely treated as having been extended from a narrower type with the gi...
Definition VPlan.cpp:1734
cl::opt< unsigned > ForceTargetInstructionCost
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
auto drop_end(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the last N elements excluded.
Definition STLExtras.h:323
@ Other
Any other memory.
Definition ModRef.h:68
bool canVectorizeTy(Type *Ty)
Returns true if Ty is a valid vector element type, void, or an unpacked literal struct where all elem...
LLVM_ABI llvm::SmallVector< int, 16 > createInterleaveMask(unsigned VF, unsigned NumVecs)
Create an interleave shuffle mask.
RecurKind
These are the kinds of recurrences that we support.
@ UMin
Unsigned integer min implemented in terms of select(cmp()).
@ Mul
Product of integers.
@ AnyOf
AnyOf reduction with select(cmp(),x,y) where one of (x,y) is loop invariant, and both x and y are int...
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ SMin
Signed integer min implemented in terms of select(cmp()).
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
@ UMax
Unsigned integer max implemented in terms of select(cmp()).
LLVM_ABI bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic has a scalar operand.
LLVM_ABI Value * getRecurrenceIdentity(RecurKind K, Type *Tp, FastMathFlags FMF)
Given information about an recurrence kind, return the identity for the @llvm.vector....
DWARFExpression::Operation Op
Value * createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF, int64_t Step)
Return a value for Step multiplied by VF.
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1897
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
LLVM_ABI Value * createOrderedReduction(IRBuilderBase &B, RecurKind RdxKind, Value *Src, Value *Start)
Create an ordered reduction intrinsic using the given recurrence kind RdxKind.
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
unsigned getPredBlockCostDivisor(TargetTransformInfo::TargetCostKind CostKind)
A helper function that returns how much we should divide the cost of a predicated block by.
@ Increment
Incrementally increasing token ID.
Definition AllocToken.h:26
Type * toVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
LLVM_ABI Value * createAnyOfReduction(IRBuilderBase &B, Value *Src, Value *InitVal, PHINode *OrigPhi)
Create a reduction of the given vector Src for a reduction of kind RecurKind::AnyOf.
LLVM_ABI bool isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, int OpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic is overloaded on the type of the operand at index OpdI...
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
Struct to hold various analysis needed for cost computations.
void execute(VPTransformState &State) override
Generate the phi nodes.
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this first-order recurrence phi recipe.
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
An overlay for VPIRInstructions wrapping PHI nodes enabling convenient use cast/dyn_cast/isa and exec...
Definition VPlan.h:1419
PHINode & getIRPhi()
Definition VPlan.h:1427
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void execute(VPTransformState &State) override
The method which generates the output IR instructions that correspond to this VPRecipe,...
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void execute(VPTransformState &State) override
Generate the instruction.
A pure-virtual common base class for recipes defining a single VPValue and using IR flags.
Definition VPlan.h:871
InstructionCost getCostForRecipeWithOpcode(unsigned Opcode, ElementCount VF, VPCostContext &Ctx) const
Compute the cost for this recipe for VF, using Opcode and Ctx.
VPRecipeWithIRFlags(const unsigned char SC, ArrayRef< VPValue * > Operands, DebugLoc DL=DebugLoc::getUnknown())
Definition VPlan.h:872
VPTransformState holds information passed down when "executing" a VPlan, needed for generating the ou...
VPTypeAnalysis TypeAnalysis
VPlan-based type analysis.
Value * get(const VPValue *Def, bool IsScalar=false)
Get the generated vector Value for a given VPValue Def if IsScalar is false, otherwise return the gen...
Definition VPlan.cpp:267
IRBuilderBase & Builder
Hold a reference to the IRBuilder used to generate output IR code.
ElementCount VF
The chosen Vectorization Factor of the loop being vectorized.
void execute(VPTransformState &State) override
Generate the wide load or gather.
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPWidenLoadEVLRecipe.
VPValue * getEVL() const
Return the EVL operand.
Definition VPlan.h:3325
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void execute(VPTransformState &State) override
Generate a wide load or gather.
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
bool isInvariantCond() const
Definition VPlan.h:1763
VPValue * getCond() const
Definition VPlan.h:1759
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPWidenSelectRecipe.
void execute(VPTransformState &State) override
Produce a widened version of the select instruction.
VPValue * getStoredValue() const
Return the address accessed by this recipe.
Definition VPlan.h:3406
void execute(VPTransformState &State) override
Generate the wide store or scatter.
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPWidenStoreEVLRecipe.
VPValue * getEVL() const
Return the EVL operand.
Definition VPlan.h:3409
void execute(VPTransformState &State) override
Generate a wide store or scatter.
VPValue * getStoredValue() const
Return the value stored by this recipe.
Definition VPlan.h:3370
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.