LLVM 23.0.0git
VPlanRecipes.cpp
Go to the documentation of this file.
1//===- VPlanRecipes.cpp - Implementations for VPlan recipes ---------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file contains implementations for different VPlan recipes.
11///
12//===----------------------------------------------------------------------===//
13
15#include "VPlan.h"
16#include "VPlanAnalysis.h"
17#include "VPlanHelpers.h"
18#include "VPlanPatternMatch.h"
19#include "VPlanUtils.h"
20#include "llvm/ADT/STLExtras.h"
23#include "llvm/ADT/Twine.h"
28#include "llvm/IR/BasicBlock.h"
29#include "llvm/IR/IRBuilder.h"
30#include "llvm/IR/Instruction.h"
32#include "llvm/IR/Intrinsics.h"
33#include "llvm/IR/Type.h"
34#include "llvm/IR/Value.h"
37#include "llvm/Support/Debug.h"
41#include <cassert>
42
43using namespace llvm;
44using namespace llvm::VPlanPatternMatch;
45
47
48#define LV_NAME "loop-vectorize"
49#define DEBUG_TYPE LV_NAME
50
52 switch (getVPRecipeID()) {
53 case VPExpressionSC:
54 return cast<VPExpressionRecipe>(this)->mayReadOrWriteMemory();
55 case VPInstructionSC: {
56 auto *VPI = cast<VPInstruction>(this);
57 // Loads read from memory but don't write to memory.
58 if (VPI->getOpcode() == Instruction::Load)
59 return false;
60 return VPI->opcodeMayReadOrWriteFromMemory();
61 }
62 case VPInterleaveEVLSC:
63 case VPInterleaveSC:
64 return cast<VPInterleaveBase>(this)->getNumStoreOperands() > 0;
65 case VPWidenStoreEVLSC:
66 case VPWidenStoreSC:
67 return true;
68 case VPReplicateSC:
69 return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
70 ->mayWriteToMemory();
71 case VPWidenCallSC:
72 return !cast<VPWidenCallRecipe>(this)
73 ->getCalledScalarFunction()
74 ->onlyReadsMemory();
75 case VPWidenMemIntrinsicSC:
76 case VPWidenIntrinsicSC:
77 return cast<VPWidenIntrinsicRecipe>(this)->mayWriteToMemory();
78 case VPActiveLaneMaskPHISC:
79 case VPCurrentIterationPHISC:
80 case VPBranchOnMaskSC:
81 case VPDerivedIVSC:
82 case VPFirstOrderRecurrencePHISC:
83 case VPReductionPHISC:
84 case VPScalarIVStepsSC:
85 case VPPredInstPHISC:
86 return false;
87 case VPBlendSC:
88 case VPReductionEVLSC:
89 case VPReductionSC:
90 case VPVectorPointerSC:
91 case VPWidenCanonicalIVSC:
92 case VPWidenCastSC:
93 case VPWidenGEPSC:
94 case VPWidenIntOrFpInductionSC:
95 case VPWidenLoadEVLSC:
96 case VPWidenLoadSC:
97 case VPWidenPHISC:
98 case VPWidenPointerInductionSC:
99 case VPWidenSC: {
100 const Instruction *I =
101 dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
102 (void)I;
103 assert((!I || !I->mayWriteToMemory()) &&
104 "underlying instruction may write to memory");
105 return false;
106 }
107 default:
108 return true;
109 }
110}
111
113 switch (getVPRecipeID()) {
114 case VPExpressionSC:
115 return cast<VPExpressionRecipe>(this)->mayReadOrWriteMemory();
116 case VPInstructionSC:
117 return cast<VPInstruction>(this)->opcodeMayReadOrWriteFromMemory();
118 case VPWidenLoadEVLSC:
119 case VPWidenLoadSC:
120 return true;
121 case VPReplicateSC:
122 return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
123 ->mayReadFromMemory();
124 case VPWidenCallSC:
125 return !cast<VPWidenCallRecipe>(this)
126 ->getCalledScalarFunction()
127 ->onlyWritesMemory();
128 case VPWidenMemIntrinsicSC:
129 case VPWidenIntrinsicSC:
130 return cast<VPWidenIntrinsicRecipe>(this)->mayReadFromMemory();
131 case VPBranchOnMaskSC:
132 case VPDerivedIVSC:
133 case VPCurrentIterationPHISC:
134 case VPFirstOrderRecurrencePHISC:
135 case VPReductionPHISC:
136 case VPPredInstPHISC:
137 case VPScalarIVStepsSC:
138 case VPWidenStoreEVLSC:
139 case VPWidenStoreSC:
140 return false;
141 case VPBlendSC:
142 case VPReductionEVLSC:
143 case VPReductionSC:
144 case VPVectorPointerSC:
145 case VPWidenCanonicalIVSC:
146 case VPWidenCastSC:
147 case VPWidenGEPSC:
148 case VPWidenIntOrFpInductionSC:
149 case VPWidenPHISC:
150 case VPWidenPointerInductionSC:
151 case VPWidenSC: {
152 const Instruction *I =
153 dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
154 (void)I;
155 assert((!I || !I->mayReadFromMemory()) &&
156 "underlying instruction may read from memory");
157 return false;
158 }
159 default:
160 // FIXME: Return false if the recipe represents an interleaved store.
161 return true;
162 }
163}
164
166 switch (getVPRecipeID()) {
167 case VPExpressionSC:
168 return cast<VPExpressionRecipe>(this)->mayHaveSideEffects();
169 case VPActiveLaneMaskPHISC:
170 case VPDerivedIVSC:
171 case VPCurrentIterationPHISC:
172 case VPFirstOrderRecurrencePHISC:
173 case VPReductionPHISC:
174 case VPPredInstPHISC:
175 case VPVectorEndPointerSC:
176 return false;
177 case VPInstructionSC: {
178 auto *VPI = cast<VPInstruction>(this);
179 return mayWriteToMemory() ||
180 VPI->getOpcode() == VPInstruction::BranchOnCount ||
181 VPI->getOpcode() == VPInstruction::BranchOnCond ||
182 VPI->getOpcode() == VPInstruction::BranchOnTwoConds;
183 }
184 case VPWidenCallSC: {
185 Function *Fn = cast<VPWidenCallRecipe>(this)->getCalledScalarFunction();
186 return mayWriteToMemory() || !Fn->doesNotThrow() || !Fn->willReturn();
187 }
188 case VPWidenMemIntrinsicSC:
189 case VPWidenIntrinsicSC:
190 return cast<VPWidenIntrinsicRecipe>(this)->mayHaveSideEffects();
191 case VPBlendSC:
192 case VPReductionEVLSC:
193 case VPReductionSC:
194 case VPScalarIVStepsSC:
195 case VPVectorPointerSC:
196 case VPWidenCanonicalIVSC:
197 case VPWidenCastSC:
198 case VPWidenGEPSC:
199 case VPWidenIntOrFpInductionSC:
200 case VPWidenPHISC:
201 case VPWidenPointerInductionSC:
202 case VPWidenSC: {
203 const Instruction *I =
204 dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
205 (void)I;
206 assert((!I || !I->mayHaveSideEffects()) &&
207 "underlying instruction has side-effects");
208 return false;
209 }
210 case VPInterleaveEVLSC:
211 case VPInterleaveSC:
212 return mayWriteToMemory();
213 case VPWidenLoadEVLSC:
214 case VPWidenLoadSC:
215 case VPWidenStoreEVLSC:
216 case VPWidenStoreSC:
217 assert(
218 cast<VPWidenMemoryRecipe>(this)->getIngredient().mayHaveSideEffects() ==
220 "mayHaveSideffects result for ingredient differs from this "
221 "implementation");
222 return mayWriteToMemory();
223 case VPReplicateSC: {
224 auto *R = cast<VPReplicateRecipe>(this);
225 return R->getUnderlyingInstr()->mayHaveSideEffects();
226 }
227 default:
228 return true;
229 }
230}
231
233 switch (getVPRecipeID()) {
234 default:
235 return false;
236 case VPInstructionSC: {
237 unsigned Opcode = cast<VPInstruction>(this)->getOpcode();
238 if (Instruction::isCast(Opcode))
239 return true;
240
241 switch (Opcode) {
242 default:
243 return false;
244 case Instruction::Add:
245 case Instruction::Sub:
246 case Instruction::Mul:
247 case Instruction::GetElementPtr:
248 return true;
249 }
250 }
251 }
252}
253
255 assert(!Parent && "Recipe already in some VPBasicBlock");
256 assert(InsertPos->getParent() &&
257 "Insertion position not in any VPBasicBlock");
258 InsertPos->getParent()->insert(this, InsertPos->getIterator());
259}
260
261void VPRecipeBase::insertBefore(VPBasicBlock &BB,
263 assert(!Parent && "Recipe already in some VPBasicBlock");
264 assert(I == BB.end() || I->getParent() == &BB);
265 BB.insert(this, I);
266}
267
269 assert(!Parent && "Recipe already in some VPBasicBlock");
270 assert(InsertPos->getParent() &&
271 "Insertion position not in any VPBasicBlock");
272 InsertPos->getParent()->insert(this, std::next(InsertPos->getIterator()));
273}
274
276 assert(getParent() && "Recipe not in any VPBasicBlock");
278 Parent = nullptr;
279}
280
282 assert(getParent() && "Recipe not in any VPBasicBlock");
284}
285
288 insertAfter(InsertPos);
289}
290
296
298 // Get the underlying instruction for the recipe, if there is one. It is used
299 // to
300 // * decide if cost computation should be skipped for this recipe,
301 // * apply forced target instruction cost.
302 Instruction *UI = nullptr;
303 if (auto *S = dyn_cast<VPSingleDefRecipe>(this))
304 UI = dyn_cast_or_null<Instruction>(S->getUnderlyingValue());
305 else if (auto *IG = dyn_cast<VPInterleaveBase>(this))
306 UI = IG->getInsertPos();
307 else if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(this))
308 UI = &WidenMem->getIngredient();
309
310 InstructionCost RecipeCost;
311 if (UI && Ctx.skipCostComputation(UI, VF.isVector())) {
312 RecipeCost = 0;
313 } else {
314 RecipeCost = computeCost(VF, Ctx);
315 if (ForceTargetInstructionCost.getNumOccurrences() > 0 &&
316 RecipeCost.isValid()) {
317 if (UI)
319 else
320 RecipeCost = InstructionCost(0);
321 }
322 }
323
324 LLVM_DEBUG({
325 dbgs() << "Cost of " << RecipeCost << " for VF " << VF << ": ";
326 dump();
327 });
328 return RecipeCost;
329}
330
332 VPCostContext &Ctx) const {
333 llvm_unreachable("subclasses should implement computeCost");
334}
335
337 return (getVPRecipeID() >= VPFirstPHISC && getVPRecipeID() <= VPLastPHISC) ||
339}
340
342 assert(OpType == Other.OpType && "OpType must match");
343 switch (OpType) {
344 case OperationType::OverflowingBinOp:
345 WrapFlags.HasNUW &= Other.WrapFlags.HasNUW;
346 WrapFlags.HasNSW &= Other.WrapFlags.HasNSW;
347 break;
348 case OperationType::Trunc:
349 TruncFlags.HasNUW &= Other.TruncFlags.HasNUW;
350 TruncFlags.HasNSW &= Other.TruncFlags.HasNSW;
351 break;
352 case OperationType::DisjointOp:
353 DisjointFlags.IsDisjoint &= Other.DisjointFlags.IsDisjoint;
354 break;
355 case OperationType::PossiblyExactOp:
356 ExactFlags.IsExact &= Other.ExactFlags.IsExact;
357 break;
358 case OperationType::GEPOp:
359 GEPFlagsStorage &= Other.GEPFlagsStorage;
360 break;
361 case OperationType::FPMathOp:
362 case OperationType::FCmp:
363 assert((OpType != OperationType::FCmp ||
364 FCmpFlags.CmpPredStorage == Other.FCmpFlags.CmpPredStorage) &&
365 "Cannot drop CmpPredicate");
366 getFMFsRef().NoNaNs &= Other.getFMFsRef().NoNaNs;
367 getFMFsRef().NoInfs &= Other.getFMFsRef().NoInfs;
368 break;
369 case OperationType::NonNegOp:
370 NonNegFlags.NonNeg &= Other.NonNegFlags.NonNeg;
371 break;
372 case OperationType::Cmp:
373 assert(CmpPredStorage == Other.CmpPredStorage &&
374 "Cannot drop CmpPredicate");
375 break;
376 case OperationType::ReductionOp:
377 assert(ReductionFlags.Kind == Other.ReductionFlags.Kind &&
378 "Cannot change RecurKind");
379 assert(ReductionFlags.IsOrdered == Other.ReductionFlags.IsOrdered &&
380 "Cannot change IsOrdered");
381 assert(ReductionFlags.IsInLoop == Other.ReductionFlags.IsInLoop &&
382 "Cannot change IsInLoop");
383 getFMFsRef().NoNaNs &= Other.getFMFsRef().NoNaNs;
384 getFMFsRef().NoInfs &= Other.getFMFsRef().NoInfs;
385 break;
386 case OperationType::Other:
387 break;
388 }
389}
390
392 assert((OpType == OperationType::FPMathOp || OpType == OperationType::FCmp ||
393 OpType == OperationType::ReductionOp ||
394 OpType == OperationType::Other) &&
395 "recipe doesn't have fast math flags");
396 if (OpType == OperationType::Other)
397 return FastMathFlags();
398 const FastMathFlagsTy &F = getFMFsRef();
399 FastMathFlags Res;
400 Res.setAllowReassoc(F.AllowReassoc);
401 Res.setNoNaNs(F.NoNaNs);
402 Res.setNoInfs(F.NoInfs);
403 Res.setNoSignedZeros(F.NoSignedZeros);
404 Res.setAllowReciprocal(F.AllowReciprocal);
405 Res.setAllowContract(F.AllowContract);
406 Res.setApproxFunc(F.ApproxFunc);
407 return Res;
408}
409
410#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
412
413void VPRecipeBase::print(raw_ostream &O, const Twine &Indent,
414 VPSlotTracker &SlotTracker) const {
415 printRecipe(O, Indent, SlotTracker);
416 if (auto DL = getDebugLoc()) {
417 O << ", !dbg ";
418 DL.print(O);
419 }
420
421 if (auto *Metadata = dyn_cast<VPIRMetadata>(this))
423}
424#endif
425
427 if (Type *Ty = V->getScalarType())
428 return Ty;
429 auto *Recipe = V->getDefiningRecipe();
430 assert(Recipe && Recipe->getParent() &&
431 "operand without scalar type must be a recipe in a plan");
432 VPTypeAnalysis TypeInfo(*Recipe->getParent()->getPlan());
433 return TypeInfo.inferScalarType(V);
434}
435
437 : VPSingleDefRecipe(VPRecipeBase::VPExpandSCEVSC, {}, Expr->getType()),
438 Expr(Expr) {}
439
440/// For call VPInstruction operands, return the operand index of the called
441/// function. The function is either the last operand (for unmasked calls) or
442/// the second-to-last operand (for masked calls).
444 unsigned NumOps = Operands.size();
445 auto *LastOp = dyn_cast<VPIRValue>(Operands[NumOps - 1]);
446 if (LastOp && isa<Function>(LastOp->getValue()))
447 return NumOps - 1;
449 "expected function operand");
450 return NumOps - 2;
451}
452
453/// For call VPInstruction operands, return the called function.
455 unsigned Idx = getCalledFnOperandIndex(Operands);
456 return cast<Function>(cast<VPIRValue>(Operands[Idx])->getValue());
457}
458
460 ArrayRef<VPValue *> Operands) {
461 assert(!Operands.empty() &&
462 "zero-operand VPInstruction opcodes must pass explicit ResultTy");
463 // Assert operand \p Idx (if present and typed) has type \p ExpectedTy.
464 [[maybe_unused]] auto AssertOperandType = [&Operands](unsigned Idx,
465 Type *ExpectedTy) {
466 if (!ExpectedTy || Operands.size() <= Idx)
467 return;
468 [[maybe_unused]] Type *OpTy = getScalarTypeOrInfer(Operands[Idx]);
469 assert((!OpTy || OpTy == ExpectedTy) &&
470 "different types inferred for different operands");
471 };
472
473 Type *Op0Ty = getScalarTypeOrInfer(Operands[0]);
474 LLVMContext &Ctx = Op0Ty->getContext();
475 switch (Opcode) {
479 case Instruction::Store:
480 case Instruction::Switch:
481 return Type::getVoidTy(Ctx);
482 case Instruction::ICmp:
483 case Instruction::FCmp:
485 AssertOperandType(1, Op0Ty);
486 return IntegerType::get(Ctx, 1);
490 assert((!Op0Ty || Op0Ty->isIntegerTy(1)) && "expected bool operand");
491 AssertOperandType(1, Op0Ty);
492 return IntegerType::get(Ctx, 1);
494 return IntegerType::get(Ctx, 32);
495 case Instruction::Select: {
496 Type *Op1Ty = getScalarTypeOrInfer(Operands[1]);
497 AssertOperandType(2, Op1Ty);
498 return Op1Ty;
499 }
501 assert(Operands.size() >= 2 && "ExtractLane requires a lane operand and "
502 "at least one source vector operand");
503 Type *Op1Ty = getScalarTypeOrInfer(Operands[1]);
504 for (unsigned Idx = 2; Idx != Operands.size(); ++Idx)
505 AssertOperandType(Idx, Op1Ty);
506 return Op1Ty;
507 }
508 case Instruction::ExtractValue: {
509 assert(Operands.size() == 2 && "expected single level extractvalue");
510 auto *StructTy = cast<StructType>(Op0Ty);
511 return StructTy->getTypeAtIndex(
512 cast<VPConstantInt>(Operands[1])->getZExtValue());
513 }
518 case Instruction::Load:
519 case Instruction::Alloca:
520 llvm_unreachable("type must be passed explicitly");
521 case Instruction::Call:
522 return getCalledFunction(Operands)->getReturnType();
523 default:
524 break;
525 }
526
527 // Opcodes that require all operands to share the same scalar type as the
528 // result.
529 bool AllOperandsSameType =
530 Instruction::isBinaryOp(Opcode) ||
536 Opcode);
537 if (AllOperandsSameType)
538 for (unsigned Idx = 1; Idx != Operands.size(); ++Idx)
539 AssertOperandType(Idx, Op0Ty);
540
541 return Op0Ty;
542}
543
545 ArrayRef<VPValue *> Operands) {
546 unsigned Opcode = I->getOpcode();
547 if (Instruction::isCast(Opcode) ||
548 is_contained(ArrayRef<unsigned>({Instruction::ExtractValue,
549 Instruction::Load, Instruction::Alloca}),
550 Opcode))
551 return I->getType();
552 return computeScalarTypeForInstruction(Opcode, Operands);
553}
554
556 const VPIRFlags &Flags, const VPIRMetadata &MD,
557 DebugLoc DL, const Twine &Name, Type *ResultTy)
559 VPRecipeBase::VPInstructionSC, Operands,
560 ResultTy ? ResultTy
561 : computeScalarTypeForInstruction(Opcode, Operands),
562 Flags, DL),
563 VPIRMetadata(MD), Opcode(Opcode), Name(Name.str()) {
565 "Set flags not supported for the provided opcode");
567 "Opcode requires specific flags to be set");
571 "number of operands does not match opcode");
572}
573
575 if (Instruction::isUnaryOp(Opcode) || Instruction::isCast(Opcode))
576 return 1;
577
578 if (Instruction::isBinaryOp(Opcode))
579 return 2;
580
581 switch (Opcode) {
585 return 0;
586 case Instruction::Alloca:
587 case Instruction::ExtractValue:
588 case Instruction::Freeze:
589 case Instruction::Load:
603 return 1;
604 case Instruction::ICmp:
605 case Instruction::FCmp:
606 case Instruction::ExtractElement:
607 case Instruction::Store:
617 return 2;
618 case Instruction::InsertElement:
619 case Instruction::Select:
622 return 3;
623 case Instruction::Call:
625 1;
626 case Instruction::GetElementPtr:
627 case Instruction::PHI:
628 case Instruction::Switch:
638 // Cannot determine the number of operands from the opcode.
639 return -1u;
640 }
641 llvm_unreachable("all cases should be handled above");
642}
643
647
648bool VPInstruction::canGenerateScalarForFirstLane() const {
650 return true;
652 return true;
653 switch (Opcode) {
654 case Instruction::Freeze:
655 case Instruction::ICmp:
656 case Instruction::PHI:
657 case Instruction::Select:
667 return true;
668 default:
669 return false;
670 }
671}
672
674 if (Kind == RecurKind::Sub)
675 return Instruction::Add;
676 if (Kind == RecurKind::FSub)
677 return Instruction::FAdd;
678 llvm_unreachable("RecurKind should be Sub/FSub.");
679}
680
681Value *VPInstruction::generate(VPTransformState &State) {
682 IRBuilderBase &Builder = State.Builder;
683
685 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
686 Value *A = State.get(getOperand(0), OnlyFirstLaneUsed);
687 Value *B = State.get(getOperand(1), OnlyFirstLaneUsed);
688 auto *Res =
689 Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B, Name);
690 if (auto *I = dyn_cast<Instruction>(Res))
691 applyFlags(*I);
692 return Res;
693 }
694
695 switch (getOpcode()) {
696 case VPInstruction::Not: {
697 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
698 Value *A = State.get(getOperand(0), OnlyFirstLaneUsed);
699 return Builder.CreateNot(A, Name);
700 }
701 case Instruction::ExtractElement: {
702 assert(State.VF.isVector() && "Only extract elements from vectors");
703 if (auto *Idx = dyn_cast<VPConstantInt>(getOperand(1)))
704 return State.get(getOperand(0), VPLane(Idx->getZExtValue()));
705 Value *Vec = State.get(getOperand(0));
706 Value *Idx = State.get(getOperand(1), /*IsScalar=*/true);
707 return Builder.CreateExtractElement(Vec, Idx, Name);
708 }
709 case Instruction::InsertElement: {
710 assert(State.VF.isVector() && "Can only insert elements into vectors");
711 Value *Vec = State.get(getOperand(0), /*IsScalar=*/false);
712 Value *Elt = State.get(getOperand(1), /*IsScalar=*/true);
713 Value *Idx = State.get(getOperand(2), /*IsScalar=*/true);
714 return Builder.CreateInsertElement(Vec, Elt, Idx, Name);
715 }
716 case Instruction::Freeze: {
718 return Builder.CreateFreeze(Op, Name);
719 }
720 case Instruction::FCmp:
721 case Instruction::ICmp: {
722 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
723 Value *A = State.get(getOperand(0), OnlyFirstLaneUsed);
724 Value *B = State.get(getOperand(1), OnlyFirstLaneUsed);
725 return Builder.CreateCmp(getPredicate(), A, B, Name);
726 }
727 case Instruction::PHI: {
728 llvm_unreachable("should be handled by VPPhi::execute");
729 }
730 case Instruction::Select: {
731 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
732 Value *Cond =
733 State.get(getOperand(0),
734 OnlyFirstLaneUsed || vputils::isSingleScalar(getOperand(0)));
735 Value *Op1 = State.get(getOperand(1), OnlyFirstLaneUsed);
736 Value *Op2 = State.get(getOperand(2), OnlyFirstLaneUsed);
737 return Builder.CreateSelectFMF(Cond, Op1, Op2, getFastMathFlags(), Name);
738 }
740 // Get first lane of vector induction variable.
741 Value *VIVElem0 = State.get(getOperand(0), VPLane(0));
742 // Get the original loop tripcount.
743 Value *ScalarTC = State.get(getOperand(1), VPLane(0));
744
745 // If this part of the active lane mask is scalar, generate the CMP directly
746 // to avoid unnecessary extracts.
747 if (State.VF.isScalar())
748 return Builder.CreateCmp(CmpInst::Predicate::ICMP_ULT, VIVElem0, ScalarTC,
749 Name);
750
751 ElementCount EC = State.VF.multiplyCoefficientBy(
752 cast<VPConstantInt>(getOperand(2))->getZExtValue());
753 auto *PredTy = VectorType::get(Builder.getInt1Ty(), EC);
754 return Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask,
755 {PredTy, ScalarTC->getType()},
756 {VIVElem0, ScalarTC}, nullptr, Name);
757 }
759 Value *Op = State.get(getOperand(0));
760 auto *VecTy = cast<VectorType>(Op->getType());
761 assert(VecTy->getScalarSizeInBits() == 1 &&
762 "NumActiveLanes only implemented for i1 vectors");
763
764 Type *Ty = State.TypeAnalysis.inferScalarType(this);
765 Value *ZExt = Builder.CreateCast(
766 Instruction::ZExt, Op, VectorType::get(Ty, VecTy->getElementCount()));
767 Value *NumActive =
768 Builder.CreateUnaryIntrinsic(Intrinsic::vector_reduce_add, ZExt);
769 return NumActive;
770 }
772 // Generate code to combine the previous and current values in vector v3.
773 //
774 // vector.ph:
775 // v_init = vector(..., ..., ..., a[-1])
776 // br vector.body
777 //
778 // vector.body
779 // i = phi [0, vector.ph], [i+4, vector.body]
780 // v1 = phi [v_init, vector.ph], [v2, vector.body]
781 // v2 = a[i, i+1, i+2, i+3];
782 // v3 = vector(v1(3), v2(0, 1, 2))
783
784 auto *V1 = State.get(getOperand(0));
785 if (!V1->getType()->isVectorTy())
786 return V1;
787 Value *V2 = State.get(getOperand(1));
788 return Builder.CreateVectorSpliceRight(V1, V2, 1, Name);
789 }
791 Value *ScalarTC = State.get(getOperand(0), VPLane(0));
792 Value *VFxUF = State.get(getOperand(1), VPLane(0));
793 Value *Sub = Builder.CreateSub(ScalarTC, VFxUF);
794 Value *Cmp =
795 Builder.CreateICmp(CmpInst::Predicate::ICMP_UGT, ScalarTC, VFxUF);
797 return Builder.CreateSelect(Cmp, Sub, Zero);
798 }
800 // TODO: Restructure this code with an explicit remainder loop, vsetvli can
801 // be outside of the main loop.
802 Value *AVL = State.get(getOperand(0), /*IsScalar*/ true);
803 // Compute EVL
804 assert(AVL->getType()->isIntegerTy() &&
805 "Requested vector length should be an integer.");
806
807 assert(State.VF.isScalable() && "Expected scalable vector factor.");
808 Value *VFArg = Builder.getInt32(State.VF.getKnownMinValue());
809
810 Value *EVL = Builder.CreateIntrinsic(
811 Builder.getInt32Ty(), Intrinsic::experimental_get_vector_length,
812 {AVL, VFArg, Builder.getTrue()});
813 return EVL;
814 }
816 Value *Cond = State.get(getOperand(0), VPLane(0));
817 // Replace the temporary unreachable terminator with a new conditional
818 // branch, hooking it up to backward destination for latch blocks now, and
819 // to forward destination(s) later when they are created.
820 // Second successor may be backwards - iff it is already in VPBB2IRBB.
821 VPBasicBlock *SecondVPSucc =
822 cast<VPBasicBlock>(getParent()->getSuccessors()[1]);
823 BasicBlock *SecondIRSucc = State.CFG.VPBB2IRBB.lookup(SecondVPSucc);
824 BasicBlock *IRBB = State.CFG.VPBB2IRBB[getParent()];
825 auto *Br = Builder.CreateCondBr(Cond, IRBB, SecondIRSucc);
826 // First successor is always forward, reset it to nullptr.
827 Br->setSuccessor(0, nullptr);
829 applyMetadata(*Br);
830 return Br;
831 }
833 return Builder.CreateVectorSplat(
834 State.VF, State.get(getOperand(0), /*IsScalar*/ true), "broadcast");
835 }
837 // For struct types, we need to build a new 'wide' struct type, where each
838 // element is widened, i.e., we create a struct of vectors.
839 auto *StructTy =
841 Value *Res = PoisonValue::get(toVectorizedTy(StructTy, State.VF));
842 for (const auto &[LaneIndex, Op] : enumerate(operands())) {
843 for (unsigned FieldIndex = 0; FieldIndex != StructTy->getNumElements();
844 FieldIndex++) {
845 Value *ScalarValue =
846 Builder.CreateExtractValue(State.get(Op, true), FieldIndex);
847 Value *VectorValue = Builder.CreateExtractValue(Res, FieldIndex);
848 VectorValue =
849 Builder.CreateInsertElement(VectorValue, ScalarValue, LaneIndex);
850 Res = Builder.CreateInsertValue(Res, VectorValue, FieldIndex);
851 }
852 }
853 return Res;
854 }
856 auto *ScalarTy = State.TypeAnalysis.inferScalarType(getOperand(0));
857 auto NumOfElements = ElementCount::getFixed(getNumOperands());
858 Value *Res = PoisonValue::get(toVectorizedTy(ScalarTy, NumOfElements));
859 for (const auto &[Idx, Op] : enumerate(operands()))
860 Res = Builder.CreateInsertElement(Res, State.get(Op, true),
861 Builder.getInt32(Idx));
862 return Res;
863 }
865 if (State.VF.isScalar())
866 return State.get(getOperand(0), true);
867 IRBuilderBase::FastMathFlagGuard FMFG(Builder);
869 // If this start vector is scaled then it should produce a vector with fewer
870 // elements than the VF.
871 ElementCount VF = State.VF.divideCoefficientBy(
872 cast<VPConstantInt>(getOperand(2))->getZExtValue());
873 auto *Iden = Builder.CreateVectorSplat(VF, State.get(getOperand(1), true));
874 return Builder.CreateInsertElement(Iden, State.get(getOperand(0), true),
875 Builder.getInt32(0));
876 }
878 RecurKind RK = getRecurKind();
879 bool IsOrdered = isReductionOrdered();
880 bool IsInLoop = isReductionInLoop();
882 "FindIV should use min/max reduction kinds");
883
884 // The recipe may have multiple operands to be reduced together.
885 unsigned NumOperandsToReduce = getNumOperands();
886 VectorParts RdxParts(NumOperandsToReduce);
887 for (unsigned Part = 0; Part < NumOperandsToReduce; ++Part)
888 RdxParts[Part] = State.get(getOperand(Part), IsInLoop);
889
890 IRBuilderBase::FastMathFlagGuard FMFG(Builder);
892
893 // Reduce multiple operands into one.
894 Value *ReducedPartRdx = RdxParts[0];
895 if (IsOrdered) {
896 ReducedPartRdx = RdxParts[NumOperandsToReduce - 1];
897 } else {
898 // Floating-point operations should have some FMF to enable the reduction.
899 for (unsigned Part = 1; Part < NumOperandsToReduce; ++Part) {
900 Value *RdxPart = RdxParts[Part];
902 ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
903 else {
904 // For sub-recurrences, each part's reduction variable is already
905 // negative, we need to do: reduce.add(-acc_uf0 + -acc_uf1)
909 : (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(RK);
910 ReducedPartRdx =
911 Builder.CreateBinOp(Opcode, RdxPart, ReducedPartRdx, "bin.rdx");
912 }
913 }
914 }
915
916 // Create the reduction after the loop. Note that inloop reductions create
917 // the target reduction in the loop using a Reduction recipe.
918 if (State.VF.isVector() && !IsInLoop) {
919 // TODO: Support in-order reductions based on the recurrence descriptor.
920 // All ops in the reduction inherit fast-math-flags from the recurrence
921 // descriptor.
922 ReducedPartRdx = createSimpleReduction(Builder, ReducedPartRdx, RK);
923 }
924
925 return ReducedPartRdx;
926 }
929 unsigned Offset =
931 Value *Res;
932 if (State.VF.isVector()) {
933 assert(Offset <= State.VF.getKnownMinValue() &&
934 "invalid offset to extract from");
935 // Extract lane VF - Offset from the operand.
936 Res = State.get(getOperand(0), VPLane::getLaneFromEnd(State.VF, Offset));
937 } else {
938 // TODO: Remove ExtractLastLane for scalar VFs.
939 assert(Offset <= 1 && "invalid offset to extract from");
940 Res = State.get(getOperand(0));
941 }
943 Res->setName(Name);
944 return Res;
945 }
947 Value *A = State.get(getOperand(0));
948 Value *B = State.get(getOperand(1));
949 return Builder.CreateLogicalAnd(A, B, Name);
950 }
952 Value *A = State.get(getOperand(0));
953 Value *B = State.get(getOperand(1));
954 return Builder.CreateLogicalOr(A, B, Name);
955 }
957 assert((State.VF.isScalar() || vputils::onlyFirstLaneUsed(this)) &&
958 "can only generate first lane for PtrAdd");
959 Value *Ptr = State.get(getOperand(0), VPLane(0));
960 Value *Addend = State.get(getOperand(1), VPLane(0));
961 return Builder.CreatePtrAdd(Ptr, Addend, Name, getGEPNoWrapFlags());
962 }
964 Value *Ptr =
966 Value *Addend = State.get(getOperand(1));
967 return Builder.CreatePtrAdd(Ptr, Addend, Name, getGEPNoWrapFlags());
968 }
970 Value *Res = Builder.CreateFreeze(State.get(getOperand(0)));
971 for (VPValue *Op : drop_begin(operands()))
972 Res = Builder.CreateOr(Res, Builder.CreateFreeze(State.get(Op)));
973 return State.VF.isScalar() ? Res : Builder.CreateOrReduce(Res);
974 }
976 assert(getNumOperands() != 2 && "ExtractLane from single source should be "
977 "simplified to ExtractElement.");
978 Value *LaneToExtract = State.get(getOperand(0), true);
979 Type *IdxTy = State.TypeAnalysis.inferScalarType(getOperand(0));
980 Value *Res = nullptr;
981 Value *RuntimeVF = getRuntimeVF(Builder, IdxTy, State.VF);
982
983 for (unsigned Idx = 1; Idx != getNumOperands(); ++Idx) {
984 Value *VectorStart =
985 Builder.CreateMul(RuntimeVF, ConstantInt::get(IdxTy, Idx - 1));
986 Value *VectorIdx = Idx == 1
987 ? LaneToExtract
988 : Builder.CreateSub(LaneToExtract, VectorStart);
989 Value *Ext = State.VF.isScalar()
990 ? State.get(getOperand(Idx))
991 : Builder.CreateExtractElement(
992 State.get(getOperand(Idx)), VectorIdx);
993 if (Res) {
994 Value *Cmp = Builder.CreateICmpUGE(LaneToExtract, VectorStart);
995 Res = Builder.CreateSelect(Cmp, Ext, Res);
996 } else {
997 Res = Ext;
998 }
999 }
1000 return Res;
1001 }
1003 Type *Ty = State.TypeAnalysis.inferScalarType(this);
1004 if (getNumOperands() == 1) {
1005 Value *Mask = State.get(getOperand(0));
1006 return Builder.CreateCountTrailingZeroElems(Ty, Mask,
1007 /*ZeroIsPoison=*/false, Name);
1008 }
1009 // If there are multiple operands, create a chain of selects to pick the
1010 // first operand with an active lane and add the number of lanes of the
1011 // preceding operands.
1012 Value *RuntimeVF = getRuntimeVF(Builder, Ty, State.VF);
1013 unsigned LastOpIdx = getNumOperands() - 1;
1014 Value *Res = nullptr;
1015 for (int Idx = LastOpIdx; Idx >= 0; --Idx) {
1016 Value *TrailingZeros =
1017 State.VF.isScalar()
1018 ? Builder.CreateZExt(
1019 Builder.CreateICmpEQ(State.get(getOperand(Idx)),
1020 Builder.getFalse()),
1021 Ty)
1023 Ty, State.get(getOperand(Idx)),
1024 /*ZeroIsPoison=*/false, Name);
1025 Value *Current = Builder.CreateAdd(
1026 Builder.CreateMul(RuntimeVF, ConstantInt::get(Ty, Idx)),
1027 TrailingZeros);
1028 if (Res) {
1029 Value *Cmp = Builder.CreateICmpNE(TrailingZeros, RuntimeVF);
1030 Res = Builder.CreateSelect(Cmp, Current, Res);
1031 } else {
1032 Res = Current;
1033 }
1034 }
1035
1036 return Res;
1037 }
1039 return State.get(getOperand(0), true);
1041 return Builder.CreateVectorReverse(State.get(getOperand(0)), "reverse");
1043 Value *Result = State.get(getOperand(0), /*IsScalar=*/true);
1044 for (unsigned Idx = 1; Idx < getNumOperands(); Idx += 2) {
1045 Value *Data = State.get(getOperand(Idx));
1046 Value *Mask = State.get(getOperand(Idx + 1));
1047 Type *VTy = Data->getType();
1048
1049 if (State.VF.isScalar())
1050 Result = Builder.CreateSelect(Mask, Data, Result);
1051 else
1052 Result = Builder.CreateIntrinsic(
1053 Intrinsic::experimental_vector_extract_last_active, {VTy},
1054 {Data, Mask, Result});
1055 }
1056
1057 return Result;
1058 }
1059 default:
1060 llvm_unreachable("Unsupported opcode for instruction");
1061 }
1062}
1063
1065 unsigned Opcode, ElementCount VF, VPCostContext &Ctx) const {
1066 Type *ScalarTy = Ctx.Types.inferScalarType(this);
1067 Type *ResultTy = VF.isVector() ? toVectorTy(ScalarTy, VF) : ScalarTy;
1068 switch (Opcode) {
1069 case Instruction::FNeg:
1070 return Ctx.TTI.getArithmeticInstrCost(Opcode, ResultTy, Ctx.CostKind);
1071 case Instruction::UDiv:
1072 case Instruction::SDiv:
1073 case Instruction::SRem:
1074 case Instruction::URem:
1075 case Instruction::Add:
1076 case Instruction::FAdd:
1077 case Instruction::Sub:
1078 case Instruction::FSub:
1079 case Instruction::Mul:
1080 case Instruction::FMul:
1081 case Instruction::FDiv:
1082 case Instruction::FRem:
1083 case Instruction::Shl:
1084 case Instruction::LShr:
1085 case Instruction::AShr:
1086 case Instruction::And:
1087 case Instruction::Or:
1088 case Instruction::Xor: {
1089 // Certain instructions can be cheaper if they have a constant second
1090 // operand. One example of this are shifts on x86.
1091 VPValue *RHS = getOperand(1);
1092 TargetTransformInfo::OperandValueInfo RHSInfo = Ctx.getOperandInfo(RHS);
1093
1094 if (RHSInfo.Kind == TargetTransformInfo::OK_AnyValue &&
1097
1100 if (CtxI)
1101 Operands.append(CtxI->value_op_begin(), CtxI->value_op_end());
1102 return Ctx.TTI.getArithmeticInstrCost(
1103 Opcode, ResultTy, Ctx.CostKind,
1104 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
1105 RHSInfo, Operands, CtxI, &Ctx.TLI);
1106 }
1107 case Instruction::Freeze:
1108 // NOTE: The only way to ask for the cost is via getInstructionCost, which
1109 // requires the actual vector instruction. Instead, both here and in the
1110 // LoopVectorizationCostModel::getInstructionCost the costs mirror the
1111 // current behaviour in llvm/Analysis/TargetTransformInfoImpl.h to keep
1112 // them in sync.
1113 return TTI::TCC_Free;
1114 case Instruction::ExtractValue:
1115 return Ctx.TTI.getInsertExtractValueCost(Instruction::ExtractValue,
1116 Ctx.CostKind);
1117 case Instruction::ICmp:
1118 case Instruction::FCmp: {
1119 Type *ScalarOpTy = Ctx.Types.inferScalarType(getOperand(0));
1120 Type *OpTy = VF.isVector() ? toVectorTy(ScalarOpTy, VF) : ScalarOpTy;
1122 return Ctx.TTI.getCmpSelInstrCost(
1123 Opcode, OpTy, CmpInst::makeCmpResultType(OpTy), getPredicate(),
1124 Ctx.CostKind, {TTI::OK_AnyValue, TTI::OP_None},
1125 {TTI::OK_AnyValue, TTI::OP_None}, CtxI);
1126 }
1127 case Instruction::BitCast: {
1128 Type *ScalarTy = Ctx.Types.inferScalarType(this);
1129 if (ScalarTy->isPointerTy())
1130 return 0;
1131 [[fallthrough]];
1132 }
1133 case Instruction::SExt:
1134 case Instruction::ZExt:
1135 case Instruction::FPToUI:
1136 case Instruction::FPToSI:
1137 case Instruction::FPExt:
1138 case Instruction::PtrToInt:
1139 case Instruction::PtrToAddr:
1140 case Instruction::IntToPtr:
1141 case Instruction::SIToFP:
1142 case Instruction::UIToFP:
1143 case Instruction::Trunc:
1144 case Instruction::FPTrunc:
1145 case Instruction::AddrSpaceCast: {
1146 // Computes the CastContextHint from a recipe that may access memory.
1147 auto ComputeCCH = [&](const VPRecipeBase *R) -> TTI::CastContextHint {
1148 if (isa<VPInterleaveBase>(R))
1150 if (const auto *ReplicateRecipe = dyn_cast<VPReplicateRecipe>(R)) {
1151 // Only compute CCH for memory operations, matching the legacy model
1152 // which only considers loads/stores for cast context hints.
1153 auto *UI = cast<Instruction>(ReplicateRecipe->getUnderlyingValue());
1154 if (!isa<LoadInst, StoreInst>(UI))
1156 return ReplicateRecipe->isPredicated() ? TTI::CastContextHint::Masked
1158 }
1159 const auto *WidenMemoryRecipe = dyn_cast<VPWidenMemoryRecipe>(R);
1160 if (WidenMemoryRecipe == nullptr)
1162 if (VF.isScalar())
1164 if (!WidenMemoryRecipe->isConsecutive())
1166 if (WidenMemoryRecipe->isMasked())
1169 };
1170
1171 VPValue *Operand = getOperand(0);
1173 bool IsReverse = false;
1174 // For Trunc/FPTrunc, get the context from the only user.
1175 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
1176 auto GetOnlyUser = [](const VPSingleDefRecipe *R) -> VPRecipeBase * {
1177 if (R->getNumUsers() == 0 || R->hasMoreThanOneUniqueUser())
1178 return nullptr;
1179 return dyn_cast<VPRecipeBase>(*R->user_begin());
1180 };
1181 if (VPRecipeBase *Recipe = GetOnlyUser(this)) {
1182 if (match(Recipe,
1186 Recipe = GetOnlyUser(cast<VPSingleDefRecipe>(Recipe));
1187 IsReverse = true;
1188 }
1189 if (Recipe)
1190 CCH = ComputeCCH(Recipe);
1191 }
1192 }
1193 // For Z/Sext, get the context from the operand.
1194 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
1195 Opcode == Instruction::FPExt) {
1196 if (auto *Recipe = Operand->getDefiningRecipe()) {
1197 VPValue *ReverseOp;
1198 if (match(Recipe,
1199 m_CombineOr(m_Reverse(m_VPValue(ReverseOp)),
1201 m_VPValue(ReverseOp))))) {
1202 Recipe = ReverseOp->getDefiningRecipe();
1203 IsReverse = true;
1204 }
1205 if (Recipe)
1206 CCH = ComputeCCH(Recipe);
1207 }
1208 }
1209 if (IsReverse && CCH != TTI::CastContextHint::None)
1211
1212 auto *ScalarSrcTy = Ctx.Types.inferScalarType(Operand);
1213 Type *SrcTy = VF.isVector() ? toVectorTy(ScalarSrcTy, VF) : ScalarSrcTy;
1214 // Arm TTI will use the underlying instruction to determine the cost.
1215 return Ctx.TTI.getCastInstrCost(
1216 Opcode, ResultTy, SrcTy, CCH, Ctx.CostKind,
1218 }
1219 case Instruction::Select: {
1221 bool IsScalarCond = getOperand(0)->isDefinedOutsideLoopRegions();
1222 Type *ScalarTy = Ctx.Types.inferScalarType(this);
1223
1224 VPValue *Op0, *Op1;
1225 bool IsLogicalAnd =
1226 match(this, m_c_LogicalAnd(m_VPValue(Op0), m_VPValue(Op1)));
1227 bool IsLogicalOr =
1228 match(this, m_c_LogicalOr(m_VPValue(Op0), m_VPValue(Op1)));
1229 // Also match the inverted forms:
1230 // select x, false, y --> !x & y (still AND)
1231 // select x, y, true --> !x | y (still OR)
1232 IsLogicalAnd |=
1233 match(this, m_Select(m_VPValue(Op0), m_False(), m_VPValue(Op1)));
1234 IsLogicalOr |=
1235 match(this, m_Select(m_VPValue(Op0), m_VPValue(Op1), m_True()));
1236
1237 if (!IsScalarCond && ScalarTy->getScalarSizeInBits() == 1 &&
1238 (IsLogicalAnd || IsLogicalOr)) {
1239 // select x, y, false --> x & y
1240 // select x, true, y --> x | y
1241 const auto [Op1VK, Op1VP] = Ctx.getOperandInfo(Op0);
1242 const auto [Op2VK, Op2VP] = Ctx.getOperandInfo(Op1);
1243
1245 if (SI && all_of(operands(),
1246 [](VPValue *Op) { return Op->getUnderlyingValue(); }))
1247 append_range(Operands, SI->operands());
1248 return Ctx.TTI.getArithmeticInstrCost(
1249 IsLogicalOr ? Instruction::Or : Instruction::And, ResultTy,
1250 Ctx.CostKind, {Op1VK, Op1VP}, {Op2VK, Op2VP}, Operands, SI);
1251 }
1252
1253 Type *CondTy = Ctx.Types.inferScalarType(getOperand(0));
1254 if (!IsScalarCond && VF.isVector())
1255 CondTy = VectorType::get(CondTy, VF);
1256
1257 llvm::CmpPredicate Pred;
1258 if (!match(getOperand(0), m_Cmp(Pred, m_VPValue(), m_VPValue())))
1259 if (auto *CondIRV = dyn_cast<VPIRValue>(getOperand(0)))
1260 if (auto *Cmp = dyn_cast<CmpInst>(CondIRV->getValue()))
1261 Pred = Cmp->getPredicate();
1262 Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
1263 return Ctx.TTI.getCmpSelInstrCost(
1264 Instruction::Select, VectorTy, CondTy, Pred, Ctx.CostKind,
1265 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None}, SI);
1266 }
1267 }
1268 llvm_unreachable("called for unsupported opcode");
1269}
1270
1272 VPCostContext &Ctx) const {
1274 if (!getUnderlyingValue() && getOpcode() != Instruction::FMul) {
1275 // TODO: Compute cost for VPInstructions without underlying values once
1276 // the legacy cost model has been retired.
1277 return 0;
1278 }
1279
1281 "Should only generate a vector value or single scalar, not scalars "
1282 "for all lanes.");
1284 getOpcode(),
1286 }
1287
1288 switch (getOpcode()) {
1289 case Instruction::Select: {
1291 match(getOperand(0), m_Cmp(Pred, m_VPValue(), m_VPValue()));
1292 auto *CondTy = Ctx.Types.inferScalarType(getOperand(0));
1293 auto *VecTy = Ctx.Types.inferScalarType(getOperand(1));
1294 if (!vputils::onlyFirstLaneUsed(this)) {
1295 CondTy = toVectorTy(CondTy, VF);
1296 VecTy = toVectorTy(VecTy, VF);
1297 }
1298 return Ctx.TTI.getCmpSelInstrCost(Instruction::Select, VecTy, CondTy, Pred,
1299 Ctx.CostKind);
1300 }
1301 case Instruction::ExtractElement:
1303 if (VF.isScalar()) {
1304 // ExtractLane with VF=1 takes care of handling extracting across multiple
1305 // parts.
1306 return 0;
1307 }
1308
1309 // Add on the cost of extracting the element.
1310 auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
1311 return Ctx.TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy,
1312 Ctx.CostKind);
1313 }
1314 case VPInstruction::AnyOf: {
1315 auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
1316 return Ctx.TTI.getArithmeticReductionCost(
1317 Instruction::Or, cast<VectorType>(VecTy), std::nullopt, Ctx.CostKind);
1318 }
1320 Type *Ty = Ctx.Types.inferScalarType(this);
1321 Type *ScalarTy = Ctx.Types.inferScalarType(getOperand(0));
1322 if (VF.isScalar())
1323 return Ctx.TTI.getCmpSelInstrCost(Instruction::ICmp, ScalarTy,
1325 CmpInst::ICMP_EQ, Ctx.CostKind);
1326 // Calculate the cost of determining the lane index.
1327 auto *PredTy = toVectorTy(ScalarTy, VF);
1328 IntrinsicCostAttributes Attrs(Intrinsic::experimental_cttz_elts, Ty,
1329 {PredTy, Type::getInt1Ty(Ctx.LLVMCtx)});
1330 return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
1331 }
1333 Type *Ty = Ctx.Types.inferScalarType(this);
1334 Type *ScalarTy = Ctx.Types.inferScalarType(getOperand(0));
1335 if (VF.isScalar())
1336 return Ctx.TTI.getCmpSelInstrCost(Instruction::ICmp, ScalarTy,
1338 CmpInst::ICMP_EQ, Ctx.CostKind);
1339 // Calculate the cost of determining the lane index: NOT + cttz_elts + SUB.
1340 auto *PredTy = toVectorTy(ScalarTy, VF);
1341 IntrinsicCostAttributes Attrs(Intrinsic::experimental_cttz_elts, Ty,
1342 {PredTy, Type::getInt1Ty(Ctx.LLVMCtx)});
1343 InstructionCost Cost = Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
1344 // Add cost of NOT operation on the predicate.
1345 Cost += Ctx.TTI.getArithmeticInstrCost(
1346 Instruction::Xor, PredTy, Ctx.CostKind,
1347 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
1348 {TargetTransformInfo::OK_UniformConstantValue,
1349 TargetTransformInfo::OP_None});
1350 // Add cost of SUB operation on the index.
1351 Cost += Ctx.TTI.getArithmeticInstrCost(Instruction::Sub, Ty, Ctx.CostKind);
1352 return Cost;
1353 }
1355 Type *ScalarTy = Ctx.Types.inferScalarType(this);
1356 Type *VecTy = toVectorTy(ScalarTy, VF);
1357 Type *MaskTy = toVectorTy(Type::getInt1Ty(Ctx.LLVMCtx), VF);
1359 Intrinsic::experimental_vector_extract_last_active, ScalarTy,
1360 {VecTy, MaskTy, ScalarTy});
1361 return Ctx.TTI.getIntrinsicInstrCost(ICA, Ctx.CostKind);
1362 }
1364 assert(VF.isVector() && "Scalar FirstOrderRecurrenceSplice?");
1365 Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
1366 return Ctx.TTI.getShuffleCost(
1368 cast<VectorType>(VectorTy), {}, Ctx.CostKind, -1);
1369 }
1371 Type *ArgTy = Ctx.Types.inferScalarType(getOperand(0));
1372 unsigned Multiplier = cast<VPConstantInt>(getOperand(2))->getZExtValue();
1373 Type *RetTy = toVectorTy(Type::getInt1Ty(Ctx.LLVMCtx), VF * Multiplier);
1374 IntrinsicCostAttributes Attrs(Intrinsic::get_active_lane_mask, RetTy,
1375 {ArgTy, ArgTy});
1376 return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
1377 }
1379 Type *Arg0Ty = Ctx.Types.inferScalarType(getOperand(0));
1380 Type *I32Ty = Type::getInt32Ty(Ctx.LLVMCtx);
1381 Type *I1Ty = Type::getInt1Ty(Ctx.LLVMCtx);
1382 IntrinsicCostAttributes Attrs(Intrinsic::experimental_get_vector_length,
1383 I32Ty, {Arg0Ty, I32Ty, I1Ty});
1384 return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
1385 }
1387 assert(VF.isVector() && "Reverse operation must be vector type");
1388 Type *EltTy = Ctx.Types.inferScalarType(this);
1389 // Skip the reverse operation cost for the mask.
1390 // FIXME: Remove this once redundant mask reverse operations can be
1391 // eliminated by VPlanTransforms::cse before cost computation.
1392 if (EltTy->isIntegerTy(1))
1393 return 0;
1394 auto *VectorTy = cast<VectorType>(toVectorTy(EltTy, VF));
1395 return Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy,
1396 VectorTy, /*Mask=*/{}, Ctx.CostKind,
1397 /*Index=*/0);
1398 }
1400 // Add on the cost of extracting the element.
1401 auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
1402 return Ctx.TTI.getIndexedVectorInstrCostFromEnd(Instruction::ExtractElement,
1403 VecTy, Ctx.CostKind, 0);
1404 }
1405 case Instruction::FCmp:
1406 case Instruction::ICmp: {
1407 // FIXME: We don't handle scalar compares inside the loop here yet, as loop
1408 // exit conditions are handled by the legacy cost model and avoiding all
1409 // scalar compares is the simplest way to avoid double-counting compares
1410 // that compute the loop exit condition.
1411 bool IsScalar = vputils::onlyFirstLaneUsed(this);
1412 const VPRegionBlock *Region = getRegion();
1413 if (IsScalar && Region &&
1414 Region == Region->getPlan()->getVectorLoopRegion())
1415 return 0;
1417 getOpcode(), IsScalar ? ElementCount::getFixed(1) : VF, Ctx);
1418 }
1420 if (VF == ElementCount::getScalable(1))
1422 [[fallthrough]];
1423 default:
1424 // TODO: Compute cost other VPInstructions once the legacy cost model has
1425 // been retired.
1427 "unexpected VPInstruction witht underlying value");
1428 return 0;
1429 }
1430}
1431
1444
1446 switch (getOpcode()) {
1447 case Instruction::Load:
1448 case Instruction::PHI:
1452 return true;
1453 default:
1455 }
1456}
1457
1459 assert(!isMasked() && "cannot execute masked VPInstruction");
1460 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
1462 "Set flags not supported for the provided opcode");
1464 "Opcode requires specific flags to be set");
1465 if (hasFastMathFlags())
1466 State.Builder.setFastMathFlags(getFastMathFlags());
1467 Value *GeneratedValue = generate(State);
1468 if (!hasResult())
1469 return;
1470 assert(GeneratedValue && "generate must produce a value");
1471 bool GeneratesPerFirstLaneOnly = canGenerateScalarForFirstLane() &&
1474 assert((((GeneratedValue->getType()->isVectorTy() ||
1475 GeneratedValue->getType()->isStructTy()) ==
1476 !GeneratesPerFirstLaneOnly) ||
1477 State.VF.isScalar()) &&
1478 "scalar value but not only first lane defined");
1479 State.set(this, GeneratedValue,
1480 /*IsScalar*/ GeneratesPerFirstLaneOnly);
1482 // FIXME: This is a workaround to enable reliable updates of the scalar loop
1483 // resume phis, when vectorizing the epilogue. Must be removed once epilogue
1484 // vectorization explicitly connects VPlans.
1485 setUnderlyingValue(GeneratedValue);
1486 }
1487}
1488
1492 return false;
1493 switch (getOpcode()) {
1494 case Instruction::ExtractValue:
1495 case Instruction::InsertValue:
1496 case Instruction::GetElementPtr:
1497 case Instruction::ExtractElement:
1498 case Instruction::InsertElement:
1499 case Instruction::Freeze:
1500 case Instruction::FCmp:
1501 case Instruction::ICmp:
1502 case Instruction::Select:
1503 case Instruction::PHI:
1528 case VPInstruction::Not:
1537 return false;
1538 case Instruction::Call:
1541 default:
1542 return true;
1543 }
1544}
1545
1547 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
1549 return vputils::onlyFirstLaneUsed(this);
1550
1551 switch (getOpcode()) {
1552 default:
1553 return false;
1554 case Instruction::ExtractElement:
1555 return Op == getOperand(1);
1556 case Instruction::InsertElement:
1557 return Op == getOperand(1) || Op == getOperand(2);
1558 case Instruction::PHI:
1559 return true;
1560 case Instruction::FCmp:
1561 case Instruction::ICmp:
1562 case Instruction::Select:
1563 case Instruction::Or:
1564 case Instruction::Freeze:
1565 case VPInstruction::Not:
1566 // TODO: Cover additional opcodes.
1567 return vputils::onlyFirstLaneUsed(this);
1568 case Instruction::Load:
1578 return true;
1581 // Before replicating by VF, Build(Struct)Vector uses all lanes of the
1582 // operand, after replicating its operands only the first lane is used.
1583 // Before replicating, it will have only a single operand.
1584 return getNumOperands() > 1;
1586 return Op == getOperand(0) || vputils::onlyFirstLaneUsed(this);
1588 // WidePtrAdd supports scalar and vector base addresses.
1589 return false;
1592 return Op == getOperand(0);
1593 };
1594 llvm_unreachable("switch should return");
1595}
1596
1598 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
1600 return vputils::onlyFirstPartUsed(this);
1601
1602 switch (getOpcode()) {
1603 default:
1604 return false;
1605 case Instruction::FCmp:
1606 case Instruction::ICmp:
1607 case Instruction::Select:
1608 return vputils::onlyFirstPartUsed(this);
1613 return true;
1614 };
1615 llvm_unreachable("switch should return");
1616}
1617
1618#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1620 VPSlotTracker SlotTracker(getParent()->getPlan());
1622}
1623
1625 VPSlotTracker &SlotTracker) const {
1626 O << Indent << "EMIT" << (isSingleScalar() ? "-SCALAR" : "") << " ";
1627
1628 if (hasResult()) {
1630 O << " = ";
1631 }
1632
1633 switch (getOpcode()) {
1634 case VPInstruction::Not:
1635 O << "not";
1636 break;
1638 O << "active lane mask";
1639 break;
1641 O << "incoming-alias-mask";
1642 break;
1644 O << "EXPLICIT-VECTOR-LENGTH";
1645 break;
1647 O << "first-order splice";
1648 break;
1650 O << "branch-on-cond";
1651 break;
1653 O << "branch-on-two-conds";
1654 break;
1656 O << "TC > VF ? TC - VF : 0";
1657 break;
1659 O << "VF * Part +";
1660 break;
1662 O << "branch-on-count";
1663 break;
1665 O << "broadcast";
1666 break;
1668 O << "buildstructvector";
1669 break;
1671 O << "buildvector";
1672 break;
1674 O << "exiting-iv-value";
1675 break;
1677 O << "masked-cond";
1678 break;
1680 O << "extract-lane";
1681 break;
1683 O << "extract-last-lane";
1684 break;
1686 O << "extract-last-part";
1687 break;
1689 O << "extract-penultimate-element";
1690 break;
1692 O << "compute-reduction-result";
1693 break;
1695 O << "logical-and";
1696 break;
1698 O << "logical-or";
1699 break;
1701 O << "ptradd";
1702 break;
1704 O << "wide-ptradd";
1705 break;
1707 O << "any-of";
1708 break;
1710 O << "first-active-lane";
1711 break;
1713 O << "last-active-lane";
1714 break;
1716 O << "reduction-start-vector";
1717 break;
1719 O << "resume-for-epilogue";
1720 break;
1722 O << "reverse";
1723 break;
1725 O << "unpack";
1726 break;
1728 O << "extract-last-active";
1729 break;
1731 O << "num-active-lanes";
1732 break;
1733 default:
1735 }
1736
1737 printFlags(O);
1739}
1740#endif
1741
1743 Type *ResultTy = getResultType();
1745 Value *Op = State.get(getOperand(0), VPLane(0));
1746 Value *Cast = State.Builder.CreateCast(Instruction::CastOps(getOpcode()),
1747 Op, ResultTy);
1748 if (auto *CastOp = dyn_cast<Instruction>(Cast)) {
1749 applyFlags(*CastOp);
1750 applyMetadata(*CastOp);
1751 }
1752 State.set(this, Cast, VPLane(0));
1753 return;
1754 }
1755 switch (getOpcode()) {
1757 Value *StepVector =
1758 State.Builder.CreateStepVector(VectorType::get(ResultTy, State.VF));
1759 State.set(this, StepVector);
1760 break;
1761 }
1762 case VPInstruction::VScale: {
1763 Value *VScale = State.Builder.CreateVScale(ResultTy);
1764 State.set(this, VScale, true);
1765 break;
1766 }
1767
1768 default:
1769 llvm_unreachable("opcode not implemented yet");
1770 }
1771}
1772
1773#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1775 VPSlotTracker &SlotTracker) const {
1776 O << Indent << "EMIT" << (isSingleScalar() ? "-SCALAR" : "") << " ";
1778 O << " = ";
1779
1780 Type *ResultTy = getResultType();
1781 switch (getOpcode()) {
1783 O << "wide-iv-step ";
1785 break;
1787 O << "step-vector " << *ResultTy;
1788 break;
1790 O << "vscale " << *ResultTy;
1791 break;
1792 case Instruction::Load:
1793 O << "load ";
1795 break;
1796 default:
1797 assert(Instruction::isCast(getOpcode()) && "unhandled opcode");
1799 printFlags(O);
1801 O << " to " << *ResultTy;
1802 }
1803}
1804#endif
1805
1807 PHINode *NewPhi = State.Builder.CreatePHI(getScalarType(), 2, getName());
1808 unsigned NumIncoming = getNumIncoming();
1809 // Detect header phis: the parent block dominates its second incoming block
1810 // (the latch). Those IR incoming values have not been generated yet and need
1811 // to be added after they have been executed.
1812 if (NumIncoming == 2 &&
1813 State.VPDT.dominates(getParent(), getIncomingBlock(1))) {
1814 NumIncoming = 1;
1815 }
1816 for (unsigned Idx = 0; Idx != NumIncoming; ++Idx) {
1817 Value *IncV = State.get(getIncomingValue(Idx), VPLane(0));
1818 BasicBlock *PredBB = State.CFG.VPBB2IRBB.at(getIncomingBlock(Idx));
1819 NewPhi->addIncoming(IncV, PredBB);
1820 }
1821 State.set(this, NewPhi, VPLane(0));
1822}
1823
1824#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1825void VPPhi::printRecipe(raw_ostream &O, const Twine &Indent,
1826 VPSlotTracker &SlotTracker) const {
1827 O << Indent << "EMIT" << (isSingleScalar() ? "-SCALAR" : "") << " ";
1829 O << " = phi";
1830 printFlags(O);
1832}
1833#endif
1834
1835VPIRInstruction *VPIRInstruction ::create(Instruction &I) {
1836 if (auto *Phi = dyn_cast<PHINode>(&I))
1837 return new VPIRPhi(*Phi);
1838 return new VPIRInstruction(I);
1839}
1840
1842 assert(!isa<VPIRPhi>(this) && getNumOperands() == 0 &&
1843 "PHINodes must be handled by VPIRPhi");
1844 // Advance the insert point after the wrapped IR instruction. This allows
1845 // interleaving VPIRInstructions and other recipes.
1846 State.Builder.SetInsertPoint(I.getParent(), std::next(I.getIterator()));
1847}
1848
1850 VPCostContext &Ctx) const {
1851 // The recipe wraps an existing IR instruction on the border of VPlan's scope,
1852 // hence it does not contribute to the cost-modeling for the VPlan.
1853 return 0;
1854}
1855
1856#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1858 VPSlotTracker &SlotTracker) const {
1859 O << Indent << "IR " << I;
1860}
1861#endif
1862
1864 PHINode *Phi = &getIRPhi();
1865 for (const auto &[Idx, Op] : enumerate(operands())) {
1866 VPValue *ExitValue = Op;
1867 auto Lane = vputils::isSingleScalar(ExitValue)
1869 : VPLane::getLastLaneForVF(State.VF);
1870 VPBlockBase *Pred = getParent()->getPredecessors()[Idx];
1871 auto *PredVPBB = Pred->getExitingBasicBlock();
1872 BasicBlock *PredBB = State.CFG.VPBB2IRBB[PredVPBB];
1873 // Set insertion point in PredBB in case an extract needs to be generated.
1874 // TODO: Model extracts explicitly.
1875 State.Builder.SetInsertPoint(PredBB->getTerminator());
1876 Value *V = State.get(ExitValue, VPLane(Lane));
1877 // If there is no existing block for PredBB in the phi, add a new incoming
1878 // value. Otherwise update the existing incoming value for PredBB.
1879 if (Phi->getBasicBlockIndex(PredBB) == -1)
1880 Phi->addIncoming(V, PredBB);
1881 else
1882 Phi->setIncomingValueForBlock(PredBB, V);
1883 }
1884
1885 // Advance the insert point after the wrapped IR instruction. This allows
1886 // interleaving VPIRInstructions and other recipes.
1887 State.Builder.SetInsertPoint(Phi->getParent(), std::next(Phi->getIterator()));
1888}
1889
1891 VPRecipeBase *R = const_cast<VPRecipeBase *>(getAsRecipe());
1892 assert(R->getNumOperands() == R->getParent()->getNumPredecessors() &&
1893 "Number of phi operands must match number of predecessors");
1894 unsigned Position = R->getParent()->getIndexForPredecessor(IncomingBlock);
1895 R->removeOperand(Position);
1896}
1897
1898VPValue *
1900 VPRecipeBase *R = const_cast<VPRecipeBase *>(getAsRecipe());
1901 return getIncomingValue(R->getParent()->getIndexForPredecessor(VPBB));
1902}
1903
1905 VPValue *V) const {
1906 VPRecipeBase *R = const_cast<VPRecipeBase *>(getAsRecipe());
1907 R->setOperand(R->getParent()->getIndexForPredecessor(VPBB), V);
1908}
1909
1910#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1912 VPSlotTracker &SlotTracker) const {
1913 interleaveComma(enumerate(getAsRecipe()->operands()), O,
1914 [this, &O, &SlotTracker](auto Op) {
1915 O << "[ ";
1916 Op.value()->printAsOperand(O, SlotTracker);
1917 O << ", ";
1918 getIncomingBlock(Op.index())->printAsOperand(O);
1919 O << " ]";
1920 });
1921}
1922#endif
1923
1924#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1926 VPSlotTracker &SlotTracker) const {
1928
1929 if (getNumOperands() != 0) {
1930 O << " (extra operand" << (getNumOperands() > 1 ? "s" : "") << ": ";
1932 [&O, &SlotTracker](auto Op) {
1933 std::get<0>(Op)->printAsOperand(O, SlotTracker);
1934 O << " from ";
1935 std::get<1>(Op)->printAsOperand(O);
1936 });
1937 O << ")";
1938 }
1939}
1940#endif
1941
1943 for (const auto &[Kind, Node] : Metadata)
1944 I.setMetadata(Kind, Node);
1945}
1946
1948 SmallVector<std::pair<unsigned, MDNode *>> MetadataIntersection;
1949 for (const auto &[KindA, MDA] : Metadata) {
1950 for (const auto &[KindB, MDB] : Other.Metadata) {
1951 if (KindA == KindB && MDA == MDB) {
1952 MetadataIntersection.emplace_back(KindA, MDA);
1953 break;
1954 }
1955 }
1956 }
1957 Metadata = std::move(MetadataIntersection);
1958}
1959
1960#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1962 const Module *M = SlotTracker.getModule();
1963 if (Metadata.empty() || !M)
1964 return;
1965
1966 ArrayRef<StringRef> MDNames = SlotTracker.getMDNames();
1967 O << " (";
1968 interleaveComma(Metadata, O, [&](const auto &KindNodePair) {
1969 auto [Kind, Node] = KindNodePair;
1970 assert(Kind < MDNames.size() && !MDNames[Kind].empty() &&
1971 "Unexpected unnamed metadata kind");
1972 O << "!" << MDNames[Kind] << " ";
1973 Node->printAsOperand(O, M);
1974 });
1975 O << ")";
1976}
1977#endif
1978
1980 assert(State.VF.isVector() && "not widening");
1981 assert(Variant != nullptr && "Can't create vector function.");
1982
1983 FunctionType *VFTy = Variant->getFunctionType();
1984 // Add return type if intrinsic is overloaded on it.
1986 for (const auto &I : enumerate(args())) {
1987 Value *Arg;
1988 // Some vectorized function variants may also take a scalar argument,
1989 // e.g. linear parameters for pointers. This needs to be the scalar value
1990 // from the start of the respective part when interleaving.
1991 if (!VFTy->getParamType(I.index())->isVectorTy())
1992 Arg = State.get(I.value(), VPLane(0));
1993 else
1994 Arg = State.get(I.value(), usesFirstLaneOnly(I.value()));
1995 Args.push_back(Arg);
1996 }
1997
2000 if (CI)
2001 CI->getOperandBundlesAsDefs(OpBundles);
2002
2003 CallInst *V = State.Builder.CreateCall(Variant, Args, OpBundles);
2004 applyFlags(*V);
2005 applyMetadata(*V);
2006 V->setCallingConv(Variant->getCallingConv());
2007
2008 if (!V->getType()->isVoidTy())
2009 State.set(this, V);
2010}
2011
2013 VPCostContext &Ctx) const {
2014 assert(getVectorizedTypeVF(Variant->getReturnType()) == VF &&
2015 "Variant return type must match VF");
2016 return computeCallCost(Variant, Ctx);
2017}
2018
2020 VPCostContext &Ctx) {
2021 return Ctx.TTI.getCallInstrCost(nullptr, Variant->getReturnType(),
2022 Variant->getFunctionType()->params(),
2023 Ctx.CostKind);
2024}
2025
2027 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
2028 assert(Variant && "Variant not set");
2029 FunctionType *VFTy = Variant->getFunctionType();
2030 return all_of(enumerate(args()), [VFTy, &Op](const auto &Arg) {
2031 auto [Idx, V] = Arg;
2032 Type *ArgTy = VFTy->getParamType(Idx);
2033 return V != Op || ArgTy->isIntegerTy() || ArgTy->isFloatingPointTy() ||
2034 ArgTy->isPointerTy() || ArgTy->isByteTy();
2035 });
2036}
2037
2038#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2040 VPSlotTracker &SlotTracker) const {
2041 O << Indent << "WIDEN-CALL ";
2042
2043 Function *CalledFn = getCalledScalarFunction();
2044 if (CalledFn->getReturnType()->isVoidTy())
2045 O << "void ";
2046 else {
2048 O << " = ";
2049 }
2050
2051 O << "call";
2052 printFlags(O);
2053 O << " @" << CalledFn->getName() << "(";
2054 interleaveComma(args(), O, [&O, &SlotTracker](VPValue *Op) {
2055 Op->printAsOperand(O, SlotTracker);
2056 });
2057 O << ")";
2058
2059 O << " (using library function";
2060 if (Variant->hasName())
2061 O << ": " << Variant->getName();
2062 O << ")";
2063}
2064#endif
2065
2067 assert(State.VF.isVector() && "not widening");
2068
2069 SmallVector<Type *, 2> TysForDecl;
2070 // Add return type if intrinsic is overloaded on it.
2071 if (isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, -1,
2072 State.TTI)) {
2073 Type *RetTy = toVectorizedTy(getScalarType(), State.VF);
2074 ArrayRef<Type *> ContainedTys = getContainedTypes(RetTy);
2075 for (auto [Idx, Ty] : enumerate(ContainedTys)) {
2077 Idx, State.TTI))
2078 TysForDecl.push_back(Ty);
2079 }
2080 }
2082 for (const auto &I : enumerate(operands())) {
2083 // Some intrinsics have a scalar argument - don't replace it with a
2084 // vector.
2085 Value *Arg;
2086 if (isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index(),
2087 State.TTI))
2088 Arg = State.get(I.value(), VPLane(0));
2089 else
2090 Arg = State.get(I.value(), usesFirstLaneOnly(I.value()));
2091 if (isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, I.index(),
2092 State.TTI))
2093 TysForDecl.push_back(Arg->getType());
2094 Args.push_back(Arg);
2095 }
2096
2097 // Use vector version of the intrinsic.
2098 Module *M = State.Builder.GetInsertBlock()->getModule();
2099 Function *VectorF =
2100 Intrinsic::getOrInsertDeclaration(M, VectorIntrinsicID, TysForDecl);
2101 assert(VectorF &&
2102 "Can't retrieve vector intrinsic or vector-predication intrinsics.");
2103
2106 if (CI)
2107 CI->getOperandBundlesAsDefs(OpBundles);
2108
2109 CallInst *V = State.Builder.CreateCall(VectorF, Args, OpBundles);
2110
2111 applyFlags(*V);
2112 applyMetadata(*V);
2113
2114 return V;
2115}
2116
2118 CallInst *V = createVectorCall(State);
2119 if (!V->getType()->isVoidTy())
2120 State.set(this, V);
2121}
2122
2125 const VPRecipeWithIRFlags &R, ElementCount VF, VPCostContext &Ctx) {
2126 Type *ScalarRetTy = Ctx.Types.inferScalarType(&R);
2127 // Skip the reverse operation cost for the mask.
2128 // FIXME: Remove this once redundant mask reverse operations can be eliminated
2129 // by VPlanTransforms::cse before cost computation.
2130 if (ID == Intrinsic::experimental_vp_reverse && ScalarRetTy->isIntegerTy(1))
2131 return InstructionCost(0);
2132
2133 // Some backends analyze intrinsic arguments to determine cost. Use the
2134 // underlying value for the operand if it has one. Otherwise try to use the
2135 // operand of the underlying call instruction, if there is one. Otherwise
2136 // clear Arguments.
2137 // TODO: Rework TTI interface to be independent of concrete IR values.
2139 for (const auto &[Idx, Op] : enumerate(Operands)) {
2140 auto *V = Op->getUnderlyingValue();
2141 if (!V) {
2142 if (auto *UI = dyn_cast_or_null<CallBase>(R.getUnderlyingValue())) {
2143 Arguments.push_back(UI->getArgOperand(Idx));
2144 continue;
2145 }
2146 Arguments.clear();
2147 break;
2148 }
2149 Arguments.push_back(V);
2150 }
2151
2152 Type *RetTy = VF.isVector() ? toVectorizedTy(ScalarRetTy, VF) : ScalarRetTy;
2153 SmallVector<Type *> ParamTys =
2154 map_to_vector(Operands, [&](const VPValue *Op) {
2155 return toVectorTy(Ctx.Types.inferScalarType(Op), VF);
2156 });
2157
2158 // TODO: Rework TTI interface to avoid reliance on underlying IntrinsicInst.
2159 IntrinsicCostAttributes CostAttrs(
2160 ID, RetTy, Arguments, ParamTys, R.getFastMathFlags(),
2161 dyn_cast_or_null<IntrinsicInst>(R.getUnderlyingValue()),
2163 return Ctx.TTI.getIntrinsicInstrCost(CostAttrs, Ctx.CostKind);
2164}
2165
2167 VPCostContext &Ctx) const {
2169 return computeCallCost(VectorIntrinsicID, ArgOps, *this, VF, Ctx);
2170}
2171
2173 return Intrinsic::getBaseName(VectorIntrinsicID);
2174}
2175
2177 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
2178 return all_of(enumerate(operands()), [this, &Op](const auto &X) {
2179 auto [Idx, V] = X;
2181 Idx, nullptr);
2182 });
2183}
2184
2185#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2187 VPSlotTracker &SlotTracker) const {
2188 O << Indent << "WIDEN-INTRINSIC ";
2189 if (getScalarType()->isVoidTy()) {
2190 O << "void ";
2191 } else {
2193 O << " = ";
2194 }
2195
2196 O << "call";
2197 printFlags(O);
2198 O << getIntrinsicName() << "(";
2199
2201 Op->printAsOperand(O, SlotTracker);
2202 });
2203 O << ")";
2204}
2205#endif
2206
2208 CallInst *MemI = createVectorCall(State);
2209 MemI->addParamAttr(
2210 0, Attribute::getWithAlignment(MemI->getContext(), Alignment));
2211 State.set(this, MemI);
2212}
2213
2215 Intrinsic::ID IID, Type *Ty, bool IsMasked, Align Alignment,
2216 VPCostContext &Ctx) {
2217 return Ctx.TTI.getMemIntrinsicInstrCost(
2218 MemIntrinsicCostAttributes(IID, Ty, /*Ptr=*/nullptr, IsMasked, Alignment),
2219 Ctx.CostKind);
2220}
2221
2224 VPCostContext &Ctx) const {
2225 Type *Ty = toVectorTy(getScalarType(), VF);
2227 !match(getOperand(2), m_True()), Alignment,
2228 Ctx);
2229}
2230
2232 IRBuilderBase &Builder = State.Builder;
2233
2234 Value *Address = State.get(getOperand(0));
2235 Value *IncAmt = State.get(getOperand(1), /*IsScalar=*/true);
2236 VectorType *VTy = cast<VectorType>(Address->getType());
2237
2238 // The histogram intrinsic requires a mask even if the recipe doesn't;
2239 // if the mask operand was omitted then all lanes should be executed and
2240 // we just need to synthesize an all-true mask.
2241 Value *Mask = nullptr;
2242 if (VPValue *VPMask = getMask())
2243 Mask = State.get(VPMask);
2244 else
2245 Mask =
2246 Builder.CreateVectorSplat(VTy->getElementCount(), Builder.getInt1(1));
2247
2248 // If this is a subtract, we want to invert the increment amount. We may
2249 // add a separate intrinsic in future, but for now we'll try this.
2250 if (Opcode == Instruction::Sub)
2251 IncAmt = Builder.CreateNeg(IncAmt);
2252 else
2253 assert(Opcode == Instruction::Add && "only add or sub supported for now");
2254
2255 auto *HistogramInst = State.Builder.CreateIntrinsic(
2256 Intrinsic::experimental_vector_histogram_add, {VTy, IncAmt->getType()},
2257 {Address, IncAmt, Mask});
2258 applyMetadata(*HistogramInst);
2259}
2260
2262 VPCostContext &Ctx) const {
2263 // FIXME: Take the gather and scatter into account as well. For now we're
2264 // generating the same cost as the fallback path, but we'll likely
2265 // need to create a new TTI method for determining the cost, including
2266 // whether we can use base + vec-of-smaller-indices or just
2267 // vec-of-pointers.
2268 assert(VF.isVector() && "Invalid VF for histogram cost");
2269 Type *AddressTy = Ctx.Types.inferScalarType(getOperand(0));
2270 VPValue *IncAmt = getOperand(1);
2271 Type *IncTy = Ctx.Types.inferScalarType(IncAmt);
2272 VectorType *VTy = VectorType::get(IncTy, VF);
2273
2274 // Assume that a non-constant update value (or a constant != 1) requires
2275 // a multiply, and add that into the cost.
2276 InstructionCost MulCost =
2277 Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, VTy, Ctx.CostKind);
2278 if (match(IncAmt, m_One()))
2279 MulCost = TTI::TCC_Free;
2280
2281 // Find the cost of the histogram operation itself.
2282 Type *PtrTy = VectorType::get(AddressTy, VF);
2283 Type *MaskTy = VectorType::get(Type::getInt1Ty(Ctx.LLVMCtx), VF);
2284 IntrinsicCostAttributes ICA(Intrinsic::experimental_vector_histogram_add,
2285 Type::getVoidTy(Ctx.LLVMCtx),
2286 {PtrTy, IncTy, MaskTy});
2287
2288 // Add the costs together with the add/sub operation.
2289 return Ctx.TTI.getIntrinsicInstrCost(ICA, Ctx.CostKind) + MulCost +
2290 Ctx.TTI.getArithmeticInstrCost(Opcode, VTy, Ctx.CostKind);
2291}
2292
2293#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2295 VPSlotTracker &SlotTracker) const {
2296 O << Indent << "WIDEN-HISTOGRAM buckets: ";
2298
2299 if (Opcode == Instruction::Sub)
2300 O << ", dec: ";
2301 else {
2302 assert(Opcode == Instruction::Add);
2303 O << ", inc: ";
2304 }
2306
2307 if (VPValue *Mask = getMask()) {
2308 O << ", mask: ";
2309 Mask->printAsOperand(O, SlotTracker);
2310 }
2311}
2312#endif
2313
2314VPIRFlags::FastMathFlagsTy::FastMathFlagsTy(const FastMathFlags &FMF) {
2315 AllowReassoc = FMF.allowReassoc();
2316 NoNaNs = FMF.noNaNs();
2317 NoInfs = FMF.noInfs();
2318 NoSignedZeros = FMF.noSignedZeros();
2319 AllowReciprocal = FMF.allowReciprocal();
2320 AllowContract = FMF.allowContract();
2321 ApproxFunc = FMF.approxFunc();
2322}
2323
2325 switch (Opcode) {
2326 case Instruction::Add:
2327 case Instruction::Sub:
2328 case Instruction::Mul:
2329 case Instruction::Shl:
2331 return WrapFlagsTy(false, false);
2332 case Instruction::Trunc:
2333 return TruncFlagsTy(false, false);
2334 case Instruction::Or:
2335 return DisjointFlagsTy(false);
2336 case Instruction::AShr:
2337 case Instruction::LShr:
2338 case Instruction::UDiv:
2339 case Instruction::SDiv:
2340 return ExactFlagsTy(false);
2341 case Instruction::GetElementPtr:
2344 return GEPNoWrapFlags::none();
2345 case Instruction::ZExt:
2346 case Instruction::UIToFP:
2347 return NonNegFlagsTy(false);
2348 case Instruction::FAdd:
2349 case Instruction::FSub:
2350 case Instruction::FMul:
2351 case Instruction::FDiv:
2352 case Instruction::FRem:
2353 case Instruction::FNeg:
2354 case Instruction::FPExt:
2355 case Instruction::FPTrunc:
2356 return FastMathFlags();
2357 case Instruction::ICmp:
2358 case Instruction::FCmp:
2360 llvm_unreachable("opcode requires explicit flags");
2361 default:
2362 return VPIRFlags();
2363 }
2364}
2365
2366#if !defined(NDEBUG)
2367bool VPIRFlags::flagsValidForOpcode(unsigned Opcode) const {
2368 switch (OpType) {
2369 case OperationType::OverflowingBinOp:
2370 return Opcode == Instruction::Add || Opcode == Instruction::Sub ||
2371 Opcode == Instruction::Mul || Opcode == Instruction::Shl ||
2372 Opcode == VPInstruction::VPInstruction::CanonicalIVIncrementForPart;
2373 case OperationType::Trunc:
2374 return Opcode == Instruction::Trunc;
2375 case OperationType::DisjointOp:
2376 return Opcode == Instruction::Or;
2377 case OperationType::PossiblyExactOp:
2378 return Opcode == Instruction::AShr || Opcode == Instruction::LShr ||
2379 Opcode == Instruction::UDiv || Opcode == Instruction::SDiv;
2380 case OperationType::GEPOp:
2381 return Opcode == Instruction::GetElementPtr ||
2382 Opcode == VPInstruction::PtrAdd ||
2383 Opcode == VPInstruction::WidePtrAdd;
2384 case OperationType::FPMathOp:
2385 return Opcode == Instruction::Call || Opcode == Instruction::FAdd ||
2386 Opcode == Instruction::FMul || Opcode == Instruction::FSub ||
2387 Opcode == Instruction::FNeg || Opcode == Instruction::FDiv ||
2388 Opcode == Instruction::FRem || Opcode == Instruction::FPExt ||
2389 Opcode == Instruction::FPTrunc || Opcode == Instruction::PHI ||
2390 Opcode == Instruction::Select ||
2391 Opcode == VPInstruction::WideIVStep ||
2393 case OperationType::FCmp:
2394 return Opcode == Instruction::FCmp;
2395 case OperationType::NonNegOp:
2396 return Opcode == Instruction::ZExt || Opcode == Instruction::UIToFP;
2397 case OperationType::Cmp:
2398 return Opcode == Instruction::FCmp || Opcode == Instruction::ICmp;
2399 case OperationType::ReductionOp:
2401 case OperationType::Other:
2402 return true;
2403 }
2404 llvm_unreachable("Unknown OperationType enum");
2405}
2406
2407bool VPIRFlags::hasRequiredFlagsForOpcode(unsigned Opcode) const {
2408 // Handle opcodes without default flags.
2409 if (Opcode == Instruction::ICmp)
2410 return OpType == OperationType::Cmp;
2411 if (Opcode == Instruction::FCmp)
2412 return OpType == OperationType::FCmp;
2414 return OpType == OperationType::ReductionOp;
2415
2416 OperationType Required = getDefaultFlags(Opcode).OpType;
2417 return Required == OperationType::Other || Required == OpType;
2418}
2419#endif
2420
2421#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2422static void printRecurrenceKind(raw_ostream &OS, const RecurKind &Kind) {
2423 switch (Kind) {
2424 case RecurKind::None:
2425 OS << "none";
2426 break;
2427 case RecurKind::Add:
2428 OS << "add";
2429 break;
2430 case RecurKind::Sub:
2431 OS << "sub";
2432 break;
2434 OS << "add-chain-with-subs";
2435 break;
2436 case RecurKind::Mul:
2437 OS << "mul";
2438 break;
2439 case RecurKind::Or:
2440 OS << "or";
2441 break;
2442 case RecurKind::And:
2443 OS << "and";
2444 break;
2445 case RecurKind::Xor:
2446 OS << "xor";
2447 break;
2448 case RecurKind::SMin:
2449 OS << "smin";
2450 break;
2451 case RecurKind::SMax:
2452 OS << "smax";
2453 break;
2454 case RecurKind::UMin:
2455 OS << "umin";
2456 break;
2457 case RecurKind::UMax:
2458 OS << "umax";
2459 break;
2460 case RecurKind::FAdd:
2461 OS << "fadd";
2462 break;
2464 OS << "fadd-chain-with-subs";
2465 break;
2466 case RecurKind::FSub:
2467 OS << "fsub";
2468 break;
2469 case RecurKind::FMul:
2470 OS << "fmul";
2471 break;
2472 case RecurKind::FMin:
2473 OS << "fmin";
2474 break;
2475 case RecurKind::FMax:
2476 OS << "fmax";
2477 break;
2478 case RecurKind::FMinNum:
2479 OS << "fminnum";
2480 break;
2481 case RecurKind::FMaxNum:
2482 OS << "fmaxnum";
2483 break;
2485 OS << "fminimum";
2486 break;
2488 OS << "fmaximum";
2489 break;
2491 OS << "fminimumnum";
2492 break;
2494 OS << "fmaximumnum";
2495 break;
2496 case RecurKind::FMulAdd:
2497 OS << "fmuladd";
2498 break;
2499 case RecurKind::AnyOf:
2500 OS << "any-of";
2501 break;
2502 case RecurKind::FindIV:
2503 OS << "find-iv";
2504 break;
2506 OS << "find-last";
2507 break;
2508 }
2509}
2510
2512 switch (OpType) {
2513 case OperationType::Cmp:
2515 break;
2516 case OperationType::FCmp:
2519 break;
2520 case OperationType::DisjointOp:
2521 if (DisjointFlags.IsDisjoint)
2522 O << " disjoint";
2523 break;
2524 case OperationType::PossiblyExactOp:
2525 if (ExactFlags.IsExact)
2526 O << " exact";
2527 break;
2528 case OperationType::OverflowingBinOp:
2529 if (WrapFlags.HasNUW)
2530 O << " nuw";
2531 if (WrapFlags.HasNSW)
2532 O << " nsw";
2533 break;
2534 case OperationType::Trunc:
2535 if (TruncFlags.HasNUW)
2536 O << " nuw";
2537 if (TruncFlags.HasNSW)
2538 O << " nsw";
2539 break;
2540 case OperationType::FPMathOp:
2542 break;
2543 case OperationType::GEPOp: {
2545 if (Flags.isInBounds())
2546 O << " inbounds";
2547 else if (Flags.hasNoUnsignedSignedWrap())
2548 O << " nusw";
2549 if (Flags.hasNoUnsignedWrap())
2550 O << " nuw";
2551 break;
2552 }
2553 case OperationType::NonNegOp:
2554 if (NonNegFlags.NonNeg)
2555 O << " nneg";
2556 break;
2557 case OperationType::ReductionOp: {
2558 O << " (";
2560 if (isReductionInLoop())
2561 O << ", in-loop";
2562 if (isReductionOrdered())
2563 O << ", ordered";
2564 O << ")";
2566 break;
2567 }
2568 case OperationType::Other:
2569 break;
2570 }
2571 O << " ";
2572}
2573#endif
2574
2576 auto &Builder = State.Builder;
2577 switch (Opcode) {
2578 case Instruction::Call:
2579 case Instruction::UncondBr:
2580 case Instruction::CondBr:
2581 case Instruction::PHI:
2582 case Instruction::GetElementPtr:
2583 llvm_unreachable("This instruction is handled by a different recipe.");
2584 case Instruction::UDiv:
2585 case Instruction::SDiv:
2586 case Instruction::SRem:
2587 case Instruction::URem:
2588 case Instruction::Add:
2589 case Instruction::FAdd:
2590 case Instruction::Sub:
2591 case Instruction::FSub:
2592 case Instruction::FNeg:
2593 case Instruction::Mul:
2594 case Instruction::FMul:
2595 case Instruction::FDiv:
2596 case Instruction::FRem:
2597 case Instruction::Shl:
2598 case Instruction::LShr:
2599 case Instruction::AShr:
2600 case Instruction::And:
2601 case Instruction::Or:
2602 case Instruction::Xor: {
2603 // Just widen unops and binops.
2605 for (VPValue *VPOp : operands())
2606 Ops.push_back(State.get(VPOp));
2607
2608 Value *V = Builder.CreateNAryOp(Opcode, Ops);
2609
2610 if (auto *VecOp = dyn_cast<Instruction>(V)) {
2611 applyFlags(*VecOp);
2612 applyMetadata(*VecOp);
2613 }
2614
2615 // Use this vector value for all users of the original instruction.
2616 State.set(this, V);
2617 break;
2618 }
2619 case Instruction::ExtractValue: {
2620 assert(getNumOperands() == 2 && "expected single level extractvalue");
2621 Value *Op = State.get(getOperand(0));
2622 Value *Extract = Builder.CreateExtractValue(
2623 Op, cast<VPConstantInt>(getOperand(1))->getZExtValue());
2624 State.set(this, Extract);
2625 break;
2626 }
2627 case Instruction::Freeze: {
2628 Value *Op = State.get(getOperand(0));
2629 Value *Freeze = Builder.CreateFreeze(Op);
2630 State.set(this, Freeze);
2631 break;
2632 }
2633 case Instruction::ICmp:
2634 case Instruction::FCmp: {
2635 // Widen compares. Generate vector compares.
2636 bool FCmp = Opcode == Instruction::FCmp;
2637 Value *A = State.get(getOperand(0));
2638 Value *B = State.get(getOperand(1));
2639 Value *C = nullptr;
2640 if (FCmp) {
2641 C = Builder.CreateFCmp(getPredicate(), A, B);
2642 } else {
2643 C = Builder.CreateICmp(getPredicate(), A, B);
2644 }
2645 if (auto *I = dyn_cast<Instruction>(C)) {
2646 applyFlags(*I);
2647 applyMetadata(*I);
2648 }
2649 State.set(this, C);
2650 break;
2651 }
2652 case Instruction::Select: {
2653 VPValue *CondOp = getOperand(0);
2654 Value *Cond = State.get(CondOp, vputils::isSingleScalar(CondOp));
2655 Value *Op0 = State.get(getOperand(1));
2656 Value *Op1 = State.get(getOperand(2));
2657 Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1);
2658 State.set(this, Sel);
2659 if (auto *I = dyn_cast<Instruction>(Sel)) {
2661 applyFlags(*I);
2662 applyMetadata(*I);
2663 }
2664 break;
2665 }
2666 default:
2667 // This instruction is not vectorized by simple widening.
2668 LLVM_DEBUG(dbgs() << "LV: Found an unhandled opcode : "
2669 << Instruction::getOpcodeName(Opcode));
2670 llvm_unreachable("Unhandled instruction!");
2671 } // end of switch.
2672
2673#if !defined(NDEBUG)
2674 // Verify that VPlan type inference results agree with the type of the
2675 // generated values.
2676 assert(VectorType::get(State.TypeAnalysis.inferScalarType(this), State.VF) ==
2677 State.get(this)->getType() &&
2678 "inferred type and type from generated instructions do not match");
2679#endif
2680}
2681
2683 VPCostContext &Ctx) const {
2684 switch (Opcode) {
2685 case Instruction::UDiv:
2686 case Instruction::SDiv:
2687 case Instruction::SRem:
2688 case Instruction::URem:
2689 // If the div/rem operation isn't safe to speculate and requires
2690 // predication, then the only way we can even create a vplan is to insert
2691 // a select on the second input operand to ensure we use the value of 1
2692 // for the inactive lanes. The select will be costed separately.
2693 case Instruction::FNeg:
2694 case Instruction::Add:
2695 case Instruction::FAdd:
2696 case Instruction::Sub:
2697 case Instruction::FSub:
2698 case Instruction::Mul:
2699 case Instruction::FMul:
2700 case Instruction::FDiv:
2701 case Instruction::FRem:
2702 case Instruction::Shl:
2703 case Instruction::LShr:
2704 case Instruction::AShr:
2705 case Instruction::And:
2706 case Instruction::Or:
2707 case Instruction::Xor:
2708 case Instruction::Freeze:
2709 case Instruction::ExtractValue:
2710 case Instruction::ICmp:
2711 case Instruction::FCmp:
2712 case Instruction::Select:
2713 return getCostForRecipeWithOpcode(getOpcode(), VF, Ctx);
2714 default:
2715 llvm_unreachable("Unsupported opcode for instruction");
2716 }
2717}
2718
2719#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2721 VPSlotTracker &SlotTracker) const {
2722 O << Indent << "WIDEN ";
2724 O << " = " << Instruction::getOpcodeName(Opcode);
2725 printFlags(O);
2727}
2728#endif
2729
2731 auto &Builder = State.Builder;
2732 /// Vectorize casts.
2733 assert(State.VF.isVector() && "Not vectorizing?");
2734 Type *DestTy = VectorType::get(getScalarType(), State.VF);
2735 VPValue *Op = getOperand(0);
2736 Value *A = State.get(Op);
2737 Value *Cast = Builder.CreateCast(Instruction::CastOps(Opcode), A, DestTy);
2738 State.set(this, Cast);
2739 if (auto *CastOp = dyn_cast<Instruction>(Cast)) {
2740 applyFlags(*CastOp);
2741 applyMetadata(*CastOp);
2742 }
2743}
2744
2749
2750#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2752 VPSlotTracker &SlotTracker) const {
2753 O << Indent << "WIDEN-CAST ";
2755 O << " = " << Instruction::getOpcodeName(Opcode);
2756 printFlags(O);
2758 O << " to " << *getScalarType();
2759}
2760#endif
2761
2763 VPCostContext &Ctx) const {
2764 return Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind);
2765}
2766
2767#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2769 raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const {
2770 O << Indent;
2772 O << " = WIDEN-INDUCTION";
2773 printFlags(O);
2775
2776 if (auto *TI = getTruncInst())
2777 O << " (truncated to " << *TI->getType() << ")";
2778}
2779#endif
2780
2782 // The step may be defined by a recipe in the preheader (e.g. if it requires
2783 // SCEV expansion), but for the canonical induction the step is required to be
2784 // 1, which is represented as live-in.
2785 return match(getStartValue(), m_ZeroInt()) &&
2786 match(getStepValue(), m_One()) &&
2787 getScalarType() == getRegion()->getCanonicalIVType();
2788}
2789
2790#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2792 VPSlotTracker &SlotTracker) const {
2793 O << Indent;
2795 O << " = DERIVED-IV ";
2796 getStartValue()->printAsOperand(O, SlotTracker);
2797 O << " + ";
2798 getOperand(1)->printAsOperand(O, SlotTracker);
2799 O << " * ";
2800 getStepValue()->printAsOperand(O, SlotTracker);
2801}
2802#endif
2803
2805 // Fast-math-flags propagate from the original induction instruction.
2806 IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
2807 State.Builder.setFastMathFlags(getFastMathFlags());
2808
2809 /// Compute scalar induction steps. \p ScalarIV is the scalar induction
2810 /// variable on which to base the steps, \p Step is the size of the step.
2811
2812 Value *BaseIV = State.get(getOperand(0), VPLane(0));
2813 Value *Step = State.get(getStepValue(), VPLane(0));
2814 IRBuilderBase &Builder = State.Builder;
2815
2816 // Ensure step has the same type as that of scalar IV.
2817 Type *BaseIVTy = BaseIV->getType()->getScalarType();
2818 assert(BaseIVTy == Step->getType() && "Types of BaseIV and Step must match!");
2819
2820 // We build scalar steps for both integer and floating-point induction
2821 // variables. Here, we determine the kind of arithmetic we will perform.
2824 if (BaseIVTy->isIntegerTy()) {
2825 AddOp = Instruction::Add;
2826 MulOp = Instruction::Mul;
2827 } else {
2828 AddOp = InductionOpcode;
2829 MulOp = Instruction::FMul;
2830 }
2831
2832 // Determine the number of scalars we need to generate.
2833 bool FirstLaneOnly = vputils::onlyFirstLaneUsed(this);
2834 // Compute the scalar steps and save the results in State.
2835
2836 unsigned EndLane = FirstLaneOnly ? 1 : State.VF.getKnownMinValue();
2837 Value *StartIdx0 = getStartIndex() ? State.get(getStartIndex(), true)
2838 : Constant::getNullValue(BaseIVTy);
2839
2840 for (unsigned Lane = 0; Lane < EndLane; ++Lane) {
2841 // It is okay if the induction variable type cannot hold the lane number,
2842 // we expect truncation in this case.
2843 Constant *LaneValue =
2844 BaseIVTy->isIntegerTy()
2845 ? ConstantInt::get(BaseIVTy, Lane, /*IsSigned=*/false,
2846 /*ImplicitTrunc=*/true)
2847 : ConstantFP::get(BaseIVTy, Lane);
2848 Value *StartIdx = Builder.CreateBinOp(AddOp, StartIdx0, LaneValue);
2849 assert((State.VF.isScalable() || isa<Constant>(StartIdx)) &&
2850 "Expected StartIdx to be folded to a constant when VF is not "
2851 "scalable");
2852 auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step);
2853 auto *Add = Builder.CreateBinOp(AddOp, BaseIV, Mul);
2854 State.set(this, Add, VPLane(Lane));
2855 }
2856}
2857
2858#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2860 VPSlotTracker &SlotTracker) const {
2861 O << Indent;
2863 O << " = SCALAR-STEPS ";
2865}
2866#endif
2867
2869 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
2871}
2872
2874 assert(State.VF.isVector() && "not widening");
2875 // Construct a vector GEP by widening the operands of the scalar GEP as
2876 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
2877 // results in a vector of pointers when at least one operand of the GEP
2878 // is vector-typed. Thus, to keep the representation compact, we only use
2879 // vector-typed operands for loop-varying values.
2880
2881 bool AllOperandsAreInvariant = all_of(operands(), [](VPValue *Op) {
2882 return Op->isDefinedOutsideLoopRegions();
2883 });
2884 if (AllOperandsAreInvariant) {
2885 // If we are vectorizing, but the GEP has only loop-invariant operands,
2886 // the GEP we build (by only using vector-typed operands for
2887 // loop-varying values) would be a scalar pointer. Thus, to ensure we
2888 // produce a vector of pointers, we need to either arbitrarily pick an
2889 // operand to broadcast, or broadcast a clone of the original GEP.
2890 // Here, we broadcast a clone of the original.
2891
2893 for (unsigned I = 0, E = getNumOperands(); I != E; I++)
2894 Ops.push_back(State.get(getOperand(I), VPLane(0)));
2895
2896 auto *NewGEP =
2897 State.Builder.CreateGEP(getSourceElementType(), Ops[0], drop_begin(Ops),
2898 "", getGEPNoWrapFlags());
2899 Value *Splat = State.Builder.CreateVectorSplat(State.VF, NewGEP);
2900 State.set(this, Splat);
2901 return;
2902 }
2903
2904 // If the GEP has at least one loop-varying operand, we are sure to
2905 // produce a vector of pointers unless VF is scalar.
2906 // The pointer operand of the new GEP. If it's loop-invariant, we
2907 // won't broadcast it.
2908 auto *Ptr = State.get(getOperand(0), isPointerLoopInvariant());
2909
2910 // Collect all the indices for the new GEP. If any index is
2911 // loop-invariant, we won't broadcast it.
2913 for (unsigned I = 1, E = getNumOperands(); I < E; I++) {
2914 VPValue *Operand = getOperand(I);
2915 Indices.push_back(State.get(Operand, isIndexLoopInvariant(I - 1)));
2916 }
2917
2918 // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
2919 // but it should be a vector, otherwise.
2920 auto *NewGEP = State.Builder.CreateGEP(getSourceElementType(), Ptr, Indices,
2921 "", getGEPNoWrapFlags());
2922 assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
2923 "NewGEP is not a pointer vector");
2924 State.set(this, NewGEP);
2925}
2926
2927#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2929 VPSlotTracker &SlotTracker) const {
2930 O << Indent << "WIDEN-GEP ";
2931 O << (isPointerLoopInvariant() ? "Inv" : "Var");
2932 for (size_t I = 0; I < getNumOperands() - 1; ++I)
2933 O << "[" << (isIndexLoopInvariant(I) ? "Inv" : "Var") << "]";
2934
2935 O << " ";
2937 O << " = getelementptr";
2938 printFlags(O);
2940}
2941#endif
2942
2944 assert(!getOffset() && "Unexpected offset operand");
2945 VPBuilder Builder(this);
2946 VPlan &Plan = *getParent()->getPlan();
2947 VPValue *VFVal = getVFValue();
2948 VPTypeAnalysis TypeInfo(Plan);
2949 const DataLayout &DL = Plan.getDataLayout();
2950 Type *IndexTy = DL.getIndexType(TypeInfo.inferScalarType(this));
2951 VPValue *Stride =
2952 Plan.getConstantInt(IndexTy, getStride(), /*IsSigned=*/true);
2953 Type *VFTy = TypeInfo.inferScalarType(VFVal);
2954 VPValue *VF = Builder.createScalarZExtOrTrunc(VFVal, IndexTy, VFTy,
2956
2957 // Offset for Part0 = Offset0 = Stride * (VF - 1).
2958 VPInstruction *VFMinusOne =
2959 Builder.createSub(VF, Plan.getConstantInt(IndexTy, 1u),
2960 DebugLoc::getUnknown(), "", {true, true});
2961 VPInstruction *Offset0 =
2962 Builder.createOverflowingOp(Instruction::Mul, {VFMinusOne, Stride});
2963
2964 // Offset for PartN = Offset0 + Part * Stride * VF.
2965 VPValue *PartxStride =
2966 Plan.getConstantInt(IndexTy, Part * getStride(), /*IsSigned=*/true);
2967 VPValue *Offset = Builder.createAdd(
2968 Offset0,
2969 Builder.createOverflowingOp(Instruction::Mul, {PartxStride, VF}));
2971}
2972
2974 auto &Builder = State.Builder;
2975 assert(getOffset() && "Expected prior materialization of offset");
2976 Value *Ptr = State.get(getPointer(), true);
2977 Value *Offset = State.get(getOffset(), true);
2978 Value *ResultPtr = Builder.CreateGEP(getSourceElementType(), Ptr, Offset, "",
2980 State.set(this, ResultPtr, /*IsScalar*/ true);
2981}
2982
2983#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2985 VPSlotTracker &SlotTracker) const {
2986 O << Indent;
2988 O << " = vector-end-pointer";
2989 printFlags(O);
2991}
2992#endif
2993
2995 assert(getVFxPart() &&
2996 "Expected prior simplification of recipe without VFxPart");
2997
2998 auto &Builder = State.Builder;
2999 Value *Ptr = State.get(getOperand(0), VPLane(0));
3000 Value *Offset = State.get(getVFxPart(), true);
3001 // TODO: Expand to VPInstruction to support constant folding.
3002 if (!match(getStride(), m_One())) {
3003 Value *Stride = Builder.CreateZExtOrTrunc(State.get(getStride(), true),
3004 Offset->getType());
3005 Offset = Builder.CreateMul(Offset, Stride);
3006 }
3007 Value *ResultPtr = Builder.CreateGEP(getSourceElementType(), Ptr, Offset, "",
3009 State.set(this, ResultPtr, /*IsScalar*/ true);
3010}
3011
3012#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3014 VPSlotTracker &SlotTracker) const {
3015 O << Indent;
3017 O << " = vector-pointer";
3018 printFlags(O);
3020}
3021#endif
3022
3024 VPCostContext &Ctx) const {
3025 // A blend will be expanded to a select VPInstruction, which will generate a
3026 // scalar select if only the first lane is used.
3028 VF = ElementCount::getFixed(1);
3029
3030 Type *ResultTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
3031 Type *CmpTy = toVectorTy(Type::getInt1Ty(Ctx.Types.getContext()), VF);
3032 return (getNumIncomingValues() - 1) *
3033 Ctx.TTI.getCmpSelInstrCost(Instruction::Select, ResultTy, CmpTy,
3034 CmpInst::BAD_ICMP_PREDICATE, Ctx.CostKind);
3035}
3036
3037#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3039 VPSlotTracker &SlotTracker) const {
3040 O << Indent << "BLEND ";
3042 O << " =";
3043 printFlags(O);
3044 if (getNumIncomingValues() == 1) {
3045 // Not a User of any mask: not really blending, this is a
3046 // single-predecessor phi.
3047 getIncomingValue(0)->printAsOperand(O, SlotTracker);
3048 } else {
3049 for (unsigned I = 0, E = getNumIncomingValues(); I < E; ++I) {
3050 if (I != 0)
3051 O << " ";
3052 getIncomingValue(I)->printAsOperand(O, SlotTracker);
3053 if (I == 0 && isNormalized())
3054 continue;
3055 O << "/";
3056 getMask(I)->printAsOperand(O, SlotTracker);
3057 }
3058 }
3059}
3060#endif
3061
3065 "In-loop AnyOf reductions aren't currently supported");
3066 // Propagate the fast-math flags carried by the underlying instruction.
3067 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
3068 State.Builder.setFastMathFlags(getFastMathFlags());
3069 Value *NewVecOp = State.get(getVecOp());
3070 if (VPValue *Cond = getCondOp()) {
3071 Value *NewCond = State.get(Cond, State.VF.isScalar());
3072 VectorType *VecTy = dyn_cast<VectorType>(NewVecOp->getType());
3073 Type *ElementTy = VecTy ? VecTy->getElementType() : NewVecOp->getType();
3074
3075 Value *Start = getRecurrenceIdentity(Kind, ElementTy, getFastMathFlags());
3076 if (State.VF.isVector())
3077 Start = State.Builder.CreateVectorSplat(VecTy->getElementCount(), Start);
3078
3079 Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, Start);
3080 NewVecOp = Select;
3081 }
3082 Value *NewRed;
3083 Value *NextInChain;
3084 if (isOrdered()) {
3085 Value *PrevInChain = State.get(getChainOp(), /*IsScalar*/ true);
3086 if (State.VF.isVector())
3087 NewRed =
3088 createOrderedReduction(State.Builder, Kind, NewVecOp, PrevInChain);
3089 else
3090 NewRed = State.Builder.CreateBinOp(
3092 PrevInChain, NewVecOp);
3093 PrevInChain = NewRed;
3094 NextInChain = NewRed;
3095 } else if (isPartialReduction()) {
3096 assert((Kind == RecurKind::Add || Kind == RecurKind::FAdd) &&
3097 "Unexpected partial reduction kind");
3098 Value *PrevInChain = State.get(getChainOp(), /*IsScalar*/ false);
3099 NewRed = State.Builder.CreateIntrinsic(
3100 PrevInChain->getType(),
3101 Kind == RecurKind::Add ? Intrinsic::vector_partial_reduce_add
3102 : Intrinsic::vector_partial_reduce_fadd,
3103 {PrevInChain, NewVecOp}, State.Builder.getFastMathFlags(),
3104 "partial.reduce");
3105 PrevInChain = NewRed;
3106 NextInChain = NewRed;
3107 } else {
3108 assert(isInLoop() &&
3109 "The reduction must either be ordered, partial or in-loop");
3110 Value *PrevInChain = State.get(getChainOp(), /*IsScalar*/ true);
3111 NewRed = createSimpleReduction(State.Builder, NewVecOp, Kind);
3113 NextInChain = createMinMaxOp(State.Builder, Kind, NewRed, PrevInChain);
3114 else
3115 NextInChain = State.Builder.CreateBinOp(
3117 PrevInChain, NewRed);
3118 }
3119 State.set(this, NextInChain, /*IsScalar*/ !isPartialReduction());
3120}
3121
3123
3124 auto &Builder = State.Builder;
3125 // Propagate the fast-math flags carried by the underlying instruction.
3126 IRBuilderBase::FastMathFlagGuard FMFGuard(Builder);
3127 Builder.setFastMathFlags(getFastMathFlags());
3128
3130 Value *Prev = State.get(getChainOp(), /*IsScalar*/ true);
3131 Value *VecOp = State.get(getVecOp());
3132 Value *EVL = State.get(getEVL(), VPLane(0));
3133
3134 Value *Mask;
3135 if (VPValue *CondOp = getCondOp())
3136 Mask = State.get(CondOp);
3137 else
3138 Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
3139
3140 Value *NewRed;
3141 if (isOrdered()) {
3142 NewRed = createOrderedReduction(Builder, Kind, VecOp, Prev, Mask, EVL);
3143 } else {
3144 NewRed = createSimpleReduction(Builder, VecOp, Kind, Mask, EVL);
3146 NewRed = createMinMaxOp(Builder, Kind, NewRed, Prev);
3147 else
3148 NewRed = Builder.CreateBinOp(
3150 Prev);
3151 }
3152 State.set(this, NewRed, /*IsScalar*/ true);
3153}
3154
3156 VPCostContext &Ctx) const {
3157 RecurKind RdxKind = getRecurrenceKind();
3158 Type *ElementTy = Ctx.Types.inferScalarType(this);
3159 auto *VectorTy = cast<VectorType>(toVectorTy(ElementTy, VF));
3160 unsigned Opcode = RecurrenceDescriptor::getOpcode(RdxKind);
3162 std::optional<FastMathFlags> OptionalFMF =
3163 ElementTy->isFloatingPointTy() ? std::make_optional(FMFs) : std::nullopt;
3164
3165 if (isPartialReduction()) {
3166 InstructionCost CondCost = 0;
3167 if (isConditional()) {
3169 auto *CondTy = cast<VectorType>(
3170 toVectorTy(Ctx.Types.inferScalarType(getCondOp()), VF));
3171 CondCost = Ctx.TTI.getCmpSelInstrCost(Instruction::Select, VectorTy,
3172 CondTy, Pred, Ctx.CostKind);
3173 }
3174 return CondCost + Ctx.TTI.getPartialReductionCost(
3175 Opcode, ElementTy, ElementTy, ElementTy, VF,
3176 TTI::PR_None, TTI::PR_None, {}, Ctx.CostKind,
3177 OptionalFMF);
3178 }
3179
3180 // TODO: Support any-of reductions.
3181 assert(
3183 ForceTargetInstructionCost.getNumOccurrences() > 0) &&
3184 "Any-of reduction not implemented in VPlan-based cost model currently.");
3185
3186 // Note that TTI should model the cost of moving result to the scalar register
3187 // and the BinOp cost in the getMinMaxReductionCost().
3190 return Ctx.TTI.getMinMaxReductionCost(Id, VectorTy, FMFs, Ctx.CostKind);
3191 }
3192
3193 // Note that TTI should model the cost of moving result to the scalar register
3194 // and the BinOp cost in the getArithmeticReductionCost().
3195 return Ctx.TTI.getArithmeticReductionCost(Opcode, VectorTy, OptionalFMF,
3196 Ctx.CostKind);
3197}
3198
3199VPExpressionRecipe::VPExpressionRecipe(
3200 ExpressionTypes ExpressionType,
3201 ArrayRef<VPSingleDefRecipe *> ExpressionRecipes)
3202 : VPSingleDefRecipe(VPRecipeBase::VPExpressionSC, {},
3203 cast<VPReductionRecipe>(ExpressionRecipes.back())
3204 ->getChainOp()
3205 ->getScalarType()),
3206 ExpressionRecipes(ExpressionRecipes), ExpressionType(ExpressionType) {
3207 assert(!ExpressionRecipes.empty() && "Nothing to combine?");
3208 assert(
3209 none_of(ExpressionRecipes,
3210 [](VPSingleDefRecipe *R) { return R->mayHaveSideEffects(); }) &&
3211 "expression cannot contain recipes with side-effects");
3212
3213 // Maintain a copy of the expression recipes as a set of users.
3214 SmallPtrSet<VPUser *, 4> ExpressionRecipesAsSetOfUsers;
3215 for (auto *R : ExpressionRecipes)
3216 ExpressionRecipesAsSetOfUsers.insert(R);
3217
3218 // Recipes in the expression, except the last one, must only be used by
3219 // (other) recipes inside the expression. If there are other users, external
3220 // to the expression, use a clone of the recipe for external users.
3221 for (VPSingleDefRecipe *R : reverse(ExpressionRecipes)) {
3222 if (R != ExpressionRecipes.back() &&
3223 any_of(R->users(), [&ExpressionRecipesAsSetOfUsers](VPUser *U) {
3224 return !ExpressionRecipesAsSetOfUsers.contains(U);
3225 })) {
3226 // There are users outside of the expression. Clone the recipe and use the
3227 // clone those external users.
3228 VPSingleDefRecipe *CopyForExtUsers = R->clone();
3229 R->replaceUsesWithIf(CopyForExtUsers, [&ExpressionRecipesAsSetOfUsers](
3230 VPUser &U, unsigned) {
3231 return !ExpressionRecipesAsSetOfUsers.contains(&U);
3232 });
3233 CopyForExtUsers->insertBefore(R);
3234 }
3235 if (R->getParent())
3236 R->removeFromParent();
3237 }
3238
3239 // Internalize all external operands to the expression recipes. To do so,
3240 // create new temporary VPValues for all operands defined by a recipe outside
3241 // the expression. The original operands are added as operands of the
3242 // VPExpressionRecipe itself.
3243 for (auto *R : ExpressionRecipes) {
3244 for (const auto &[Idx, Op] : enumerate(R->operands())) {
3245 auto *Def = Op->getDefiningRecipe();
3246 if (Def && ExpressionRecipesAsSetOfUsers.contains(Def))
3247 continue;
3248 addOperand(Op);
3249 LiveInPlaceholders.push_back(
3251 }
3252 }
3253
3254 // Replace each external operand with the first one created for it in
3255 // LiveInPlaceholders.
3256 for (auto *R : ExpressionRecipes)
3257 for (auto const &[LiveIn, Tmp] : zip(operands(), LiveInPlaceholders))
3258 R->replaceUsesOfWith(LiveIn, Tmp);
3259}
3260
3262 for (auto *R : ExpressionRecipes)
3263 // Since the list could contain duplicates, make sure the recipe hasn't
3264 // already been inserted.
3265 if (!R->getParent())
3266 R->insertBefore(this);
3267
3268 for (const auto &[Idx, Op] : enumerate(operands()))
3269 LiveInPlaceholders[Idx]->replaceAllUsesWith(Op);
3270
3271 replaceAllUsesWith(ExpressionRecipes.back());
3272 ExpressionRecipes.clear();
3273}
3274
3276 VPCostContext &Ctx) const {
3277 Type *RedTy = Ctx.Types.inferScalarType(this);
3278 auto *SrcVecTy = cast<VectorType>(
3279 toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF));
3280 unsigned Opcode = RecurrenceDescriptor::getOpcode(
3281 cast<VPReductionRecipe>(ExpressionRecipes.back())->getRecurrenceKind());
3282 switch (ExpressionType) {
3283 case ExpressionTypes::ExtendedReduction: {
3284 unsigned Opcode = RecurrenceDescriptor::getOpcode(
3285 cast<VPReductionRecipe>(ExpressionRecipes[1])->getRecurrenceKind());
3286 auto *ExtR = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
3287 auto *RedR = cast<VPReductionRecipe>(ExpressionRecipes.back());
3288
3289 if (RedR->isPartialReduction())
3290 return Ctx.TTI.getPartialReductionCost(
3291 Opcode, Ctx.Types.inferScalarType(getOperand(0)), nullptr, RedTy, VF,
3293 TargetTransformInfo::PR_None, std::nullopt, Ctx.CostKind,
3294 RedTy->isFloatingPointTy() ? std::optional{RedR->getFastMathFlags()}
3295 : std::nullopt);
3296 else if (!RedTy->isFloatingPointTy())
3297 // TTI::getExtendedReductionCost only supports integer types.
3298 return Ctx.TTI.getExtendedReductionCost(
3299 Opcode, ExtR->getOpcode() == Instruction::ZExt, RedTy, SrcVecTy,
3300 std::nullopt, Ctx.CostKind);
3301 else
3303 }
3304 case ExpressionTypes::MulAccReduction:
3305 return Ctx.TTI.getMulAccReductionCost(false, Opcode, RedTy, SrcVecTy,
3306 Ctx.CostKind);
3307
3308 case ExpressionTypes::ExtNegatedMulAccReduction:
3309 switch (Opcode) {
3310 case Instruction::Add:
3311 Opcode = Instruction::Sub;
3312 break;
3313 case Instruction::FAdd:
3314 Opcode = Instruction::FSub;
3315 break;
3316 default:
3317 llvm_unreachable("Unsupported opcode for ExtNegatedMulAccReduction");
3318 }
3319 [[fallthrough]];
3320 case ExpressionTypes::ExtMulAccReduction: {
3321 auto *RedR = cast<VPReductionRecipe>(ExpressionRecipes.back());
3322 if (RedR->isPartialReduction()) {
3323 auto *Ext0R = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
3324 auto *Ext1R = cast<VPWidenCastRecipe>(ExpressionRecipes[1]);
3325 auto *Mul = cast<VPWidenRecipe>(ExpressionRecipes[2]);
3326 return Ctx.TTI.getPartialReductionCost(
3327 Opcode, Ctx.Types.inferScalarType(getOperand(0)),
3328 Ctx.Types.inferScalarType(getOperand(1)), RedTy, VF,
3330 Ext0R->getOpcode()),
3332 Ext1R->getOpcode()),
3333 Mul->getOpcode(), Ctx.CostKind,
3334 RedTy->isFloatingPointTy() ? std::optional{RedR->getFastMathFlags()}
3335 : std::nullopt);
3336 }
3337 assert(Opcode != Instruction::FSub && "Only integer types are supported");
3338 return Ctx.TTI.getMulAccReductionCost(
3339 cast<VPWidenCastRecipe>(ExpressionRecipes.front())->getOpcode() ==
3340 Instruction::ZExt,
3341 Opcode, RedTy, SrcVecTy, Ctx.CostKind);
3342 }
3343 }
3344 llvm_unreachable("Unknown VPExpressionRecipe::ExpressionTypes enum");
3345}
3346
3348 return any_of(ExpressionRecipes, [](VPSingleDefRecipe *R) {
3349 return R->mayReadFromMemory() || R->mayWriteToMemory();
3350 });
3351}
3352
3354 assert(
3355 none_of(ExpressionRecipes,
3356 [](VPSingleDefRecipe *R) { return R->mayHaveSideEffects(); }) &&
3357 "expression cannot contain recipes with side-effects");
3358 return false;
3359}
3360
3362 auto *RR = dyn_cast<VPReductionRecipe>(ExpressionRecipes.back());
3363 return RR && !RR->isPartialReduction();
3364}
3365
3366#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3367
3369 VPSlotTracker &SlotTracker) const {
3370 O << Indent << "EXPRESSION ";
3372 O << " = ";
3373 auto *Red = cast<VPReductionRecipe>(ExpressionRecipes.back());
3374 unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind());
3375
3376 switch (ExpressionType) {
3377 case ExpressionTypes::ExtendedReduction: {
3379 O << " + " << (Red->isPartialReduction() ? "partial." : "") << "reduce.";
3380 O << Instruction::getOpcodeName(Opcode) << " (";
3382 Red->printFlags(O);
3383
3384 auto *Ext0 = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
3385 O << Instruction::getOpcodeName(Ext0->getOpcode()) << " to "
3386 << *Ext0->getScalarType();
3387 if (Red->isConditional()) {
3388 O << ", ";
3389 Red->getCondOp()->printAsOperand(O, SlotTracker);
3390 }
3391 O << ")";
3392 break;
3393 }
3394 case ExpressionTypes::ExtNegatedMulAccReduction: {
3396 O << " + " << (Red->isPartialReduction() ? "partial." : "") << "reduce.";
3398 RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()))
3399 << " (sub (0, mul";
3400 auto *Mul = cast<VPWidenRecipe>(ExpressionRecipes[2]);
3401 Mul->printFlags(O);
3402 O << "(";
3404 auto *Ext0 = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
3405 O << " " << Instruction::getOpcodeName(Ext0->getOpcode()) << " to "
3406 << *Ext0->getScalarType() << "), (";
3408 auto *Ext1 = cast<VPWidenCastRecipe>(ExpressionRecipes[1]);
3409 O << " " << Instruction::getOpcodeName(Ext1->getOpcode()) << " to "
3410 << *Ext1->getScalarType() << ")";
3411 if (Red->isConditional()) {
3412 O << ", ";
3413 Red->getCondOp()->printAsOperand(O, SlotTracker);
3414 }
3415 O << "))";
3416 break;
3417 }
3418 case ExpressionTypes::MulAccReduction:
3419 case ExpressionTypes::ExtMulAccReduction: {
3421 O << " + " << (Red->isPartialReduction() ? "partial." : "") << "reduce.";
3423 RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()))
3424 << " (";
3425 O << "mul";
3426 bool IsExtended = ExpressionType == ExpressionTypes::ExtMulAccReduction;
3427 auto *Mul = cast<VPWidenRecipe>(IsExtended ? ExpressionRecipes[2]
3428 : ExpressionRecipes[0]);
3429 Mul->printFlags(O);
3430 if (IsExtended)
3431 O << "(";
3433 if (IsExtended) {
3434 auto *Ext0 = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
3435 O << " " << Instruction::getOpcodeName(Ext0->getOpcode()) << " to "
3436 << *Ext0->getScalarType() << "), (";
3437 } else {
3438 O << ", ";
3439 }
3441 if (IsExtended) {
3442 auto *Ext1 = cast<VPWidenCastRecipe>(ExpressionRecipes[1]);
3443 O << " " << Instruction::getOpcodeName(Ext1->getOpcode()) << " to "
3444 << *Ext1->getScalarType() << ")";
3445 }
3446 if (Red->isConditional()) {
3447 O << ", ";
3448 Red->getCondOp()->printAsOperand(O, SlotTracker);
3449 }
3450 O << ")";
3451 break;
3452 }
3453 }
3454}
3455
3457 VPSlotTracker &SlotTracker) const {
3458 if (isPartialReduction())
3459 O << Indent << "PARTIAL-REDUCE ";
3460 else
3461 O << Indent << "REDUCE ";
3463 O << " = ";
3465 O << " +";
3466 printFlags(O);
3467 O << " reduce.";
3469 O << " (";
3471 if (isConditional()) {
3472 O << ", ";
3474 }
3475 O << ")";
3476}
3477
3479 VPSlotTracker &SlotTracker) const {
3480 O << Indent << "REDUCE ";
3482 O << " = ";
3484 O << " +";
3485 printFlags(O);
3486 O << " vp.reduce."
3489 << " (";
3491 O << ", ";
3493 if (isConditional()) {
3494 O << ", ";
3496 }
3497 O << ")";
3498}
3499
3500#endif
3501
3503 assert(IsSingleScalar &&
3504 "VPReplicateRecipes must be unrolled before ::execute");
3505 auto *Instr = getUnderlyingInstr();
3506 Instruction *Cloned = Instr->clone();
3507 Type *ResultTy = getScalarType();
3508 if (!ResultTy->isVoidTy()) {
3509 Cloned->setName(Instr->getName() + ".cloned");
3510 // The operands of the replicate recipe may have been narrowed, resulting in
3511 // a narrower result type. Update the type of the cloned instruction to the
3512 // correct type.
3513 if (ResultTy != Cloned->getType())
3514 Cloned->mutateType(ResultTy);
3515 }
3516
3517 applyFlags(*Cloned);
3518 applyMetadata(*Cloned);
3519
3520 if (hasPredicate())
3521 cast<CmpInst>(Cloned)->setPredicate(getPredicate());
3522
3523 // Replace the operands of the cloned instructions with their scalar
3524 // equivalents in the new loop.
3525 for (const auto &[Idx, V] : enumerate(operands()))
3526 Cloned->setOperand(Idx, State.get(V, true));
3527
3528 // Place the cloned scalar in the new loop.
3529 State.Builder.Insert(Cloned);
3530
3531 State.set(this, Cloned, true);
3532
3533 // If we just cloned a new assumption, add it the assumption cache.
3534 if (auto *II = dyn_cast<AssumeInst>(Cloned))
3535 State.AC->registerAssumption(II);
3536}
3537
3538/// Returns a SCEV expression for \p Ptr if it is a pointer computation for
3539/// which the legacy cost model computes a SCEV expression when computing the
3540/// address cost. Computing SCEVs for VPValues is incomplete and returns
3541/// SCEVCouldNotCompute in cases the legacy cost model can compute SCEVs. In
3542/// those cases we fall back to the legacy cost model. Otherwise return nullptr.
3543static const SCEV *getAddressAccessSCEV(const VPValue *Ptr,
3545 const Loop *L) {
3546 const SCEV *Addr = vputils::getSCEVExprForVPValue(Ptr, PSE, L);
3547 if (isa<SCEVCouldNotCompute>(Addr))
3548 return Addr;
3549
3550 return vputils::isAddressSCEVForCost(Addr, *PSE.getSE(), L) ? Addr : nullptr;
3551}
3552
3553/// Return true if \p R is a predicated store with a loop-invariant address
3554/// only masked by the header mask.
3556 const SCEV *PtrSCEV,
3557 VPCostContext &Ctx) {
3558 const VPRegionBlock *ParentRegion = R.getRegion();
3559 if (R.getOpcode() != Instruction::Store || !ParentRegion ||
3560 !ParentRegion->isReplicator() || !PtrSCEV ||
3561 !Ctx.PSE.getSE()->isLoopInvariant(PtrSCEV, Ctx.L))
3562 return false;
3563 auto *BOM =
3565 return vputils::isHeaderMask(BOM->getOperand(0), *ParentRegion->getPlan());
3566}
3567
3569 VPCostContext &Ctx) const {
3571 // VPReplicateRecipe may be cloned as part of an existing VPlan-to-VPlan
3572 // transform, avoid computing their cost multiple times for now.
3573 Ctx.SkipCostComputation.insert(UI);
3574
3575 if (VF.isScalable() && !isSingleScalar())
3577
3578 switch (UI->getOpcode()) {
3579 case Instruction::Alloca:
3580 if (VF.isScalable())
3582 return Ctx.TTI.getArithmeticInstrCost(
3583 Instruction::Mul, Ctx.Types.inferScalarType(this), Ctx.CostKind);
3584 case Instruction::GetElementPtr:
3585 // We mark this instruction as zero-cost because the cost of GEPs in
3586 // vectorized code depends on whether the corresponding memory instruction
3587 // is scalarized or not. Therefore, we handle GEPs with the memory
3588 // instruction cost.
3589 return 0;
3590 case Instruction::Call: {
3591 auto *CalledFn =
3593 Type *ResultTy = Ctx.Types.inferScalarType(this);
3595 return computeCallCost(CalledFn, ResultTy, ArgOps, isSingleScalar(), VF,
3596 Ctx);
3597 }
3598 case Instruction::Add:
3599 case Instruction::Sub:
3600 case Instruction::FAdd:
3601 case Instruction::FSub:
3602 case Instruction::Mul:
3603 case Instruction::FMul:
3604 case Instruction::FDiv:
3605 case Instruction::FRem:
3606 case Instruction::Shl:
3607 case Instruction::LShr:
3608 case Instruction::AShr:
3609 case Instruction::And:
3610 case Instruction::Or:
3611 case Instruction::Xor:
3612 case Instruction::ICmp:
3613 case Instruction::FCmp:
3615 Ctx) *
3616 (isSingleScalar() ? 1 : VF.getFixedValue());
3617 case Instruction::SDiv:
3618 case Instruction::UDiv:
3619 case Instruction::SRem:
3620 case Instruction::URem: {
3621 InstructionCost ScalarCost =
3623 if (isSingleScalar())
3624 return ScalarCost;
3625
3626 // If any of the operands is from a different replicate region and has its
3627 // cost skipped, it may have been forced to scalar. Fall back to legacy cost
3628 // model to avoid cost mis-match.
3629 if (any_of(operands(), [&Ctx, VF](VPValue *Op) {
3630 auto *PredR = dyn_cast<VPPredInstPHIRecipe>(Op);
3631 if (!PredR)
3632 return false;
3633 return Ctx.skipCostComputation(
3635 PredR->getOperand(0)->getUnderlyingValue()),
3636 VF.isVector());
3637 }))
3638 break;
3639
3640 ScalarCost = ScalarCost * VF.getFixedValue() +
3641 Ctx.getScalarizationOverhead(Ctx.Types.inferScalarType(this),
3642 to_vector(operands()), VF);
3643 // If the recipe is not predicated (i.e. not in a replicate region), return
3644 // the scalar cost. Otherwise handle predicated cost.
3645 if (!getRegion()->isReplicator())
3646 return ScalarCost;
3647
3648 // Account for the phi nodes that we will create.
3649 ScalarCost += VF.getFixedValue() *
3650 Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind);
3651 // Scale the cost by the probability of executing the predicated blocks.
3652 // This assumes the predicated block for each vector lane is equally
3653 // likely.
3654 ScalarCost /= Ctx.getPredBlockCostDivisor(UI->getParent());
3655 return ScalarCost;
3656 }
3657 case Instruction::Load:
3658 case Instruction::Store: {
3659 bool IsLoad = UI->getOpcode() == Instruction::Load;
3660 const VPValue *PtrOp = getOperand(!IsLoad);
3661 const SCEV *PtrSCEV = getAddressAccessSCEV(PtrOp, Ctx.PSE, Ctx.L);
3663 break;
3664
3665 Type *ValTy = Ctx.Types.inferScalarType(IsLoad ? this : getOperand(0));
3666 Type *ScalarPtrTy = Ctx.Types.inferScalarType(PtrOp);
3667 const Align Alignment = getLoadStoreAlignment(UI);
3668 unsigned AS = cast<PointerType>(ScalarPtrTy)->getAddressSpace();
3670 bool PreferVectorizedAddressing = Ctx.TTI.prefersVectorizedAddressing();
3671 bool UsedByLoadStoreAddress =
3672 !PreferVectorizedAddressing && vputils::isUsedByLoadStoreAddress(this);
3673 InstructionCost ScalarMemOpCost = Ctx.TTI.getMemoryOpCost(
3674 UI->getOpcode(), ValTy, Alignment, AS, Ctx.CostKind, OpInfo,
3675 UsedByLoadStoreAddress ? UI : nullptr);
3676
3677 // Check if this is a predicated store with a loop-invariant address only
3678 // masked by the header mask. If so, return the uniform mem op cost.
3679 if (isPredicatedUniformMemOpAfterTailFolding(*this, PtrSCEV, Ctx)) {
3680 InstructionCost UniformCost =
3681 ScalarMemOpCost +
3682 Ctx.TTI.getAddressComputationCost(ScalarPtrTy, /*SE=*/nullptr,
3683 /*Ptr=*/nullptr, Ctx.CostKind);
3684 auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
3685 VPValue *StoredVal = getOperand(0);
3686 if (!StoredVal->isDefinedOutsideLoopRegions())
3687 UniformCost += Ctx.TTI.getIndexedVectorInstrCostFromEnd(
3688 Instruction::ExtractElement, VectorTy, Ctx.CostKind, 0);
3689 return UniformCost;
3690 }
3691
3692 Type *PtrTy = isSingleScalar() ? ScalarPtrTy : toVectorTy(ScalarPtrTy, VF);
3693 InstructionCost ScalarCost =
3694 ScalarMemOpCost +
3695 Ctx.TTI.getAddressComputationCost(
3696 PtrTy, UsedByLoadStoreAddress ? nullptr : Ctx.PSE.getSE(), PtrSCEV,
3697 Ctx.CostKind);
3698 if (isSingleScalar())
3699 return ScalarCost;
3700
3701 SmallVector<const VPValue *> OpsToScalarize;
3702 Type *ResultTy = Type::getVoidTy(PtrTy->getContext());
3703 // Set ResultTy and OpsToScalarize, if scalarization is needed. Currently we
3704 // don't assign scalarization overhead in general, if the target prefers
3705 // vectorized addressing or the loaded value is used as part of an address
3706 // of another load or store.
3707 if (!UsedByLoadStoreAddress) {
3708 bool EfficientVectorLoadStore =
3709 Ctx.TTI.supportsEfficientVectorElementLoadStore();
3710 if (!(IsLoad && !PreferVectorizedAddressing) &&
3711 !(!IsLoad && EfficientVectorLoadStore))
3712 append_range(OpsToScalarize, operands());
3713
3714 if (!EfficientVectorLoadStore)
3715 ResultTy = Ctx.Types.inferScalarType(this);
3716 }
3717
3721 (ScalarCost * VF.getFixedValue()) +
3722 Ctx.getScalarizationOverhead(ResultTy, OpsToScalarize, VF, VIC, true);
3723
3724 const VPRegionBlock *ParentRegion = getRegion();
3725 if (ParentRegion && ParentRegion->isReplicator()) {
3726 if (!PtrSCEV)
3727 break;
3728 Cost /= Ctx.getPredBlockCostDivisor(UI->getParent());
3729 Cost += Ctx.TTI.getCFInstrCost(Instruction::CondBr, Ctx.CostKind);
3730
3731 auto *VecI1Ty = VectorType::get(
3732 IntegerType::getInt1Ty(Ctx.L->getHeader()->getContext()), VF);
3733 Cost += Ctx.TTI.getScalarizationOverhead(
3734 VecI1Ty, APInt::getAllOnes(VF.getFixedValue()),
3735 /*Insert=*/false, /*Extract=*/true, Ctx.CostKind);
3736
3737 if (Ctx.useEmulatedMaskMemRefHack(this, VF)) {
3738 // Artificially setting to a high enough value to practically disable
3739 // vectorization with such operations.
3740 return 3000000;
3741 }
3742 }
3743 return Cost;
3744 }
3745 case Instruction::SExt:
3746 case Instruction::ZExt:
3747 case Instruction::FPToUI:
3748 case Instruction::FPToSI:
3749 case Instruction::FPExt:
3750 case Instruction::PtrToInt:
3751 case Instruction::PtrToAddr:
3752 case Instruction::IntToPtr:
3753 case Instruction::SIToFP:
3754 case Instruction::UIToFP:
3755 case Instruction::Trunc:
3756 case Instruction::FPTrunc:
3757 case Instruction::Select:
3758 case Instruction::AddrSpaceCast: {
3760 Ctx) *
3761 (isSingleScalar() ? 1 : VF.getFixedValue());
3762 }
3763 case Instruction::ExtractValue:
3764 case Instruction::InsertValue:
3765 return Ctx.TTI.getInsertExtractValueCost(getOpcode(), Ctx.CostKind);
3766 }
3767
3768 return Ctx.getLegacyCost(UI, VF);
3769}
3770
3772 Function *CalledFn, Type *ResultTy, ArrayRef<const VPValue *> ArgOps,
3773 bool IsSingleScalar, ElementCount VF, VPCostContext &Ctx) {
3775 ArgOps, [&](const VPValue *Op) { return Ctx.Types.inferScalarType(Op); });
3776
3777 Intrinsic::ID IntrinID = CalledFn->getIntrinsicID();
3778 auto GetIntrinsicCost = [&] {
3779 if (!IntrinID)
3781 return Ctx.TTI.getIntrinsicInstrCost(
3782 IntrinsicCostAttributes(IntrinID, ResultTy, Tys), Ctx.CostKind);
3783 };
3784
3785 if (IntrinID && VPCostContext::isFreeScalarIntrinsic(IntrinID)) {
3786 assert(GetIntrinsicCost() == 0 && "scalarizing intrinsic should be free");
3787 return 0;
3788 }
3789
3790 InstructionCost ScalarCallCost =
3791 Ctx.TTI.getCallInstrCost(CalledFn, ResultTy, Tys, Ctx.CostKind);
3792 if (IsSingleScalar) {
3793 ScalarCallCost = std::min(ScalarCallCost, GetIntrinsicCost());
3794 return ScalarCallCost;
3795 }
3796
3797 // Scalarization overhead is undefined for scalable VFs.
3798 if (VF.isScalable())
3800
3801 return ScalarCallCost * VF.getFixedValue() +
3802 Ctx.getScalarizationOverhead(ResultTy, ArgOps, VF);
3803}
3804
3805#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3807 VPSlotTracker &SlotTracker) const {
3808 O << Indent << (IsSingleScalar ? "CLONE " : "REPLICATE ");
3809
3810 VPTypeAnalysis TypeInfo(*getParent()->getPlan());
3811 if (!TypeInfo.inferScalarType(this)->isVoidTy()) {
3813 O << " = ";
3814 }
3815 if (auto *CB = dyn_cast<CallBase>(getUnderlyingInstr())) {
3816 O << "call";
3817 printFlags(O);
3818 O << "@" << CB->getCalledFunction()->getName() << "(";
3820 O, [&O, &SlotTracker](VPValue *Op) {
3821 Op->printAsOperand(O, SlotTracker);
3822 });
3823 O << ")";
3824 } else {
3826 printFlags(O);
3828 }
3829
3830 // Find if the recipe is used by a widened recipe via an intervening
3831 // VPPredInstPHIRecipe. In this case, also pack the scalar values in a vector.
3832 if (any_of(users(), [](const VPUser *U) {
3833 if (auto *PredR = dyn_cast<VPPredInstPHIRecipe>(U))
3834 return !vputils::onlyScalarValuesUsed(PredR);
3835 return false;
3836 }))
3837 O << " (S->V)";
3838}
3839#endif
3840
3842 llvm_unreachable("recipe must be removed when dissolving replicate region");
3843}
3844
3846 VPCostContext &Ctx) const {
3847 // The legacy cost model doesn't assign costs to branches for individual
3848 // replicate regions. Match the current behavior in the VPlan cost model for
3849 // now.
3850 return 0;
3851}
3852
3854 llvm_unreachable("recipe must be removed when dissolving replicate region");
3855}
3856
3857#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3859 VPSlotTracker &SlotTracker) const {
3860 O << Indent << "PHI-PREDICATED-INSTRUCTION ";
3862 O << " = ";
3864}
3865#endif
3866
3868 VPCostContext &Ctx) const {
3869 const VPRecipeBase *R = getAsRecipe();
3871 Type *ScalarTy = IsLoad ? cast<VPSingleDefRecipe>(R)->getScalarType()
3872 : Ctx.Types.inferScalarType(R->getOperand(1));
3873 Type *Ty = toVectorTy(ScalarTy, VF);
3874 unsigned AS = cast<PointerType>(Ctx.Types.inferScalarType(getAddr()))
3875 ->getAddressSpace();
3876 unsigned Opcode = IsLoad ? Instruction::Load : Instruction::Store;
3877
3878 if (!Consecutive) {
3879 // TODO: Using the original IR may not be accurate.
3880 // Currently, ARM will use the underlying IR to calculate gather/scatter
3881 // instruction cost.
3882 [[maybe_unused]] auto IsReverseMask = [this, R]() {
3883 VPValue *Mask = getMask();
3884 if (!Mask)
3885 return false;
3886
3889
3890 return match(Mask, m_Reverse(m_VPValue()));
3891 };
3892 assert(!IsReverseMask() &&
3893 "Inconsecutive memory access should not have reverse order");
3894 Type *PtrTy = Ctx.Types.inferScalarType(getAddr());
3895 const Value *Ptr = getAddr()->getUnderlyingValue();
3896
3897 // If the address value is uniform across all lanes, then the address can be
3898 // calculated with scalar type and broadcast.
3900 PtrTy = toVectorTy(PtrTy, VF);
3901
3902 unsigned IID = isa<VPWidenLoadRecipe>(R) ? Intrinsic::masked_gather
3903 : isa<VPWidenStoreRecipe>(R) ? Intrinsic::masked_scatter
3904 : isa<VPWidenLoadEVLRecipe>(R) ? Intrinsic::vp_gather
3905 : Intrinsic::vp_scatter;
3906 return Ctx.TTI.getAddressComputationCost(PtrTy, nullptr, nullptr,
3907 Ctx.CostKind) +
3908 Ctx.TTI.getMemIntrinsicInstrCost(
3910 &Ingredient),
3911 Ctx.CostKind);
3912 }
3913
3915 if (IsMasked) {
3916 unsigned IID = isa<VPWidenLoadRecipe>(R) ? Intrinsic::masked_load
3917 : Intrinsic::masked_store;
3918 Cost += Ctx.TTI.getMemIntrinsicInstrCost(
3919 MemIntrinsicCostAttributes(IID, Ty, Alignment, AS), Ctx.CostKind);
3920 } else {
3921 TTI::OperandValueInfo OpInfo = Ctx.getOperandInfo(
3923 : R->getOperand(1));
3924 Cost += Ctx.TTI.getMemoryOpCost(Opcode, Ty, Alignment, AS, Ctx.CostKind,
3925 OpInfo, &Ingredient);
3926 }
3927 return Cost;
3928}
3929
3931 Type *ScalarDataTy = getScalarType();
3932 auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
3933 bool CreateGather = !isConsecutive();
3934
3935 auto &Builder = State.Builder;
3936 Value *Mask = nullptr;
3937 if (auto *VPMask = getMask())
3938 Mask = State.get(VPMask);
3939
3940 Value *Addr = State.get(getAddr(), /*IsScalar*/ !CreateGather);
3941 Value *NewLI;
3942 if (CreateGather) {
3943 NewLI = Builder.CreateMaskedGather(DataTy, Addr, Alignment, Mask, nullptr,
3944 "wide.masked.gather");
3945 } else if (Mask) {
3946 NewLI =
3947 Builder.CreateMaskedLoad(DataTy, Addr, Alignment, Mask,
3948 PoisonValue::get(DataTy), "wide.masked.load");
3949 } else {
3950 NewLI = Builder.CreateAlignedLoad(DataTy, Addr, Alignment, "wide.load");
3951 }
3953 State.set(this, NewLI);
3954}
3955
3956#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3958 VPSlotTracker &SlotTracker) const {
3959 O << Indent << "WIDEN ";
3961 O << " = load ";
3963}
3964#endif
3965
3967 Type *ScalarDataTy = getScalarType();
3968 auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
3969 bool CreateGather = !isConsecutive();
3970
3971 auto &Builder = State.Builder;
3972 CallInst *NewLI;
3973 Value *EVL = State.get(getEVL(), VPLane(0));
3974 Value *Addr = State.get(getAddr(), !CreateGather);
3975 Value *Mask = nullptr;
3976 if (VPValue *VPMask = getMask())
3977 Mask = State.get(VPMask);
3978 else
3979 Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
3980
3981 if (CreateGather) {
3982 NewLI =
3983 Builder.CreateIntrinsic(DataTy, Intrinsic::vp_gather, {Addr, Mask, EVL},
3984 nullptr, "wide.masked.gather");
3985 } else {
3986 NewLI = Builder.CreateIntrinsic(DataTy, Intrinsic::vp_load,
3987 {Addr, Mask, EVL}, nullptr, "vp.op.load");
3988 }
3989 NewLI->addParamAttr(
3991 applyMetadata(*NewLI);
3992 Instruction *Res = NewLI;
3993 State.set(this, Res);
3994}
3995
3997 VPCostContext &Ctx) const {
3998 if (!Consecutive || IsMasked)
3999 return VPWidenMemoryRecipe::computeCost(VF, Ctx);
4000
4001 // We need to use the getMemIntrinsicInstrCost() instead of getMemoryOpCost()
4002 // here because the EVL recipes using EVL to replace the tail mask. But in the
4003 // legacy model, it will always calculate the cost of mask.
4004 // TODO: Using getMemoryOpCost() instead of getMemIntrinsicInstrCost when we
4005 // don't need to compare to the legacy cost model.
4006 Type *Ty = toVectorTy(getScalarType(), VF);
4007 unsigned AS = cast<PointerType>(Ctx.Types.inferScalarType(getAddr()))
4008 ->getAddressSpace();
4009 return Ctx.TTI.getMemIntrinsicInstrCost(
4010 MemIntrinsicCostAttributes(Intrinsic::vp_load, Ty, Alignment, AS),
4011 Ctx.CostKind);
4012}
4013
4014#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4016 VPSlotTracker &SlotTracker) const {
4017 O << Indent << "WIDEN ";
4019 O << " = vp.load ";
4021}
4022#endif
4023
4025 VPValue *StoredVPValue = getStoredValue();
4026 bool CreateScatter = !isConsecutive();
4027
4028 auto &Builder = State.Builder;
4029
4030 Value *Mask = nullptr;
4031 if (auto *VPMask = getMask())
4032 Mask = State.get(VPMask);
4033
4034 Value *StoredVal = State.get(StoredVPValue);
4035 Value *Addr = State.get(getAddr(), /*IsScalar*/ !CreateScatter);
4036 Instruction *NewSI = nullptr;
4037 if (CreateScatter)
4038 NewSI = Builder.CreateMaskedScatter(StoredVal, Addr, Alignment, Mask);
4039 else if (Mask)
4040 NewSI = Builder.CreateMaskedStore(StoredVal, Addr, Alignment, Mask);
4041 else
4042 NewSI = Builder.CreateAlignedStore(StoredVal, Addr, Alignment);
4043 applyMetadata(*NewSI);
4044}
4045
4046#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4048 VPSlotTracker &SlotTracker) const {
4049 O << Indent << "WIDEN store ";
4051}
4052#endif
4053
4055 VPValue *StoredValue = getStoredValue();
4056 bool CreateScatter = !isConsecutive();
4057
4058 auto &Builder = State.Builder;
4059
4060 CallInst *NewSI = nullptr;
4061 Value *StoredVal = State.get(StoredValue);
4062 Value *EVL = State.get(getEVL(), VPLane(0));
4063 Value *Mask = nullptr;
4064 if (VPValue *VPMask = getMask())
4065 Mask = State.get(VPMask);
4066 else
4067 Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
4068
4069 Value *Addr = State.get(getAddr(), !CreateScatter);
4070 if (CreateScatter) {
4071 NewSI = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()),
4072 Intrinsic::vp_scatter,
4073 {StoredVal, Addr, Mask, EVL});
4074 } else {
4075 NewSI = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()),
4076 Intrinsic::vp_store,
4077 {StoredVal, Addr, Mask, EVL});
4078 }
4079 NewSI->addParamAttr(
4081 applyMetadata(*NewSI);
4082}
4083
4085 VPCostContext &Ctx) const {
4086 if (!Consecutive || IsMasked)
4087 return VPWidenMemoryRecipe::computeCost(VF, Ctx);
4088
4089 // We need to use the getMemIntrinsicInstrCost() instead of getMemoryOpCost()
4090 // here because the EVL recipes using EVL to replace the tail mask. But in the
4091 // legacy model, it will always calculate the cost of mask.
4092 // TODO: Using getMemoryOpCost() instead of getMemIntrinsicInstrCost when we
4093 // don't need to compare to the legacy cost model.
4094 Type *Ty = toVectorTy(Ctx.Types.inferScalarType(getStoredValue()), VF);
4095 unsigned AS = cast<PointerType>(Ctx.Types.inferScalarType(getAddr()))
4096 ->getAddressSpace();
4097 return Ctx.TTI.getMemIntrinsicInstrCost(
4098 MemIntrinsicCostAttributes(Intrinsic::vp_store, Ty, Alignment, AS),
4099 Ctx.CostKind);
4100}
4101
4102#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4104 VPSlotTracker &SlotTracker) const {
4105 O << Indent << "WIDEN vp.store ";
4107}
4108#endif
4109
4111 VectorType *DstVTy, const DataLayout &DL) {
4112 // Verify that V is a vector type with same number of elements as DstVTy.
4113 auto VF = DstVTy->getElementCount();
4114 auto *SrcVecTy = cast<VectorType>(V->getType());
4115 assert(VF == SrcVecTy->getElementCount() && "Vector dimensions do not match");
4116 Type *SrcElemTy = SrcVecTy->getElementType();
4117 Type *DstElemTy = DstVTy->getElementType();
4118 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
4119 "Vector elements must have same size");
4120
4121 // Do a direct cast if element types are castable.
4122 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
4123 return Builder.CreateBitOrPointerCast(V, DstVTy);
4124 }
4125 // V cannot be directly casted to desired vector type.
4126 // May happen when V is a floating point vector but DstVTy is a vector of
4127 // pointers or vice-versa. Handle this using a two-step bitcast using an
4128 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
4129 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
4130 "Only one type should be a pointer type");
4131 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
4132 "Only one type should be a floating point type");
4133 Type *IntTy =
4134 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
4135 auto *VecIntTy = VectorType::get(IntTy, VF);
4136 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
4137 return Builder.CreateBitOrPointerCast(CastVal, DstVTy);
4138}
4139
4140/// Return a vector containing interleaved elements from multiple
4141/// smaller input vectors.
4143 const Twine &Name) {
4144 unsigned Factor = Vals.size();
4145 assert(Factor > 1 && "Tried to interleave invalid number of vectors");
4146
4147 VectorType *VecTy = cast<VectorType>(Vals[0]->getType());
4148#ifndef NDEBUG
4149 for (Value *Val : Vals)
4150 assert(Val->getType() == VecTy && "Tried to interleave mismatched types");
4151#endif
4152
4153 // Scalable vectors cannot use arbitrary shufflevectors (only splats), so
4154 // must use intrinsics to interleave.
4155 if (VecTy->isScalableTy()) {
4156 assert(Factor <= 8 && "Unsupported interleave factor for scalable vectors");
4157 return Builder.CreateVectorInterleave(Vals, Name);
4158 }
4159
4160 // Fixed length. Start by concatenating all vectors into a wide vector.
4161 Value *WideVec = concatenateVectors(Builder, Vals);
4162
4163 // Interleave the elements into the wide vector.
4164 const unsigned NumElts = VecTy->getElementCount().getFixedValue();
4165 return Builder.CreateShuffleVector(
4166 WideVec, createInterleaveMask(NumElts, Factor), Name);
4167}
4168
4169// Try to vectorize the interleave group that \p Instr belongs to.
4170//
4171// E.g. Translate following interleaved load group (factor = 3):
4172// for (i = 0; i < N; i+=3) {
4173// R = Pic[i]; // Member of index 0
4174// G = Pic[i+1]; // Member of index 1
4175// B = Pic[i+2]; // Member of index 2
4176// ... // do something to R, G, B
4177// }
4178// To:
4179// %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B
4180// %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements
4181// %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements
4182// %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements
4183//
4184// Or translate following interleaved store group (factor = 3):
4185// for (i = 0; i < N; i+=3) {
4186// ... do something to R, G, B
4187// Pic[i] = R; // Member of index 0
4188// Pic[i+1] = G; // Member of index 1
4189// Pic[i+2] = B; // Member of index 2
4190// }
4191// To:
4192// %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
4193// %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
4194// %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
4195// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements
4196// store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B
4198 assert((!needsMaskForGaps() || !State.VF.isScalable()) &&
4199 "Masking gaps for scalable vectors is not yet supported.");
4201 Instruction *Instr = Group->getInsertPos();
4202
4203 // Prepare for the vector type of the interleaved load/store.
4204 Type *ScalarTy = getLoadStoreType(Instr);
4205 unsigned InterleaveFactor = Group->getFactor();
4206 auto *VecTy = VectorType::get(ScalarTy, State.VF * InterleaveFactor);
4207
4208 VPValue *BlockInMask = getMask();
4209 VPValue *Addr = getAddr();
4210 Value *ResAddr = State.get(Addr, VPLane(0));
4211
4212 auto CreateGroupMask = [&BlockInMask, &State,
4213 &InterleaveFactor](Value *MaskForGaps) -> Value * {
4214 if (State.VF.isScalable()) {
4215 assert(!MaskForGaps && "Interleaved groups with gaps are not supported.");
4216 assert(InterleaveFactor <= 8 &&
4217 "Unsupported deinterleave factor for scalable vectors");
4218 auto *ResBlockInMask = State.get(BlockInMask);
4219 SmallVector<Value *> Ops(InterleaveFactor, ResBlockInMask);
4220 return interleaveVectors(State.Builder, Ops, "interleaved.mask");
4221 }
4222
4223 if (!BlockInMask)
4224 return MaskForGaps;
4225
4226 Value *ResBlockInMask = State.get(BlockInMask);
4227 Value *ShuffledMask = State.Builder.CreateShuffleVector(
4228 ResBlockInMask,
4229 createReplicatedMask(InterleaveFactor, State.VF.getFixedValue()),
4230 "interleaved.mask");
4231 return MaskForGaps ? State.Builder.CreateBinOp(Instruction::And,
4232 ShuffledMask, MaskForGaps)
4233 : ShuffledMask;
4234 };
4235
4236 const DataLayout &DL = Instr->getDataLayout();
4237 // Vectorize the interleaved load group.
4238 if (isa<LoadInst>(Instr)) {
4239 Value *MaskForGaps = nullptr;
4240 if (needsMaskForGaps()) {
4241 MaskForGaps =
4242 createBitMaskForGaps(State.Builder, State.VF.getFixedValue(), *Group);
4243 assert(MaskForGaps && "Mask for Gaps is required but it is null");
4244 }
4245
4246 Instruction *NewLoad;
4247 if (BlockInMask || MaskForGaps) {
4248 Value *GroupMask = CreateGroupMask(MaskForGaps);
4249 Value *PoisonVec = PoisonValue::get(VecTy);
4250 NewLoad = State.Builder.CreateMaskedLoad(VecTy, ResAddr,
4251 Group->getAlign(), GroupMask,
4252 PoisonVec, "wide.masked.vec");
4253 } else
4254 NewLoad = State.Builder.CreateAlignedLoad(VecTy, ResAddr,
4255 Group->getAlign(), "wide.vec");
4256 applyMetadata(*NewLoad);
4257 // TODO: Also manage existing metadata using VPIRMetadata.
4258 Group->addMetadata(NewLoad);
4259
4261 if (VecTy->isScalableTy()) {
4262 // Scalable vectors cannot use arbitrary shufflevectors (only splats),
4263 // so must use intrinsics to deinterleave.
4264 assert(InterleaveFactor <= 8 &&
4265 "Unsupported deinterleave factor for scalable vectors");
4266 NewLoad = State.Builder.CreateIntrinsic(
4267 Intrinsic::getDeinterleaveIntrinsicID(InterleaveFactor),
4268 NewLoad->getType(), NewLoad,
4269 /*FMFSource=*/nullptr, "strided.vec");
4270 }
4271
4272 auto CreateStridedVector = [&InterleaveFactor, &State,
4273 &NewLoad](unsigned Index) -> Value * {
4274 assert(Index < InterleaveFactor && "Illegal group index");
4275 if (State.VF.isScalable())
4276 return State.Builder.CreateExtractValue(NewLoad, Index);
4277
4278 // For fixed length VF, use shuffle to extract the sub-vectors from the
4279 // wide load.
4280 auto StrideMask =
4281 createStrideMask(Index, InterleaveFactor, State.VF.getFixedValue());
4282 return State.Builder.CreateShuffleVector(NewLoad, StrideMask,
4283 "strided.vec");
4284 };
4285
4286 for (unsigned I = 0, J = 0; I < InterleaveFactor; ++I) {
4287 Instruction *Member = Group->getMember(I);
4288
4289 // Skip the gaps in the group.
4290 if (!Member)
4291 continue;
4292
4293 Value *StridedVec = CreateStridedVector(I);
4294
4295 // If this member has different type, cast the result type.
4296 if (Member->getType() != ScalarTy) {
4297 VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF);
4298 StridedVec =
4299 createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL);
4300 }
4301
4302 if (Group->isReverse())
4303 StridedVec = State.Builder.CreateVectorReverse(StridedVec, "reverse");
4304
4305 State.set(VPDefs[J], StridedVec);
4306 ++J;
4307 }
4308 return;
4309 }
4310
4311 // The sub vector type for current instruction.
4312 auto *SubVT = VectorType::get(ScalarTy, State.VF);
4313
4314 // Vectorize the interleaved store group.
4315 Value *MaskForGaps =
4316 createBitMaskForGaps(State.Builder, State.VF.getKnownMinValue(), *Group);
4317 assert(((MaskForGaps != nullptr) == needsMaskForGaps()) &&
4318 "Mismatch between NeedsMaskForGaps and MaskForGaps");
4319 ArrayRef<VPValue *> StoredValues = getStoredValues();
4320 // Collect the stored vector from each member.
4321 SmallVector<Value *, 4> StoredVecs;
4322 unsigned StoredIdx = 0;
4323 for (unsigned i = 0; i < InterleaveFactor; i++) {
4324 assert((Group->getMember(i) || MaskForGaps) &&
4325 "Fail to get a member from an interleaved store group");
4326 Instruction *Member = Group->getMember(i);
4327
4328 // Skip the gaps in the group.
4329 if (!Member) {
4330 Value *Undef = PoisonValue::get(SubVT);
4331 StoredVecs.push_back(Undef);
4332 continue;
4333 }
4334
4335 Value *StoredVec = State.get(StoredValues[StoredIdx]);
4336 ++StoredIdx;
4337
4338 if (Group->isReverse())
4339 StoredVec = State.Builder.CreateVectorReverse(StoredVec, "reverse");
4340
4341 // If this member has different type, cast it to a unified type.
4342
4343 if (StoredVec->getType() != SubVT)
4344 StoredVec = createBitOrPointerCast(State.Builder, StoredVec, SubVT, DL);
4345
4346 StoredVecs.push_back(StoredVec);
4347 }
4348
4349 // Interleave all the smaller vectors into one wider vector.
4350 Value *IVec = interleaveVectors(State.Builder, StoredVecs, "interleaved.vec");
4351 Instruction *NewStoreInstr;
4352 if (BlockInMask || MaskForGaps) {
4353 Value *GroupMask = CreateGroupMask(MaskForGaps);
4354 NewStoreInstr = State.Builder.CreateMaskedStore(
4355 IVec, ResAddr, Group->getAlign(), GroupMask);
4356 } else
4357 NewStoreInstr =
4358 State.Builder.CreateAlignedStore(IVec, ResAddr, Group->getAlign());
4359
4360 applyMetadata(*NewStoreInstr);
4361 // TODO: Also manage existing metadata using VPIRMetadata.
4362 Group->addMetadata(NewStoreInstr);
4363}
4364
4365#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4367 VPSlotTracker &SlotTracker) const {
4369 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
4370 IG->getInsertPos()->printAsOperand(O, false);
4371 O << ", ";
4373 VPValue *Mask = getMask();
4374 if (Mask) {
4375 O << ", ";
4376 Mask->printAsOperand(O, SlotTracker);
4377 }
4378
4379 unsigned OpIdx = 0;
4380 for (unsigned i = 0; i < IG->getFactor(); ++i) {
4381 if (!IG->getMember(i))
4382 continue;
4383 if (getNumStoreOperands() > 0) {
4384 O << "\n" << Indent << " store ";
4386 O << " to index " << i;
4387 } else {
4388 O << "\n" << Indent << " ";
4390 O << " = load from index " << i;
4391 }
4392 ++OpIdx;
4393 }
4394}
4395#endif
4396
4398 assert(State.VF.isScalable() &&
4399 "Only support scalable VF for EVL tail-folding.");
4401 "Masking gaps for scalable vectors is not yet supported.");
4403 Instruction *Instr = Group->getInsertPos();
4404
4405 // Prepare for the vector type of the interleaved load/store.
4406 Type *ScalarTy = getLoadStoreType(Instr);
4407 unsigned InterleaveFactor = Group->getFactor();
4408 assert(InterleaveFactor <= 8 &&
4409 "Unsupported deinterleave/interleave factor for scalable vectors");
4410 ElementCount WideVF = State.VF * InterleaveFactor;
4411 auto *VecTy = VectorType::get(ScalarTy, WideVF);
4412
4413 VPValue *Addr = getAddr();
4414 Value *ResAddr = State.get(Addr, VPLane(0));
4415 Value *EVL = State.get(getEVL(), VPLane(0));
4416 Value *InterleaveEVL = State.Builder.CreateMul(
4417 EVL, ConstantInt::get(EVL->getType(), InterleaveFactor), "interleave.evl",
4418 /* NUW= */ true, /* NSW= */ true);
4419 LLVMContext &Ctx = State.Builder.getContext();
4420
4421 Value *GroupMask = nullptr;
4422 if (VPValue *BlockInMask = getMask()) {
4423 SmallVector<Value *> Ops(InterleaveFactor, State.get(BlockInMask));
4424 GroupMask = interleaveVectors(State.Builder, Ops, "interleaved.mask");
4425 } else {
4426 GroupMask =
4427 State.Builder.CreateVectorSplat(WideVF, State.Builder.getTrue());
4428 }
4429
4430 // Vectorize the interleaved load group.
4431 if (isa<LoadInst>(Instr)) {
4432 CallInst *NewLoad = State.Builder.CreateIntrinsic(
4433 VecTy, Intrinsic::vp_load, {ResAddr, GroupMask, InterleaveEVL}, nullptr,
4434 "wide.vp.load");
4435 NewLoad->addParamAttr(0,
4436 Attribute::getWithAlignment(Ctx, Group->getAlign()));
4437
4438 applyMetadata(*NewLoad);
4439 // TODO: Also manage existing metadata using VPIRMetadata.
4440 Group->addMetadata(NewLoad);
4441
4442 // Scalable vectors cannot use arbitrary shufflevectors (only splats),
4443 // so must use intrinsics to deinterleave.
4444 NewLoad = State.Builder.CreateIntrinsic(
4445 Intrinsic::getDeinterleaveIntrinsicID(InterleaveFactor),
4446 NewLoad->getType(), NewLoad,
4447 /*FMFSource=*/nullptr, "strided.vec");
4448
4449 const DataLayout &DL = Instr->getDataLayout();
4450 for (unsigned I = 0, J = 0; I < InterleaveFactor; ++I) {
4451 Instruction *Member = Group->getMember(I);
4452 // Skip the gaps in the group.
4453 if (!Member)
4454 continue;
4455
4456 Value *StridedVec = State.Builder.CreateExtractValue(NewLoad, I);
4457 // If this member has different type, cast the result type.
4458 if (Member->getType() != ScalarTy) {
4459 VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF);
4460 StridedVec =
4461 createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL);
4462 }
4463
4464 State.set(getVPValue(J), StridedVec);
4465 ++J;
4466 }
4467 return;
4468 } // End for interleaved load.
4469
4470 // The sub vector type for current instruction.
4471 auto *SubVT = VectorType::get(ScalarTy, State.VF);
4472 // Vectorize the interleaved store group.
4473 ArrayRef<VPValue *> StoredValues = getStoredValues();
4474 // Collect the stored vector from each member.
4475 SmallVector<Value *, 4> StoredVecs;
4476 const DataLayout &DL = Instr->getDataLayout();
4477 for (unsigned I = 0, StoredIdx = 0; I < InterleaveFactor; I++) {
4478 Instruction *Member = Group->getMember(I);
4479 // Skip the gaps in the group.
4480 if (!Member) {
4481 StoredVecs.push_back(PoisonValue::get(SubVT));
4482 continue;
4483 }
4484
4485 Value *StoredVec = State.get(StoredValues[StoredIdx]);
4486 // If this member has different type, cast it to a unified type.
4487 if (StoredVec->getType() != SubVT)
4488 StoredVec = createBitOrPointerCast(State.Builder, StoredVec, SubVT, DL);
4489
4490 StoredVecs.push_back(StoredVec);
4491 ++StoredIdx;
4492 }
4493
4494 // Interleave all the smaller vectors into one wider vector.
4495 Value *IVec = interleaveVectors(State.Builder, StoredVecs, "interleaved.vec");
4496 CallInst *NewStore =
4497 State.Builder.CreateIntrinsic(Type::getVoidTy(Ctx), Intrinsic::vp_store,
4498 {IVec, ResAddr, GroupMask, InterleaveEVL});
4499 NewStore->addParamAttr(1,
4500 Attribute::getWithAlignment(Ctx, Group->getAlign()));
4501
4502 applyMetadata(*NewStore);
4503 // TODO: Also manage existing metadata using VPIRMetadata.
4504 Group->addMetadata(NewStore);
4505}
4506
4507#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4509 VPSlotTracker &SlotTracker) const {
4511 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
4512 IG->getInsertPos()->printAsOperand(O, false);
4513 O << ", ";
4515 O << ", ";
4517 if (VPValue *Mask = getMask()) {
4518 O << ", ";
4519 Mask->printAsOperand(O, SlotTracker);
4520 }
4521
4522 unsigned OpIdx = 0;
4523 for (unsigned i = 0; i < IG->getFactor(); ++i) {
4524 if (!IG->getMember(i))
4525 continue;
4526 if (getNumStoreOperands() > 0) {
4527 O << "\n" << Indent << " vp.store ";
4529 O << " to index " << i;
4530 } else {
4531 O << "\n" << Indent << " ";
4533 O << " = vp.load from index " << i;
4534 }
4535 ++OpIdx;
4536 }
4537}
4538#endif
4539
4541 VPCostContext &Ctx) const {
4542 Instruction *InsertPos = getInsertPos();
4543 // Find the VPValue index of the interleave group. We need to skip gaps.
4544 unsigned InsertPosIdx = 0;
4545 for (unsigned Idx = 0; IG->getFactor(); ++Idx)
4546 if (auto *Member = IG->getMember(Idx)) {
4547 if (Member == InsertPos)
4548 break;
4549 InsertPosIdx++;
4550 }
4551 Type *ValTy = Ctx.Types.inferScalarType(
4552 getNumDefinedValues() > 0 ? getVPValue(InsertPosIdx)
4553 : getStoredValues()[InsertPosIdx]);
4554 auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
4555 unsigned AS = cast<PointerType>(Ctx.Types.inferScalarType(getAddr()))
4556 ->getAddressSpace();
4557
4558 unsigned InterleaveFactor = IG->getFactor();
4559 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
4560
4561 // Holds the indices of existing members in the interleaved group.
4563 for (unsigned IF = 0; IF < InterleaveFactor; IF++)
4564 if (IG->getMember(IF))
4565 Indices.push_back(IF);
4566
4567 // Calculate the cost of the whole interleaved group.
4568 InstructionCost Cost = Ctx.TTI.getInterleavedMemoryOpCost(
4569 InsertPos->getOpcode(), WideVecTy, IG->getFactor(), Indices,
4570 IG->getAlign(), AS, Ctx.CostKind, getMask(), NeedsMaskForGaps);
4571
4572 if (!IG->isReverse())
4573 return Cost;
4574
4575 return Cost + IG->getNumMembers() *
4576 Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
4577 VectorTy, VectorTy, {}, Ctx.CostKind,
4578 0);
4579}
4580
4582 return vputils::onlyScalarValuesUsed(this) &&
4583 (!IsScalable || vputils::onlyFirstLaneUsed(this));
4584}
4585
4586#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4588 raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const {
4589 assert((getNumOperands() == 3 || getNumOperands() == 5) &&
4590 "unexpected number of operands");
4591 O << Indent << "EMIT ";
4593 O << " = WIDEN-POINTER-INDUCTION ";
4595 O << ", ";
4597 O << ", ";
4599 if (getNumOperands() == 5) {
4600 O << ", ";
4602 O << ", ";
4604 }
4605}
4606
4608 VPSlotTracker &SlotTracker) const {
4609 O << Indent << "EMIT ";
4611 O << " = EXPAND SCEV " << *Expr;
4612}
4613#endif
4614
4615#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4617 VPSlotTracker &SlotTracker) const {
4618 O << Indent << "EMIT ";
4620 O << " = WIDEN-CANONICAL-INDUCTION";
4621 printFlags(O);
4623}
4624#endif
4625
4627 auto &Builder = State.Builder;
4628 // Create a vector from the initial value.
4629 auto *VectorInit = getStartValue()->getLiveInIRValue();
4630
4631 Type *VecTy = State.VF.isScalar()
4632 ? VectorInit->getType()
4633 : VectorType::get(VectorInit->getType(), State.VF);
4634
4635 BasicBlock *VectorPH =
4636 State.CFG.VPBB2IRBB.at(getParent()->getCFGPredecessor(0));
4637 if (State.VF.isVector()) {
4638 auto *IdxTy = Builder.getInt32Ty();
4639 auto *One = ConstantInt::get(IdxTy, 1);
4640 IRBuilder<>::InsertPointGuard Guard(Builder);
4641 Builder.SetInsertPoint(VectorPH->getTerminator());
4642 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, State.VF);
4643 auto *LastIdx = Builder.CreateSub(RuntimeVF, One);
4644 VectorInit = Builder.CreateInsertElement(
4645 PoisonValue::get(VecTy), VectorInit, LastIdx, "vector.recur.init");
4646 }
4647
4648 // Create a phi node for the new recurrence.
4649 PHINode *Phi = PHINode::Create(VecTy, 2, "vector.recur");
4650 Phi->insertBefore(State.CFG.PrevBB->getFirstInsertionPt());
4651 Phi->addIncoming(VectorInit, VectorPH);
4652 State.set(this, Phi);
4653}
4654
4657 VPCostContext &Ctx) const {
4658 if (VF.isScalar())
4659 return Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind);
4660
4661 return 0;
4662}
4663
4664#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4666 raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const {
4667 O << Indent << "FIRST-ORDER-RECURRENCE-PHI ";
4669 O << " = phi ";
4671}
4672#endif
4673
4675 // Reductions do not have to start at zero. They can start with
4676 // any loop invariant values.
4677 VPValue *StartVPV = getStartValue();
4678
4679 // In order to support recurrences we need to be able to vectorize Phi nodes.
4680 // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4681 // stage #1: We create a new vector PHI node with no incoming edges. We'll use
4682 // this value when we vectorize all of the instructions that use the PHI.
4683 BasicBlock *VectorPH =
4684 State.CFG.VPBB2IRBB.at(getParent()->getCFGPredecessor(0));
4685 bool ScalarPHI = State.VF.isScalar() || isInLoop();
4686 Value *StartV = State.get(StartVPV, ScalarPHI);
4687 Type *VecTy = StartV->getType();
4688
4689 BasicBlock *HeaderBB = State.CFG.PrevBB;
4690 assert(State.CurrentParentLoop->getHeader() == HeaderBB &&
4691 "recipe must be in the vector loop header");
4692 auto *Phi = PHINode::Create(VecTy, 2, "vec.phi");
4693 Phi->insertBefore(HeaderBB->getFirstInsertionPt());
4694 State.set(this, Phi, isInLoop());
4695
4696 Phi->addIncoming(StartV, VectorPH);
4697}
4698
4699#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4701 VPSlotTracker &SlotTracker) const {
4702 O << Indent << "WIDEN-REDUCTION-PHI ";
4703
4705 O << " = phi (";
4706 printRecurrenceKind(O, Kind);
4707 O << ")";
4708 printFlags(O);
4710 if (getVFScaleFactor() > 1)
4711 O << " (VF scaled by 1/" << getVFScaleFactor() << ")";
4712}
4713#endif
4714
4716 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
4717 return vputils::onlyFirstLaneUsed(this);
4718}
4719
4721 Value *Op0 = State.get(getOperand(0));
4722 Type *VecTy = Op0->getType();
4723 Instruction *VecPhi = State.Builder.CreatePHI(VecTy, 2, Name);
4724 State.set(this, VecPhi);
4725}
4726
4728 VPCostContext &Ctx) const {
4729 return Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind);
4730}
4731
4732#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4734 VPSlotTracker &SlotTracker) const {
4735 O << Indent << "WIDEN-PHI ";
4736
4738 O << " = phi ";
4740}
4741#endif
4742
4744 BasicBlock *VectorPH =
4745 State.CFG.VPBB2IRBB.at(getParent()->getCFGPredecessor(0));
4746 Value *StartMask = State.get(getOperand(0));
4747 PHINode *Phi =
4748 State.Builder.CreatePHI(StartMask->getType(), 2, "active.lane.mask");
4749 Phi->addIncoming(StartMask, VectorPH);
4750 State.set(this, Phi);
4751}
4752
4753#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4755 VPSlotTracker &SlotTracker) const {
4756 O << Indent << "ACTIVE-LANE-MASK-PHI ";
4757
4759 O << " = phi ";
4761}
4762#endif
4763
4764#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4766 raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const {
4767 O << Indent << "CURRENT-ITERATION-PHI ";
4768
4770 O << " = phi ";
4772}
4773#endif
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static MCDisassembler::DecodeStatus addOperand(MCInst &Inst, const MCOperand &Opnd)
AMDGPU Lower Kernel Arguments
AMDGPU Register Bank Select
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static const Function * getParent(const Value *V)
#define X(NUM, ENUM, NAME)
Definition ELF.h:853
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static void replaceAllUsesWith(Value *Old, Value *New, SmallPtrSet< BasicBlock *, 32 > &FreshBBs, bool IsHuge)
Replace all old uses with new ones, and push the updated BBs into FreshBBs.
Value * getPointer(Value *Ptr)
iv users
Definition IVUsers.cpp:48
static constexpr Value * getValue(Ty &ValueOrUse)
static Value * getOpcode(Value &V, Type &Ty, InstrumentationConfig &IConf, InstrumentorIRBuilderTy &IIRB)
static std::pair< Value *, APInt > getMask(Value *WideMask, unsigned Factor, ElementCount LeafValueEC)
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
This file provides a LoopVectorizationPlanner class.
static const SCEV * getAddressAccessSCEV(Value *Ptr, PredicatedScalarEvolution &PSE, const Loop *TheLoop)
Gets the address access SCEV for Ptr, if it should be used for cost modeling according to isAddressSC...
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
static const Function * getCalledFunction(const Value *V)
static bool isOrdered(const Instruction *I)
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
const SmallVectorImpl< MachineOperand > & Cond
This file contains some templates that are useful if you are working with the STL at all.
This file defines less commonly used SmallVector utilities.
This file defines the SmallVector class.
#define LLVM_DEBUG(...)
Definition Debug.h:119
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This file contains the declarations of different VPlan-related auxiliary helpers.
static bool isPredicatedUniformMemOpAfterTailFolding(const VPReplicateRecipe &R, const SCEV *PtrSCEV, VPCostContext &Ctx)
Return true if R is a predicated store with a loop-invariant address only masked by the header mask.
static Value * interleaveVectors(IRBuilderBase &Builder, ArrayRef< Value * > Vals, const Twine &Name)
Return a vector containing interleaved elements from multiple smaller input vectors.
static Value * createBitOrPointerCast(IRBuilderBase &Builder, Value *V, VectorType *DstVTy, const DataLayout &DL)
static Instruction::BinaryOps getSubRecurOpcode(RecurKind Kind)
SmallVector< Value *, 2 > VectorParts
static void printRecurrenceKind(raw_ostream &OS, const RecurKind &Kind)
static unsigned getCalledFnOperandIndex(ArrayRef< VPValue * > Operands)
For call VPInstruction operands, return the operand index of the called function.
This file contains the declarations of the Vectorization Plan base classes:
void printAsOperand(OutputBuffer &OB, Prec P=Prec::Default, bool StrictlyWorse=false) const
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition APInt.h:235
Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
Get the array size.
Definition ArrayRef.h:141
bool empty() const
Check if the array is empty.
Definition ArrayRef.h:136
static LLVM_ABI Attribute getWithAlignment(LLVMContext &Context, Align Alignment)
Return a uniquified Attribute object that has the specific alignment set.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
LLVM_ABI const_iterator getFirstInsertionPt() const
Returns an iterator to the first instruction in this block that is suitable for inserting a non-PHI i...
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction; assumes that the block is well-formed.
Definition BasicBlock.h:237
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
This class represents a function call, abstracting a target machine's calling convention.
static LLVM_ABI bool isBitOrNoopPointerCastable(Type *SrcTy, Type *DestTy, const DataLayout &DL)
Check whether a bitcast, inttoptr, or ptrtoint cast between these types is valid and a no-op.
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:740
@ ICMP_UGT
unsigned greater than
Definition InstrTypes.h:763
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:765
static LLVM_ABI StringRef getPredicateName(Predicate P)
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
void setSuccessor(unsigned idx, BasicBlock *NewSucc)
This is an important base class in LLVM.
Definition Constant.h:43
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
A debug info location.
Definition DebugLoc.h:124
static DebugLoc getUnknown()
Definition DebugLoc.h:151
constexpr bool isVector() const
One or more elements.
Definition TypeSize.h:324
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition TypeSize.h:312
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition TypeSize.h:309
constexpr bool isScalar() const
Exactly one element.
Definition TypeSize.h:320
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:23
LLVM_ABI void print(raw_ostream &O) const
Print fast-math flags to O.
Definition Operator.cpp:283
void setAllowContract(bool B=true)
Definition FMF.h:90
bool noSignedZeros() const
Definition FMF.h:67
bool noInfs() const
Definition FMF.h:66
void setAllowReciprocal(bool B=true)
Definition FMF.h:87
bool allowReciprocal() const
Definition FMF.h:68
void setNoSignedZeros(bool B=true)
Definition FMF.h:84
bool allowReassoc() const
Flag queries.
Definition FMF.h:64
bool approxFunc() const
Definition FMF.h:70
void setNoNaNs(bool B=true)
Definition FMF.h:78
void setAllowReassoc(bool B=true)
Flag setters.
Definition FMF.h:75
bool noNaNs() const
Definition FMF.h:65
void setApproxFunc(bool B=true)
Definition FMF.h:93
void setNoInfs(bool B=true)
Definition FMF.h:81
bool allowContract() const
Definition FMF.h:69
Class to represent function types.
Type * getParamType(unsigned i) const
Parameter type accessors.
bool willReturn() const
Determine if the function will return.
Definition Function.h:669
Intrinsic::ID getIntrinsicID() const LLVM_READONLY
getIntrinsicID - This method returns the ID number of the specified function, or Intrinsic::not_intri...
Definition Function.h:246
bool doesNotThrow() const
Determine if the function cannot unwind.
Definition Function.h:602
bool doesNotAccessMemory() const
Determine if the function does not access memory.
Definition Function.cpp:867
Type * getReturnType() const
Returns the type of the ret val.
Definition Function.h:216
Represents flags for the getelementptr instruction/expression.
static GEPNoWrapFlags none()
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2627
IntegerType * getInt1Ty()
Fetch the type representing a single bit.
Definition IRBuilder.h:571
Value * CreateInsertValue(Value *Agg, Value *Val, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition IRBuilder.h:2681
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2615
LLVM_ABI Value * CreateVectorSpliceRight(Value *V1, Value *V2, Value *Offset, const Twine &Name="")
Create a vector.splice.right intrinsic call, or a shufflevector that produces the same result if the ...
CondBrInst * CreateCondBr(Value *Cond, BasicBlock *True, BasicBlock *False, MDNode *BranchWeights=nullptr, MDNode *Unpredictable=nullptr)
Create a conditional 'br Cond, TrueDest, FalseDest' instruction.
Definition IRBuilder.h:1238
LLVM_ABI Value * CreateSelectFMF(Value *C, Value *True, Value *False, FMFSource FMFSource, const Twine &Name="", Instruction *MDFrom=nullptr)
LLVM_ABI Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
Value * CreateExtractValue(Value *Agg, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition IRBuilder.h:2674
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > OverloadTypes, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="", ArrayRef< OperandBundleDef > OpBundles={})
Create a call to intrinsic ID with Args, mangled using OverloadTypes.
LLVM_ABI Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Value * CreateFreeze(Value *V, const Twine &Name="")
Definition IRBuilder.h:2693
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
Definition IRBuilder.h:586
Value * CreatePtrAdd(Value *Ptr, Value *Offset, const Twine &Name="", GEPNoWrapFlags NW=GEPNoWrapFlags::none())
Definition IRBuilder.h:2091
Value * CreateCast(Instruction::CastOps Op, Value *V, Type *DestTy, const Twine &Name="", MDNode *FPMathTag=nullptr, FMFSource FMFSource={})
Definition IRBuilder.h:2276
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
Definition IRBuilder.h:352
LLVM_ABI Value * CreateVectorReverse(Value *V, const Twine &Name="")
Return a vector value that contains the vector V reversed.
Value * CreateICmpNE(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:2378
LLVM_ABI CallInst * CreateOrReduce(Value *Src)
Create a vector int OR reduction intrinsic of the source vector.
Value * CreateLogicalAnd(Value *Cond1, Value *Cond2, const Twine &Name="", Instruction *MDFrom=nullptr)
Definition IRBuilder.h:1782
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Definition IRBuilder.h:529
Value * CreateCmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:2508
Value * CreateNot(Value *V, const Twine &Name="")
Definition IRBuilder.h:1866
Value * CreateICmpEQ(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:2374
Value * CreateCountTrailingZeroElems(Type *ResTy, Value *Mask, bool ZeroIsPoison=true, const Twine &Name="")
Create a call to llvm.experimental_cttz_elts.
Definition IRBuilder.h:1176
Value * CreateSub(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1461
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition IRBuilder.h:2120
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1444
ConstantInt * getFalse()
Get the constant value for i1 false.
Definition IRBuilder.h:514
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:1753
Value * CreateICmpUGE(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:2386
Value * CreateLogicalOr(Value *Cond1, Value *Cond2, const Twine &Name="", Instruction *MDFrom=nullptr)
Definition IRBuilder.h:1790
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:2484
Value * CreateOr(Value *LHS, Value *RHS, const Twine &Name="", bool IsDisjoint=false)
Definition IRBuilder.h:1614
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1478
LLVM_ABI Value * CreateUnaryIntrinsic(Intrinsic::ID ID, Value *Op, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 1 operand which is mangled on its type.
static InstructionCost getInvalid(CostType Val=0)
bool isCast() const
bool isBinaryOp() const
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
const char * getOpcodeName() const
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
bool isUnaryOp() const
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:350
The group of interleaved loads/stores sharing the same stride and close to each other.
uint32_t getFactor() const
InstTy * getMember(uint32_t Index) const
Get the member with the given index Index.
bool isReverse() const
InstTy * getInsertPos() const
void addMetadata(InstTy *NewInst) const
Add metadata (e.g.
Align getAlign() const
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
Information for memory intrinsic cost model.
Root of the metadata hierarchy.
Definition Metadata.h:64
LLVM_ABI void print(raw_ostream &OS, const Module *M=nullptr, bool IsForDebug=false) const
Print.
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:68
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
static PHINode * Create(Type *Ty, unsigned NumReservedValues, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
Constructors - NumReservedValues is a hint for the number of incoming edges that this phi node will h...
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
ScalarEvolution * getSE() const
Returns the ScalarEvolution analysis used.
static LLVM_ABI unsigned getOpcode(RecurKind Kind)
Returns the opcode corresponding to the RecurrenceKind.
static bool isAnyOfRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is of the form select(cmp(),x,y) where one of (x,...
static LLVM_ABI bool isSubRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is for a sub operation.
static bool isFindIVRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is of the form select(cmp(),x,y) where one of (x,...
static bool isMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is any min/max kind.
This class represents an analyzed expression in the program.
This class represents the LLVM 'select' instruction.
This class provides computation of slot numbers for LLVM Assembly writing.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
reference emplace_back(ArgTypes &&... Args)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Represent a constant reference to a string, i.e.
Definition StringRef.h:56
VectorInstrContext
Represents a hint about the context in which an insert/extract is used.
@ None
The insert/extract is not used with a load/store.
@ Load
The value being inserted comes from a load (InsertElement only).
@ Store
The extracted value is stored (ExtractElement only).
static LLVM_ABI PartialReductionExtendKind getPartialReductionExtendKind(Instruction *I)
Get the kind of extension that an instruction represents.
static LLVM_ABI OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
@ TCC_Free
Expected to fold away in lowering.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Reverse
Reverse the order of the vector.
CastContextHint
Represents a hint about the context in which a cast is used.
@ Reversed
The cast is used with a reversed load/store.
@ Masked
The cast is used with a masked load/store.
@ Normal
The cast is used with a normal load/store.
@ Interleave
The cast is used with an interleaved load/store.
@ GatherScatter
The cast is used with a gather/scatter.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
bool isByteTy() const
True if this is an instance of ByteType.
Definition Type.h:242
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:288
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:309
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:282
static LLVM_ABI Type * getVoidTy(LLVMContext &C)
Definition Type.cpp:282
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:368
bool isStructTy() const
True if this is an instance of StructType.
Definition Type.h:276
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:130
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:232
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:306
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:186
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:257
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:313
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:141
value_op_iterator value_op_end()
Definition User.h:288
void setOperand(unsigned i, Value *Val)
Definition User.h:212
Value * getOperand(unsigned i) const
Definition User.h:207
value_op_iterator value_op_begin()
Definition User.h:285
void execute(VPTransformState &State) override
Generate the active lane mask phi of the vector loop.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph.
Definition VPlan.h:4323
RecipeListTy & getRecipeList()
Returns a reference to the list of recipes.
Definition VPlan.h:4376
iterator end()
Definition VPlan.h:4360
const VPRecipeBase & front() const
Definition VPlan.h:4370
void insert(VPRecipeBase *Recipe, iterator InsertPt)
Definition VPlan.h:4389
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPWidenMemoryRecipe.
VPValue * getIncomingValue(unsigned Idx) const
Return incoming value number Idx.
Definition VPlan.h:2955
unsigned getNumIncomingValues() const
Return the number of incoming values, taking into account when normalized the first incoming value wi...
Definition VPlan.h:2950
bool usesFirstLaneOnly(const VPValue *Op) const override
Returns true if the recipe only uses the first lane of operand Op.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
bool isNormalized() const
A normalized blend is one that has an odd number of operands, whereby the first operand does not have...
Definition VPlan.h:2946
VPBlockBase is the building block of the Hierarchical Control-Flow Graph.
Definition VPlan.h:93
const VPBlocksTy & getPredecessors() const
Definition VPlan.h:221
VPlan * getPlan()
Definition VPlan.cpp:211
void printAsOperand(raw_ostream &OS, bool PrintType=false) const
Definition VPlan.h:363
const VPBasicBlock * getEntryBasicBlock() const
Definition VPlan.cpp:216
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPBranchOnMaskRecipe.
void execute(VPTransformState &State) override
Generate the extraction of the appropriate bit from the block mask and the conditional branch.
VPlan-based builder utility analogous to IRBuilder.
LLVM_ABI_FOR_TEST void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
unsigned getNumDefinedValues() const
Returns the number of values defined by the VPDef.
Definition VPlanValue.h:559
VPValue * getVPSingleValue()
Returns the only VPValue defined by the VPDef.
Definition VPlanValue.h:532
VPValue * getVPValue(unsigned I)
Returns the VPValue with index I defined by the VPDef.
Definition VPlanValue.h:544
ArrayRef< VPRecipeValue * > definedValues()
Returns an ArrayRef of the values defined by the VPDef.
Definition VPlanValue.h:554
VPIRValue * getStartValue() const
Definition VPlan.h:4140
VPValue * getStepValue() const
Definition VPlan.h:4142
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
VPExpandSCEVRecipe(const SCEV *Expr)
bool isVectorToScalar() const
Returns true if this VPExpressionRecipe produces a single scalar.
void decompose()
Insert the recipes of the expression back into the VPlan, directly before the current recipe.
bool mayHaveSideEffects() const
Returns true if this expression contains recipes that may have side effects.
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Compute the cost of this recipe either using a recipe's specialized implementation or using the legac...
bool mayReadOrWriteMemory() const
Returns true if this expression contains recipes that may read from or write to memory.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this header phi recipe.
VPValue * getStartValue()
Returns the start value of the phi, if one is set.
Definition VPlan.h:2448
void execute(VPTransformState &State) override
Produce a vectorized histogram operation.
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPHistogramRecipe.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
VPValue * getMask() const
Return the mask operand if one was provided, or a null pointer if all lanes should be executed uncond...
Definition VPlan.h:2176
Class to record and manage LLVM IR flags.
Definition VPlan.h:700
FastMathFlagsTy FMFs
Definition VPlan.h:788
ReductionFlagsTy ReductionFlags
Definition VPlan.h:790
LLVM_ABI_FOR_TEST bool hasRequiredFlagsForOpcode(unsigned Opcode) const
Returns true if Opcode has its required flags set.
LLVM_ABI_FOR_TEST bool flagsValidForOpcode(unsigned Opcode) const
Returns true if the set flags are valid for Opcode.
static VPIRFlags getDefaultFlags(unsigned Opcode)
Returns default flags for Opcode for opcodes that support it, asserts otherwise.
WrapFlagsTy WrapFlags
Definition VPlan.h:782
void printFlags(raw_ostream &O) const
bool hasFastMathFlags() const
Returns true if the recipe has fast-math flags.
Definition VPlan.h:1005
LLVM_ABI_FOR_TEST FastMathFlags getFastMathFlags() const
bool isReductionOrdered() const
Definition VPlan.h:1069
TruncFlagsTy TruncFlags
Definition VPlan.h:783
CmpInst::Predicate getPredicate() const
Definition VPlan.h:977
ExactFlagsTy ExactFlags
Definition VPlan.h:785
void intersectFlags(const VPIRFlags &Other)
Only keep flags also present in Other.
uint8_t GEPFlagsStorage
Definition VPlan.h:786
GEPNoWrapFlags getGEPNoWrapFlags() const
Definition VPlan.h:995
bool hasPredicate() const
Returns true if the recipe has a comparison predicate.
Definition VPlan.h:1000
DisjointFlagsTy DisjointFlags
Definition VPlan.h:784
FCmpFlagsTy FCmpFlags
Definition VPlan.h:789
NonNegFlagsTy NonNegFlags
Definition VPlan.h:787
bool isReductionInLoop() const
Definition VPlan.h:1075
void applyFlags(Instruction &I) const
Apply the IR flags to I.
Definition VPlan.h:934
uint8_t CmpPredStorage
Definition VPlan.h:781
RecurKind getRecurKind() const
Definition VPlan.h:1063
void execute(VPTransformState &State) override
The method which generates the output IR instructions that correspond to this VPRecipe,...
LLVM_ABI_FOR_TEST InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPIRInstruction.
VPIRInstruction(Instruction &I)
VPIRInstruction::create() should be used to create VPIRInstructions, as subclasses may need to be cre...
Definition VPlan.h:1710
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void intersect(const VPIRMetadata &MD)
Intersect this VPIRMetadata object with MD, keeping only metadata nodes that are common to both.
VPIRMetadata()=default
void print(raw_ostream &O, VPSlotTracker &SlotTracker) const
Print metadata with node IDs.
void applyMetadata(Instruction &I) const
Add all metadata to I.
Type * getResultType() const
Definition VPlan.h:1587
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void execute(VPTransformState &State) override
Generate the instruction.
This is a concrete Recipe that models a single VPlan-level instruction.
Definition VPlan.h:1231
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPInstruction.
VPInstruction(unsigned Opcode, ArrayRef< VPValue * > Operands, const VPIRFlags &Flags={}, const VPIRMetadata &MD={}, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="", Type *ResultTy=nullptr)
bool doesGeneratePerAllLanes() const
Returns true if this VPInstruction generates scalar values for all lanes.
@ ExtractLastActive
Extracts the last active lane from a set of vectors.
Definition VPlan.h:1333
@ ExtractLane
Extracts a single lane (first operand) from a set of vector operands.
Definition VPlan.h:1324
@ ExitingIVValue
Compute the exiting value of a wide induction after vectorization, that is the value of the last lane...
Definition VPlan.h:1337
@ WideIVStep
Scale the first operand (vector step) by the second operand (scalar-step).
Definition VPlan.h:1349
@ ResumeForEpilogue
Explicit user for the resume phi of the canonical induction in the main VPlan, used by the epilogue v...
Definition VPlan.h:1327
@ Unpack
Extracts all lanes from its (non-scalable) vector operand.
Definition VPlan.h:1274
@ ReductionStartVector
Start vector for reductions with 3 operands: the original start value, the identity value for the red...
Definition VPlan.h:1320
@ BuildVector
Creates a fixed-width vector containing all operands.
Definition VPlan.h:1269
@ BuildStructVector
Given operands of (the same) struct type, creates a struct of fixed- width vectors each containing a ...
Definition VPlan.h:1266
@ VScale
Returns the value for vscale.
Definition VPlan.h:1353
@ CanonicalIVIncrementForPart
Definition VPlan.h:1250
@ ComputeReductionResult
Reduce the operands to the final reduction result using the operation specified via the operation's V...
Definition VPlan.h:1277
bool hasResult() const
Definition VPlan.h:1438
bool opcodeMayReadOrWriteFromMemory() const
Returns true if the underlying opcode may read from or write to memory.
LLVM_DUMP_METHOD void dump() const
Print the VPInstruction to dbgs() (for debugging).
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the VPInstruction to O.
StringRef getName() const
Returns the symbolic name assigned to the VPInstruction.
Definition VPlan.h:1518
unsigned getOpcode() const
Definition VPlan.h:1422
bool usesFirstLaneOnly(const VPValue *Op) const override
Returns true if the recipe only uses the first lane of operand Op.
bool isVectorToScalar() const
Returns true if this VPInstruction produces a scalar value from a vector, e.g.
bool isSingleScalar() const
Returns true if this VPInstruction's operands are single scalars and the result is also a single scal...
unsigned getNumOperandsForOpcode() const
Return the number of operands determined by the opcode of the VPInstruction, excluding mask.
bool isMasked() const
Returns true if the VPInstruction has a mask operand.
Definition VPlan.h:1463
void execute(VPTransformState &State) override
Generate the instruction.
bool usesFirstPartOnly(const VPValue *Op) const override
Returns true if the recipe only uses the first part of operand Op.
bool needsMaskForGaps() const
Return true if the access needs a mask because of the gaps.
Definition VPlan.h:3059
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this recipe.
Instruction * getInsertPos() const
Definition VPlan.h:3063
const InterleaveGroup< Instruction > * getInterleaveGroup() const
Definition VPlan.h:3061
VPValue * getMask() const
Return the mask used by this recipe.
Definition VPlan.h:3053
ArrayRef< VPValue * > getStoredValues() const
Return the VPValues stored by this interleave group.
Definition VPlan.h:3082
VPValue * getAddr() const
Return the address accessed by this recipe.
Definition VPlan.h:3047
VPValue * getEVL() const
The VPValue of the explicit vector length.
Definition VPlan.h:3156
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
unsigned getNumStoreOperands() const override
Returns the number of stored operands of this interleave group.
Definition VPlan.h:3169
void execute(VPTransformState &State) override
Generate the wide load or store, and shuffles.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
unsigned getNumStoreOperands() const override
Returns the number of stored operands of this interleave group.
Definition VPlan.h:3119
void execute(VPTransformState &State) override
Generate the wide load or store, and shuffles.
In what follows, the term "input IR" refers to code that is fed into the vectorizer whereas the term ...
static VPLane getLastLaneForVF(const ElementCount &VF)
static VPLane getLaneFromEnd(const ElementCount &VF, unsigned Offset)
static VPLane getFirstLane()
virtual const VPRecipeBase * getAsRecipe() const =0
Return a VPRecipeBase* to the current object.
VPValue * getIncomingValueForBlock(const VPBasicBlock *VPBB) const
Returns the incoming value for VPBB. VPBB must be an incoming block.
virtual unsigned getNumIncoming() const
Returns the number of incoming values, also number of incoming blocks.
Definition VPlan.h:1623
void removeIncomingValueFor(VPBlockBase *IncomingBlock) const
Removes the incoming value for IncomingBlock, which must be a predecessor.
const VPBasicBlock * getIncomingBlock(unsigned Idx) const
Returns the incoming block with index Idx.
Definition VPlan.h:4467
detail::zippy< llvm::detail::zip_first, VPUser::const_operand_range, const_incoming_blocks_range > incoming_values_and_blocks() const
Returns an iterator range over pairs of incoming values and corresponding incoming blocks.
Definition VPlan.h:1648
VPValue * getIncomingValue(unsigned Idx) const
Returns the incoming VPValue with index Idx.
Definition VPlan.h:1608
void printPhiOperands(raw_ostream &O, VPSlotTracker &SlotTracker) const
Print the recipe.
void setIncomingValueForBlock(const VPBasicBlock *VPBB, VPValue *V) const
Sets the incoming value for VPBB to V.
void execute(VPTransformState &State) override
Generates phi nodes for live-outs (from a replicate region) as needed to retain SSA form.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
VPRecipeBase is a base class modeling a sequence of one or more output IR instructions.
Definition VPlan.h:401
bool mayReadFromMemory() const
Returns true if the recipe may read from memory.
bool mayHaveSideEffects() const
Returns true if the recipe may have side-effects.
virtual void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const =0
Each concrete VPRecipe prints itself, without printing common information, like debug info or metadat...
VPRegionBlock * getRegion()
Definition VPlan.h:4668
LLVM_ABI_FOR_TEST void dump() const
Dump the recipe to stderr (for debugging).
Definition VPlan.cpp:117
bool isPhi() const
Returns true for PHI-like recipes.
bool mayWriteToMemory() const
Returns true if the recipe may write to memory.
virtual InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const
Compute the cost of this recipe either using a recipe's specialized implementation or using the legac...
VPBasicBlock * getParent()
Definition VPlan.h:476
DebugLoc getDebugLoc() const
Returns the debug location of the recipe.
Definition VPlan.h:554
void moveBefore(VPBasicBlock &BB, iplist< VPRecipeBase >::iterator I)
Unlink this recipe and insert into BB before I.
bool isSafeToSpeculativelyExecute() const
Return true if we can safely execute this recipe unconditionally even if it is masked originally.
void insertBefore(VPRecipeBase *InsertPos)
Insert an unlinked recipe into a basic block immediately before the specified recipe.
void insertAfter(VPRecipeBase *InsertPos)
Insert an unlinked Recipe into a basic block immediately after the specified Recipe.
iplist< VPRecipeBase >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
InstructionCost cost(ElementCount VF, VPCostContext &Ctx)
Return the cost of this recipe, taking into account if the cost computation should be skipped and the...
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const
Print the recipe, delegating to printRecipe().
void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
unsigned getVPRecipeID() const
Definition VPlan.h:522
void moveAfter(VPRecipeBase *MovePos)
Unlink this recipe from its current VPBasicBlock and insert it into the VPBasicBlock that MovePos liv...
VPRecipeBase(const unsigned char SC, ArrayRef< VPValue * > Operands, DebugLoc DL=DebugLoc::getUnknown())
Definition VPlan.h:466
Type * getScalarType() const
Returns the scalar type of this VPRecipeValue.
Definition VPlanValue.h:337
friend class VPValue
Definition VPlanValue.h:316
void execute(VPTransformState &State) override
Generate the reduction in the loop.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
VPValue * getEVL() const
The VPValue of the explicit vector length.
Definition VPlan.h:3328
unsigned getVFScaleFactor() const
Get the factor that the VF of this recipe's output should be scaled by, or 1 if it isn't scaled.
Definition VPlan.h:2856
bool isInLoop() const
Returns true if the phi is part of an in-loop reduction.
Definition VPlan.h:2880
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void execute(VPTransformState &State) override
Generate the phi/select nodes.
bool isConditional() const
Return true if the in-loop reduction is conditional.
Definition VPlan.h:3270
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of VPReductionRecipe.
VPValue * getVecOp() const
The VPValue of the vector value to be reduced.
Definition VPlan.h:3281
VPValue * getCondOp() const
The VPValue of the condition for the block.
Definition VPlan.h:3283
RecurKind getRecurrenceKind() const
Return the recurrence kind for the in-loop reduction.
Definition VPlan.h:3266
bool isPartialReduction() const
Returns true if the reduction outputs a vector with a scaled down VF.
Definition VPlan.h:3272
VPValue * getChainOp() const
The VPValue of the scalar Chain being accumulated.
Definition VPlan.h:3279
bool isInLoop() const
Returns true if the reduction is in-loop.
Definition VPlan.h:3274
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void execute(VPTransformState &State) override
Generate the reduction in the loop.
VPRegionBlock represents a collection of VPBasicBlocks and VPRegionBlocks which form a Single-Entry-S...
Definition VPlan.h:4533
bool isReplicator() const
An indicator whether this region is to generate multiple replicated instances of output IR correspond...
Definition VPlan.h:4609
VPReplicateRecipe replicates a given instruction producing multiple scalar copies of the original sca...
Definition VPlan.h:3350
void execute(VPTransformState &State) override
Generate replicas of the desired Ingredient.
bool isSingleScalar() const
Definition VPlan.h:3406
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPReplicateRecipe.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
static Type * computeScalarType(const Instruction *I, ArrayRef< VPValue * > Operands)
Compute the scalar result type for a VPReplicateRecipe wrapping I with Operands (excluding any predic...
static InstructionCost computeCallCost(Function *CalledFn, Type *ResultTy, ArrayRef< const VPValue * > ArgOps, bool IsSingleScalar, ElementCount VF, VPCostContext &Ctx)
Return the cost of scalarizing a call to CalledFn with argument operands ArgOps for a given VF.
unsigned getOpcode() const
Definition VPlan.h:3430
VPValue * getStepValue() const
Definition VPlan.h:4212
VPValue * getStartIndex() const
Return the StartIndex, or null if known to be zero, valid only after unrolling.
Definition VPlan.h:4220
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void execute(VPTransformState &State) override
Generate the scalarized versions of the phi node as needed by their users.
VPSingleDefRecipe is a base class for recipes that model a sequence of one or more output IR that def...
Definition VPlan.h:614
Instruction * getUnderlyingInstr()
Returns the underlying instruction.
Definition VPlan.h:685
LLVM_ABI_FOR_TEST LLVM_DUMP_METHOD void dump() const
Print this VPSingleDefRecipe to dbgs() (for debugging).
VPSingleDefRecipe(const unsigned char SC, ArrayRef< VPValue * > Operands, DebugLoc DL=DebugLoc::getUnknown())
Definition VPlan.h:616
This class can be used to assign names to VPValues.
An analysis for type-inference for VPValues.
Type * inferScalarType(const VPValue *V)
Infer the type of V. Returns the scalar type of V.
This class augments VPValue with operands which provide the inverse def-use edges from VPValue's user...
Definition VPlanValue.h:384
void printOperands(raw_ostream &O, VPSlotTracker &SlotTracker) const
Print the operands to O.
Definition VPlan.cpp:1527
operand_range operands()
Definition VPlanValue.h:455
unsigned getNumOperands() const
Definition VPlanValue.h:422
operand_iterator op_end()
Definition VPlanValue.h:453
operand_iterator op_begin()
Definition VPlanValue.h:451
VPValue * getOperand(unsigned N) const
Definition VPlanValue.h:423
This is the base class of the VPlan Def/Use graph, used for modeling the data flow into,...
Definition VPlanValue.h:50
Value * getLiveInIRValue() const
Return the underlying IR value for a VPIRValue.
Definition VPlan.cpp:143
bool isDefinedOutsideLoopRegions() const
Returns true if the VPValue is defined outside any loop.
Definition VPlan.cpp:1478
VPRecipeBase * getDefiningRecipe()
Returns the recipe defining this VPValue or nullptr if it is not defined by a recipe,...
Definition VPlan.cpp:130
void printAsOperand(raw_ostream &OS, VPSlotTracker &Tracker) const
Definition VPlan.cpp:1523
Value * getUnderlyingValue() const
Return the underlying Value attached to this VPValue.
Definition VPlanValue.h:75
void setUnderlyingValue(Value *Val)
Definition VPlanValue.h:208
VPValue * getVFValue() const
Definition VPlan.h:2278
void execute(VPTransformState &State) override
The method which generates the output IR instructions that correspond to this VPRecipe,...
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
Type * getSourceElementType() const
Definition VPlan.h:2275
int64_t getStride() const
Definition VPlan.h:2276
void materializeOffset(unsigned Part=0)
Adds the offset operand to the recipe.
VPValue * getStride() const
Definition VPlan.h:2344
Type * getSourceElementType() const
Definition VPlan.h:2352
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void execute(VPTransformState &State) override
The method which generates the output IR instructions that correspond to this VPRecipe,...
VPValue * getVFxPart() const
Definition VPlan.h:2346
bool usesFirstLaneOnly(const VPValue *Op) const override
Returns true if the recipe only uses the first lane of operand Op.
operand_range args()
Definition VPlan.h:2127
Function * getCalledScalarFunction() const
Definition VPlan.h:2123
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPWidenCallRecipe.
void execute(VPTransformState &State) override
Produce a widened version of the call instruction.
static InstructionCost computeCallCost(Function *Variant, VPCostContext &Ctx)
Return the cost of widening a call using the vector function Variant.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
LLVM_ABI_FOR_TEST void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
LLVM_ABI_FOR_TEST void execute(VPTransformState &State) override
Produce widened copies of the cast.
LLVM_ABI_FOR_TEST InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPWidenCastRecipe.
void execute(VPTransformState &State) override
Generate the gep nodes.
Type * getSourceElementType() const
Definition VPlan.h:2232
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
bool usesFirstLaneOnly(const VPValue *Op) const override
Returns true if the recipe only uses the first lane of operand Op.
VPIRValue * getStartValue() const
Returns the start value of the induction.
Definition VPlan.h:2517
VPValue * getStepValue()
Returns the step value of the induction.
Definition VPlan.h:2520
VPIRValue * getStartValue() const
Returns the start value of the induction.
Definition VPlan.h:2618
TruncInst * getTruncInst()
Returns the first defined value as TruncInst, if it is one or nullptr otherwise.
Definition VPlan.h:2633
bool isCanonical() const
Returns true if the induction is canonical, i.e.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
CallInst * createVectorCall(VPTransformState &State)
Helper function to produce the widened intrinsic call.
Intrinsic::ID getVectorIntrinsicID() const
Return the ID of the intrinsic.
Definition VPlan.h:2012
LLVM_ABI_FOR_TEST void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
StringRef getIntrinsicName() const
Return to name of the intrinsic as string.
static InstructionCost computeCallCost(Intrinsic::ID ID, ArrayRef< const VPValue * > Operands, const VPRecipeWithIRFlags &R, ElementCount VF, VPCostContext &Ctx)
Compute the cost of a vector intrinsic with ID and Operands.
LLVM_ABI_FOR_TEST bool usesFirstLaneOnly(const VPValue *Op) const override
Returns true if the VPUser only uses the first lane of operand Op.
LLVM_ABI_FOR_TEST void execute(VPTransformState &State) override
Produce a widened version of the vector intrinsic.
LLVM_ABI_FOR_TEST InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this vector intrinsic.
static InstructionCost computeMemIntrinsicCost(Intrinsic::ID IID, Type *Ty, bool IsMasked, Align Alignment, VPCostContext &Ctx)
Helper function for computing the cost of vector memory intrinsic.
void execute(VPTransformState &State) override
Produce a widened version of the vector memory intrinsic.
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this vector memory intrinsic.
bool IsMasked
Whether the memory access is masked.
Definition VPlan.h:3681
bool isConsecutive() const
Return whether the loaded-from / stored-to addresses are consecutive.
Definition VPlan.h:3704
Instruction & Ingredient
Definition VPlan.h:3672
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const
Return the cost of this VPWidenMemoryRecipe.
bool Consecutive
Whether the accessed addresses are consecutive.
Definition VPlan.h:3678
VPValue * getMask() const
Return the mask used by this recipe.
Definition VPlan.h:3714
Align Alignment
Alignment information for this memory access.
Definition VPlan.h:3675
virtual VPRecipeBase * getAsRecipe()=0
Return a VPRecipeBase* to the current object.
VPValue * getAddr() const
Return the address accessed by this recipe.
Definition VPlan.h:3707
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPWidenPHIRecipe.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void execute(VPTransformState &State) override
Generate the phi/select nodes.
bool onlyScalarsGenerated(bool IsScalable)
Returns true if only scalar values will be generated.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPWidenRecipe.
void execute(VPTransformState &State) override
Produce a widened instruction using the opcode and operands of the recipe, processing State....
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
VPlan models a candidate for vectorization, encoding various decisions take to produce efficient outp...
Definition VPlan.h:4681
const DataLayout & getDataLayout() const
Definition VPlan.h:4886
VPIRValue * getConstantInt(Type *Ty, uint64_t Val, bool IsSigned=false)
Return a VPIRValue wrapping a ConstantInt with the given type and value.
Definition VPlan.h:4988
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255
LLVM_ABI void setName(const Twine &Name)
Change the name of the value.
Definition Value.cpp:393
LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.h:258
void mutateType(Type *Ty)
Mutate the type of this Value to be of the specified type.
Definition Value.h:806
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:318
Base class of all SIMD vector types.
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Type * getElementType() const
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:168
constexpr LeafTy multiplyCoefficientBy(ScalarTy RHS) const
Definition TypeSize.h:256
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
constexpr LeafTy divideCoefficientBy(ScalarTy RHS) const
We do not provide the '/' operator here because division for polynomial types does not work in the sa...
Definition TypeSize.h:252
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
iterator erase(iterator where)
Definition ilist.h:204
pointer remove(iterator &IT)
Definition ilist.h:188
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
LLVM_ABI Intrinsic::ID getDeinterleaveIntrinsicID(unsigned Factor)
Returns the corresponding llvm.vector.deinterleaveN intrinsic for factor N.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > OverloadTys={})
Look up the Function declaration of the intrinsic id in the Module M.
LLVM_ABI StringRef getBaseName(ID id)
Return the LLVM name for an intrinsic, without encoded types for overloading, such as "llvm....
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
match_combine_or< Ty... > m_CombineOr(const Ty &...Ps)
Combine pattern matchers matching any of Ps patterns.
auto m_Cmp()
Matches any compare instruction and ignore it.
bool match(Val *V, const Pattern &P)
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
LogicalOp_match< LHS, RHS, Instruction::And, true > m_c_LogicalAnd(const LHS &L, const RHS &R)
Matches L && R with LHS and RHS in either order.
LogicalOp_match< LHS, RHS, Instruction::Or, true > m_c_LogicalOr(const LHS &L, const RHS &R)
Matches L || R with LHS and RHS in either order.
specific_intval< 1 > m_False()
specific_intval< 1 > m_True()
auto m_VPValue()
Match an arbitrary VPValue and ignore it.
VPInstruction_match< VPInstruction::Reverse, Op0_t > m_Reverse(const Op0_t &Op0)
NodeAddr< DefNode * > Def
Definition RDFGraph.h:384
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
bool isSingleScalar(const VPValue *VPV)
Returns true if VPV is a single scalar, either because it produces the same value for all lanes or on...
bool isAddressSCEVForCost(const SCEV *Addr, ScalarEvolution &SE, const Loop *L)
Returns true if Addr is an address SCEV that can be passed to TTI::getAddressComputationCost,...
bool onlyFirstPartUsed(const VPValue *Def)
Returns true if only the first part of Def is used.
bool onlyFirstLaneUsed(const VPValue *Def)
Returns true if only the first lane of Def is used.
bool onlyScalarValuesUsed(const VPValue *Def)
Returns true if only scalar values of Def are used by all users.
bool isUsedByLoadStoreAddress(const VPValue *V)
Returns true if V is used as part of the address of another load or store.
const SCEV * getSCEVExprForVPValue(const VPValue *V, PredicatedScalarEvolution &PSE, const Loop *L=nullptr)
Return the SCEV expression for V.
bool isHeaderMask(const VPValue *V, const VPlan &Plan)
Return true if V is a header mask in Plan.
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:315
LLVM_ABI Value * createSimpleReduction(IRBuilderBase &B, Value *Src, RecurKind RdxKind)
Create a reduction of the given vector.
@ Offset
Definition DWP.cpp:558
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition STLExtras.h:830
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1738
LLVM_ABI Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
InstructionCost Cost
@ Undef
Value of the register doesn't matter.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2553
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
auto map_to_vector(ContainerTy &&C, FuncTy &&F)
Map a range to a SmallVector with element types deduced from the mapping.
Value * getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF)
Return the runtime value for VF.
auto dyn_cast_if_present(const Y &Val)
dyn_cast_if_present<X> - Functionally identical to dyn_cast, except that a null (or none in the case ...
Definition Casting.h:732
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2207
void interleaveComma(const Container &c, StreamT &os, UnaryFunctor each_fn)
Definition STLExtras.h:2312
auto cast_or_null(const Y &Val)
Definition Casting.h:714
LLVM_ABI Value * concatenateVectors(IRBuilderBase &Builder, ArrayRef< Value * > Vecs)
Concatenate a list of vectors.
Align getLoadStoreAlignment(const Value *I)
A helper function that returns the alignment of load or store instruction.
bool isa_and_nonnull(const Y &Val)
Definition Casting.h:676
LLVM_ABI Value * createMinMaxOp(IRBuilderBase &Builder, RecurKind RK, Value *Left, Value *Right)
Returns a Min/Max operation corresponding to MinMaxRecurrenceKind.
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
static Error getOffset(const SymbolRef &Sym, SectionRef Sec, uint64_t &Result)
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1745
LLVM_ABI Constant * createBitMaskForGaps(IRBuilderBase &Builder, unsigned VF, const InterleaveGroup< Instruction > &Group)
Create a mask that filters the members of an interleave group where there are gaps.
LLVM_ABI llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
auto reverse(ContainerTy &&C)
Definition STLExtras.h:407
ElementCount getVectorizedTypeVF(Type *Ty)
Returns the number of vector elements for a vectorized type.
LLVM_ABI llvm::SmallVector< int, 16 > createReplicatedMask(unsigned ReplicationFactor, unsigned VF)
Create a mask with replicated elements.
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:209
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1752
SmallVector< ValueTypeFromRangeType< R >, Size > to_vector(R &&Range)
Given a range of type R, iterate the entire range and return a SmallVector with elements of the vecto...
Type * toVectorizedTy(Type *Ty, ElementCount EC)
A helper for converting to vectorized types.
cl::opt< unsigned > ForceTargetInstructionCost
LLVM_ABI Type * computeScalarTypeForInstruction(unsigned Opcode, ArrayRef< VPValue * > Operands)
Compute the scalar result type for an IR Opcode given Operands.
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
auto drop_end(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the last N elements excluded.
Definition STLExtras.h:322
LLVM_ABI bool isVectorIntrinsicWithStructReturnOverloadAtField(Intrinsic::ID ID, int RetIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic that returns a struct is overloaded at the struct elem...
@ Other
Any other memory.
Definition ModRef.h:68
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:221
LLVM_ABI llvm::SmallVector< int, 16 > createInterleaveMask(unsigned VF, unsigned NumVecs)
Create an interleave shuffle mask.
RecurKind
These are the kinds of recurrences that we support.
@ UMin
Unsigned integer min implemented in terms of select(cmp()).
@ FMinimumNum
FP min with llvm.minimumnum semantics.
@ FindIV
FindIV reduction with select(icmp(),x,y) where one of (x,y) is a loop induction variable (increasing ...
@ Or
Bitwise or logical OR of integers.
@ FMinimum
FP min with llvm.minimum semantics.
@ FMaxNum
FP max with llvm.maxnum semantics including NaNs.
@ Mul
Product of integers.
@ FSub
Subtraction of floats.
@ FAddChainWithSubs
A chain of fadds and fsubs.
@ None
Not a recurrence.
@ AnyOf
AnyOf reduction with select(cmp(),x,y) where one of (x,y) is loop invariant, and both x and y are int...
@ Xor
Bitwise or logical XOR of integers.
@ FindLast
FindLast reduction with select(cmp(),x,y) where x and y.
@ FMax
FP max implemented in terms of select(cmp()).
@ FMaximum
FP max with llvm.maximum semantics.
@ FMulAdd
Sum of float products with llvm.fmuladd(a * b + sum).
@ FMul
Product of floats.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ And
Bitwise or logical AND of integers.
@ SMin
Signed integer min implemented in terms of select(cmp()).
@ FMin
FP min implemented in terms of select(cmp()).
@ FMinNum
FP min with llvm.minnum semantics including NaNs.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
@ AddChainWithSubs
A chain of adds and subs.
@ FAdd
Sum of floats.
@ FMaximumNum
FP max with llvm.maximumnum semantics.
@ UMax
Unsigned integer max implemented in terms of select(cmp()).
LLVM_ABI bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic has a scalar operand.
LLVM_ABI Value * getRecurrenceIdentity(RecurKind K, Type *Tp, FastMathFlags FMF)
Given information about an recurrence kind, return the identity for the @llvm.vector....
DWARFExpression::Operation Op
LLVM_ABI Type * getScalarTypeOrInfer(VPValue *V)
Return the scalar type of V.
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1946
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
LLVM_ABI Value * createOrderedReduction(IRBuilderBase &B, RecurKind RdxKind, Value *Src, Value *Start)
Create an ordered reduction intrinsic using the given recurrence kind RdxKind.
ArrayRef< Type * > getContainedTypes(Type *const &Ty)
Returns the types contained in Ty.
Type * toVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
LLVM_ABI bool isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, int OpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic is overloaded on the type of the operand at index OpdI...
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
Struct to hold various analysis needed for cost computations.
static bool isFreeScalarIntrinsic(Intrinsic::ID ID)
Returns true if ID is a pseudo intrinsic that is dropped via scalarization rather than widened.
Definition VPlan.cpp:1948
void execute(VPTransformState &State) override
Generate the phi nodes.
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this first-order recurrence phi recipe.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
An overlay for VPIRInstructions wrapping PHI nodes enabling convenient use cast/dyn_cast/isa and exec...
Definition VPlan.h:1768
PHINode & getIRPhi()
Definition VPlan.h:1781
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void execute(VPTransformState &State) override
The method which generates the output IR instructions that correspond to this VPRecipe,...
void execute(VPTransformState &State) override
Generate the instruction.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
InstructionCost getCostForRecipeWithOpcode(unsigned Opcode, ElementCount VF, VPCostContext &Ctx) const
Compute the cost for this recipe for VF, using Opcode and Ctx.
VPRecipeWithIRFlags(const unsigned char SC, ArrayRef< VPValue * > Operands, const VPIRFlags &Flags, DebugLoc DL=DebugLoc::getUnknown())
Definition VPlan.h:1123
A symbolic live-in VPValue, used for values like vector trip count, VF, and VFxUF.
Definition VPlanValue.h:286
SmallDenseMap< const VPBasicBlock *, BasicBlock * > VPBB2IRBB
A mapping of each VPBasicBlock to the corresponding BasicBlock.
VPTransformState holds information passed down when "executing" a VPlan, needed for generating the ou...
VPTypeAnalysis TypeAnalysis
VPlan-based type analysis.
struct llvm::VPTransformState::CFGState CFG
Value * get(const VPValue *Def, bool IsScalar=false)
Get the generated vector Value for a given VPValue Def if IsScalar is false, otherwise return the gen...
Definition VPlan.cpp:313
IRBuilderBase & Builder
Hold a reference to the IRBuilder used to generate output IR code.
ElementCount VF
The chosen Vectorization Factor of the loop being vectorized.
LLVM_ABI_FOR_TEST void execute(VPTransformState &State) override
Generate the wide load or gather.
LLVM_ABI_FOR_TEST void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
LLVM_ABI_FOR_TEST InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPWidenLoadEVLRecipe.
VPValue * getEVL() const
Return the EVL operand.
Definition VPlan.h:3798
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void execute(VPTransformState &State) override
Generate a wide load or gather.
VPValue * getStoredValue() const
Return the address accessed by this recipe.
Definition VPlan.h:3899
LLVM_ABI_FOR_TEST void execute(VPTransformState &State) override
Generate the wide store or scatter.
LLVM_ABI_FOR_TEST void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
LLVM_ABI_FOR_TEST InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPWidenStoreEVLRecipe.
VPValue * getEVL() const
Return the EVL operand.
Definition VPlan.h:3902
void execute(VPTransformState &State) override
Generate a wide store or scatter.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
VPValue * getStoredValue() const
Return the value stored by this recipe.
Definition VPlan.h:3848