LLVM 23.0.0git
VPlanRecipes.cpp
Go to the documentation of this file.
1//===- VPlanRecipes.cpp - Implementations for VPlan recipes ---------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file contains implementations for different VPlan recipes.
11///
12//===----------------------------------------------------------------------===//
13
15#include "VPlan.h"
16#include "VPlanHelpers.h"
17#include "VPlanPatternMatch.h"
18#include "VPlanUtils.h"
19#include "llvm/ADT/STLExtras.h"
22#include "llvm/ADT/Twine.h"
27#include "llvm/IR/BasicBlock.h"
28#include "llvm/IR/IRBuilder.h"
29#include "llvm/IR/Instruction.h"
31#include "llvm/IR/Intrinsics.h"
32#include "llvm/IR/Type.h"
33#include "llvm/IR/Value.h"
36#include "llvm/Support/Debug.h"
40#include <cassert>
41
42using namespace llvm;
43using namespace llvm::VPlanPatternMatch;
44
46
47#define LV_NAME "loop-vectorize"
48#define DEBUG_TYPE LV_NAME
49
50#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
51// It is sometimes necessary to disable printing of metadata in tests in order
52// to avoid non-deterministic behaviour due to metadata introduced by VPlan
53// that wasn't present in the original scalar IR.
55 "vplan-print-metadata", cl::init(true), cl::Hidden,
56 cl::desc("Controls the printing of recipe metadata when debugging."));
57#endif
58
60 switch (getVPRecipeID()) {
61 case VPExpressionSC:
62 return cast<VPExpressionRecipe>(this)->mayReadOrWriteMemory();
63 case VPInstructionSC: {
64 auto *VPI = cast<VPInstruction>(this);
65 // Loads read from memory but don't write to memory.
66 if (VPI->getOpcode() == Instruction::Load)
67 return false;
68 return VPI->opcodeMayReadOrWriteFromMemory();
69 }
70 case VPInterleaveEVLSC:
71 case VPInterleaveSC:
72 return cast<VPInterleaveBase>(this)->getNumStoreOperands() > 0;
73 case VPWidenStoreEVLSC:
74 case VPWidenStoreSC:
75 return true;
76 case VPReplicateSC:
77 return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
78 ->mayWriteToMemory();
79 case VPWidenCallSC:
80 return !cast<VPWidenCallRecipe>(this)
81 ->getCalledScalarFunction()
82 ->onlyReadsMemory();
83 case VPWidenMemIntrinsicSC:
84 case VPWidenIntrinsicSC:
85 return cast<VPWidenIntrinsicRecipe>(this)->mayWriteToMemory();
86 case VPActiveLaneMaskPHISC:
87 case VPCurrentIterationPHISC:
88 case VPBranchOnMaskSC:
89 case VPDerivedIVSC:
90 case VPFirstOrderRecurrencePHISC:
91 case VPReductionPHISC:
92 case VPScalarIVStepsSC:
93 case VPPredInstPHISC:
94 return false;
95 case VPBlendSC:
96 case VPReductionEVLSC:
97 case VPReductionSC:
98 case VPVectorPointerSC:
99 case VPWidenCanonicalIVSC:
100 case VPWidenCastSC:
101 case VPWidenGEPSC:
102 case VPWidenIntOrFpInductionSC:
103 case VPWidenLoadEVLSC:
104 case VPWidenLoadSC:
105 case VPWidenPHISC:
106 case VPWidenPointerInductionSC:
107 case VPWidenSC: {
108 const Instruction *I =
109 dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
110 (void)I;
111 assert((!I || !I->mayWriteToMemory()) &&
112 "underlying instruction may write to memory");
113 return false;
114 }
115 default:
116 return true;
117 }
118}
119
121 switch (getVPRecipeID()) {
122 case VPExpressionSC:
123 return cast<VPExpressionRecipe>(this)->mayReadOrWriteMemory();
124 case VPInstructionSC:
125 return cast<VPInstruction>(this)->opcodeMayReadOrWriteFromMemory();
126 case VPWidenLoadEVLSC:
127 case VPWidenLoadSC:
128 return true;
129 case VPReplicateSC:
130 return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
131 ->mayReadFromMemory();
132 case VPWidenCallSC:
133 return !cast<VPWidenCallRecipe>(this)
134 ->getCalledScalarFunction()
135 ->onlyWritesMemory();
136 case VPWidenMemIntrinsicSC:
137 case VPWidenIntrinsicSC:
138 return cast<VPWidenIntrinsicRecipe>(this)->mayReadFromMemory();
139 case VPBranchOnMaskSC:
140 case VPDerivedIVSC:
141 case VPCurrentIterationPHISC:
142 case VPFirstOrderRecurrencePHISC:
143 case VPReductionPHISC:
144 case VPPredInstPHISC:
145 case VPScalarIVStepsSC:
146 case VPWidenStoreEVLSC:
147 case VPWidenStoreSC:
148 return false;
149 case VPBlendSC:
150 case VPReductionEVLSC:
151 case VPReductionSC:
152 case VPVectorPointerSC:
153 case VPWidenCanonicalIVSC:
154 case VPWidenCastSC:
155 case VPWidenGEPSC:
156 case VPWidenIntOrFpInductionSC:
157 case VPWidenPHISC:
158 case VPWidenPointerInductionSC:
159 case VPWidenSC: {
160 const Instruction *I =
161 dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
162 (void)I;
163 assert((!I || !I->mayReadFromMemory()) &&
164 "underlying instruction may read from memory");
165 return false;
166 }
167 default:
168 // FIXME: Return false if the recipe represents an interleaved store.
169 return true;
170 }
171}
172
174 switch (getVPRecipeID()) {
175 case VPExpressionSC:
176 return cast<VPExpressionRecipe>(this)->mayHaveSideEffects();
177 case VPActiveLaneMaskPHISC:
178 case VPDerivedIVSC:
179 case VPCurrentIterationPHISC:
180 case VPFirstOrderRecurrencePHISC:
181 case VPReductionPHISC:
182 case VPPredInstPHISC:
183 case VPVectorEndPointerSC:
184 return false;
185 case VPInstructionSC: {
186 auto *VPI = cast<VPInstruction>(this);
187 return mayWriteToMemory() ||
188 VPI->getOpcode() == VPInstruction::BranchOnCount ||
189 VPI->getOpcode() == VPInstruction::BranchOnCond ||
190 VPI->getOpcode() == VPInstruction::BranchOnTwoConds;
191 }
192 case VPWidenCallSC: {
193 Function *Fn = cast<VPWidenCallRecipe>(this)->getCalledScalarFunction();
194 return mayWriteToMemory() || !Fn->doesNotThrow() || !Fn->willReturn();
195 }
196 case VPWidenMemIntrinsicSC:
197 case VPWidenIntrinsicSC:
198 return cast<VPWidenIntrinsicRecipe>(this)->mayHaveSideEffects();
199 case VPBlendSC:
200 case VPReductionEVLSC:
201 case VPReductionSC:
202 case VPScalarIVStepsSC:
203 case VPVectorPointerSC:
204 case VPWidenCanonicalIVSC:
205 case VPWidenCastSC:
206 case VPWidenGEPSC:
207 case VPWidenIntOrFpInductionSC:
208 case VPWidenPHISC:
209 case VPWidenPointerInductionSC:
210 case VPWidenSC: {
211 const Instruction *I =
212 dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
213 (void)I;
214 assert((!I || !I->mayHaveSideEffects()) &&
215 "underlying instruction has side-effects");
216 return false;
217 }
218 case VPInterleaveEVLSC:
219 case VPInterleaveSC:
220 return mayWriteToMemory();
221 case VPWidenLoadEVLSC:
222 case VPWidenLoadSC:
223 case VPWidenStoreEVLSC:
224 case VPWidenStoreSC:
225 assert(
226 cast<VPWidenMemoryRecipe>(this)->getIngredient().mayHaveSideEffects() ==
228 "mayHaveSideffects result for ingredient differs from this "
229 "implementation");
230 return mayWriteToMemory();
231 case VPReplicateSC: {
232 auto *R = cast<VPReplicateRecipe>(this);
233 return R->getUnderlyingInstr()->mayHaveSideEffects();
234 }
235 default:
236 return true;
237 }
238}
239
241 switch (getVPRecipeID()) {
242 default:
243 return false;
244 case VPInstructionSC: {
245 unsigned Opcode = cast<VPInstruction>(this)->getOpcode();
246 if (Instruction::isCast(Opcode))
247 return true;
248
249 switch (Opcode) {
250 default:
251 return false;
252 case Instruction::Add:
253 case Instruction::Sub:
254 case Instruction::Mul:
255 case Instruction::GetElementPtr:
256 return true;
257 }
258 }
259 }
260}
261
263 assert(!Parent && "Recipe already in some VPBasicBlock");
264 assert(InsertPos->getParent() &&
265 "Insertion position not in any VPBasicBlock");
266 InsertPos->getParent()->insert(this, InsertPos->getIterator());
267}
268
269void VPRecipeBase::insertBefore(VPBasicBlock &BB,
271 assert(!Parent && "Recipe already in some VPBasicBlock");
272 assert(I == BB.end() || I->getParent() == &BB);
273 BB.insert(this, I);
274}
275
277 assert(!Parent && "Recipe already in some VPBasicBlock");
278 assert(InsertPos->getParent() &&
279 "Insertion position not in any VPBasicBlock");
280 InsertPos->getParent()->insert(this, std::next(InsertPos->getIterator()));
281}
282
284 assert(getParent() && "Recipe not in any VPBasicBlock");
286 Parent = nullptr;
287}
288
290 assert(getParent() && "Recipe not in any VPBasicBlock");
292}
293
296 insertAfter(InsertPos);
297}
298
304
306 // Get the underlying instruction for the recipe, if there is one. It is used
307 // to
308 // * decide if cost computation should be skipped for this recipe,
309 // * apply forced target instruction cost.
310 Instruction *UI = nullptr;
311 if (auto *S = dyn_cast<VPSingleDefRecipe>(this))
312 UI = dyn_cast_or_null<Instruction>(S->getUnderlyingValue());
313 else if (auto *IG = dyn_cast<VPInterleaveBase>(this))
314 UI = IG->getInsertPos();
315 else if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(this))
316 UI = &WidenMem->getIngredient();
317
318 InstructionCost RecipeCost;
319 if (UI && Ctx.skipCostComputation(UI, VF.isVector())) {
320 RecipeCost = 0;
321 } else {
322 RecipeCost = computeCost(VF, Ctx);
323 if (ForceTargetInstructionCost.getNumOccurrences() > 0 &&
324 RecipeCost.isValid()) {
325 if (UI)
327 else
328 RecipeCost = InstructionCost(0);
329 }
330 }
331
332 LLVM_DEBUG({
333 dbgs() << "Cost of " << RecipeCost << " for VF " << VF << ": ";
334 dump();
335 });
336 return RecipeCost;
337}
338
340 VPCostContext &Ctx) const {
341 llvm_unreachable("subclasses should implement computeCost");
342}
343
345 return (getVPRecipeID() >= VPFirstPHISC && getVPRecipeID() <= VPLastPHISC) ||
347}
348
350 assert(OpType == Other.OpType && "OpType must match");
351 switch (OpType) {
352 case OperationType::OverflowingBinOp:
353 WrapFlags.HasNUW &= Other.WrapFlags.HasNUW;
354 WrapFlags.HasNSW &= Other.WrapFlags.HasNSW;
355 break;
356 case OperationType::Trunc:
357 TruncFlags.HasNUW &= Other.TruncFlags.HasNUW;
358 TruncFlags.HasNSW &= Other.TruncFlags.HasNSW;
359 break;
360 case OperationType::DisjointOp:
361 DisjointFlags.IsDisjoint &= Other.DisjointFlags.IsDisjoint;
362 break;
363 case OperationType::PossiblyExactOp:
364 ExactFlags.IsExact &= Other.ExactFlags.IsExact;
365 break;
366 case OperationType::GEPOp:
367 GEPFlagsStorage &= Other.GEPFlagsStorage;
368 break;
369 case OperationType::FPMathOp:
370 case OperationType::FCmp:
371 assert((OpType != OperationType::FCmp ||
372 FCmpFlags.CmpPredStorage == Other.FCmpFlags.CmpPredStorage) &&
373 "Cannot drop CmpPredicate");
374 getFMFsRef().NoNaNs &= Other.getFMFsRef().NoNaNs;
375 getFMFsRef().NoInfs &= Other.getFMFsRef().NoInfs;
376 break;
377 case OperationType::NonNegOp:
378 NonNegFlags.NonNeg &= Other.NonNegFlags.NonNeg;
379 break;
380 case OperationType::Cmp:
381 assert(CmpPredStorage == Other.CmpPredStorage &&
382 "Cannot drop CmpPredicate");
383 break;
384 case OperationType::ReductionOp:
385 assert(ReductionFlags.Kind == Other.ReductionFlags.Kind &&
386 "Cannot change RecurKind");
387 assert(ReductionFlags.IsOrdered == Other.ReductionFlags.IsOrdered &&
388 "Cannot change IsOrdered");
389 assert(ReductionFlags.IsInLoop == Other.ReductionFlags.IsInLoop &&
390 "Cannot change IsInLoop");
391 getFMFsRef().NoNaNs &= Other.getFMFsRef().NoNaNs;
392 getFMFsRef().NoInfs &= Other.getFMFsRef().NoInfs;
393 break;
394 case OperationType::Other:
395 break;
396 }
397}
398
400 if (!hasFastMathFlags())
401 return {};
402 const FastMathFlagsTy &F = getFMFsRef();
403 FastMathFlags Res;
404 Res.setAllowReassoc(F.AllowReassoc);
405 Res.setNoNaNs(F.NoNaNs);
406 Res.setNoInfs(F.NoInfs);
407 Res.setNoSignedZeros(F.NoSignedZeros);
408 Res.setAllowReciprocal(F.AllowReciprocal);
409 Res.setAllowContract(F.AllowContract);
410 Res.setApproxFunc(F.ApproxFunc);
411 return Res;
412}
413
414#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
416
417void VPRecipeBase::print(raw_ostream &O, const Twine &Indent,
418 VPSlotTracker &SlotTracker) const {
419 printRecipe(O, Indent, SlotTracker);
420 if (auto DL = getDebugLoc()) {
421 O << ", !dbg ";
422 DL.print(O);
423 }
424
425 if (auto *Metadata = dyn_cast<VPIRMetadata>(this))
427}
428#endif
429
431 : VPSingleDefRecipe(VPRecipeBase::VPExpandSCEVSC, {}, Expr->getType()),
432 Expr(Expr) {}
433
434/// For call VPInstruction operands, return the operand index of the called
435/// function. The function is either the last operand (for unmasked calls) or
436/// the second-to-last operand (for masked calls).
438 unsigned NumOps = Operands.size();
439 auto *LastOp = dyn_cast<VPIRValue>(Operands[NumOps - 1]);
440 if (LastOp && isa<Function>(LastOp->getValue()))
441 return NumOps - 1;
443 "expected function operand");
444 return NumOps - 2;
445}
446
447/// For call VPInstruction operands, return the called function.
449 unsigned Idx = getCalledFnOperandIndex(Operands);
450 return cast<Function>(cast<VPIRValue>(Operands[Idx])->getValue());
451}
452
454 ArrayRef<VPValue *> Operands) {
455 assert(!Operands.empty() &&
456 "zero-operand VPInstruction opcodes must pass explicit ResultTy");
457 // Assert operand \p Idx (if present and typed) has type \p ExpectedTy.
458 [[maybe_unused]] auto AssertOperandType = [&Operands](unsigned Idx,
459 Type *ExpectedTy) {
460 if (!ExpectedTy || Operands.size() <= Idx)
461 return;
462 [[maybe_unused]] Type *OpTy = Operands[Idx]->getScalarType();
463 assert((!OpTy || OpTy == ExpectedTy) &&
464 "different types inferred for different operands");
465 };
466
467 Type *Op0Ty = Operands[0]->getScalarType();
468 LLVMContext &Ctx = Op0Ty->getContext();
469 switch (Opcode) {
471 assert(Op0Ty->isIntegerTy(1) && "expected bool condition");
472 return Type::getVoidTy(Ctx);
474 assert(Op0Ty->isIntegerTy(1) && "expected bool condition");
475 AssertOperandType(1, IntegerType::get(Ctx, 1));
476 return Type::getVoidTy(Ctx);
478 assert(Op0Ty->isIntegerTy() && "expected integer operand");
479 AssertOperandType(1, Op0Ty);
480 return Type::getVoidTy(Ctx);
483 assert(Op0Ty->isIntegerTy() && "expected integer operand");
484 for (unsigned Idx = 1; Idx != Operands.size(); ++Idx)
485 AssertOperandType(Idx, Op0Ty);
486 return Op0Ty;
487 case Instruction::Switch:
488 for (unsigned Idx = 1; Idx != Operands.size(); ++Idx)
489 AssertOperandType(Idx, Op0Ty);
490 return Type::getVoidTy(Ctx);
491 case Instruction::Store:
492 return Type::getVoidTy(Ctx);
493 case Instruction::ICmp:
494 assert(Op0Ty->isIntOrPtrTy() && "expected integer or pointer operand");
495 AssertOperandType(1, Op0Ty);
496 return IntegerType::get(Ctx, 1);
497 case Instruction::FCmp:
498 assert(Op0Ty->isFloatingPointTy() && "expected floating-point operand");
499 AssertOperandType(1, Op0Ty);
500 return IntegerType::get(Ctx, 1);
502 assert(Op0Ty->isIntegerTy() && "expected integer operand");
503 AssertOperandType(1, Op0Ty);
504 return IntegerType::get(Ctx, 1);
506 assert(Op0Ty->isIntegerTy(1) && "expected bool operand");
507 return IntegerType::get(Ctx, 1);
510 assert(Op0Ty->isIntegerTy(1) && "expected bool operand");
511 AssertOperandType(1, Op0Ty);
512 return IntegerType::get(Ctx, 1);
514 assert(Op0Ty->isIntegerTy(1) && "expected bool operand");
515 for (unsigned Idx = 1; Idx != Operands.size(); ++Idx)
516 AssertOperandType(Idx, Op0Ty);
517 return IntegerType::get(Ctx, 1);
519 assert(Op0Ty->isIntegerTy() && "expected integer operand");
520 return IntegerType::get(Ctx, 32);
521 case Instruction::Select: {
522 assert((!Op0Ty || Op0Ty->isIntegerTy(1)) &&
523 "select condition must be bool");
524 Type *Op1Ty = Operands[1]->getScalarType();
525 AssertOperandType(2, Op1Ty);
526 return Op1Ty;
527 }
528 case Instruction::InsertElement:
529 // The inserted scalar (operand 1) must match the vector element type;
530 // operand 2 must be an integer.
531 AssertOperandType(1, Op0Ty);
532 assert(Operands[2]->getScalarType()->isIntegerTy() &&
533 "expected integer operand");
534 return Op0Ty;
536 // The start value and the identity value (operands 0 and 1) fill the same
537 // vector and must match in type; operand 2 is the scaling factor.
538 AssertOperandType(1, Op0Ty);
539 return Op0Ty;
541 assert(Operands.size() >= 2 && "ExtractLane requires a lane operand and "
542 "at least one source vector operand");
543 // Operand 0 is the lane index, used for integer arithmetic.
544 assert(Op0Ty->isIntegerTy() && "expected integer operand");
545 Type *Op1Ty = Operands[1]->getScalarType();
546 for (unsigned Idx = 2; Idx != Operands.size(); ++Idx)
547 AssertOperandType(Idx, Op1Ty);
548 return Op1Ty;
549 }
552 assert(Operands[0]->getScalarType()->isPointerTy() &&
553 "expected pointer operand");
554 assert(Operands[1]->getScalarType()->isIntegerTy() &&
555 "expected integer operand");
556 return Op0Ty;
557 case Instruction::ExtractValue: {
558 assert(Operands.size() == 2 && "expected single level extractvalue");
559 auto *StructTy = cast<StructType>(Op0Ty);
560 return StructTy->getTypeAtIndex(
561 cast<VPConstantInt>(Operands[1])->getZExtValue());
562 }
567 case Instruction::Load:
568 case Instruction::Alloca:
569 llvm_unreachable("type must be passed explicitly");
570 case Instruction::Call:
571 return getCalledFunction(Operands)->getReturnType();
572 default:
573 break;
574 }
575
576 // Opcodes that require all operands to share the same scalar type as the
577 // result.
578 bool AllOperandsSameType =
579 Instruction::isBinaryOp(Opcode) ||
583 Opcode);
584 if (AllOperandsSameType)
585 for (unsigned Idx = 1; Idx != Operands.size(); ++Idx)
586 AssertOperandType(Idx, Op0Ty);
587
588 return Op0Ty;
589}
590
592 ArrayRef<VPValue *> Operands) {
593 unsigned Opcode = I->getOpcode();
594 if (Instruction::isCast(Opcode) ||
595 is_contained(ArrayRef<unsigned>({Instruction::ExtractValue,
596 Instruction::Load, Instruction::Alloca}),
597 Opcode))
598 return I->getType();
599 return computeScalarTypeForInstruction(Opcode, Operands);
600}
601
603 const VPIRFlags &Flags, const VPIRMetadata &MD,
604 DebugLoc DL, const Twine &Name, Type *ResultTy)
606 VPRecipeBase::VPInstructionSC, Operands,
607 ResultTy ? ResultTy
608 : computeScalarTypeForInstruction(Opcode, Operands),
609 Flags, DL),
610 VPIRMetadata(MD), Opcode(Opcode), Name(Name.str()) {
612 "Set flags not supported for the provided opcode");
614 "Opcode requires specific flags to be set");
618 "number of operands does not match opcode");
619}
620
622 if (Instruction::isUnaryOp(Opcode) || Instruction::isCast(Opcode))
623 return 1;
624
625 if (Instruction::isBinaryOp(Opcode))
626 return 2;
627
628 switch (Opcode) {
632 return 0;
633 case Instruction::Alloca:
634 case Instruction::ExtractValue:
635 case Instruction::Freeze:
636 case Instruction::Load:
649 return 1;
650 case Instruction::ICmp:
651 case Instruction::FCmp:
652 case Instruction::ExtractElement:
653 case Instruction::Store:
664 return 2;
665 case Instruction::InsertElement:
666 case Instruction::Select:
669 return 3;
670 case Instruction::Call:
672 1;
673 case Instruction::GetElementPtr:
674 case Instruction::PHI:
675 case Instruction::Switch:
685 // Cannot determine the number of operands from the opcode.
686 return -1u;
687 }
688 llvm_unreachable("all cases should be handled above");
689}
690
694
695bool VPInstruction::canGenerateScalarForFirstLane() const {
697 return true;
699 return true;
700 switch (Opcode) {
701 case Instruction::Freeze:
702 case Instruction::ICmp:
703 case Instruction::PHI:
704 case Instruction::Select:
714 return true;
715 default:
716 return false;
717 }
718}
719
721 if (Kind == RecurKind::Sub)
722 return Instruction::Add;
723 if (Kind == RecurKind::FSub)
724 return Instruction::FAdd;
725 llvm_unreachable("RecurKind should be Sub/FSub.");
726}
727
728Value *VPInstruction::generate(VPTransformState &State) {
729 IRBuilderBase &Builder = State.Builder;
730
732 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
733 Value *A = State.get(getOperand(0), OnlyFirstLaneUsed);
734 Value *B = State.get(getOperand(1), OnlyFirstLaneUsed);
735 auto *Res =
736 Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B, Name);
737 if (auto *I = dyn_cast<Instruction>(Res))
738 applyFlags(*I);
739 return Res;
740 }
741
742 switch (getOpcode()) {
743 case VPInstruction::Not: {
744 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
745 Value *A = State.get(getOperand(0), OnlyFirstLaneUsed);
746 return Builder.CreateNot(A, Name);
747 }
748 case Instruction::ExtractElement: {
749 assert(State.VF.isVector() && "Only extract elements from vectors");
750 if (auto *Idx = dyn_cast<VPConstantInt>(getOperand(1)))
751 return State.get(getOperand(0), VPLane(Idx->getZExtValue()));
752 Value *Vec = State.get(getOperand(0));
753 Value *Idx = State.get(getOperand(1), /*IsScalar=*/true);
754 return Builder.CreateExtractElement(Vec, Idx, Name);
755 }
756 case Instruction::InsertElement: {
757 assert(State.VF.isVector() && "Can only insert elements into vectors");
758 Value *Vec = State.get(getOperand(0), /*IsScalar=*/false);
759 Value *Elt = State.get(getOperand(1), /*IsScalar=*/true);
760 Value *Idx = State.get(getOperand(2), /*IsScalar=*/true);
761 return Builder.CreateInsertElement(Vec, Elt, Idx, Name);
762 }
763 case Instruction::Freeze: {
765 return Builder.CreateFreeze(Op, Name);
766 }
767 case Instruction::FCmp:
768 case Instruction::ICmp: {
769 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
770 Value *A = State.get(getOperand(0), OnlyFirstLaneUsed);
771 Value *B = State.get(getOperand(1), OnlyFirstLaneUsed);
772 return Builder.CreateCmp(getPredicate(), A, B, Name);
773 }
774 case Instruction::PHI: {
775 llvm_unreachable("should be handled by VPPhi::execute");
776 }
777 case Instruction::Select: {
778 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
779 Value *Cond =
780 State.get(getOperand(0),
781 OnlyFirstLaneUsed || vputils::isSingleScalar(getOperand(0)));
782 Value *Op1 = State.get(getOperand(1), OnlyFirstLaneUsed);
783 Value *Op2 = State.get(getOperand(2), OnlyFirstLaneUsed);
784 return Builder.CreateSelectFMF(Cond, Op1, Op2, getFastMathFlagsOrNone(),
785 Name);
786 }
788 // Get first lane of vector induction variable.
789 Value *VIVElem0 = State.get(getOperand(0), VPLane(0));
790 // Get the original loop tripcount.
791 Value *ScalarTC = State.get(getOperand(1), VPLane(0));
792
793 // If this part of the active lane mask is scalar, generate the CMP directly
794 // to avoid unnecessary extracts.
795 if (State.VF.isScalar())
796 return Builder.CreateCmp(CmpInst::Predicate::ICMP_ULT, VIVElem0, ScalarTC,
797 Name);
798
799 ElementCount EC = State.VF.multiplyCoefficientBy(
800 cast<VPConstantInt>(getOperand(2))->getZExtValue());
801 auto *PredTy = VectorType::get(Builder.getInt1Ty(), EC);
802 return Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask,
803 {PredTy, ScalarTC->getType()},
804 {VIVElem0, ScalarTC}, nullptr, Name);
805 }
807 Value *Op = State.get(getOperand(0));
808 auto *VecTy = cast<VectorType>(Op->getType());
809 assert(VecTy->getScalarSizeInBits() == 1 &&
810 "NumActiveLanes only implemented for i1 vectors");
811
812 Type *Ty = getScalarType();
813 Value *ZExt = Builder.CreateCast(
814 Instruction::ZExt, Op, VectorType::get(Ty, VecTy->getElementCount()));
815 Value *NumActive =
816 Builder.CreateUnaryIntrinsic(Intrinsic::vector_reduce_add, ZExt);
817 return NumActive;
818 }
820 // Generate code to combine the previous and current values in vector v3.
821 //
822 // vector.ph:
823 // v_init = vector(..., ..., ..., a[-1])
824 // br vector.body
825 //
826 // vector.body
827 // i = phi [0, vector.ph], [i+4, vector.body]
828 // v1 = phi [v_init, vector.ph], [v2, vector.body]
829 // v2 = a[i, i+1, i+2, i+3];
830 // v3 = vector(v1(3), v2(0, 1, 2))
831
832 auto *V1 = State.get(getOperand(0));
833 if (!V1->getType()->isVectorTy())
834 return V1;
835 Value *V2 = State.get(getOperand(1));
836 return Builder.CreateVectorSpliceRight(V1, V2, 1, Name);
837 }
839 Value *ScalarTC = State.get(getOperand(0), VPLane(0));
840 Value *VFxUF = State.get(getOperand(1), VPLane(0));
841 Value *Sub = Builder.CreateSub(ScalarTC, VFxUF);
842 Value *Cmp =
843 Builder.CreateICmp(CmpInst::Predicate::ICMP_UGT, ScalarTC, VFxUF);
845 return Builder.CreateSelect(Cmp, Sub, Zero);
846 }
848 // TODO: Restructure this code with an explicit remainder loop, vsetvli can
849 // be outside of the main loop.
850 Value *AVL = State.get(getOperand(0), /*IsScalar*/ true);
851 // Compute EVL
852 assert(AVL->getType()->isIntegerTy() &&
853 "Requested vector length should be an integer.");
854
855 assert(State.VF.isScalable() && "Expected scalable vector factor.");
856 Value *VFArg = Builder.getInt32(State.VF.getKnownMinValue());
857
858 Value *EVL = Builder.CreateIntrinsic(
859 Builder.getInt32Ty(), Intrinsic::experimental_get_vector_length,
860 {AVL, VFArg, Builder.getTrue()});
861 return EVL;
862 }
864 Value *Cond = State.get(getOperand(0), VPLane(0));
865 // Replace the temporary unreachable terminator with a new conditional
866 // branch, hooking it up to backward destination for latch blocks now, and
867 // to forward destination(s) later when they are created.
868 // Second successor may be backwards - iff it is already in VPBB2IRBB.
869 VPBasicBlock *SecondVPSucc =
870 cast<VPBasicBlock>(getParent()->getSuccessors()[1]);
871 BasicBlock *SecondIRSucc = State.CFG.VPBB2IRBB.lookup(SecondVPSucc);
872 BasicBlock *IRBB = State.CFG.VPBB2IRBB[getParent()];
873 auto *Br = Builder.CreateCondBr(Cond, IRBB, SecondIRSucc);
874 // First successor is always forward, reset it to nullptr.
875 Br->setSuccessor(0, nullptr);
877 applyMetadata(*Br);
878 return Br;
879 }
881 return Builder.CreateVectorSplat(
882 State.VF, State.get(getOperand(0), /*IsScalar*/ true), "broadcast");
883 }
885 // For struct types, we need to build a new 'wide' struct type, where each
886 // element is widened, i.e., we create a struct of vectors.
887 auto *StructTy = cast<StructType>(getOperand(0)->getScalarType());
888 Value *Res = PoisonValue::get(toVectorizedTy(StructTy, State.VF));
889 for (const auto &[LaneIndex, Op] : enumerate(operands())) {
890 for (unsigned FieldIndex = 0; FieldIndex != StructTy->getNumElements();
891 FieldIndex++) {
892 Value *ScalarValue =
893 Builder.CreateExtractValue(State.get(Op, true), FieldIndex);
894 Value *VectorValue = Builder.CreateExtractValue(Res, FieldIndex);
895 VectorValue =
896 Builder.CreateInsertElement(VectorValue, ScalarValue, LaneIndex);
897 Res = Builder.CreateInsertValue(Res, VectorValue, FieldIndex);
898 }
899 }
900 return Res;
901 }
903 auto *ScalarTy = getOperand(0)->getScalarType();
904 auto NumOfElements = ElementCount::getFixed(getNumOperands());
905 Value *Res = PoisonValue::get(toVectorizedTy(ScalarTy, NumOfElements));
906 for (const auto &[Idx, Op] : enumerate(operands()))
907 Res = Builder.CreateInsertElement(Res, State.get(Op, true),
908 Builder.getInt32(Idx));
909 return Res;
910 }
912 if (State.VF.isScalar())
913 return State.get(getOperand(0), true);
914 IRBuilderBase::FastMathFlagGuard FMFG(Builder);
916 // If this start vector is scaled then it should produce a vector with fewer
917 // elements than the VF.
918 ElementCount VF = State.VF.divideCoefficientBy(
919 cast<VPConstantInt>(getOperand(2))->getZExtValue());
920 auto *Iden = Builder.CreateVectorSplat(VF, State.get(getOperand(1), true));
921 return Builder.CreateInsertElement(Iden, State.get(getOperand(0), true),
922 Builder.getInt32(0));
923 }
925 RecurKind RK = getRecurKind();
926 bool IsOrdered = isReductionOrdered();
927 bool IsInLoop = isReductionInLoop();
929 "FindIV should use min/max reduction kinds");
930
931 // The recipe may have multiple operands to be reduced together.
932 unsigned NumOperandsToReduce = getNumOperands();
933 VectorParts RdxParts(NumOperandsToReduce);
934 for (unsigned Part = 0; Part < NumOperandsToReduce; ++Part)
935 RdxParts[Part] = State.get(getOperand(Part), IsInLoop);
936
937 IRBuilderBase::FastMathFlagGuard FMFG(Builder);
939
940 // Reduce multiple operands into one.
941 Value *ReducedPartRdx = RdxParts[0];
942 if (IsOrdered) {
943 ReducedPartRdx = RdxParts[NumOperandsToReduce - 1];
944 } else {
945 // Floating-point operations should have some FMF to enable the reduction.
946 for (unsigned Part = 1; Part < NumOperandsToReduce; ++Part) {
947 Value *RdxPart = RdxParts[Part];
949 ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
950 else {
951 // For sub-recurrences, each part's reduction variable is already
952 // negative, we need to do: reduce.add(-acc_uf0 + -acc_uf1)
956 : (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(RK);
957 ReducedPartRdx =
958 Builder.CreateBinOp(Opcode, RdxPart, ReducedPartRdx, "bin.rdx");
959 }
960 }
961 }
962
963 // Create the reduction after the loop. Note that inloop reductions create
964 // the target reduction in the loop using a Reduction recipe.
965 if (State.VF.isVector() && !IsInLoop) {
966 // TODO: Support in-order reductions based on the recurrence descriptor.
967 // All ops in the reduction inherit fast-math-flags from the recurrence
968 // descriptor.
969 ReducedPartRdx = createSimpleReduction(Builder, ReducedPartRdx, RK);
970 }
971
972 return ReducedPartRdx;
973 }
976 unsigned Offset =
978 Value *Res;
979 if (State.VF.isVector()) {
980 assert(Offset <= State.VF.getKnownMinValue() &&
981 "invalid offset to extract from");
982 // Extract lane VF - Offset from the operand.
983 Res = State.get(getOperand(0), VPLane::getLaneFromEnd(State.VF, Offset));
984 } else {
985 // TODO: Remove ExtractLastLane for scalar VFs.
986 assert(Offset <= 1 && "invalid offset to extract from");
987 Res = State.get(getOperand(0));
988 }
990 Res->setName(Name);
991 return Res;
992 }
994 Value *A = State.get(getOperand(0));
995 Value *B = State.get(getOperand(1));
996 return Builder.CreateLogicalAnd(A, B, Name);
997 }
999 Value *A = State.get(getOperand(0));
1000 Value *B = State.get(getOperand(1));
1001 return Builder.CreateLogicalOr(A, B, Name);
1002 }
1003 case VPInstruction::PtrAdd: {
1004 assert((State.VF.isScalar() || vputils::onlyFirstLaneUsed(this)) &&
1005 "can only generate first lane for PtrAdd");
1006 Value *Ptr = State.get(getOperand(0), VPLane(0));
1007 Value *Addend = State.get(getOperand(1), VPLane(0));
1008 return Builder.CreatePtrAdd(Ptr, Addend, Name, getGEPNoWrapFlags());
1009 }
1011 Value *Ptr =
1013 Value *Addend = State.get(getOperand(1));
1014 return Builder.CreatePtrAdd(Ptr, Addend, Name, getGEPNoWrapFlags());
1015 }
1016 case VPInstruction::AnyOf: {
1017 Value *Res = Builder.CreateFreeze(State.get(getOperand(0)));
1018 for (VPValue *Op : drop_begin(operands()))
1019 Res = Builder.CreateOr(Res, Builder.CreateFreeze(State.get(Op)));
1020 return State.VF.isScalar() ? Res : Builder.CreateOrReduce(Res);
1021 }
1023 assert(getNumOperands() != 2 && "ExtractLane from single source should be "
1024 "simplified to ExtractElement.");
1025 Value *LaneToExtract = State.get(getOperand(0), true);
1026 Type *IdxTy = getOperand(0)->getScalarType();
1027 Value *Res = nullptr;
1028 Value *RuntimeVF = getRuntimeVF(Builder, IdxTy, State.VF);
1029
1030 for (unsigned Idx = 1; Idx != getNumOperands(); ++Idx) {
1031 Value *VectorStart =
1032 Builder.CreateMul(RuntimeVF, ConstantInt::get(IdxTy, Idx - 1));
1033 Value *VectorIdx = Idx == 1
1034 ? LaneToExtract
1035 : Builder.CreateSub(LaneToExtract, VectorStart);
1036 Value *Ext = State.VF.isScalar()
1037 ? State.get(getOperand(Idx))
1038 : Builder.CreateExtractElement(
1039 State.get(getOperand(Idx)), VectorIdx);
1040 if (Res) {
1041 Value *Cmp = Builder.CreateICmpUGE(LaneToExtract, VectorStart);
1042 Res = Builder.CreateSelect(Cmp, Ext, Res);
1043 } else {
1044 Res = Ext;
1045 }
1046 }
1047 return Res;
1048 }
1050 Type *Ty = this->getScalarType();
1051 if (getNumOperands() == 1) {
1052 Value *Mask = State.get(getOperand(0));
1053 return Builder.CreateCountTrailingZeroElems(Ty, Mask,
1054 /*ZeroIsPoison=*/false, Name);
1055 }
1056 // If there are multiple operands, create a chain of selects to pick the
1057 // first operand with an active lane and add the number of lanes of the
1058 // preceding operands.
1059 Value *RuntimeVF = getRuntimeVF(Builder, Ty, State.VF);
1060 unsigned LastOpIdx = getNumOperands() - 1;
1061 Value *Res = nullptr;
1062 for (int Idx = LastOpIdx; Idx >= 0; --Idx) {
1063 Value *TrailingZeros =
1064 State.VF.isScalar()
1065 ? Builder.CreateZExt(
1066 Builder.CreateICmpEQ(State.get(getOperand(Idx)),
1067 Builder.getFalse()),
1068 Ty)
1070 Ty, State.get(getOperand(Idx)),
1071 /*ZeroIsPoison=*/false, Name);
1072 Value *Current = Builder.CreateAdd(
1073 Builder.CreateMul(RuntimeVF, ConstantInt::get(Ty, Idx)),
1074 TrailingZeros);
1075 if (Res) {
1076 Value *Cmp = Builder.CreateICmpNE(TrailingZeros, RuntimeVF);
1077 Res = Builder.CreateSelect(Cmp, Current, Res);
1078 } else {
1079 Res = Current;
1080 }
1081 }
1082
1083 return Res;
1084 }
1086 return State.get(getOperand(0), true);
1088 return Builder.CreateVectorReverse(State.get(getOperand(0)), "reverse");
1090 Value *Result = State.get(getOperand(0), /*IsScalar=*/true);
1091 for (unsigned Idx = 1; Idx < getNumOperands(); Idx += 2) {
1092 Value *Data = State.get(getOperand(Idx));
1093 Value *Mask = State.get(getOperand(Idx + 1));
1094 Type *VTy = Data->getType();
1095
1096 if (State.VF.isScalar())
1097 Result = Builder.CreateSelect(Mask, Data, Result);
1098 else
1099 Result = Builder.CreateIntrinsic(
1100 Intrinsic::experimental_vector_extract_last_active, {VTy},
1101 {Data, Mask, Result});
1102 }
1103
1104 return Result;
1105 }
1106 default:
1107 llvm_unreachable("Unsupported opcode for instruction");
1108 }
1109}
1110
1112 unsigned Opcode, ElementCount VF, VPCostContext &Ctx) const {
1113 Type *ScalarTy = this->getScalarType();
1114 Type *ResultTy = VF.isVector() ? toVectorTy(ScalarTy, VF) : ScalarTy;
1115 switch (Opcode) {
1116 case Instruction::FNeg:
1117 return Ctx.TTI.getArithmeticInstrCost(Opcode, ResultTy, Ctx.CostKind);
1118 case Instruction::UDiv:
1119 case Instruction::SDiv:
1120 case Instruction::SRem:
1121 case Instruction::URem:
1122 case Instruction::Add:
1123 case Instruction::FAdd:
1124 case Instruction::Sub:
1125 case Instruction::FSub:
1126 case Instruction::Mul:
1127 case Instruction::FMul:
1128 case Instruction::FDiv:
1129 case Instruction::FRem:
1130 case Instruction::Shl:
1131 case Instruction::LShr:
1132 case Instruction::AShr:
1133 case Instruction::And:
1134 case Instruction::Or:
1135 case Instruction::Xor: {
1136 // Certain instructions can be cheaper if they have a constant second
1137 // operand. One example of this are shifts on x86.
1138 VPValue *RHS = getOperand(1);
1139 TargetTransformInfo::OperandValueInfo RHSInfo = Ctx.getOperandInfo(RHS);
1140
1141 if (RHSInfo.Kind == TargetTransformInfo::OK_AnyValue &&
1144
1147 if (CtxI)
1148 Operands.append(CtxI->value_op_begin(), CtxI->value_op_end());
1149 return Ctx.TTI.getArithmeticInstrCost(
1150 Opcode, ResultTy, Ctx.CostKind,
1151 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
1152 RHSInfo, Operands, CtxI, &Ctx.TLI);
1153 }
1154 case Instruction::Freeze:
1155 // NOTE: The only way to ask for the cost is via getInstructionCost, which
1156 // requires the actual vector instruction. Instead, both here and in the
1157 // LoopVectorizationCostModel::getInstructionCost the costs mirror the
1158 // current behaviour in llvm/Analysis/TargetTransformInfoImpl.h to keep
1159 // them in sync.
1160 return TTI::TCC_Free;
1161 case Instruction::ExtractValue:
1162 return Ctx.TTI.getInsertExtractValueCost(Instruction::ExtractValue,
1163 Ctx.CostKind);
1164 case Instruction::ICmp:
1165 case Instruction::FCmp: {
1166 Type *ScalarOpTy = getOperand(0)->getScalarType();
1167 Type *OpTy = VF.isVector() ? toVectorTy(ScalarOpTy, VF) : ScalarOpTy;
1169 return Ctx.TTI.getCmpSelInstrCost(
1170 Opcode, OpTy, CmpInst::makeCmpResultType(OpTy), getPredicate(),
1171 Ctx.CostKind, {TTI::OK_AnyValue, TTI::OP_None},
1172 {TTI::OK_AnyValue, TTI::OP_None}, CtxI);
1173 }
1174 case Instruction::BitCast: {
1175 Type *ScalarTy = this->getScalarType();
1176 if (ScalarTy->isPointerTy())
1177 return 0;
1178 [[fallthrough]];
1179 }
1180 case Instruction::SExt:
1181 case Instruction::ZExt:
1182 case Instruction::FPToUI:
1183 case Instruction::FPToSI:
1184 case Instruction::FPExt:
1185 case Instruction::PtrToInt:
1186 case Instruction::PtrToAddr:
1187 case Instruction::IntToPtr:
1188 case Instruction::SIToFP:
1189 case Instruction::UIToFP:
1190 case Instruction::Trunc:
1191 case Instruction::FPTrunc:
1192 case Instruction::AddrSpaceCast: {
1193 // Computes the CastContextHint from a recipe that may access memory.
1194 auto ComputeCCH = [&](const VPRecipeBase *R) -> TTI::CastContextHint {
1195 if (isa<VPInterleaveBase>(R))
1197 if (const auto *ReplicateRecipe = dyn_cast<VPReplicateRecipe>(R)) {
1198 // Only compute CCH for memory operations, matching the legacy model
1199 // which only considers loads/stores for cast context hints.
1200 auto *UI = cast<Instruction>(ReplicateRecipe->getUnderlyingValue());
1201 if (!isa<LoadInst, StoreInst>(UI))
1203 return ReplicateRecipe->isPredicated() ? TTI::CastContextHint::Masked
1205 }
1206 const auto *WidenMemoryRecipe = dyn_cast<VPWidenMemoryRecipe>(R);
1207 if (WidenMemoryRecipe == nullptr)
1209 if (VF.isScalar())
1211 if (!WidenMemoryRecipe->isConsecutive())
1213 if (WidenMemoryRecipe->isMasked())
1216 };
1217
1218 VPValue *Operand = getOperand(0);
1220 bool IsReverse = false;
1221 // For Trunc/FPTrunc, get the context from the only user.
1222 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
1223 if (auto *Recipe = cast_or_null<VPRecipeBase>(getSingleUser())) {
1224 if (match(Recipe,
1228 IsReverse = true;
1230 Recipe->getVPSingleValue()->getSingleUser());
1231 }
1232 if (Recipe)
1233 CCH = ComputeCCH(Recipe);
1234 }
1235 }
1236 // For Z/Sext, get the context from the operand.
1237 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
1238 Opcode == Instruction::FPExt) {
1239 if (auto *Recipe = Operand->getDefiningRecipe()) {
1240 VPValue *ReverseOp;
1241 if (match(Recipe,
1242 m_CombineOr(m_Reverse(m_VPValue(ReverseOp)),
1244 m_VPValue(ReverseOp))))) {
1245 Recipe = ReverseOp->getDefiningRecipe();
1246 IsReverse = true;
1247 }
1248 if (Recipe)
1249 CCH = ComputeCCH(Recipe);
1250 }
1251 }
1252 if (IsReverse && CCH != TTI::CastContextHint::None)
1254
1255 auto *ScalarSrcTy = Operand->getScalarType();
1256 Type *SrcTy = VF.isVector() ? toVectorTy(ScalarSrcTy, VF) : ScalarSrcTy;
1257 // Arm TTI will use the underlying instruction to determine the cost.
1258 return Ctx.TTI.getCastInstrCost(
1259 Opcode, ResultTy, SrcTy, CCH, Ctx.CostKind,
1261 }
1262 case Instruction::Select: {
1264 bool IsScalarCond = getOperand(0)->isDefinedOutsideLoopRegions();
1265 Type *ScalarTy = this->getScalarType();
1266
1267 VPValue *Op0, *Op1;
1268 bool IsLogicalAnd =
1269 match(this, m_c_LogicalAnd(m_VPValue(Op0), m_VPValue(Op1)));
1270 bool IsLogicalOr =
1271 match(this, m_c_LogicalOr(m_VPValue(Op0), m_VPValue(Op1)));
1272 // Also match the inverted forms:
1273 // select x, false, y --> !x & y (still AND)
1274 // select x, y, true --> !x | y (still OR)
1275 IsLogicalAnd |=
1276 match(this, m_Select(m_VPValue(Op0), m_False(), m_VPValue(Op1)));
1277 IsLogicalOr |=
1278 match(this, m_Select(m_VPValue(Op0), m_VPValue(Op1), m_True()));
1279
1280 if (!IsScalarCond && ScalarTy->getScalarSizeInBits() == 1 &&
1281 (IsLogicalAnd || IsLogicalOr)) {
1282 // select x, y, false --> x & y
1283 // select x, true, y --> x | y
1284 const auto [Op1VK, Op1VP] = Ctx.getOperandInfo(Op0);
1285 const auto [Op2VK, Op2VP] = Ctx.getOperandInfo(Op1);
1286
1288 if (SI && all_of(operands(),
1289 [](VPValue *Op) { return Op->getUnderlyingValue(); }))
1290 append_range(Operands, SI->operands());
1291 return Ctx.TTI.getArithmeticInstrCost(
1292 IsLogicalOr ? Instruction::Or : Instruction::And, ResultTy,
1293 Ctx.CostKind, {Op1VK, Op1VP}, {Op2VK, Op2VP}, Operands, SI);
1294 }
1295
1296 Type *CondTy = getOperand(0)->getScalarType();
1297 if (!IsScalarCond && VF.isVector())
1298 CondTy = VectorType::get(CondTy, VF);
1299
1300 llvm::CmpPredicate Pred;
1301 if (!match(getOperand(0), m_Cmp(Pred, m_VPValue(), m_VPValue())))
1302 if (auto *CondIRV = dyn_cast<VPIRValue>(getOperand(0)))
1303 if (auto *Cmp = dyn_cast<CmpInst>(CondIRV->getValue()))
1304 Pred = Cmp->getPredicate();
1305 Type *VectorTy = toVectorTy(this->getScalarType(), VF);
1306 return Ctx.TTI.getCmpSelInstrCost(
1307 Instruction::Select, VectorTy, CondTy, Pred, Ctx.CostKind,
1308 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None}, SI);
1309 }
1310 }
1311 llvm_unreachable("called for unsupported opcode");
1312}
1313
1315 VPCostContext &Ctx) const {
1317 if (!getUnderlyingValue() && getOpcode() != Instruction::FMul) {
1318 // TODO: Compute cost for VPInstructions without underlying values once
1319 // the legacy cost model has been retired.
1320 return 0;
1321 }
1322
1324 "Should only generate a vector value or single scalar, not scalars "
1325 "for all lanes.");
1327 getOpcode(),
1329 }
1330
1331 switch (getOpcode()) {
1332 case Instruction::Select: {
1334 match(getOperand(0), m_Cmp(Pred, m_VPValue(), m_VPValue()));
1335 auto *CondTy = getOperand(0)->getScalarType();
1336 auto *VecTy = getOperand(1)->getScalarType();
1337 if (!vputils::onlyFirstLaneUsed(this)) {
1338 CondTy = toVectorTy(CondTy, VF);
1339 VecTy = toVectorTy(VecTy, VF);
1340 }
1341 return Ctx.TTI.getCmpSelInstrCost(Instruction::Select, VecTy, CondTy, Pred,
1342 Ctx.CostKind);
1343 }
1344 case Instruction::ExtractElement:
1346 if (VF.isScalar()) {
1347 // ExtractLane with VF=1 takes care of handling extracting across multiple
1348 // parts.
1349 return 0;
1350 }
1351
1352 // Add on the cost of extracting the element.
1353 auto *VecTy = toVectorTy(getOperand(0)->getScalarType(), VF);
1354 return Ctx.TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy,
1355 Ctx.CostKind);
1356 }
1357 case VPInstruction::AnyOf: {
1358 auto *VecTy = toVectorTy(this->getScalarType(), VF);
1359 return Ctx.TTI.getArithmeticReductionCost(
1360 Instruction::Or, cast<VectorType>(VecTy), std::nullopt, Ctx.CostKind);
1361 }
1363 Type *Ty = this->getScalarType();
1364 Type *ScalarTy = getOperand(0)->getScalarType();
1365 if (VF.isScalar())
1366 return Ctx.TTI.getCmpSelInstrCost(Instruction::ICmp, ScalarTy,
1368 CmpInst::ICMP_EQ, Ctx.CostKind);
1369 // Calculate the cost of determining the lane index.
1370 auto *PredTy = toVectorTy(ScalarTy, VF);
1371 IntrinsicCostAttributes Attrs(Intrinsic::experimental_cttz_elts, Ty,
1372 {PredTy, Type::getInt1Ty(Ctx.LLVMCtx)});
1373 return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
1374 }
1376 Type *Ty = this->getScalarType();
1377 Type *ScalarTy = getOperand(0)->getScalarType();
1378 if (VF.isScalar())
1379 return Ctx.TTI.getCmpSelInstrCost(Instruction::ICmp, ScalarTy,
1381 CmpInst::ICMP_EQ, Ctx.CostKind);
1382 // Calculate the cost of determining the lane index: NOT + cttz_elts + SUB.
1383 auto *PredTy = toVectorTy(ScalarTy, VF);
1384 IntrinsicCostAttributes Attrs(Intrinsic::experimental_cttz_elts, Ty,
1385 {PredTy, Type::getInt1Ty(Ctx.LLVMCtx)});
1386 InstructionCost Cost = Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
1387 // Add cost of NOT operation on the predicate.
1388 Cost += Ctx.TTI.getArithmeticInstrCost(
1389 Instruction::Xor, PredTy, Ctx.CostKind,
1390 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
1391 {TargetTransformInfo::OK_UniformConstantValue,
1392 TargetTransformInfo::OP_None});
1393 // Add cost of SUB operation on the index.
1394 Cost += Ctx.TTI.getArithmeticInstrCost(Instruction::Sub, Ty, Ctx.CostKind);
1395 return Cost;
1396 }
1398 Type *ScalarTy = this->getScalarType();
1399 Type *VecTy = toVectorTy(ScalarTy, VF);
1400 Type *MaskTy = toVectorTy(Type::getInt1Ty(Ctx.LLVMCtx), VF);
1402 Intrinsic::experimental_vector_extract_last_active, ScalarTy,
1403 {VecTy, MaskTy, ScalarTy});
1404 return Ctx.TTI.getIntrinsicInstrCost(ICA, Ctx.CostKind);
1405 }
1407 assert(VF.isVector() && "Scalar FirstOrderRecurrenceSplice?");
1408 Type *VectorTy = toVectorTy(this->getScalarType(), VF);
1409 return Ctx.TTI.getShuffleCost(
1411 cast<VectorType>(VectorTy), {}, Ctx.CostKind, -1);
1412 }
1414 Type *ArgTy = getOperand(0)->getScalarType();
1415 unsigned Multiplier = cast<VPConstantInt>(getOperand(2))->getZExtValue();
1416 Type *RetTy = toVectorTy(Type::getInt1Ty(Ctx.LLVMCtx), VF * Multiplier);
1417 IntrinsicCostAttributes Attrs(Intrinsic::get_active_lane_mask, RetTy,
1418 {ArgTy, ArgTy});
1419 return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
1420 }
1422 Type *Arg0Ty = getOperand(0)->getScalarType();
1423 Type *I32Ty = Type::getInt32Ty(Ctx.LLVMCtx);
1424 Type *I1Ty = Type::getInt1Ty(Ctx.LLVMCtx);
1425 IntrinsicCostAttributes Attrs(Intrinsic::experimental_get_vector_length,
1426 I32Ty, {Arg0Ty, I32Ty, I1Ty});
1427 return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
1428 }
1430 assert(VF.isVector() && "Reverse operation must be vector type");
1431 Type *EltTy = this->getScalarType();
1432 // Skip the reverse operation cost for the mask.
1433 // FIXME: Remove this once redundant mask reverse operations can be
1434 // eliminated by VPlanTransforms::cse before cost computation.
1435 if (EltTy->isIntegerTy(1))
1436 return 0;
1437 auto *VectorTy = cast<VectorType>(toVectorTy(EltTy, VF));
1438 return Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy,
1439 VectorTy, /*Mask=*/{}, Ctx.CostKind,
1440 /*Index=*/0);
1441 }
1443 // Add on the cost of extracting the element.
1444 auto *VecTy = toVectorTy(getOperand(0)->getScalarType(), VF);
1445 return Ctx.TTI.getIndexedVectorInstrCostFromEnd(Instruction::ExtractElement,
1446 VecTy, Ctx.CostKind, 0);
1447 }
1448 case VPInstruction::Not: {
1449 Type *ValTy = this->getScalarType();
1450 // InstCombine will fold `xor` to the conditional branch.
1451 if (auto *U = const_cast<VPUser *>(getSingleUser()))
1452 if (match(U, m_BranchOnCond(m_VPValue())))
1453 return 0;
1454 if (!vputils::onlyFirstLaneUsed(this))
1455 ValTy = toVectorTy(ValTy, VF);
1456 return Ctx.TTI.getArithmeticInstrCost(Instruction::Xor, ValTy,
1457 Ctx.CostKind);
1458 }
1460 // If TC <= VF then this is just a branch.
1461 // FIXME: Removing the branch happens in simplifyBranchConditionForVFAndUF
1462 // where it checks TC <= VF * UF, but we don't know UF yet. This means in
1463 // some cases we get a cost that's too high due to counting a cmp that
1464 // later gets removed.
1465 // FIXME: The compare could also be removed if TC = M * vscale,
1466 // VF = N * vscale, and M <= N. Detecting that would require having the
1467 // trip count as a SCEV though.
1470 if (TCConst && TCConst->getValue().ule(VF.getKnownMinValue()))
1471 return 0;
1472 // Otherwise BranchOnCount generates ICmpEQ followed by a branch.
1473 Type *ValTy = getOperand(0)->getScalarType();
1474 return Ctx.TTI.getCmpSelInstrCost(Instruction::ICmp, ValTy,
1476 CmpInst::ICMP_EQ, Ctx.CostKind);
1477 }
1478 case Instruction::FCmp:
1479 case Instruction::ICmp:
1481 getOpcode(),
1484 if (VF == ElementCount::getScalable(1))
1486 [[fallthrough]];
1487 default:
1488 // TODO: Compute cost other VPInstructions once the legacy cost model has
1489 // been retired.
1491 "unexpected VPInstruction witht underlying value");
1492 return 0;
1493 }
1494}
1495
1508
1510 switch (getOpcode()) {
1511 case Instruction::Load:
1512 case Instruction::PHI:
1516 return true;
1517 default:
1519 }
1520}
1521
1523#ifndef NDEBUG
1524 Type *Ty = Op->getScalarType();
1525 switch (getOpcode()) {
1529 assert(Ty == getOperand(0)->getScalarType() &&
1530 "types of operand 0 and new operand must match");
1531 break;
1535 assert(Ty == getOperand(0)->getScalarType() &&
1536 "appended operand must match operand 0's scalar type");
1537 break;
1539 assert(Ty == getOperand(1)->getScalarType() &&
1540 "appended operand must match operand 1's scalar type");
1541 break;
1543 // The recipe is constructed with 3 operands (result, data, mask). Extra
1544 // operands beyond that are appended in (data, mask) pairs.
1545 constexpr unsigned NumInitialOperands = 3;
1546 assert(getNumOperands() >= NumInitialOperands &&
1547 "ExtractLastActive must have at least the initial 3 operands");
1548 bool IsMaskSlot = ((getNumOperands() - NumInitialOperands) & 1u) == 1u;
1549 assert((IsMaskSlot ? Ty->isIntegerTy(1)
1550 : Ty == getOperand(1)->getScalarType()) &&
1551 "ExtractLastActive expects alternating data/mask operands "
1552 "matching operand 1's type and i1, respectively");
1553 break;
1554 }
1555 default:
1556 llvm_unreachable("opcode does not support growing the operand list "
1557 "outside of construction");
1558 }
1559#endif
1561}
1562
1564 assert(!isMasked() && "cannot execute masked VPInstruction");
1565 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
1567 "Set flags not supported for the provided opcode");
1569 "Opcode requires specific flags to be set");
1570 State.Builder.setFastMathFlags(getFastMathFlagsOrNone());
1571 Value *GeneratedValue = generate(State);
1572 if (!hasResult())
1573 return;
1574 assert(GeneratedValue && "generate must produce a value");
1575 bool GeneratesPerFirstLaneOnly = canGenerateScalarForFirstLane() &&
1578 assert((((GeneratedValue->getType()->isVectorTy() ||
1579 GeneratedValue->getType()->isStructTy()) ==
1580 !GeneratesPerFirstLaneOnly) ||
1581 State.VF.isScalar()) &&
1582 "scalar value but not only first lane defined");
1583 State.set(this, GeneratedValue,
1584 /*IsScalar*/ GeneratesPerFirstLaneOnly);
1586 getOpcode() == Instruction::Freeze) {
1587 // FIXME: This is a workaround to enable reliable updates of the scalar loop
1588 // resume phis, and to let epilogue vectorization recover the frozen
1589 // reduction start from the main plan. Must be removed once epilogue
1590 // vectorization explicitly connects VPlans.
1591 setUnderlyingValue(GeneratedValue);
1592 }
1593}
1594
1598 return false;
1599 switch (getOpcode()) {
1600 case Instruction::ExtractValue:
1601 case Instruction::InsertValue:
1602 case Instruction::GetElementPtr:
1603 case Instruction::ExtractElement:
1604 case Instruction::InsertElement:
1605 case Instruction::Freeze:
1606 case Instruction::FCmp:
1607 case Instruction::ICmp:
1608 case Instruction::Select:
1609 case Instruction::PHI:
1634 case VPInstruction::Not:
1643 return false;
1644 case Instruction::Call:
1647 default:
1648 return true;
1649 }
1650}
1651
1653 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
1655 return vputils::onlyFirstLaneUsed(this);
1656
1657 switch (getOpcode()) {
1658 default:
1659 return false;
1660 case Instruction::ExtractElement:
1661 return Op == getOperand(1);
1662 case Instruction::InsertElement:
1663 return Op == getOperand(1) || Op == getOperand(2);
1664 case Instruction::PHI:
1665 return true;
1666 case Instruction::FCmp:
1667 case Instruction::ICmp:
1668 case Instruction::Select:
1669 case Instruction::Or:
1670 case Instruction::Freeze:
1671 case VPInstruction::Not:
1672 // TODO: Cover additional opcodes.
1673 return vputils::onlyFirstLaneUsed(this);
1674 case Instruction::Load:
1685 return true;
1688 // Before replicating by VF, Build(Struct)Vector uses all lanes of the
1689 // operand, after replicating its operands only the first lane is used.
1690 // Before replicating, it will have only a single operand.
1691 return getNumOperands() > 1;
1693 return Op == getOperand(0) || vputils::onlyFirstLaneUsed(this);
1695 // WidePtrAdd supports scalar and vector base addresses.
1696 return false;
1699 return Op == getOperand(0);
1700 };
1701 llvm_unreachable("switch should return");
1702}
1703
1705 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
1707 return vputils::onlyFirstPartUsed(this);
1708
1709 switch (getOpcode()) {
1710 default:
1711 return false;
1712 case Instruction::FCmp:
1713 case Instruction::ICmp:
1714 case Instruction::Select:
1715 return vputils::onlyFirstPartUsed(this);
1720 return true;
1721 };
1722 llvm_unreachable("switch should return");
1723}
1724
1725#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1727 VPSlotTracker SlotTracker(getParent()->getPlan());
1729}
1730
1732 VPSlotTracker &SlotTracker) const {
1733 O << Indent << "EMIT" << (isSingleScalar() ? "-SCALAR" : "") << " ";
1734
1735 if (hasResult()) {
1737 O << " = ";
1738 }
1739
1740 switch (getOpcode()) {
1741 case VPInstruction::Not:
1742 O << "not";
1743 break;
1745 O << "active lane mask";
1746 break;
1748 O << "incoming-alias-mask";
1749 break;
1751 O << "EXPLICIT-VECTOR-LENGTH";
1752 break;
1754 O << "first-order splice";
1755 break;
1757 O << "branch-on-cond";
1758 break;
1760 O << "branch-on-two-conds";
1761 break;
1763 O << "TC > VF ? TC - VF : 0";
1764 break;
1766 O << "VF * Part +";
1767 break;
1769 O << "branch-on-count";
1770 break;
1772 O << "broadcast";
1773 break;
1775 O << "buildstructvector";
1776 break;
1778 O << "buildvector";
1779 break;
1781 O << "exiting-iv-value";
1782 break;
1784 O << "masked-cond";
1785 break;
1787 O << "extract-lane";
1788 break;
1790 O << "extract-last-lane";
1791 break;
1793 O << "extract-last-part";
1794 break;
1796 O << "extract-penultimate-element";
1797 break;
1799 O << "compute-reduction-result";
1800 break;
1802 O << "logical-and";
1803 break;
1805 O << "logical-or";
1806 break;
1808 O << "ptradd";
1809 break;
1811 O << "wide-ptradd";
1812 break;
1814 O << "any-of";
1815 break;
1817 O << "first-active-lane";
1818 break;
1820 O << "last-active-lane";
1821 break;
1823 O << "reduction-start-vector";
1824 break;
1826 O << "resume-for-epilogue";
1827 break;
1829 O << "reverse";
1830 break;
1832 O << "unpack";
1833 break;
1835 O << "extract-last-active";
1836 break;
1838 O << "num-active-lanes";
1839 break;
1840 default:
1842 }
1843
1844 printFlags(O);
1846}
1847#endif
1848
1850 Type *ResultTy = getResultType();
1852 Value *Op = State.get(getOperand(0), VPLane(0));
1853 Value *Cast = State.Builder.CreateCast(Instruction::CastOps(getOpcode()),
1854 Op, ResultTy);
1855 if (auto *CastOp = dyn_cast<Instruction>(Cast)) {
1856 applyFlags(*CastOp);
1857 applyMetadata(*CastOp);
1858 }
1859 State.set(this, Cast, VPLane(0));
1860 return;
1861 }
1862 switch (getOpcode()) {
1864 Value *StepVector =
1865 State.Builder.CreateStepVector(VectorType::get(ResultTy, State.VF));
1866 State.set(this, StepVector);
1867 break;
1868 }
1869 case VPInstruction::VScale: {
1870 Value *VScale = State.Builder.CreateVScale(ResultTy);
1871 State.set(this, VScale, true);
1872 break;
1873 }
1874
1875 default:
1876 llvm_unreachable("opcode not implemented yet");
1877 }
1878}
1879
1881 VPCostContext &Ctx) const {
1882 // NOTE: At the moment it seems only possible to expose this path for
1883 // the trunc, zext and sext opcodes. However, isScalarCast also covers
1884 // int<>fp conversions, bitcasts, ptr<>int conversions, etc.
1887 Ctx);
1888
1889 switch (getOpcode()) {
1890 case VPInstruction::VScale: {
1891 Type *Ty = this->getScalarType();
1892 ArrayRef<Type *> Tys;
1893 IntrinsicCostAttributes Attrs(Intrinsic::vscale, Ty, Tys);
1894 return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
1895 }
1897 // TODO: This isn't quite right since even if the step-vector is hoisted
1898 // out of the loop it has a non-zero cost in the middle block, etc.
1899 // Once the stepvector is correctly hoisted out of the vector loop by the
1900 // licm transform we can add the cost here so that it doesn't incorrectly
1901 // affect the choice of VF.
1902 return 0;
1903 default:
1904 // Although VPInstructionWithType is also used for
1905 // VPInstruction::WideIVStep it isn't currently possible to expose cases
1906 // where the cost is queried.
1907 llvm_unreachable("Unhandled opcode");
1908 }
1909 return 0;
1910}
1911
1912#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1914 VPSlotTracker &SlotTracker) const {
1915 O << Indent << "EMIT" << (isSingleScalar() ? "-SCALAR" : "") << " ";
1917 O << " = ";
1918
1919 Type *ResultTy = getResultType();
1920 switch (getOpcode()) {
1922 O << "wide-iv-step ";
1924 break;
1926 O << "step-vector " << *ResultTy;
1927 break;
1929 O << "vscale " << *ResultTy;
1930 break;
1931 case Instruction::Load:
1932 O << "load ";
1934 break;
1935 default:
1936 assert(Instruction::isCast(getOpcode()) && "unhandled opcode");
1938 printFlags(O);
1940 O << " to " << *ResultTy;
1941 }
1942}
1943#endif
1944
1945/// Shared execute logic for VPPhi and VPWidenPHIRecipe. Creates a PHI node,
1946/// adds incoming values, and stores the result in State. For header phis, only
1947/// the preheader incoming value is added; the backedge is fixed up later by
1948/// VPlan::execute().
1950 VPTransformState &State, bool IsScalar,
1951 const Twine &Name) {
1952 unsigned NumIncoming = VPBlockUtils::isHeader(R->getParent(), State.VPDT)
1953 ? 1
1954 : Phi.getNumIncoming();
1955 Value *FirstInc = State.get(Phi.getIncomingValue(0), IsScalar);
1956 PHINode *NewPhi = State.Builder.CreatePHI(FirstInc->getType(), 2, Name);
1957 NewPhi->addIncoming(FirstInc,
1958 State.CFG.VPBB2IRBB.at(Phi.getIncomingBlock(0)));
1959 for (unsigned Idx = 1; Idx != NumIncoming; ++Idx)
1960 NewPhi->addIncoming(State.get(Phi.getIncomingValue(Idx), IsScalar),
1961 State.CFG.VPBB2IRBB.at(Phi.getIncomingBlock(Idx)));
1962 State.set(R, NewPhi, IsScalar);
1963}
1964
1966 executePhiRecipe(this, *this, State, /*IsScalar=*/true, getName());
1967}
1968
1969#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1970void VPPhi::printRecipe(raw_ostream &O, const Twine &Indent,
1971 VPSlotTracker &SlotTracker) const {
1972 O << Indent << "EMIT" << (isSingleScalar() ? "-SCALAR" : "") << " ";
1974 O << " = phi";
1975 printFlags(O);
1977}
1978#endif
1979
1980VPIRInstruction *VPIRInstruction ::create(Instruction &I) {
1981 if (auto *Phi = dyn_cast<PHINode>(&I))
1982 return new VPIRPhi(*Phi);
1983 return new VPIRInstruction(I);
1984}
1985
1987 assert(!isa<VPIRPhi>(this) && getNumOperands() == 0 &&
1988 "PHINodes must be handled by VPIRPhi");
1989 // Advance the insert point after the wrapped IR instruction. This allows
1990 // interleaving VPIRInstructions and other recipes.
1991 State.Builder.SetInsertPoint(I.getParent(), std::next(I.getIterator()));
1992}
1993
1995 VPCostContext &Ctx) const {
1996 // The recipe wraps an existing IR instruction on the border of VPlan's scope,
1997 // hence it does not contribute to the cost-modeling for the VPlan.
1998 return 0;
1999}
2000
2001#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2003 VPSlotTracker &SlotTracker) const {
2004 O << Indent << "IR " << I;
2005}
2006#endif
2007
2009 PHINode *Phi = &getIRPhi();
2010 for (const auto &[Idx, Op] : enumerate(operands())) {
2011 VPValue *ExitValue = Op;
2012 auto Lane = vputils::isSingleScalar(ExitValue)
2014 : VPLane::getLastLaneForVF(State.VF);
2015 VPBlockBase *Pred = getParent()->getPredecessors()[Idx];
2016 auto *PredVPBB = Pred->getExitingBasicBlock();
2017 BasicBlock *PredBB = State.CFG.VPBB2IRBB[PredVPBB];
2018 // Set insertion point in PredBB in case an extract needs to be generated.
2019 // TODO: Model extracts explicitly.
2020 State.Builder.SetInsertPoint(PredBB->getTerminator());
2021 Value *V = State.get(ExitValue, VPLane(Lane));
2022 // If there is no existing block for PredBB in the phi, add a new incoming
2023 // value. Otherwise update the existing incoming value for PredBB.
2024 if (Phi->getBasicBlockIndex(PredBB) == -1)
2025 Phi->addIncoming(V, PredBB);
2026 else
2027 Phi->setIncomingValueForBlock(PredBB, V);
2028 }
2029
2030 // Advance the insert point after the wrapped IR instruction. This allows
2031 // interleaving VPIRInstructions and other recipes.
2032 State.Builder.SetInsertPoint(Phi->getParent(), std::next(Phi->getIterator()));
2033}
2034
2036 VPRecipeBase *R = const_cast<VPRecipeBase *>(getAsRecipe());
2037 assert(R->getNumOperands() == R->getParent()->getNumPredecessors() &&
2038 "Number of phi operands must match number of predecessors");
2039 unsigned Position = R->getParent()->getIndexForPredecessor(IncomingBlock);
2040 R->removeOperand(Position);
2041}
2042
2043VPValue *
2045 VPRecipeBase *R = const_cast<VPRecipeBase *>(getAsRecipe());
2046 return getIncomingValue(R->getParent()->getIndexForPredecessor(VPBB));
2047}
2048
2050 VPValue *V) const {
2051 VPRecipeBase *R = const_cast<VPRecipeBase *>(getAsRecipe());
2052 R->setOperand(R->getParent()->getIndexForPredecessor(VPBB), V);
2053}
2054
2055#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2057 VPSlotTracker &SlotTracker) const {
2058 interleaveComma(enumerate(getAsRecipe()->operands()), O,
2059 [this, &O, &SlotTracker](auto Op) {
2060 O << "[ ";
2061 Op.value()->printAsOperand(O, SlotTracker);
2062 O << ", ";
2063 getIncomingBlock(Op.index())->printAsOperand(O);
2064 O << " ]";
2065 });
2066}
2067#endif
2068
2069#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2071 VPSlotTracker &SlotTracker) const {
2073
2074 if (getNumOperands() != 0) {
2075 O << " (extra operand" << (getNumOperands() > 1 ? "s" : "") << ": ";
2077 [&O, &SlotTracker](auto Op) {
2078 std::get<0>(Op)->printAsOperand(O, SlotTracker);
2079 O << " from ";
2080 std::get<1>(Op)->printAsOperand(O);
2081 });
2082 O << ")";
2083 }
2084}
2085#endif
2086
2088 for (const auto &[Kind, Node] : Metadata)
2089 I.setMetadata(Kind, Node);
2090}
2091
2093 SmallVector<std::pair<unsigned, MDNode *>> MetadataIntersection;
2094 for (const auto &[KindA, MDA] : Metadata) {
2095 for (const auto &[KindB, MDB] : Other.Metadata) {
2096 if (KindA == KindB && MDA == MDB) {
2097 MetadataIntersection.emplace_back(KindA, MDA);
2098 break;
2099 }
2100 }
2101 }
2102 Metadata = std::move(MetadataIntersection);
2103}
2104
2105#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2107 const Module *M = SlotTracker.getModule();
2108 if (Metadata.empty() || !M || !VPlanPrintMetadata)
2109 return;
2110
2111 ArrayRef<StringRef> MDNames = SlotTracker.getMDNames();
2112 O << " (";
2113 interleaveComma(Metadata, O, [&](const auto &KindNodePair) {
2114 auto [Kind, Node] = KindNodePair;
2115 assert(Kind < MDNames.size() && !MDNames[Kind].empty() &&
2116 "Unexpected unnamed metadata kind");
2117 O << "!" << MDNames[Kind] << " ";
2118 Node->printAsOperand(O, M);
2119 });
2120 O << ")";
2121}
2122#endif
2123
2125 assert(State.VF.isVector() && "not widening");
2126 assert(Variant != nullptr && "Can't create vector function.");
2127
2128 FunctionType *VFTy = Variant->getFunctionType();
2129 // Add return type if intrinsic is overloaded on it.
2131 for (const auto &I : enumerate(args())) {
2132 Value *Arg;
2133 // Some vectorized function variants may also take a scalar argument,
2134 // e.g. linear parameters for pointers. This needs to be the scalar value
2135 // from the start of the respective part when interleaving.
2136 if (!VFTy->getParamType(I.index())->isVectorTy())
2137 Arg = State.get(I.value(), VPLane(0));
2138 else
2139 Arg = State.get(I.value(), usesFirstLaneOnly(I.value()));
2140 Args.push_back(Arg);
2141 }
2142
2145 if (CI)
2146 CI->getOperandBundlesAsDefs(OpBundles);
2147
2148 CallInst *V = State.Builder.CreateCall(Variant, Args, OpBundles);
2149 applyFlags(*V);
2150 applyMetadata(*V);
2151 V->setCallingConv(Variant->getCallingConv());
2152
2153 if (!V->getType()->isVoidTy())
2154 State.set(this, V);
2155}
2156
2158 VPCostContext &Ctx) const {
2159 assert(getVectorizedTypeVF(Variant->getReturnType()) == VF &&
2160 "Variant return type must match VF");
2161 return computeCallCost(Variant, Ctx);
2162}
2163
2165 VPCostContext &Ctx) {
2166 return Ctx.TTI.getCallInstrCost(nullptr, Variant->getReturnType(),
2167 Variant->getFunctionType()->params(),
2168 Ctx.CostKind);
2169}
2170
2172 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
2173 assert(Variant && "Variant not set");
2174 FunctionType *VFTy = Variant->getFunctionType();
2175 return all_of(enumerate(args()), [VFTy, &Op](const auto &Arg) {
2176 auto [Idx, V] = Arg;
2177 Type *ArgTy = VFTy->getParamType(Idx);
2178 return V != Op || ArgTy->isIntegerTy() || ArgTy->isFloatingPointTy() ||
2179 ArgTy->isPointerTy() || ArgTy->isByteTy();
2180 });
2181}
2182
2183#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2185 VPSlotTracker &SlotTracker) const {
2186 O << Indent << "WIDEN-CALL ";
2187
2188 Function *CalledFn = getCalledScalarFunction();
2189 if (CalledFn->getReturnType()->isVoidTy())
2190 O << "void ";
2191 else {
2193 O << " = ";
2194 }
2195
2196 O << "call";
2197 printFlags(O);
2198 O << " @" << CalledFn->getName() << "(";
2199 interleaveComma(args(), O, [&O, &SlotTracker](VPValue *Op) {
2200 Op->printAsOperand(O, SlotTracker);
2201 });
2202 O << ")";
2203
2204 O << " (using library function";
2205 if (Variant->hasName())
2206 O << ": " << Variant->getName();
2207 O << ")";
2208}
2209#endif
2210
2212 assert(State.VF.isVector() && "not widening");
2213
2214 SmallVector<Type *, 2> TysForDecl;
2215 // Add return type if intrinsic is overloaded on it.
2216 if (isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, -1,
2217 State.TTI)) {
2218 Type *RetTy = toVectorizedTy(getScalarType(), State.VF);
2219 ArrayRef<Type *> ContainedTys = getContainedTypes(RetTy);
2220 for (auto [Idx, Ty] : enumerate(ContainedTys)) {
2222 Idx, State.TTI))
2223 TysForDecl.push_back(Ty);
2224 }
2225 }
2227 for (const auto &I : enumerate(operands())) {
2228 // Some intrinsics have a scalar argument - don't replace it with a
2229 // vector.
2230 Value *Arg;
2231 if (isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index(),
2232 State.TTI))
2233 Arg = State.get(I.value(), VPLane(0));
2234 else
2235 Arg = State.get(I.value(), usesFirstLaneOnly(I.value()));
2236 if (isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, I.index(),
2237 State.TTI))
2238 TysForDecl.push_back(Arg->getType());
2239 Args.push_back(Arg);
2240 }
2241
2242 // Use vector version of the intrinsic.
2243 Module *M = State.Builder.GetInsertBlock()->getModule();
2244 Function *VectorF =
2245 Intrinsic::getOrInsertDeclaration(M, VectorIntrinsicID, TysForDecl);
2246 assert(VectorF &&
2247 "Can't retrieve vector intrinsic or vector-predication intrinsics.");
2248
2251 if (CI)
2252 CI->getOperandBundlesAsDefs(OpBundles);
2253
2254 CallInst *V = State.Builder.CreateCall(VectorF, Args, OpBundles);
2255
2256 applyFlags(*V);
2257 applyMetadata(*V);
2258
2259 return V;
2260}
2261
2263 CallInst *V = createVectorCall(State);
2264 if (!V->getType()->isVoidTy())
2265 State.set(this, V);
2266}
2267
2270 const VPRecipeWithIRFlags &R, ElementCount VF, VPCostContext &Ctx) {
2271 Type *ScalarRetTy = R.getScalarType();
2272 // Skip the reverse operation cost for the mask.
2273 // FIXME: Remove this once redundant mask reverse operations can be eliminated
2274 // by VPlanTransforms::cse before cost computation.
2275 if (ID == Intrinsic::experimental_vp_reverse && ScalarRetTy->isIntegerTy(1))
2276 return InstructionCost(0);
2277
2278 // Some backends analyze intrinsic arguments to determine cost. Use the
2279 // underlying value for the operand if it has one. Otherwise try to use the
2280 // operand of the underlying call instruction, if there is one. Otherwise
2281 // clear Arguments.
2282 // TODO: Rework TTI interface to be independent of concrete IR values.
2284 for (const auto &[Idx, Op] : enumerate(Operands)) {
2285 auto *V = Op->getUnderlyingValue();
2286 if (!V) {
2287 if (auto *UI = dyn_cast_or_null<CallBase>(R.getUnderlyingValue())) {
2288 Arguments.push_back(UI->getArgOperand(Idx));
2289 continue;
2290 }
2291 Arguments.clear();
2292 break;
2293 }
2294 Arguments.push_back(V);
2295 }
2296
2297 Type *RetTy = VF.isVector() ? toVectorizedTy(ScalarRetTy, VF) : ScalarRetTy;
2298 SmallVector<Type *> ParamTys =
2299 map_to_vector(Operands, [&](const VPValue *Op) {
2300 return toVectorTy(Op->getScalarType(), VF);
2301 });
2302
2303 // TODO: Rework TTI interface to avoid reliance on underlying IntrinsicInst.
2304 IntrinsicCostAttributes CostAttrs(
2305 ID, RetTy, Arguments, ParamTys, R.getFastMathFlagsOrNone(),
2306 dyn_cast_or_null<IntrinsicInst>(R.getUnderlyingValue()),
2308 return Ctx.TTI.getIntrinsicInstrCost(CostAttrs, Ctx.CostKind);
2309}
2310
2312 VPCostContext &Ctx) const {
2314 return computeCallCost(VectorIntrinsicID, ArgOps, *this, VF, Ctx);
2315}
2316
2318 return Intrinsic::getBaseName(VectorIntrinsicID);
2319}
2320
2322 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
2323 return all_of(enumerate(operands()), [this, &Op](const auto &X) {
2324 auto [Idx, V] = X;
2326 Idx, nullptr);
2327 });
2328}
2329
2330#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2332 VPSlotTracker &SlotTracker) const {
2333 O << Indent << "WIDEN-INTRINSIC ";
2334 if (getScalarType()->isVoidTy()) {
2335 O << "void ";
2336 } else {
2338 O << " = ";
2339 }
2340
2341 O << "call";
2342 printFlags(O);
2343 O << getIntrinsicName() << "(";
2344
2346 Op->printAsOperand(O, SlotTracker);
2347 });
2348 O << ")";
2349}
2350#endif
2351
2353 CallInst *MemI = createVectorCall(State);
2354 MemI->addParamAttr(
2355 0, Attribute::getWithAlignment(MemI->getContext(), Alignment));
2356 State.set(this, MemI);
2357}
2358
2360 Intrinsic::ID IID, Type *Ty, bool IsMasked, Align Alignment,
2361 VPCostContext &Ctx) {
2362 return Ctx.TTI.getMemIntrinsicInstrCost(
2363 MemIntrinsicCostAttributes(IID, Ty, /*Ptr=*/nullptr, IsMasked, Alignment),
2364 Ctx.CostKind);
2365}
2366
2369 VPCostContext &Ctx) const {
2370 Type *Ty = toVectorTy(getScalarType(), VF);
2372 !match(getOperand(2), m_True()), Alignment,
2373 Ctx);
2374}
2375
2377 IRBuilderBase &Builder = State.Builder;
2378
2379 Value *Address = State.get(getOperand(0));
2380 Value *IncAmt = State.get(getOperand(1), /*IsScalar=*/true);
2381 VectorType *VTy = cast<VectorType>(Address->getType());
2382
2383 // The histogram intrinsic requires a mask even if the recipe doesn't;
2384 // if the mask operand was omitted then all lanes should be executed and
2385 // we just need to synthesize an all-true mask.
2386 Value *Mask = nullptr;
2387 if (VPValue *VPMask = getMask())
2388 Mask = State.get(VPMask);
2389 else
2390 Mask =
2391 Builder.CreateVectorSplat(VTy->getElementCount(), Builder.getInt1(1));
2392
2393 // If this is a subtract, we want to invert the increment amount. We may
2394 // add a separate intrinsic in future, but for now we'll try this.
2395 if (Opcode == Instruction::Sub)
2396 IncAmt = Builder.CreateNeg(IncAmt);
2397 else
2398 assert(Opcode == Instruction::Add && "only add or sub supported for now");
2399
2400 Instruction *HistogramInst = State.Builder.CreateIntrinsicWithoutFolding(
2401 Intrinsic::experimental_vector_histogram_add, {VTy, IncAmt->getType()},
2402 {Address, IncAmt, Mask});
2403 applyMetadata(*HistogramInst);
2404}
2405
2407 VPCostContext &Ctx) const {
2408 // FIXME: Take the gather and scatter into account as well. For now we're
2409 // generating the same cost as the fallback path, but we'll likely
2410 // need to create a new TTI method for determining the cost, including
2411 // whether we can use base + vec-of-smaller-indices or just
2412 // vec-of-pointers.
2413 assert(VF.isVector() && "Invalid VF for histogram cost");
2414 Type *AddressTy = getOperand(0)->getScalarType();
2415 VPValue *IncAmt = getOperand(1);
2416 Type *IncTy = IncAmt->getScalarType();
2417 VectorType *VTy = VectorType::get(IncTy, VF);
2418
2419 // Assume that a non-constant update value (or a constant != 1) requires
2420 // a multiply, and add that into the cost.
2421 InstructionCost MulCost =
2422 Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, VTy, Ctx.CostKind);
2423 if (match(IncAmt, m_One()))
2424 MulCost = TTI::TCC_Free;
2425
2426 // Find the cost of the histogram operation itself.
2427 Type *PtrTy = VectorType::get(AddressTy, VF);
2428 Type *MaskTy = VectorType::get(Type::getInt1Ty(Ctx.LLVMCtx), VF);
2429 IntrinsicCostAttributes ICA(Intrinsic::experimental_vector_histogram_add,
2430 Type::getVoidTy(Ctx.LLVMCtx),
2431 {PtrTy, IncTy, MaskTy});
2432
2433 // Add the costs together with the add/sub operation.
2434 return Ctx.TTI.getIntrinsicInstrCost(ICA, Ctx.CostKind) + MulCost +
2435 Ctx.TTI.getArithmeticInstrCost(Opcode, VTy, Ctx.CostKind);
2436}
2437
2438#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2440 VPSlotTracker &SlotTracker) const {
2441 O << Indent << "WIDEN-HISTOGRAM buckets: ";
2443
2444 if (Opcode == Instruction::Sub)
2445 O << ", dec: ";
2446 else {
2447 assert(Opcode == Instruction::Add);
2448 O << ", inc: ";
2449 }
2451
2452 if (VPValue *Mask = getMask()) {
2453 O << ", mask: ";
2454 Mask->printAsOperand(O, SlotTracker);
2455 }
2456}
2457#endif
2458
2459VPIRFlags::FastMathFlagsTy::FastMathFlagsTy(const FastMathFlags &FMF) {
2460 AllowReassoc = FMF.allowReassoc();
2461 NoNaNs = FMF.noNaNs();
2462 NoInfs = FMF.noInfs();
2463 NoSignedZeros = FMF.noSignedZeros();
2464 AllowReciprocal = FMF.allowReciprocal();
2465 AllowContract = FMF.allowContract();
2466 ApproxFunc = FMF.approxFunc();
2467}
2468
2470 switch (Opcode) {
2471 case Instruction::Add:
2472 case Instruction::Sub:
2473 case Instruction::Mul:
2474 case Instruction::Shl:
2476 return WrapFlagsTy(false, false);
2477 case Instruction::Trunc:
2478 return TruncFlagsTy(false, false);
2479 case Instruction::Or:
2480 return DisjointFlagsTy(false);
2481 case Instruction::AShr:
2482 case Instruction::LShr:
2483 case Instruction::UDiv:
2484 case Instruction::SDiv:
2485 return ExactFlagsTy(false);
2486 case Instruction::GetElementPtr:
2489 return GEPNoWrapFlags::none();
2490 case Instruction::ZExt:
2491 case Instruction::UIToFP:
2492 return NonNegFlagsTy(false);
2493 case Instruction::FAdd:
2494 case Instruction::FSub:
2495 case Instruction::FMul:
2496 case Instruction::FDiv:
2497 case Instruction::FRem:
2498 case Instruction::FNeg:
2499 case Instruction::FPExt:
2500 case Instruction::FPTrunc:
2501 return FastMathFlags();
2502 case Instruction::ICmp:
2503 case Instruction::FCmp:
2505 llvm_unreachable("opcode requires explicit flags");
2506 default:
2507 return VPIRFlags();
2508 }
2509}
2510
2511#if !defined(NDEBUG)
2512bool VPIRFlags::flagsValidForOpcode(unsigned Opcode) const {
2513 switch (OpType) {
2514 case OperationType::OverflowingBinOp:
2515 return Opcode == Instruction::Add || Opcode == Instruction::Sub ||
2516 Opcode == Instruction::Mul || Opcode == Instruction::Shl ||
2517 Opcode == VPInstruction::VPInstruction::CanonicalIVIncrementForPart;
2518 case OperationType::Trunc:
2519 return Opcode == Instruction::Trunc;
2520 case OperationType::DisjointOp:
2521 return Opcode == Instruction::Or;
2522 case OperationType::PossiblyExactOp:
2523 return Opcode == Instruction::AShr || Opcode == Instruction::LShr ||
2524 Opcode == Instruction::UDiv || Opcode == Instruction::SDiv;
2525 case OperationType::GEPOp:
2526 return Opcode == Instruction::GetElementPtr ||
2527 Opcode == VPInstruction::PtrAdd ||
2528 Opcode == VPInstruction::WidePtrAdd;
2529 case OperationType::FPMathOp:
2530 return Opcode == Instruction::Call || Opcode == Instruction::FAdd ||
2531 Opcode == Instruction::FMul || Opcode == Instruction::FSub ||
2532 Opcode == Instruction::FNeg || Opcode == Instruction::FDiv ||
2533 Opcode == Instruction::FRem || Opcode == Instruction::FPExt ||
2534 Opcode == Instruction::FPTrunc || Opcode == Instruction::PHI ||
2535 Opcode == Instruction::Select || Opcode == Instruction::SIToFP ||
2536 Opcode == Instruction::UIToFP ||
2537 Opcode == VPInstruction::WideIVStep ||
2539 case OperationType::FCmp:
2540 return Opcode == Instruction::FCmp;
2541 case OperationType::NonNegOp:
2542 return Opcode == Instruction::ZExt || Opcode == Instruction::UIToFP;
2543 case OperationType::Cmp:
2544 return Opcode == Instruction::FCmp || Opcode == Instruction::ICmp;
2545 case OperationType::ReductionOp:
2547 case OperationType::Other:
2548 return true;
2549 }
2550 llvm_unreachable("Unknown OperationType enum");
2551}
2552
2553bool VPIRFlags::hasRequiredFlagsForOpcode(unsigned Opcode) const {
2554 // Handle opcodes without default flags.
2555 if (Opcode == Instruction::ICmp)
2556 return OpType == OperationType::Cmp;
2557 if (Opcode == Instruction::FCmp)
2558 return OpType == OperationType::FCmp;
2560 return OpType == OperationType::ReductionOp;
2561
2562 OperationType Required = getDefaultFlags(Opcode).OpType;
2563 return Required == OperationType::Other || Required == OpType;
2564}
2565#endif
2566
2567#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2568static void printRecurrenceKind(raw_ostream &OS, const RecurKind &Kind) {
2569 switch (Kind) {
2570 case RecurKind::None:
2571 OS << "none";
2572 break;
2573 case RecurKind::Add:
2574 OS << "add";
2575 break;
2576 case RecurKind::Sub:
2577 OS << "sub";
2578 break;
2580 OS << "add-chain-with-subs";
2581 break;
2582 case RecurKind::Mul:
2583 OS << "mul";
2584 break;
2585 case RecurKind::Or:
2586 OS << "or";
2587 break;
2588 case RecurKind::And:
2589 OS << "and";
2590 break;
2591 case RecurKind::Xor:
2592 OS << "xor";
2593 break;
2594 case RecurKind::SMin:
2595 OS << "smin";
2596 break;
2597 case RecurKind::SMax:
2598 OS << "smax";
2599 break;
2600 case RecurKind::UMin:
2601 OS << "umin";
2602 break;
2603 case RecurKind::UMax:
2604 OS << "umax";
2605 break;
2606 case RecurKind::FAdd:
2607 OS << "fadd";
2608 break;
2610 OS << "fadd-chain-with-subs";
2611 break;
2612 case RecurKind::FSub:
2613 OS << "fsub";
2614 break;
2615 case RecurKind::FMul:
2616 OS << "fmul";
2617 break;
2618 case RecurKind::FMin:
2619 OS << "fmin";
2620 break;
2621 case RecurKind::FMax:
2622 OS << "fmax";
2623 break;
2624 case RecurKind::FMinNum:
2625 OS << "fminnum";
2626 break;
2627 case RecurKind::FMaxNum:
2628 OS << "fmaxnum";
2629 break;
2631 OS << "fminimum";
2632 break;
2634 OS << "fmaximum";
2635 break;
2637 OS << "fminimumnum";
2638 break;
2640 OS << "fmaximumnum";
2641 break;
2642 case RecurKind::FMulAdd:
2643 OS << "fmuladd";
2644 break;
2645 case RecurKind::AnyOf:
2646 OS << "any-of";
2647 break;
2648 case RecurKind::FindIV:
2649 OS << "find-iv";
2650 break;
2652 OS << "find-last";
2653 break;
2654 }
2655}
2656
2658 switch (OpType) {
2659 case OperationType::Cmp:
2661 break;
2662 case OperationType::FCmp:
2665 break;
2666 case OperationType::DisjointOp:
2667 if (DisjointFlags.IsDisjoint)
2668 O << " disjoint";
2669 break;
2670 case OperationType::PossiblyExactOp:
2671 if (ExactFlags.IsExact)
2672 O << " exact";
2673 break;
2674 case OperationType::OverflowingBinOp:
2675 if (WrapFlags.HasNUW)
2676 O << " nuw";
2677 if (WrapFlags.HasNSW)
2678 O << " nsw";
2679 break;
2680 case OperationType::Trunc:
2681 if (TruncFlags.HasNUW)
2682 O << " nuw";
2683 if (TruncFlags.HasNSW)
2684 O << " nsw";
2685 break;
2686 case OperationType::FPMathOp:
2688 break;
2689 case OperationType::GEPOp: {
2691 if (Flags.isInBounds())
2692 O << " inbounds";
2693 else if (Flags.hasNoUnsignedSignedWrap())
2694 O << " nusw";
2695 if (Flags.hasNoUnsignedWrap())
2696 O << " nuw";
2697 break;
2698 }
2699 case OperationType::NonNegOp:
2700 if (NonNegFlags.NonNeg)
2701 O << " nneg";
2702 break;
2703 case OperationType::ReductionOp: {
2704 O << " (";
2706 if (isReductionInLoop())
2707 O << ", in-loop";
2708 if (isReductionOrdered())
2709 O << ", ordered";
2710 O << ")";
2712 break;
2713 }
2714 case OperationType::Other:
2715 break;
2716 }
2717 O << " ";
2718}
2719#endif
2720
2722 auto &Builder = State.Builder;
2723 switch (Opcode) {
2724 case Instruction::Call:
2725 case Instruction::UncondBr:
2726 case Instruction::CondBr:
2727 case Instruction::PHI:
2728 case Instruction::GetElementPtr:
2729 llvm_unreachable("This instruction is handled by a different recipe.");
2730 case Instruction::UDiv:
2731 case Instruction::SDiv:
2732 case Instruction::SRem:
2733 case Instruction::URem:
2734 case Instruction::Add:
2735 case Instruction::FAdd:
2736 case Instruction::Sub:
2737 case Instruction::FSub:
2738 case Instruction::FNeg:
2739 case Instruction::Mul:
2740 case Instruction::FMul:
2741 case Instruction::FDiv:
2742 case Instruction::FRem:
2743 case Instruction::Shl:
2744 case Instruction::LShr:
2745 case Instruction::AShr:
2746 case Instruction::And:
2747 case Instruction::Or:
2748 case Instruction::Xor: {
2749 // Just widen unops and binops.
2751 for (VPValue *VPOp : operands())
2752 Ops.push_back(State.get(VPOp));
2753
2754 Value *V = Builder.CreateNAryOp(Opcode, Ops);
2755
2756 if (auto *VecOp = dyn_cast<Instruction>(V)) {
2757 applyFlags(*VecOp);
2758 applyMetadata(*VecOp);
2759 }
2760
2761 // Use this vector value for all users of the original instruction.
2762 State.set(this, V);
2763 break;
2764 }
2765 case Instruction::ExtractValue: {
2766 assert(getNumOperands() == 2 && "expected single level extractvalue");
2767 Value *Op = State.get(getOperand(0));
2768 Value *Extract = Builder.CreateExtractValue(
2769 Op, cast<VPConstantInt>(getOperand(1))->getZExtValue());
2770 State.set(this, Extract);
2771 break;
2772 }
2773 case Instruction::Freeze: {
2774 Value *Op = State.get(getOperand(0));
2775 Value *Freeze = Builder.CreateFreeze(Op);
2776 State.set(this, Freeze);
2777 break;
2778 }
2779 case Instruction::ICmp:
2780 case Instruction::FCmp: {
2781 // Widen compares. Generate vector compares.
2782 bool FCmp = Opcode == Instruction::FCmp;
2783 Value *A = State.get(getOperand(0));
2784 Value *B = State.get(getOperand(1));
2785 Value *C = nullptr;
2786 if (FCmp) {
2787 C = Builder.CreateFCmp(getPredicate(), A, B);
2788 } else {
2789 C = Builder.CreateICmp(getPredicate(), A, B);
2790 }
2791 if (auto *I = dyn_cast<Instruction>(C)) {
2792 applyFlags(*I);
2793 applyMetadata(*I);
2794 }
2795 State.set(this, C);
2796 break;
2797 }
2798 case Instruction::Select: {
2799 VPValue *CondOp = getOperand(0);
2800 Value *Cond = State.get(CondOp, vputils::isSingleScalar(CondOp));
2801 Value *Op0 = State.get(getOperand(1));
2802 Value *Op1 = State.get(getOperand(2));
2803 Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1);
2804 State.set(this, Sel);
2805 if (auto *I = dyn_cast<Instruction>(Sel)) {
2807 applyFlags(*I);
2808 applyMetadata(*I);
2809 }
2810 break;
2811 }
2812 default:
2813 // This instruction is not vectorized by simple widening.
2814 LLVM_DEBUG(dbgs() << "LV: Found an unhandled opcode : "
2815 << Instruction::getOpcodeName(Opcode));
2816 llvm_unreachable("Unhandled instruction!");
2817 } // end of switch.
2818
2819#if !defined(NDEBUG)
2820 // Verify that VPlan type inference results agree with the type of the
2821 // generated values.
2822 assert(VectorType::get(this->getScalarType(), State.VF) ==
2823 State.get(this)->getType() &&
2824 "inferred type and type from generated instructions do not match");
2825#endif
2826}
2827
2829 VPCostContext &Ctx) const {
2830 switch (Opcode) {
2831 case Instruction::UDiv:
2832 case Instruction::SDiv:
2833 case Instruction::SRem:
2834 case Instruction::URem:
2835 // If the div/rem operation isn't safe to speculate and requires
2836 // predication, then the only way we can even create a vplan is to insert
2837 // a select on the second input operand to ensure we use the value of 1
2838 // for the inactive lanes. The select will be costed separately.
2839 case Instruction::FNeg:
2840 case Instruction::Add:
2841 case Instruction::FAdd:
2842 case Instruction::Sub:
2843 case Instruction::FSub:
2844 case Instruction::Mul:
2845 case Instruction::FMul:
2846 case Instruction::FDiv:
2847 case Instruction::FRem:
2848 case Instruction::Shl:
2849 case Instruction::LShr:
2850 case Instruction::AShr:
2851 case Instruction::And:
2852 case Instruction::Or:
2853 case Instruction::Xor:
2854 case Instruction::Freeze:
2855 case Instruction::ExtractValue:
2856 case Instruction::ICmp:
2857 case Instruction::FCmp:
2858 case Instruction::Select:
2859 return getCostForRecipeWithOpcode(getOpcode(), VF, Ctx);
2860 default:
2861 llvm_unreachable("Unsupported opcode for instruction");
2862 }
2863}
2864
2865#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2867 VPSlotTracker &SlotTracker) const {
2868 O << Indent << "WIDEN ";
2870 O << " = " << Instruction::getOpcodeName(Opcode);
2871 printFlags(O);
2873}
2874#endif
2875
2877 auto &Builder = State.Builder;
2878 /// Vectorize casts.
2879 assert(State.VF.isVector() && "Not vectorizing?");
2880 Type *DestTy = VectorType::get(getScalarType(), State.VF);
2881 VPValue *Op = getOperand(0);
2882 Value *A = State.get(Op);
2883 Value *Cast = Builder.CreateCast(Instruction::CastOps(Opcode), A, DestTy);
2884 State.set(this, Cast);
2885 if (auto *CastOp = dyn_cast<Instruction>(Cast)) {
2886 applyFlags(*CastOp);
2887 applyMetadata(*CastOp);
2888 }
2889}
2890
2895
2896#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2898 VPSlotTracker &SlotTracker) const {
2899 O << Indent << "WIDEN-CAST ";
2901 O << " = " << Instruction::getOpcodeName(Opcode);
2902 printFlags(O);
2904 O << " to " << *getScalarType();
2905}
2906#endif
2907
2909 VPCostContext &Ctx) const {
2910 return Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind);
2911}
2912
2913#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2915 raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const {
2916 O << Indent;
2918 O << " = WIDEN-INDUCTION";
2919 printFlags(O);
2921
2922 if (auto *TI = getTruncInst())
2923 O << " (truncated to " << *TI->getType() << ")";
2924}
2925#endif
2926
2928 // The step may be defined by a recipe in the preheader (e.g. if it requires
2929 // SCEV expansion), but for the canonical induction the step is required to be
2930 // 1, which is represented as live-in.
2931 return match(getStartValue(), m_ZeroInt()) &&
2932 match(getStepValue(), m_One()) &&
2933 getScalarType() == getRegion()->getCanonicalIVType();
2934}
2935
2937 VPCostContext &Ctx) const {
2938 // The cost model for this is modelled on expandVPDerivedIV in
2939 // VPlanTransforms.cpp. In order to avoid overly pessimistic costs that can
2940 // negatively affect vectorization it takes into account any expected
2941 // simplifications that happen in simplifyRecipe.
2942 switch (getInductionKind()) {
2943 default:
2944 // TODO: Compute cost for remaining kinds.
2945 break;
2947 // There are currently no tests that expose a path where all lanes are
2948 // used, so it's better to bail out for now.
2949 if (!vputils::onlyFirstLaneUsed(this))
2950 break;
2951
2952 // Start off by assuming we need both mul and add, then refine this.
2953 bool NeedsMul = true, NeedsAdd = true, NeedsShl = false;
2954
2955 // If the start value is zero the add gets folded away.
2956 if (auto *VPV = dyn_cast<VPIRValue>(getStartValue()))
2957 if (auto *StartC = dyn_cast<ConstantInt>(VPV->getValue()))
2958 NeedsAdd = !StartC->isZero();
2959
2960 // For some values of step the arithmetic changes:
2961 // 1. A step of 1 requires no operation.
2962 // 2. A step of -1 requires a negate.
2963 // 3. A power-of-2 step will use a shl, instead of a mul.
2964 Type *StepTy = getStepValue()->getScalarType();
2966 if (auto *VPV = dyn_cast<VPIRValue>(getStepValue())) {
2967 if (auto *StepC = dyn_cast<ConstantInt>(VPV->getValue())) {
2968 if (StepC->isOne())
2969 NeedsMul = false;
2970 else if (StepC->isMinusOne()) {
2971 // This will most likely end up as a negate in simplifyRecipe, and
2972 // the negate will be combined with the add to make a sub.
2973 // NOTE: This is perhaps an invalid assumption that the cost of an
2974 // 'add' is the same as a 'sub'.
2975 NeedsMul = false;
2976 NeedsAdd = true;
2977 } else if (StepC->getValue().isPowerOf2()) {
2978 // This will most likely end up as a shift-left in simplifyRecipe
2979 NeedsMul = false;
2980 NeedsShl = true;
2981 }
2982 }
2983 }
2984
2985 // Add the cost of the conversion from index to step type if the index
2986 // will be used.
2987 Type *IndexTy = getIndex()->getScalarType();
2988 unsigned StepTySize = StepTy->getScalarSizeInBits();
2989 unsigned IndexTySize = IndexTy->getScalarSizeInBits();
2990 if ((NeedsAdd || NeedsMul || NeedsShl) && StepTySize != IndexTySize) {
2991 unsigned CastOpc =
2992 StepTySize < IndexTySize ? Instruction::Trunc : Instruction::SExt;
2993 Cost += Ctx.TTI.getCastInstrCost(
2994 CastOpc, StepTy, IndexTy, TTI::CastContextHint::None, Ctx.CostKind);
2995 }
2996
2997 if (NeedsMul)
2998 Cost += Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, StepTy,
2999 Ctx.CostKind);
3000 if (NeedsShl)
3001 Cost += Ctx.TTI.getArithmeticInstrCost(
3002 Instruction::Shl, StepTy, Ctx.CostKind,
3003 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
3004 {TargetTransformInfo::OK_UniformConstantValue,
3005 TargetTransformInfo::OP_None});
3006 if (NeedsAdd)
3007 Cost += Ctx.TTI.getArithmeticInstrCost(Instruction::Add, StepTy,
3008 Ctx.CostKind);
3009 return Cost;
3010 }
3011 }
3012
3013 return 0;
3014}
3015
3016#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3018 VPSlotTracker &SlotTracker) const {
3019 O << Indent;
3021 O << " = DERIVED-IV ";
3022 getStartValue()->printAsOperand(O, SlotTracker);
3023 O << " + ";
3024 getOperand(1)->printAsOperand(O, SlotTracker);
3025 O << " * ";
3026 getStepValue()->printAsOperand(O, SlotTracker);
3027}
3028#endif
3029
3031 VPCostContext &Ctx) const {
3032 // TODO: Add costs for floating point.
3033 Type *BaseIVTy = getOperand(0)->getScalarType();
3034 if (!BaseIVTy->isIntegerTy())
3035 return 0;
3036
3037 // TODO: Add support for predicated regions. Requires scaling the cost by the
3038 // probability of entering the block.
3039 if (getRegion() && getRegion()->isReplicator())
3040 return 0;
3041
3042 // If only the first lane is used, then there won't be any code that remains
3043 // in the loop for the first unrolled part.
3045 return 0;
3046
3047 // Typically the operations are:
3048 // 1. Add the start index to each lane value.
3049 // 2. Multiply the start index by the step.
3050 // 3. Add the scaled start index to base IV.
3051 // Any code generated for 1 and 2 should be loop invariant and therefore
3052 // hoisted out of the loop. We only need to add on the cost of 3.
3053
3054 // Given the users of VPScalarIVStepsRecipe tend to be scalarized GEPs, i.e.
3055 // %add1 = add i32 %iv, 0
3056 // %add2 = add i32 %iv, 1
3057 // %gep1 = getelementptr i8, ptr %p, i32 %add1
3058 // %gep2 = getelementptr i8, ptr %p, i32 %add2
3059 // it's very likely that these GEPs will all be rewritten to have a common
3060 // base such that what's left is just
3061 // %base_gep = getelementptr i8, ptr %p, i32 %iv
3062 // %gep1 = getelementptr i8, ptr %base_gep, i32 0
3063 // %gep2 = getelementptr i8, ptr %base_gep, i32 1
3064 // Therefore, in reality the cost is somewhere betwen 1*AddCost and
3065 // (NumLanes - 1) * AddCost. For now, assume the cost of a single add.
3066 return Ctx.TTI.getArithmeticInstrCost(Instruction::Add, BaseIVTy,
3067 Ctx.CostKind);
3068}
3069
3071 // Fast-math-flags propagate from the original induction instruction.
3072 IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
3073 State.Builder.setFastMathFlags(getFastMathFlagsOrNone());
3074
3075 /// Compute scalar induction steps. \p ScalarIV is the scalar induction
3076 /// variable on which to base the steps, \p Step is the size of the step.
3077
3078 Value *BaseIV = State.get(getOperand(0), VPLane(0));
3079 Value *Step = State.get(getStepValue(), VPLane(0));
3080 IRBuilderBase &Builder = State.Builder;
3081
3082 // Ensure step has the same type as that of scalar IV.
3083 Type *BaseIVTy = BaseIV->getType()->getScalarType();
3084 assert(BaseIVTy == Step->getType() && "Types of BaseIV and Step must match!");
3085
3086 // We build scalar steps for both integer and floating-point induction
3087 // variables. Here, we determine the kind of arithmetic we will perform.
3090 if (BaseIVTy->isIntegerTy()) {
3091 AddOp = Instruction::Add;
3092 MulOp = Instruction::Mul;
3093 } else {
3094 AddOp = InductionOpcode;
3095 MulOp = Instruction::FMul;
3096 }
3097
3098 // Determine the number of scalars we need to generate.
3099 bool FirstLaneOnly = vputils::onlyFirstLaneUsed(this);
3100 // Compute the scalar steps and save the results in State.
3101
3102 unsigned EndLane = FirstLaneOnly ? 1 : State.VF.getKnownMinValue();
3103 Value *StartIdx0 = getStartIndex() ? State.get(getStartIndex(), true)
3104 : Constant::getNullValue(BaseIVTy);
3105
3106 for (unsigned Lane = 0; Lane < EndLane; ++Lane) {
3107 // It is okay if the induction variable type cannot hold the lane number,
3108 // we expect truncation in this case.
3109 Constant *LaneValue =
3110 BaseIVTy->isIntegerTy()
3111 ? ConstantInt::get(BaseIVTy, Lane, /*IsSigned=*/false,
3112 /*ImplicitTrunc=*/true)
3113 : ConstantFP::get(BaseIVTy, Lane);
3114 Value *StartIdx = Builder.CreateBinOp(AddOp, StartIdx0, LaneValue);
3115 assert((State.VF.isScalable() || isa<Constant>(StartIdx)) &&
3116 "Expected StartIdx to be folded to a constant when VF is not "
3117 "scalable");
3118 auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step);
3119 auto *Add = Builder.CreateBinOp(AddOp, BaseIV, Mul);
3120 State.set(this, Add, VPLane(Lane));
3121 }
3122}
3123
3124#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3126 VPSlotTracker &SlotTracker) const {
3127 O << Indent;
3129 O << " = SCALAR-STEPS ";
3131}
3132#endif
3133
3135 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
3137}
3138
3140 assert(State.VF.isVector() && "not widening");
3141 auto Ops = map_to_vector(operands(), [&](VPValue *Op) {
3142 return State.get(Op, vputils::isSingleScalar(Op));
3143 });
3144 auto *GEP =
3145 State.Builder.CreateGEP(getSourceElementType(), Ops.front(),
3146 drop_begin(Ops), "wide.gep", getGEPNoWrapFlags());
3147 State.set(this, GEP, vputils::isSingleScalar(this));
3148}
3149
3150#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3152 VPSlotTracker &SlotTracker) const {
3153 O << Indent << "WIDEN-GEP ";
3155 O << " = getelementptr";
3156 printFlags(O);
3158}
3159#endif
3160
3162 assert(!getOffset() && "Unexpected offset operand");
3163 VPBuilder Builder(this);
3164 VPlan &Plan = *getParent()->getPlan();
3165 VPValue *VFVal = getVFValue();
3166 const DataLayout &DL = Plan.getDataLayout();
3167 Type *IndexTy = DL.getIndexType(this->getScalarType());
3168 VPValue *Stride =
3169 Plan.getConstantInt(IndexTy, getStride(), /*IsSigned=*/true);
3170 Type *VFTy = VFVal->getScalarType();
3171 VPValue *VF = Builder.createScalarZExtOrTrunc(VFVal, IndexTy, VFTy,
3173
3174 // Offset for Part0 = Offset0 = Stride * (VF - 1).
3175 VPInstruction *VFMinusOne =
3176 Builder.createSub(VF, Plan.getConstantInt(IndexTy, 1u),
3177 DebugLoc::getUnknown(), "", {true, true});
3178 VPInstruction *Offset0 =
3179 Builder.createOverflowingOp(Instruction::Mul, {VFMinusOne, Stride});
3180
3181 // Offset for PartN = Offset0 + Part * Stride * VF.
3182 VPValue *PartxStride =
3183 Plan.getConstantInt(IndexTy, Part * getStride(), /*IsSigned=*/true);
3184 VPValue *Offset = Builder.createAdd(
3185 Offset0,
3186 Builder.createOverflowingOp(Instruction::Mul, {PartxStride, VF}));
3188}
3189
3191 auto &Builder = State.Builder;
3192 assert(getOffset() && "Expected prior materialization of offset");
3193 Value *Ptr = State.get(getPointer(), true);
3194 Value *Offset = State.get(getOffset(), true);
3195 Value *ResultPtr = Builder.CreateGEP(getSourceElementType(), Ptr, Offset, "",
3197 State.set(this, ResultPtr, /*IsScalar*/ true);
3198}
3199
3200#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3202 VPSlotTracker &SlotTracker) const {
3203 O << Indent;
3205 O << " = vector-end-pointer";
3206 printFlags(O);
3208}
3209#endif
3210
3212 assert(getVFxPart() &&
3213 "Expected prior simplification of recipe without VFxPart");
3214
3215 auto &Builder = State.Builder;
3216 Value *Ptr = State.get(getOperand(0), VPLane(0));
3217 Value *Offset = State.get(getVFxPart(), true);
3218 // TODO: Expand to VPInstruction to support constant folding.
3219 if (!match(getStride(), m_One())) {
3220 Value *Stride = Builder.CreateZExtOrTrunc(State.get(getStride(), true),
3221 Offset->getType());
3222 Offset = Builder.CreateMul(Offset, Stride);
3223 }
3224 Value *ResultPtr = Builder.CreateGEP(getSourceElementType(), Ptr, Offset, "",
3226 State.set(this, ResultPtr, /*IsScalar*/ true);
3227}
3228
3229#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3231 VPSlotTracker &SlotTracker) const {
3232 O << Indent;
3234 O << " = vector-pointer";
3235 printFlags(O);
3237}
3238#endif
3239
3241 VPCostContext &Ctx) const {
3242 // A blend will be expanded to a select VPInstruction, which will generate a
3243 // scalar select if only the first lane is used.
3245 VF = ElementCount::getFixed(1);
3246
3247 Type *ResultTy = toVectorTy(this->getScalarType(), VF);
3248 Type *CmpTy = toVectorTy(Type::getInt1Ty(Ctx.LLVMCtx), VF);
3249 return (getNumIncomingValues() - 1) *
3250 Ctx.TTI.getCmpSelInstrCost(Instruction::Select, ResultTy, CmpTy,
3251 CmpInst::BAD_ICMP_PREDICATE, Ctx.CostKind);
3252}
3253
3254#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3256 VPSlotTracker &SlotTracker) const {
3257 O << Indent << "BLEND ";
3259 O << " =";
3260 printFlags(O);
3261 if (getNumIncomingValues() == 1) {
3262 // Not a User of any mask: not really blending, this is a
3263 // single-predecessor phi.
3264 getIncomingValue(0)->printAsOperand(O, SlotTracker);
3265 } else {
3266 for (unsigned I = 0, E = getNumIncomingValues(); I < E; ++I) {
3267 if (I != 0)
3268 O << " ";
3269 getIncomingValue(I)->printAsOperand(O, SlotTracker);
3270 if (I == 0 && isNormalized())
3271 continue;
3272 O << "/";
3273 getMask(I)->printAsOperand(O, SlotTracker);
3274 }
3275 }
3276}
3277#endif
3278
3282 "In-loop AnyOf reductions aren't currently supported");
3283 // Propagate the fast-math flags carried by the underlying instruction.
3284 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
3285 State.Builder.setFastMathFlags(getFastMathFlagsOrNone());
3286 Value *NewVecOp = State.get(getVecOp());
3287 if (VPValue *Cond = getCondOp()) {
3288 Value *NewCond = State.get(Cond, State.VF.isScalar());
3289 VectorType *VecTy = dyn_cast<VectorType>(NewVecOp->getType());
3290 Type *ElementTy = VecTy ? VecTy->getElementType() : NewVecOp->getType();
3291
3292 Value *Start =
3294 if (State.VF.isVector())
3295 Start = State.Builder.CreateVectorSplat(VecTy->getElementCount(), Start);
3296
3297 Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, Start);
3298 NewVecOp = Select;
3299 }
3300 Value *NewRed;
3301 Value *NextInChain;
3302 if (isOrdered()) {
3303 Value *PrevInChain = State.get(getChainOp(), /*IsScalar*/ true);
3304 if (State.VF.isVector())
3305 NewRed =
3306 createOrderedReduction(State.Builder, Kind, NewVecOp, PrevInChain);
3307 else
3308 NewRed = State.Builder.CreateBinOp(
3310 PrevInChain, NewVecOp);
3311 PrevInChain = NewRed;
3312 NextInChain = NewRed;
3313 } else if (isPartialReduction()) {
3314 assert((Kind == RecurKind::Add || Kind == RecurKind::FAdd) &&
3315 "Unexpected partial reduction kind");
3316 Value *PrevInChain = State.get(getChainOp(), /*IsScalar*/ false);
3317 NewRed = State.Builder.CreateIntrinsic(
3318 PrevInChain->getType(),
3319 Kind == RecurKind::Add ? Intrinsic::vector_partial_reduce_add
3320 : Intrinsic::vector_partial_reduce_fadd,
3321 {PrevInChain, NewVecOp}, State.Builder.getFastMathFlags(),
3322 "partial.reduce");
3323 PrevInChain = NewRed;
3324 NextInChain = NewRed;
3325 } else {
3326 assert(isInLoop() &&
3327 "The reduction must either be ordered, partial or in-loop");
3328 Value *PrevInChain = State.get(getChainOp(), /*IsScalar*/ true);
3329 NewRed = createSimpleReduction(State.Builder, NewVecOp, Kind);
3331 NextInChain = createMinMaxOp(State.Builder, Kind, NewRed, PrevInChain);
3332 else
3333 NextInChain = State.Builder.CreateBinOp(
3335 PrevInChain, NewRed);
3336 }
3337 State.set(this, NextInChain, /*IsScalar*/ !isPartialReduction());
3338}
3339
3341
3342 auto &Builder = State.Builder;
3343 // Propagate the fast-math flags carried by the underlying instruction.
3344 IRBuilderBase::FastMathFlagGuard FMFGuard(Builder);
3345 Builder.setFastMathFlags(getFastMathFlagsOrNone());
3346
3348 Value *Prev = State.get(getChainOp(), /*IsScalar*/ true);
3349 Value *VecOp = State.get(getVecOp());
3350 Value *EVL = State.get(getEVL(), VPLane(0));
3351
3352 Value *Mask;
3353 if (VPValue *CondOp = getCondOp())
3354 Mask = State.get(CondOp);
3355 else
3356 Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
3357
3358 Value *NewRed;
3359 if (isOrdered()) {
3360 NewRed = createOrderedReduction(Builder, Kind, VecOp, Prev, Mask, EVL);
3361 } else {
3362 NewRed = createSimpleReduction(Builder, VecOp, Kind, Mask, EVL);
3364 NewRed = createMinMaxOp(Builder, Kind, NewRed, Prev);
3365 else
3366 NewRed = Builder.CreateBinOp(
3368 Prev);
3369 }
3370 State.set(this, NewRed, /*IsScalar*/ true);
3371}
3372
3374 VPCostContext &Ctx) const {
3375 RecurKind RdxKind = getRecurrenceKind();
3376 Type *ElementTy = this->getScalarType();
3377 auto *VectorTy = cast<VectorType>(toVectorTy(ElementTy, VF));
3378 unsigned Opcode = RecurrenceDescriptor::getOpcode(RdxKind);
3380 std::optional<FastMathFlags> OptionalFMF =
3381 ElementTy->isFloatingPointTy() ? std::make_optional(FMFs) : std::nullopt;
3382
3383 if (isPartialReduction()) {
3384 InstructionCost CondCost = 0;
3385 if (isConditional()) {
3387 auto *CondTy =
3389 CondCost = Ctx.TTI.getCmpSelInstrCost(Instruction::Select, VectorTy,
3390 CondTy, Pred, Ctx.CostKind);
3391 }
3392 return CondCost + Ctx.TTI.getPartialReductionCost(
3393 Opcode, ElementTy, ElementTy, ElementTy, VF,
3394 TTI::PR_None, TTI::PR_None, {}, Ctx.CostKind,
3395 OptionalFMF);
3396 }
3397
3398 // TODO: Support any-of reductions.
3399 assert(
3401 ForceTargetInstructionCost.getNumOccurrences() > 0) &&
3402 "Any-of reduction not implemented in VPlan-based cost model currently.");
3403
3404 // Note that TTI should model the cost of moving result to the scalar register
3405 // and the BinOp cost in the getMinMaxReductionCost().
3408 return Ctx.TTI.getMinMaxReductionCost(Id, VectorTy, FMFs, Ctx.CostKind);
3409 }
3410
3411 // Note that TTI should model the cost of moving result to the scalar register
3412 // and the BinOp cost in the getArithmeticReductionCost().
3413 return Ctx.TTI.getArithmeticReductionCost(Opcode, VectorTy, OptionalFMF,
3414 Ctx.CostKind);
3415}
3416
3417VPExpressionRecipe::VPExpressionRecipe(
3418 ExpressionTypes ExpressionType,
3419 ArrayRef<VPSingleDefRecipe *> ExpressionRecipes)
3420 : VPSingleDefRecipe(VPRecipeBase::VPExpressionSC, {},
3421 cast<VPReductionRecipe>(ExpressionRecipes.back())
3422 ->getChainOp()
3423 ->getScalarType()),
3424 ExpressionRecipes(ExpressionRecipes), ExpressionType(ExpressionType) {
3425 assert(!ExpressionRecipes.empty() && "Nothing to combine?");
3426 assert(
3427 none_of(ExpressionRecipes,
3428 [](VPSingleDefRecipe *R) { return R->mayHaveSideEffects(); }) &&
3429 "expression cannot contain recipes with side-effects");
3430
3431 // Maintain a copy of the expression recipes as a set of users.
3432 SmallPtrSet<VPUser *, 4> ExpressionRecipesAsSetOfUsers;
3433 for (auto *R : ExpressionRecipes)
3434 ExpressionRecipesAsSetOfUsers.insert(R);
3435
3436 // Recipes in the expression, except the last one, must only be used by
3437 // (other) recipes inside the expression. If there are other users, external
3438 // to the expression, use a clone of the recipe for external users.
3439 for (VPSingleDefRecipe *R : reverse(ExpressionRecipes)) {
3440 if (R != ExpressionRecipes.back() &&
3441 any_of(R->users(), [&ExpressionRecipesAsSetOfUsers](VPUser *U) {
3442 return !ExpressionRecipesAsSetOfUsers.contains(U);
3443 })) {
3444 // There are users outside of the expression. Clone the recipe and use the
3445 // clone those external users.
3446 VPSingleDefRecipe *CopyForExtUsers = R->clone();
3447 R->replaceUsesWithIf(CopyForExtUsers, [&ExpressionRecipesAsSetOfUsers](
3448 VPUser &U, unsigned) {
3449 return !ExpressionRecipesAsSetOfUsers.contains(&U);
3450 });
3451 CopyForExtUsers->insertBefore(R);
3452 }
3453 if (R->getParent())
3454 R->removeFromParent();
3455 }
3456
3457 // Internalize all external operands to the expression recipes. To do so,
3458 // create new temporary VPValues for all operands defined by a recipe outside
3459 // the expression. The original operands are added as operands of the
3460 // VPExpressionRecipe itself.
3461 for (auto *R : ExpressionRecipes) {
3462 for (const auto &[Idx, Op] : enumerate(R->operands())) {
3463 auto *Def = Op->getDefiningRecipe();
3464 if (Def && ExpressionRecipesAsSetOfUsers.contains(Def))
3465 continue;
3466 addOperand(Op);
3467 LiveInPlaceholders.push_back(new VPSymbolicValue(Op->getScalarType()));
3468 }
3469 }
3470
3471 // Replace each external operand with the first one created for it in
3472 // LiveInPlaceholders.
3473 for (auto *R : ExpressionRecipes)
3474 for (auto const &[LiveIn, Tmp] : zip(operands(), LiveInPlaceholders))
3475 R->replaceUsesOfWith(LiveIn, Tmp);
3476}
3477
3479 for (auto *R : ExpressionRecipes)
3480 // Since the list could contain duplicates, make sure the recipe hasn't
3481 // already been inserted.
3482 if (!R->getParent())
3483 R->insertBefore(this);
3484
3485 for (const auto &[Idx, Op] : enumerate(operands()))
3486 LiveInPlaceholders[Idx]->replaceAllUsesWith(Op);
3487
3488 replaceAllUsesWith(ExpressionRecipes.back());
3489 ExpressionRecipes.clear();
3490}
3491
3493 VPCostContext &Ctx) const {
3494 Type *RedTy = this->getScalarType();
3495 auto *SrcVecTy =
3497 unsigned Opcode = RecurrenceDescriptor::getOpcode(
3498 cast<VPReductionRecipe>(ExpressionRecipes.back())->getRecurrenceKind());
3499 switch (ExpressionType) {
3500 case ExpressionTypes::NegatedExtendedReduction:
3501 assert((Opcode == Instruction::Add || Opcode == Instruction::FAdd) &&
3502 "Unexpected opcode");
3503 Opcode = Opcode == Instruction::Add ? Instruction::Sub : Instruction::FSub;
3504 [[fallthrough]];
3505 case ExpressionTypes::ExtendedReduction: {
3506 auto *RedR = cast<VPReductionRecipe>(ExpressionRecipes.back());
3507 auto *ExtR = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
3508
3509 if (RedR->isPartialReduction())
3510 return Ctx.TTI.getPartialReductionCost(
3511 Opcode, getOperand(0)->getScalarType(), nullptr, RedTy, VF,
3513 TargetTransformInfo::PR_None, std::nullopt, Ctx.CostKind,
3514 RedTy->isFloatingPointTy()
3515 ? std::optional{RedR->getFastMathFlagsOrNone()}
3516 : std::nullopt);
3517 else if (!RedTy->isFloatingPointTy())
3518 // TTI::getExtendedReductionCost only supports integer types.
3519 return Ctx.TTI.getExtendedReductionCost(
3520 Opcode, ExtR->getOpcode() == Instruction::ZExt, RedTy, SrcVecTy,
3521 std::nullopt, Ctx.CostKind);
3522 else
3524 }
3525 case ExpressionTypes::MulAccReduction:
3526 return Ctx.TTI.getMulAccReductionCost(false, Opcode, RedTy, SrcVecTy,
3527 Ctx.CostKind);
3528
3529 case ExpressionTypes::ExtNegatedMulAccReduction:
3530 switch (Opcode) {
3531 case Instruction::Add:
3532 Opcode = Instruction::Sub;
3533 break;
3534 case Instruction::FAdd:
3535 Opcode = Instruction::FSub;
3536 break;
3537 default:
3538 llvm_unreachable("Unsupported opcode for ExtNegatedMulAccReduction");
3539 }
3540 [[fallthrough]];
3541 case ExpressionTypes::ExtMulAccReduction: {
3542 auto *RedR = cast<VPReductionRecipe>(ExpressionRecipes.back());
3543 if (RedR->isPartialReduction()) {
3544 auto *Ext0R = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
3545 auto *Ext1R = cast<VPWidenCastRecipe>(ExpressionRecipes[1]);
3546 auto *Mul = cast<VPWidenRecipe>(ExpressionRecipes[2]);
3547 return Ctx.TTI.getPartialReductionCost(
3548 Opcode, getOperand(0)->getScalarType(),
3549 getOperand(1)->getScalarType(), RedTy, VF,
3551 Ext0R->getOpcode()),
3553 Ext1R->getOpcode()),
3554 Mul->getOpcode(), Ctx.CostKind,
3555 RedTy->isFloatingPointTy()
3556 ? std::optional{RedR->getFastMathFlagsOrNone()}
3557 : std::nullopt);
3558 }
3559 assert(Opcode != Instruction::FSub && "Only integer types are supported");
3560 return Ctx.TTI.getMulAccReductionCost(
3561 cast<VPWidenCastRecipe>(ExpressionRecipes.front())->getOpcode() ==
3562 Instruction::ZExt,
3563 Opcode, RedTy, SrcVecTy, Ctx.CostKind);
3564 }
3565 }
3566 llvm_unreachable("Unknown VPExpressionRecipe::ExpressionTypes enum");
3567}
3568
3570 return any_of(ExpressionRecipes, [](VPSingleDefRecipe *R) {
3571 return R->mayReadFromMemory() || R->mayWriteToMemory();
3572 });
3573}
3574
3576 assert(
3577 none_of(ExpressionRecipes,
3578 [](VPSingleDefRecipe *R) { return R->mayHaveSideEffects(); }) &&
3579 "expression cannot contain recipes with side-effects");
3580 return false;
3581}
3582
3584 auto *RR = dyn_cast<VPReductionRecipe>(ExpressionRecipes.back());
3585 return RR && !RR->isPartialReduction();
3586}
3587
3588#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3589
3591 VPSlotTracker &SlotTracker) const {
3592 O << Indent << "EXPRESSION ";
3594 O << " = ";
3595 auto *Red = cast<VPReductionRecipe>(ExpressionRecipes.back());
3596 unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind());
3597 VPValue *RdxStart =
3598 getOperand(getNumOperands() - (Red->isConditional() ? 2 : 1));
3599
3600 switch (ExpressionType) {
3601 case ExpressionTypes::NegatedExtendedReduction:
3602 case ExpressionTypes::ExtendedReduction: {
3603 bool Negated = ExpressionType == ExpressionTypes::NegatedExtendedReduction;
3605 O << " + " << (Red->isPartialReduction() ? "partial." : "") << "reduce.";
3606 O << Instruction::getOpcodeName(Opcode) << " (";
3607 if (Negated)
3608 O << (Opcode == Instruction::Add ? "sub (0, " : "fneg(");
3610 if (Negated)
3611 O << ")";
3612 Red->printFlags(O);
3613
3614 auto *Ext0 = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
3615 O << Instruction::getOpcodeName(Ext0->getOpcode()) << " to "
3616 << *Ext0->getScalarType();
3617 if (Red->isConditional()) {
3618 O << ", ";
3620 }
3621 O << ")";
3622 break;
3623 }
3624 case ExpressionTypes::ExtNegatedMulAccReduction: {
3625 RdxStart->printAsOperand(O, SlotTracker);
3626 O << " + " << (Red->isPartialReduction() ? "partial." : "") << "reduce.";
3628 RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()))
3629 << " (sub (0, mul";
3630 auto *Mul = cast<VPWidenRecipe>(ExpressionRecipes[2]);
3631 Mul->printFlags(O);
3632 O << "(";
3634 auto *Ext0 = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
3635 O << " " << Instruction::getOpcodeName(Ext0->getOpcode()) << " to "
3636 << *Ext0->getScalarType() << "), (";
3638 auto *Ext1 = cast<VPWidenCastRecipe>(ExpressionRecipes[1]);
3639 O << " " << Instruction::getOpcodeName(Ext1->getOpcode()) << " to "
3640 << *Ext1->getScalarType() << ")";
3641 if (Red->isConditional()) {
3642 O << ", ";
3644 }
3645 O << "))";
3646 break;
3647 }
3648 case ExpressionTypes::MulAccReduction:
3649 case ExpressionTypes::ExtMulAccReduction: {
3650 RdxStart->printAsOperand(O, SlotTracker);
3651 O << " + " << (Red->isPartialReduction() ? "partial." : "") << "reduce.";
3653 RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()))
3654 << " (";
3655 O << "mul";
3656 bool IsExtended = ExpressionType == ExpressionTypes::ExtMulAccReduction;
3657 auto *Mul = cast<VPWidenRecipe>(IsExtended ? ExpressionRecipes[2]
3658 : ExpressionRecipes[0]);
3659 Mul->printFlags(O);
3660 if (IsExtended)
3661 O << "(";
3663 if (IsExtended) {
3664 auto *Ext0 = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
3665 O << " " << Instruction::getOpcodeName(Ext0->getOpcode()) << " to "
3666 << *Ext0->getScalarType() << "), (";
3667 } else {
3668 O << ", ";
3669 }
3671 if (IsExtended) {
3672 auto *Ext1 = cast<VPWidenCastRecipe>(ExpressionRecipes[1]);
3673 O << " " << Instruction::getOpcodeName(Ext1->getOpcode()) << " to "
3674 << *Ext1->getScalarType() << ")";
3675 }
3676 if (Red->isConditional()) {
3677 O << ", ";
3679 }
3680 O << ")";
3681 break;
3682 }
3683 }
3684}
3685
3687 VPSlotTracker &SlotTracker) const {
3688 if (isPartialReduction())
3689 O << Indent << "PARTIAL-REDUCE ";
3690 else
3691 O << Indent << "REDUCE ";
3693 O << " = ";
3695 O << " +";
3696 printFlags(O);
3697 O << " reduce.";
3699 O << " (";
3701 if (isConditional()) {
3702 O << ", ";
3704 }
3705 O << ")";
3706}
3707
3709 VPSlotTracker &SlotTracker) const {
3710 O << Indent << "REDUCE ";
3712 O << " = ";
3714 O << " +";
3715 printFlags(O);
3716 O << " vp.reduce."
3719 << " (";
3721 O << ", ";
3723 if (isConditional()) {
3724 O << ", ";
3726 }
3727 O << ")";
3728}
3729
3730#endif
3731
3733 assert(IsSingleScalar &&
3734 "VPReplicateRecipes must be unrolled before ::execute");
3735 auto *Instr = getUnderlyingInstr();
3736 Instruction *Cloned = Instr->clone();
3737 Type *ResultTy = getScalarType();
3738 if (!ResultTy->isVoidTy()) {
3739 Cloned->setName(Instr->getName() + ".cloned");
3740 // The operands of the replicate recipe may have been narrowed, resulting in
3741 // a narrower result type. Update the type of the cloned instruction to the
3742 // correct type.
3743 if (ResultTy != Cloned->getType())
3744 Cloned->mutateType(ResultTy);
3745 }
3746
3747 applyFlags(*Cloned);
3748 applyMetadata(*Cloned);
3749
3750 if (hasPredicate())
3751 cast<CmpInst>(Cloned)->setPredicate(getPredicate());
3752
3753 // Replace the operands of the cloned instructions with their scalar
3754 // equivalents in the new loop.
3755 for (const auto &[Idx, V] : enumerate(operands()))
3756 Cloned->setOperand(Idx, State.get(V, true));
3757
3758 // Place the cloned scalar in the new loop.
3759 State.Builder.Insert(Cloned);
3760
3761 State.set(this, Cloned, true);
3762
3763 // If we just cloned a new assumption, add it the assumption cache.
3764 if (auto *II = dyn_cast<AssumeInst>(Cloned))
3765 State.AC->registerAssumption(II);
3766}
3767
3768/// Returns a SCEV expression for \p Ptr if it is a pointer computation for
3769/// which the legacy cost model computes a SCEV expression when computing the
3770/// address cost. Computing SCEVs for VPValues is incomplete and returns
3771/// SCEVCouldNotCompute in cases the legacy cost model can compute SCEVs. In
3772/// those cases we fall back to the legacy cost model. Otherwise return nullptr.
3773static const SCEV *getAddressAccessSCEV(const VPValue *Ptr,
3775 const Loop *L) {
3776 const SCEV *Addr = vputils::getSCEVExprForVPValue(Ptr, PSE, L);
3777 if (isa<SCEVCouldNotCompute>(Addr))
3778 return Addr;
3779
3780 return vputils::isAddressSCEVForCost(Addr, *PSE.getSE(), L) ? Addr : nullptr;
3781}
3782
3784 VPCostContext &Ctx) const {
3786 // VPReplicateRecipe may be cloned as part of an existing VPlan-to-VPlan
3787 // transform, avoid computing their cost multiple times for now.
3788 Ctx.SkipCostComputation.insert(UI);
3789
3790 if (VF.isScalable() && !isSingleScalar())
3792
3793 switch (UI->getOpcode()) {
3794 case Instruction::Alloca:
3795 if (VF.isScalable())
3797 return Ctx.TTI.getArithmeticInstrCost(Instruction::Mul,
3798 this->getScalarType(), Ctx.CostKind);
3799 case Instruction::GetElementPtr:
3800 // We mark this instruction as zero-cost because the cost of GEPs in
3801 // vectorized code depends on whether the corresponding memory instruction
3802 // is scalarized or not. Therefore, we handle GEPs with the memory
3803 // instruction cost.
3804 return 0;
3805 case Instruction::Call: {
3806 auto *CalledFn =
3808 Type *ResultTy = this->getScalarType();
3810 return computeCallCost(CalledFn, ResultTy, ArgOps, isSingleScalar(), VF,
3811 Ctx);
3812 }
3813 case Instruction::Add:
3814 case Instruction::Sub:
3815 case Instruction::FAdd:
3816 case Instruction::FSub:
3817 case Instruction::Mul:
3818 case Instruction::FMul:
3819 case Instruction::FDiv:
3820 case Instruction::FRem:
3821 case Instruction::Shl:
3822 case Instruction::LShr:
3823 case Instruction::AShr:
3824 case Instruction::And:
3825 case Instruction::Or:
3826 case Instruction::Xor:
3827 case Instruction::ICmp:
3828 case Instruction::FCmp:
3830 Ctx) *
3831 (isSingleScalar() ? 1 : VF.getFixedValue());
3832 case Instruction::SDiv:
3833 case Instruction::UDiv:
3834 case Instruction::SRem:
3835 case Instruction::URem: {
3836 InstructionCost ScalarCost =
3838 if (isSingleScalar())
3839 return ScalarCost;
3840
3841 // If any of the operands is from a different replicate region and has its
3842 // cost skipped, it may have been forced to scalar. Fall back to legacy cost
3843 // model to avoid cost mis-match.
3844 if (any_of(operands(), [&Ctx, VF](VPValue *Op) {
3845 auto *PredR = dyn_cast<VPPredInstPHIRecipe>(Op);
3846 if (!PredR)
3847 return false;
3848 return Ctx.skipCostComputation(
3850 PredR->getOperand(0)->getUnderlyingValue()),
3851 VF.isVector());
3852 }))
3853 break;
3854
3855 ScalarCost = ScalarCost * VF.getFixedValue() +
3856 Ctx.getScalarizationOverhead(this->getScalarType(),
3857 to_vector(operands()), VF);
3858 // If the recipe is not predicated (i.e. not in a replicate region), return
3859 // the scalar cost. Otherwise handle predicated cost.
3860 if (!getRegion()->isReplicator())
3861 return ScalarCost;
3862
3863 // Account for the phi nodes that we will create.
3864 ScalarCost += VF.getFixedValue() *
3865 Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind);
3866 // Scale the cost by the probability of executing the predicated blocks.
3867 // This assumes the predicated block for each vector lane is equally
3868 // likely.
3869 ScalarCost /= Ctx.getPredBlockCostDivisor(UI->getParent());
3870 return ScalarCost;
3871 }
3872 case Instruction::Load:
3873 case Instruction::Store: {
3874 bool IsLoad = UI->getOpcode() == Instruction::Load;
3875 const VPValue *PtrOp = getOperand(!IsLoad);
3876 const SCEV *PtrSCEV = getAddressAccessSCEV(PtrOp, Ctx.PSE, Ctx.L);
3878 break;
3879
3880 Type *ValTy = (IsLoad ? this : getOperand(0))->getScalarType();
3881 Type *ScalarPtrTy = PtrOp->getScalarType();
3882 const Align Alignment = getLoadStoreAlignment(UI);
3883 unsigned AS = cast<PointerType>(ScalarPtrTy)->getAddressSpace();
3885 bool PreferVectorizedAddressing = Ctx.TTI.prefersVectorizedAddressing();
3886 bool UsedByLoadStoreAddress =
3887 !PreferVectorizedAddressing && vputils::isUsedByLoadStoreAddress(this);
3888 InstructionCost ScalarMemOpCost = Ctx.TTI.getMemoryOpCost(
3889 UI->getOpcode(), ValTy, Alignment, AS, Ctx.CostKind, OpInfo,
3890 UsedByLoadStoreAddress ? UI : nullptr);
3891
3892 Type *PtrTy = isSingleScalar() ? ScalarPtrTy : toVectorTy(ScalarPtrTy, VF);
3893 InstructionCost ScalarCost =
3894 ScalarMemOpCost +
3895 Ctx.TTI.getAddressComputationCost(
3896 PtrTy, UsedByLoadStoreAddress ? nullptr : Ctx.PSE.getSE(), PtrSCEV,
3897 Ctx.CostKind);
3898 if (isSingleScalar())
3899 return ScalarCost;
3900
3901 SmallVector<const VPValue *> OpsToScalarize;
3902 Type *ResultTy = Type::getVoidTy(PtrTy->getContext());
3903 // Set ResultTy and OpsToScalarize, if scalarization is needed. Currently we
3904 // don't assign scalarization overhead in general, if the target prefers
3905 // vectorized addressing or the loaded value is used as part of an address
3906 // of another load or store.
3907 if (!UsedByLoadStoreAddress) {
3908 bool EfficientVectorLoadStore =
3909 Ctx.TTI.supportsEfficientVectorElementLoadStore();
3910 if (!(IsLoad && !PreferVectorizedAddressing) &&
3911 !(!IsLoad && EfficientVectorLoadStore))
3912 append_range(OpsToScalarize, operands());
3913
3914 if (!EfficientVectorLoadStore)
3915 ResultTy = this->getScalarType();
3916 }
3917
3921 (ScalarCost * VF.getFixedValue()) +
3922 Ctx.getScalarizationOverhead(ResultTy, OpsToScalarize, VF, VIC, true);
3923
3924 const VPRegionBlock *ParentRegion = getRegion();
3925 if (ParentRegion && ParentRegion->isReplicator()) {
3926 if (!PtrSCEV)
3927 break;
3928 Cost /= Ctx.getPredBlockCostDivisor(UI->getParent());
3929 Cost += Ctx.TTI.getCFInstrCost(Instruction::CondBr, Ctx.CostKind);
3930
3931 auto *VecI1Ty = VectorType::get(
3932 IntegerType::getInt1Ty(Ctx.L->getHeader()->getContext()), VF);
3933 Cost += Ctx.TTI.getScalarizationOverhead(
3934 VecI1Ty, APInt::getAllOnes(VF.getFixedValue()),
3935 /*Insert=*/false, /*Extract=*/true, Ctx.CostKind);
3936
3937 if (Ctx.useEmulatedMaskMemRefHack(this, VF)) {
3938 // Artificially setting to a high enough value to practically disable
3939 // vectorization with such operations.
3940 return 3000000;
3941 }
3942 }
3943 return Cost;
3944 }
3945 case Instruction::SExt:
3946 case Instruction::ZExt:
3947 case Instruction::FPToUI:
3948 case Instruction::FPToSI:
3949 case Instruction::FPExt:
3950 case Instruction::PtrToInt:
3951 case Instruction::PtrToAddr:
3952 case Instruction::IntToPtr:
3953 case Instruction::SIToFP:
3954 case Instruction::UIToFP:
3955 case Instruction::Trunc:
3956 case Instruction::FPTrunc:
3957 case Instruction::Select:
3958 case Instruction::AddrSpaceCast: {
3960 Ctx) *
3961 (isSingleScalar() ? 1 : VF.getFixedValue());
3962 }
3963 case Instruction::ExtractValue:
3964 case Instruction::InsertValue:
3965 return Ctx.TTI.getInsertExtractValueCost(getOpcode(), Ctx.CostKind);
3966 }
3967
3968 return Ctx.getLegacyCost(UI, VF);
3969}
3970
3972 Function *CalledFn, Type *ResultTy, ArrayRef<const VPValue *> ArgOps,
3973 bool IsSingleScalar, ElementCount VF, VPCostContext &Ctx) {
3975 ArgOps, [&](const VPValue *Op) { return Op->getScalarType(); });
3976
3977 Intrinsic::ID IntrinID = CalledFn->getIntrinsicID();
3978 auto GetIntrinsicCost = [&] {
3979 if (!IntrinID)
3981 return Ctx.TTI.getIntrinsicInstrCost(
3982 IntrinsicCostAttributes(IntrinID, ResultTy, Tys), Ctx.CostKind);
3983 };
3984
3985 if (IntrinID && VPCostContext::isFreeScalarIntrinsic(IntrinID)) {
3986 assert(GetIntrinsicCost() == 0 && "scalarizing intrinsic should be free");
3987 return 0;
3988 }
3989
3990 InstructionCost ScalarCallCost =
3991 Ctx.TTI.getCallInstrCost(CalledFn, ResultTy, Tys, Ctx.CostKind);
3992 if (IsSingleScalar) {
3993 ScalarCallCost = std::min(ScalarCallCost, GetIntrinsicCost());
3994 return ScalarCallCost;
3995 }
3996
3997 // Scalarization overhead is undefined for scalable VFs.
3998 if (VF.isScalable())
4000
4001 return ScalarCallCost * VF.getFixedValue() +
4002 Ctx.getScalarizationOverhead(ResultTy, ArgOps, VF);
4003}
4004
4005#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4007 VPSlotTracker &SlotTracker) const {
4008 O << Indent << (IsSingleScalar ? "CLONE " : "REPLICATE ");
4009
4010 if (!getScalarType()->isVoidTy()) {
4012 O << " = ";
4013 }
4014 if (auto *CB = dyn_cast<CallBase>(getUnderlyingInstr())) {
4015 O << "call";
4016 printFlags(O);
4017 O << "@" << CB->getCalledFunction()->getName() << "(";
4019 O, [&O, &SlotTracker](VPValue *Op) {
4020 Op->printAsOperand(O, SlotTracker);
4021 });
4022 O << ")";
4023 } else {
4025 printFlags(O);
4027 }
4028
4029 // Find if the recipe is used by a widened recipe via an intervening
4030 // VPPredInstPHIRecipe. In this case, also pack the scalar values in a vector.
4031 if (any_of(users(), [](const VPUser *U) {
4032 if (auto *PredR = dyn_cast<VPPredInstPHIRecipe>(U))
4033 return !vputils::onlyScalarValuesUsed(PredR);
4034 return false;
4035 }))
4036 O << " (S->V)";
4037}
4038#endif
4039
4041 llvm_unreachable("recipe must be removed when dissolving replicate region");
4042}
4043
4045 VPCostContext &Ctx) const {
4046 // The legacy cost model doesn't assign costs to branches for individual
4047 // replicate regions. Match the current behavior in the VPlan cost model for
4048 // now.
4049 return 0;
4050}
4051
4053 llvm_unreachable("recipe must be removed when dissolving replicate region");
4054}
4055
4056#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4058 VPSlotTracker &SlotTracker) const {
4059 O << Indent << "PHI-PREDICATED-INSTRUCTION ";
4061 O << " = ";
4063}
4064#endif
4065
4067const VPRecipeBase *VPWidenLoadRecipe::getAsRecipe() const { return this; }
4068
4071
4073const VPRecipeBase *VPWidenStoreRecipe::getAsRecipe() const { return this; }
4074
4077
4079 VPCostContext &Ctx) const {
4080 const VPRecipeBase *R = getAsRecipe();
4082 Type *ScalarTy = IsLoad ? cast<VPSingleDefRecipe>(R)->getScalarType()
4083 : R->getOperand(1)->getScalarType();
4084 Type *Ty = toVectorTy(ScalarTy, VF);
4085 unsigned AS =
4086 cast<PointerType>(getAddr()->getScalarType())->getAddressSpace();
4087 unsigned Opcode = IsLoad ? Instruction::Load : Instruction::Store;
4088
4089 if (!Consecutive) {
4090 // TODO: Using the original IR may not be accurate.
4091 // Currently, ARM will use the underlying IR to calculate gather/scatter
4092 // instruction cost.
4093 [[maybe_unused]] auto IsReverseMask = [this, R]() {
4094 VPValue *Mask = getMask();
4095 if (!Mask)
4096 return false;
4097
4100
4101 return match(Mask, m_Reverse(m_VPValue()));
4102 };
4103 assert(!IsReverseMask() &&
4104 "Inconsecutive memory access should not have reverse order");
4105 Type *PtrTy = getAddr()->getScalarType();
4106 const Value *Ptr = getAddr()->getUnderlyingValue();
4107
4108 // If the address value is uniform across all lanes, then the address can be
4109 // calculated with scalar type and broadcast.
4111 PtrTy = toVectorTy(PtrTy, VF);
4112
4113 unsigned IID = isa<VPWidenLoadRecipe>(R) ? Intrinsic::masked_gather
4114 : isa<VPWidenStoreRecipe>(R) ? Intrinsic::masked_scatter
4115 : isa<VPWidenLoadEVLRecipe>(R) ? Intrinsic::vp_gather
4116 : Intrinsic::vp_scatter;
4117 return Ctx.TTI.getAddressComputationCost(PtrTy, nullptr, nullptr,
4118 Ctx.CostKind) +
4119 Ctx.TTI.getMemIntrinsicInstrCost(
4121 &Ingredient),
4122 Ctx.CostKind);
4123 }
4124
4126 if (IsMasked) {
4127 unsigned IID = isa<VPWidenLoadRecipe>(R) ? Intrinsic::masked_load
4128 : Intrinsic::masked_store;
4129 Cost += Ctx.TTI.getMemIntrinsicInstrCost(
4130 MemIntrinsicCostAttributes(IID, Ty, Alignment, AS), Ctx.CostKind);
4131 } else {
4132 TTI::OperandValueInfo OpInfo = Ctx.getOperandInfo(
4134 : R->getOperand(1));
4135 Cost += Ctx.TTI.getMemoryOpCost(Opcode, Ty, Alignment, AS, Ctx.CostKind,
4136 OpInfo, &Ingredient);
4137 }
4138 return Cost;
4139}
4140
4142 Type *ScalarDataTy = getScalarType();
4143 auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
4144 bool CreateGather = !isConsecutive();
4145
4146 auto &Builder = State.Builder;
4147 Value *Mask = nullptr;
4148 if (auto *VPMask = getMask())
4149 Mask = State.get(VPMask);
4150
4151 Value *Addr = State.get(getAddr(), /*IsScalar*/ !CreateGather);
4152 Value *NewLI;
4153 if (CreateGather) {
4154 NewLI = Builder.CreateMaskedGather(DataTy, Addr, Alignment, Mask, nullptr,
4155 "wide.masked.gather");
4156 } else if (Mask) {
4157 NewLI =
4158 Builder.CreateMaskedLoad(DataTy, Addr, Alignment, Mask,
4159 PoisonValue::get(DataTy), "wide.masked.load");
4160 } else {
4161 NewLI = Builder.CreateAlignedLoad(DataTy, Addr, Alignment, "wide.load");
4162 }
4164 State.set(this, NewLI);
4165}
4166
4167#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4169 VPSlotTracker &SlotTracker) const {
4170 O << Indent << "WIDEN ";
4172 O << " = load ";
4174}
4175#endif
4176
4178 Type *ScalarDataTy = getScalarType();
4179 auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
4180 bool CreateGather = !isConsecutive();
4181
4182 auto &Builder = State.Builder;
4183 CallInst *NewLI;
4184 Value *EVL = State.get(getEVL(), VPLane(0));
4185 Value *Addr = State.get(getAddr(), !CreateGather);
4186 Value *Mask = nullptr;
4187 if (VPValue *VPMask = getMask())
4188 Mask = State.get(VPMask);
4189 else
4190 Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
4191
4192 if (CreateGather) {
4193 NewLI = Builder.CreateIntrinsicWithoutFolding(DataTy, Intrinsic::vp_gather,
4194 {Addr, Mask, EVL}, nullptr,
4195 "wide.masked.gather");
4196 } else {
4197 NewLI = Builder.CreateIntrinsicWithoutFolding(
4198 DataTy, Intrinsic::vp_load, {Addr, Mask, EVL}, nullptr, "vp.op.load");
4199 }
4200 NewLI->addParamAttr(
4202 applyMetadata(*NewLI);
4203 State.set(this, NewLI);
4204}
4205
4207 VPCostContext &Ctx) const {
4208 if (!Consecutive || IsMasked)
4209 return VPWidenMemoryRecipe::computeCost(VF, Ctx);
4210
4211 // We need to use the getMemIntrinsicInstrCost() instead of getMemoryOpCost()
4212 // here because the EVL recipes using EVL to replace the tail mask. But in the
4213 // legacy model, it will always calculate the cost of mask.
4214 // TODO: Using getMemoryOpCost() instead of getMemIntrinsicInstrCost when we
4215 // don't need to compare to the legacy cost model.
4216 Type *Ty = toVectorTy(getScalarType(), VF);
4217 unsigned AS =
4218 cast<PointerType>(getAddr()->getScalarType())->getAddressSpace();
4219 return Ctx.TTI.getMemIntrinsicInstrCost(
4220 MemIntrinsicCostAttributes(Intrinsic::vp_load, Ty, Alignment, AS),
4221 Ctx.CostKind);
4222}
4223
4224#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4226 VPSlotTracker &SlotTracker) const {
4227 O << Indent << "WIDEN ";
4229 O << " = vp.load ";
4231}
4232#endif
4233
4235 VPValue *StoredVPValue = getStoredValue();
4236 bool CreateScatter = !isConsecutive();
4237
4238 auto &Builder = State.Builder;
4239
4240 Value *Mask = nullptr;
4241 if (auto *VPMask = getMask())
4242 Mask = State.get(VPMask);
4243
4244 Value *StoredVal = State.get(StoredVPValue);
4245 Value *Addr = State.get(getAddr(), /*IsScalar*/ !CreateScatter);
4246 Instruction *NewSI = nullptr;
4247 if (CreateScatter)
4248 NewSI = Builder.CreateMaskedScatter(StoredVal, Addr, Alignment, Mask);
4249 else if (Mask)
4250 NewSI = Builder.CreateMaskedStore(StoredVal, Addr, Alignment, Mask);
4251 else
4252 NewSI = Builder.CreateAlignedStore(StoredVal, Addr, Alignment);
4253 applyMetadata(*NewSI);
4254}
4255
4256#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4258 VPSlotTracker &SlotTracker) const {
4259 O << Indent << "WIDEN store ";
4261}
4262#endif
4263
4265 VPValue *StoredValue = getStoredValue();
4266 bool CreateScatter = !isConsecutive();
4267
4268 auto &Builder = State.Builder;
4269
4270 CallInst *NewSI = nullptr;
4271 Value *StoredVal = State.get(StoredValue);
4272 Value *EVL = State.get(getEVL(), VPLane(0));
4273 Value *Mask = nullptr;
4274 if (VPValue *VPMask = getMask())
4275 Mask = State.get(VPMask);
4276 else
4277 Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
4278
4279 Value *Addr = State.get(getAddr(), !CreateScatter);
4280 if (CreateScatter) {
4281 NewSI = Builder.CreateIntrinsicWithoutFolding(
4282 Type::getVoidTy(EVL->getContext()), Intrinsic::vp_scatter,
4283 {StoredVal, Addr, Mask, EVL});
4284 } else {
4285 NewSI = Builder.CreateIntrinsicWithoutFolding(
4286 Type::getVoidTy(EVL->getContext()), Intrinsic::vp_store,
4287 {StoredVal, Addr, Mask, EVL});
4288 }
4289 NewSI->addParamAttr(
4291 applyMetadata(*NewSI);
4292}
4293
4295 VPCostContext &Ctx) const {
4296 if (!Consecutive || IsMasked)
4297 return VPWidenMemoryRecipe::computeCost(VF, Ctx);
4298
4299 // We need to use the getMemIntrinsicInstrCost() instead of getMemoryOpCost()
4300 // here because the EVL recipes using EVL to replace the tail mask. But in the
4301 // legacy model, it will always calculate the cost of mask.
4302 // TODO: Using getMemoryOpCost() instead of getMemIntrinsicInstrCost when we
4303 // don't need to compare to the legacy cost model.
4304 Type *Ty = toVectorTy(getStoredValue()->getScalarType(), VF);
4305 unsigned AS =
4306 cast<PointerType>(getAddr()->getScalarType())->getAddressSpace();
4307 return Ctx.TTI.getMemIntrinsicInstrCost(
4308 MemIntrinsicCostAttributes(Intrinsic::vp_store, Ty, Alignment, AS),
4309 Ctx.CostKind);
4310}
4311
4312#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4314 VPSlotTracker &SlotTracker) const {
4315 O << Indent << "WIDEN vp.store ";
4317}
4318#endif
4319
4321 VectorType *DstVTy, const DataLayout &DL) {
4322 // Verify that V is a vector type with same number of elements as DstVTy.
4323 auto VF = DstVTy->getElementCount();
4324 auto *SrcVecTy = cast<VectorType>(V->getType());
4325 assert(VF == SrcVecTy->getElementCount() && "Vector dimensions do not match");
4326 Type *SrcElemTy = SrcVecTy->getElementType();
4327 Type *DstElemTy = DstVTy->getElementType();
4328 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
4329 "Vector elements must have same size");
4330
4331 // Do a direct cast if element types are castable.
4332 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
4333 return Builder.CreateBitOrPointerCast(V, DstVTy);
4334 }
4335 // V cannot be directly casted to desired vector type.
4336 // May happen when V is a floating point vector but DstVTy is a vector of
4337 // pointers or vice-versa. Handle this using a two-step bitcast using an
4338 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
4339 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
4340 "Only one type should be a pointer type");
4341 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
4342 "Only one type should be a floating point type");
4343 Type *IntTy =
4344 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
4345 auto *VecIntTy = VectorType::get(IntTy, VF);
4346 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
4347 return Builder.CreateBitOrPointerCast(CastVal, DstVTy);
4348}
4349
4350/// Return a vector containing interleaved elements from multiple
4351/// smaller input vectors.
4353 const Twine &Name) {
4354 unsigned Factor = Vals.size();
4355 assert(Factor > 1 && "Tried to interleave invalid number of vectors");
4356
4357 VectorType *VecTy = cast<VectorType>(Vals[0]->getType());
4358#ifndef NDEBUG
4359 for (Value *Val : Vals)
4360 assert(Val->getType() == VecTy && "Tried to interleave mismatched types");
4361#endif
4362
4363 // Scalable vectors cannot use arbitrary shufflevectors (only splats), so
4364 // must use intrinsics to interleave.
4365 if (VecTy->isScalableTy()) {
4366 assert(Factor <= 8 && "Unsupported interleave factor for scalable vectors");
4367 return Builder.CreateVectorInterleave(Vals, Name);
4368 }
4369
4370 // Fixed length. Start by concatenating all vectors into a wide vector.
4371 Value *WideVec = concatenateVectors(Builder, Vals);
4372
4373 // Interleave the elements into the wide vector.
4374 const unsigned NumElts = VecTy->getElementCount().getFixedValue();
4375 return Builder.CreateShuffleVector(
4376 WideVec, createInterleaveMask(NumElts, Factor), Name);
4377}
4378
4379// Try to vectorize the interleave group that \p Instr belongs to.
4380//
4381// E.g. Translate following interleaved load group (factor = 3):
4382// for (i = 0; i < N; i+=3) {
4383// R = Pic[i]; // Member of index 0
4384// G = Pic[i+1]; // Member of index 1
4385// B = Pic[i+2]; // Member of index 2
4386// ... // do something to R, G, B
4387// }
4388// To:
4389// %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B
4390// %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements
4391// %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements
4392// %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements
4393//
4394// Or translate following interleaved store group (factor = 3):
4395// for (i = 0; i < N; i+=3) {
4396// ... do something to R, G, B
4397// Pic[i] = R; // Member of index 0
4398// Pic[i+1] = G; // Member of index 1
4399// Pic[i+2] = B; // Member of index 2
4400// }
4401// To:
4402// %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
4403// %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
4404// %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
4405// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements
4406// store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B
4408 assert((!needsMaskForGaps() || !State.VF.isScalable()) &&
4409 "Masking gaps for scalable vectors is not yet supported.");
4411 Instruction *Instr = Group->getInsertPos();
4412
4413 // Prepare for the vector type of the interleaved load/store.
4414 Type *ScalarTy = getLoadStoreType(Instr);
4415 unsigned InterleaveFactor = Group->getFactor();
4416 auto *VecTy = VectorType::get(ScalarTy, State.VF * InterleaveFactor);
4417
4418 VPValue *BlockInMask = getMask();
4419 VPValue *Addr = getAddr();
4420 Value *ResAddr = State.get(Addr, VPLane(0));
4421
4422 auto CreateGroupMask = [&BlockInMask, &State,
4423 &InterleaveFactor](Value *MaskForGaps) -> Value * {
4424 if (State.VF.isScalable()) {
4425 assert(!MaskForGaps && "Interleaved groups with gaps are not supported.");
4426 assert(InterleaveFactor <= 8 &&
4427 "Unsupported deinterleave factor for scalable vectors");
4428 auto *ResBlockInMask = State.get(BlockInMask);
4429 SmallVector<Value *> Ops(InterleaveFactor, ResBlockInMask);
4430 return interleaveVectors(State.Builder, Ops, "interleaved.mask");
4431 }
4432
4433 if (!BlockInMask)
4434 return MaskForGaps;
4435
4436 Value *ResBlockInMask = State.get(BlockInMask);
4437 Value *ShuffledMask = State.Builder.CreateShuffleVector(
4438 ResBlockInMask,
4439 createReplicatedMask(InterleaveFactor, State.VF.getFixedValue()),
4440 "interleaved.mask");
4441 return MaskForGaps ? State.Builder.CreateBinOp(Instruction::And,
4442 ShuffledMask, MaskForGaps)
4443 : ShuffledMask;
4444 };
4445
4446 const DataLayout &DL = Instr->getDataLayout();
4447 // Vectorize the interleaved load group.
4448 if (isa<LoadInst>(Instr)) {
4449 Value *MaskForGaps = nullptr;
4450 if (needsMaskForGaps()) {
4451 MaskForGaps =
4452 createBitMaskForGaps(State.Builder, State.VF.getFixedValue(), *Group);
4453 assert(MaskForGaps && "Mask for Gaps is required but it is null");
4454 }
4455
4456 Instruction *NewLoad;
4457 if (BlockInMask || MaskForGaps) {
4458 Value *GroupMask = CreateGroupMask(MaskForGaps);
4459 Value *PoisonVec = PoisonValue::get(VecTy);
4460 NewLoad = State.Builder.CreateMaskedLoad(VecTy, ResAddr,
4461 Group->getAlign(), GroupMask,
4462 PoisonVec, "wide.masked.vec");
4463 } else
4464 NewLoad = State.Builder.CreateAlignedLoad(VecTy, ResAddr,
4465 Group->getAlign(), "wide.vec");
4466 applyMetadata(*NewLoad);
4467 // TODO: Also manage existing metadata using VPIRMetadata.
4468 Group->addMetadata(NewLoad);
4469
4471 if (VecTy->isScalableTy()) {
4472 // Scalable vectors cannot use arbitrary shufflevectors (only splats),
4473 // so must use intrinsics to deinterleave.
4474 assert(InterleaveFactor <= 8 &&
4475 "Unsupported deinterleave factor for scalable vectors");
4476 NewLoad = State.Builder.CreateIntrinsicWithoutFolding(
4477 Intrinsic::getDeinterleaveIntrinsicID(InterleaveFactor),
4478 NewLoad->getType(), NewLoad,
4479 /*FMFSource=*/nullptr, "strided.vec");
4480 }
4481
4482 auto CreateStridedVector = [&InterleaveFactor, &State,
4483 &NewLoad](unsigned Index) -> Value * {
4484 assert(Index < InterleaveFactor && "Illegal group index");
4485 if (State.VF.isScalable())
4486 return State.Builder.CreateExtractValue(NewLoad, Index);
4487
4488 // For fixed length VF, use shuffle to extract the sub-vectors from the
4489 // wide load.
4490 auto StrideMask =
4491 createStrideMask(Index, InterleaveFactor, State.VF.getFixedValue());
4492 return State.Builder.CreateShuffleVector(NewLoad, StrideMask,
4493 "strided.vec");
4494 };
4495
4496 for (unsigned I = 0, J = 0; I < InterleaveFactor; ++I) {
4497 Instruction *Member = Group->getMember(I);
4498
4499 // Skip the gaps in the group.
4500 if (!Member)
4501 continue;
4502
4503 Value *StridedVec = CreateStridedVector(I);
4504
4505 // If this member has different type, cast the result type.
4506 if (Member->getType() != ScalarTy) {
4507 VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF);
4508 StridedVec =
4509 createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL);
4510 }
4511
4512 if (Group->isReverse())
4513 StridedVec = State.Builder.CreateVectorReverse(StridedVec, "reverse");
4514
4515 State.set(VPDefs[J], StridedVec);
4516 ++J;
4517 }
4518 return;
4519 }
4520
4521 // The sub vector type for current instruction.
4522 auto *SubVT = VectorType::get(ScalarTy, State.VF);
4523
4524 // Vectorize the interleaved store group.
4525 Value *MaskForGaps =
4526 createBitMaskForGaps(State.Builder, State.VF.getKnownMinValue(), *Group);
4527 assert(((MaskForGaps != nullptr) == needsMaskForGaps()) &&
4528 "Mismatch between NeedsMaskForGaps and MaskForGaps");
4529 ArrayRef<VPValue *> StoredValues = getStoredValues();
4530 // Collect the stored vector from each member.
4531 SmallVector<Value *, 4> StoredVecs;
4532 unsigned StoredIdx = 0;
4533 for (unsigned i = 0; i < InterleaveFactor; i++) {
4534 assert((Group->getMember(i) || MaskForGaps) &&
4535 "Fail to get a member from an interleaved store group");
4536 Instruction *Member = Group->getMember(i);
4537
4538 // Skip the gaps in the group.
4539 if (!Member) {
4540 Value *Undef = PoisonValue::get(SubVT);
4541 StoredVecs.push_back(Undef);
4542 continue;
4543 }
4544
4545 Value *StoredVec = State.get(StoredValues[StoredIdx]);
4546 ++StoredIdx;
4547
4548 if (Group->isReverse())
4549 StoredVec = State.Builder.CreateVectorReverse(StoredVec, "reverse");
4550
4551 // If this member has different type, cast it to a unified type.
4552
4553 if (StoredVec->getType() != SubVT)
4554 StoredVec = createBitOrPointerCast(State.Builder, StoredVec, SubVT, DL);
4555
4556 StoredVecs.push_back(StoredVec);
4557 }
4558
4559 // Interleave all the smaller vectors into one wider vector.
4560 Value *IVec = interleaveVectors(State.Builder, StoredVecs, "interleaved.vec");
4561 Instruction *NewStoreInstr;
4562 if (BlockInMask || MaskForGaps) {
4563 Value *GroupMask = CreateGroupMask(MaskForGaps);
4564 NewStoreInstr = State.Builder.CreateMaskedStore(
4565 IVec, ResAddr, Group->getAlign(), GroupMask);
4566 } else
4567 NewStoreInstr =
4568 State.Builder.CreateAlignedStore(IVec, ResAddr, Group->getAlign());
4569
4570 applyMetadata(*NewStoreInstr);
4571 // TODO: Also manage existing metadata using VPIRMetadata.
4572 Group->addMetadata(NewStoreInstr);
4573}
4574
4575#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4577 VPSlotTracker &SlotTracker) const {
4579 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << ", ";
4581 VPValue *Mask = getMask();
4582 if (Mask) {
4583 O << ", ";
4584 Mask->printAsOperand(O, SlotTracker);
4585 }
4586
4587 unsigned OpIdx = 0;
4588 for (unsigned i = 0; i < IG->getFactor(); ++i) {
4589 if (!IG->getMember(i))
4590 continue;
4591 if (getNumStoreOperands() > 0) {
4592 O << "\n" << Indent << " store ";
4594 O << " to index " << i;
4595 } else {
4596 O << "\n" << Indent << " ";
4598 O << " = load from index " << i;
4599 }
4600 ++OpIdx;
4601 }
4602}
4603#endif
4604
4606 assert(State.VF.isScalable() &&
4607 "Only support scalable VF for EVL tail-folding.");
4609 "Masking gaps for scalable vectors is not yet supported.");
4611 Instruction *Instr = Group->getInsertPos();
4612
4613 // Prepare for the vector type of the interleaved load/store.
4614 Type *ScalarTy = getLoadStoreType(Instr);
4615 unsigned InterleaveFactor = Group->getFactor();
4616 assert(InterleaveFactor <= 8 &&
4617 "Unsupported deinterleave/interleave factor for scalable vectors");
4618 ElementCount WideVF = State.VF * InterleaveFactor;
4619 auto *VecTy = VectorType::get(ScalarTy, WideVF);
4620
4621 VPValue *Addr = getAddr();
4622 Value *ResAddr = State.get(Addr, VPLane(0));
4623 Value *EVL = State.get(getEVL(), VPLane(0));
4624 Value *InterleaveEVL = State.Builder.CreateMul(
4625 EVL, ConstantInt::get(EVL->getType(), InterleaveFactor), "interleave.evl",
4626 /* NUW= */ true, /* NSW= */ true);
4627 LLVMContext &Ctx = State.Builder.getContext();
4628
4629 Value *GroupMask = nullptr;
4630 if (VPValue *BlockInMask = getMask()) {
4631 SmallVector<Value *> Ops(InterleaveFactor, State.get(BlockInMask));
4632 GroupMask = interleaveVectors(State.Builder, Ops, "interleaved.mask");
4633 } else {
4634 GroupMask =
4635 State.Builder.CreateVectorSplat(WideVF, State.Builder.getTrue());
4636 }
4637
4638 // Vectorize the interleaved load group.
4639 if (isa<LoadInst>(Instr)) {
4640 CallInst *NewLoad = State.Builder.CreateIntrinsicWithoutFolding(
4641 VecTy, Intrinsic::vp_load, {ResAddr, GroupMask, InterleaveEVL}, nullptr,
4642 "wide.vp.load");
4643 NewLoad->addParamAttr(0,
4644 Attribute::getWithAlignment(Ctx, Group->getAlign()));
4645
4646 applyMetadata(*NewLoad);
4647 // TODO: Also manage existing metadata using VPIRMetadata.
4648 Group->addMetadata(NewLoad);
4649
4650 // Scalable vectors cannot use arbitrary shufflevectors (only splats),
4651 // so must use intrinsics to deinterleave.
4652 NewLoad = State.Builder.CreateIntrinsicWithoutFolding(
4653 Intrinsic::getDeinterleaveIntrinsicID(InterleaveFactor),
4654 NewLoad->getType(), NewLoad,
4655 /*FMFSource=*/nullptr, "strided.vec");
4656
4657 const DataLayout &DL = Instr->getDataLayout();
4658 for (unsigned I = 0, J = 0; I < InterleaveFactor; ++I) {
4659 Instruction *Member = Group->getMember(I);
4660 // Skip the gaps in the group.
4661 if (!Member)
4662 continue;
4663
4664 Value *StridedVec = State.Builder.CreateExtractValue(NewLoad, I);
4665 // If this member has different type, cast the result type.
4666 if (Member->getType() != ScalarTy) {
4667 VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF);
4668 StridedVec =
4669 createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL);
4670 }
4671
4672 State.set(getVPValue(J), StridedVec);
4673 ++J;
4674 }
4675 return;
4676 } // End for interleaved load.
4677
4678 // The sub vector type for current instruction.
4679 auto *SubVT = VectorType::get(ScalarTy, State.VF);
4680 // Vectorize the interleaved store group.
4681 ArrayRef<VPValue *> StoredValues = getStoredValues();
4682 // Collect the stored vector from each member.
4683 SmallVector<Value *, 4> StoredVecs;
4684 const DataLayout &DL = Instr->getDataLayout();
4685 for (unsigned I = 0, StoredIdx = 0; I < InterleaveFactor; I++) {
4686 Instruction *Member = Group->getMember(I);
4687 // Skip the gaps in the group.
4688 if (!Member) {
4689 StoredVecs.push_back(PoisonValue::get(SubVT));
4690 continue;
4691 }
4692
4693 Value *StoredVec = State.get(StoredValues[StoredIdx]);
4694 // If this member has different type, cast it to a unified type.
4695 if (StoredVec->getType() != SubVT)
4696 StoredVec = createBitOrPointerCast(State.Builder, StoredVec, SubVT, DL);
4697
4698 StoredVecs.push_back(StoredVec);
4699 ++StoredIdx;
4700 }
4701
4702 // Interleave all the smaller vectors into one wider vector.
4703 Value *IVec = interleaveVectors(State.Builder, StoredVecs, "interleaved.vec");
4704 CallInst *NewStore = State.Builder.CreateIntrinsicWithoutFolding(
4705 Type::getVoidTy(Ctx), Intrinsic::vp_store,
4706 {IVec, ResAddr, GroupMask, InterleaveEVL});
4707
4708 NewStore->addParamAttr(1,
4709 Attribute::getWithAlignment(Ctx, Group->getAlign()));
4710
4711 applyMetadata(*NewStore);
4712 // TODO: Also manage existing metadata using VPIRMetadata.
4713 Group->addMetadata(NewStore);
4714}
4715
4716#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4718 VPSlotTracker &SlotTracker) const {
4720 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << ", ";
4722 O << ", ";
4724 if (VPValue *Mask = getMask()) {
4725 O << ", ";
4726 Mask->printAsOperand(O, SlotTracker);
4727 }
4728
4729 unsigned OpIdx = 0;
4730 for (unsigned i = 0; i < IG->getFactor(); ++i) {
4731 if (!IG->getMember(i))
4732 continue;
4733 if (getNumStoreOperands() > 0) {
4734 O << "\n" << Indent << " vp.store ";
4736 O << " to index " << i;
4737 } else {
4738 O << "\n" << Indent << " ";
4740 O << " = vp.load from index " << i;
4741 }
4742 ++OpIdx;
4743 }
4744}
4745#endif
4746
4748 VPCostContext &Ctx) const {
4749 Instruction *InsertPos = getInsertPos();
4750 // Find the VPValue index of the interleave group. We need to skip gaps.
4751 unsigned InsertPosIdx = 0;
4752 for (unsigned Idx = 0; IG->getFactor(); ++Idx)
4753 if (auto *Member = IG->getMember(Idx)) {
4754 if (Member == InsertPos)
4755 break;
4756 InsertPosIdx++;
4757 }
4758 const VPValue *ValV = getNumDefinedValues() > 0
4759 ? getVPValue(InsertPosIdx)
4760 : getStoredValues()[InsertPosIdx];
4761 Type *ValTy = ValV->getScalarType();
4762 auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
4763 unsigned AS =
4764 cast<PointerType>(getAddr()->getScalarType())->getAddressSpace();
4765
4766 unsigned InterleaveFactor = IG->getFactor();
4767 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
4768
4769 // Holds the indices of existing members in the interleaved group.
4771 for (unsigned IF = 0; IF < InterleaveFactor; IF++)
4772 if (IG->getMember(IF))
4773 Indices.push_back(IF);
4774
4775 // Calculate the cost of the whole interleaved group.
4776 InstructionCost Cost = Ctx.TTI.getInterleavedMemoryOpCost(
4777 InsertPos->getOpcode(), WideVecTy, IG->getFactor(), Indices,
4778 IG->getAlign(), AS, Ctx.CostKind, getMask(), NeedsMaskForGaps);
4779
4780 if (!IG->isReverse())
4781 return Cost;
4782
4783 return Cost + IG->getNumMembers() *
4784 Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
4785 VectorTy, VectorTy, {}, Ctx.CostKind,
4786 0);
4787}
4788
4790 return vputils::onlyScalarValuesUsed(this) &&
4791 (!IsScalable || vputils::onlyFirstLaneUsed(this));
4792}
4793
4794#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4796 raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const {
4797 assert((getNumOperands() == 3 || getNumOperands() == 5) &&
4798 "unexpected number of operands");
4799 O << Indent << "EMIT ";
4801 O << " = WIDEN-POINTER-INDUCTION ";
4803 O << ", ";
4805 O << ", ";
4807 if (getNumOperands() == 5) {
4808 O << ", ";
4810 O << ", ";
4812 }
4813}
4814
4816 VPSlotTracker &SlotTracker) const {
4817 O << Indent << "EMIT ";
4819 O << " = EXPAND SCEV " << *Expr;
4820}
4821#endif
4822
4823#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4825 VPSlotTracker &SlotTracker) const {
4826 O << Indent << "EMIT ";
4828 O << " = WIDEN-CANONICAL-INDUCTION";
4829 printFlags(O);
4831}
4832#endif
4833
4835 auto &Builder = State.Builder;
4836 // Create a vector from the initial value.
4837 auto *VectorInit = getStartValue()->getLiveInIRValue();
4838
4839 Type *VecTy = State.VF.isScalar()
4840 ? VectorInit->getType()
4841 : VectorType::get(VectorInit->getType(), State.VF);
4842
4843 BasicBlock *VectorPH =
4844 State.CFG.VPBB2IRBB.at(getParent()->getCFGPredecessor(0));
4845 if (State.VF.isVector()) {
4846 auto *IdxTy = Builder.getInt32Ty();
4847 auto *One = ConstantInt::get(IdxTy, 1);
4848 IRBuilder<>::InsertPointGuard Guard(Builder);
4849 Builder.SetInsertPoint(VectorPH->getTerminator());
4850 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, State.VF);
4851 auto *LastIdx = Builder.CreateSub(RuntimeVF, One);
4852 VectorInit = Builder.CreateInsertElement(
4853 PoisonValue::get(VecTy), VectorInit, LastIdx, "vector.recur.init");
4854 }
4855
4856 // Create a phi node for the new recurrence.
4857 PHINode *Phi = PHINode::Create(VecTy, 2, "vector.recur");
4858 Phi->insertBefore(State.CFG.PrevBB->getFirstInsertionPt());
4859 Phi->addIncoming(VectorInit, VectorPH);
4860 State.set(this, Phi);
4861}
4862
4865 VPCostContext &Ctx) const {
4866 if (VF.isScalar())
4867 return Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind);
4868
4869 return 0;
4870}
4871
4872#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4874 raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const {
4875 O << Indent << "FIRST-ORDER-RECURRENCE-PHI ";
4877 O << " = phi ";
4879}
4880#endif
4881
4883 // Reductions do not have to start at zero. They can start with
4884 // any loop invariant values.
4885 VPValue *StartVPV = getStartValue();
4886
4887 // In order to support recurrences we need to be able to vectorize Phi nodes.
4888 // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4889 // stage #1: We create a new vector PHI node with no incoming edges. We'll use
4890 // this value when we vectorize all of the instructions that use the PHI.
4891 BasicBlock *VectorPH =
4892 State.CFG.VPBB2IRBB.at(getParent()->getCFGPredecessor(0));
4893 bool ScalarPHI = State.VF.isScalar() || isInLoop();
4894 Value *StartV = State.get(StartVPV, ScalarPHI);
4895 Type *VecTy = StartV->getType();
4896
4897 BasicBlock *HeaderBB = State.CFG.PrevBB;
4898 assert(State.CurrentParentLoop->getHeader() == HeaderBB &&
4899 "recipe must be in the vector loop header");
4900 auto *Phi = PHINode::Create(VecTy, 2, "vec.phi");
4901 Phi->insertBefore(HeaderBB->getFirstInsertionPt());
4902 State.set(this, Phi, isInLoop());
4903
4904 Phi->addIncoming(StartV, VectorPH);
4905}
4906
4907#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4909 VPSlotTracker &SlotTracker) const {
4910 O << Indent << "WIDEN-REDUCTION-PHI ";
4911
4913 O << " = phi (";
4914 printRecurrenceKind(O, Kind);
4915 O << ")";
4916 printFlags(O);
4918 if (getVFScaleFactor() > 1)
4919 O << " (VF scaled by 1/" << getVFScaleFactor() << ")";
4920}
4921#endif
4922
4924 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
4925 return vputils::onlyFirstLaneUsed(this);
4926}
4927
4929 executePhiRecipe(this, *this, State, /*IsScalar=*/false, Name);
4930}
4931
4933 VPCostContext &Ctx) const {
4934 return Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind);
4935}
4936
4937#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4939 VPSlotTracker &SlotTracker) const {
4940 O << Indent << "WIDEN-PHI ";
4941
4943 O << " = phi ";
4945}
4946#endif
4947
4949 BasicBlock *VectorPH =
4950 State.CFG.VPBB2IRBB.at(getParent()->getCFGPredecessor(0));
4951 Value *StartMask = State.get(getOperand(0));
4952 PHINode *Phi =
4953 State.Builder.CreatePHI(StartMask->getType(), 2, "active.lane.mask");
4954 Phi->addIncoming(StartMask, VectorPH);
4955 State.set(this, Phi);
4956}
4957
4958#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4960 VPSlotTracker &SlotTracker) const {
4961 O << Indent << "ACTIVE-LANE-MASK-PHI ";
4962
4964 O << " = phi ";
4966}
4967#endif
4968
4969#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4971 raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const {
4972 O << Indent << "CURRENT-ITERATION-PHI ";
4973
4975 O << " = phi ";
4977}
4978#endif
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static MCDisassembler::DecodeStatus addOperand(MCInst &Inst, const MCOperand &Opnd)
AMDGPU Lower Kernel Arguments
AMDGPU Register Bank Select
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static const Function * getParent(const Value *V)
#define X(NUM, ENUM, NAME)
Definition ELF.h:853
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static void replaceAllUsesWith(Value *Old, Value *New, SmallPtrSet< BasicBlock *, 32 > &FreshBBs, bool IsHuge)
Replace all old uses with new ones, and push the updated BBs into FreshBBs.
Hexagon Common GEP
Value * getPointer(Value *Ptr)
iv users
Definition IVUsers.cpp:48
static constexpr Value * getValue(Ty &ValueOrUse)
static Value * getOpcode(Value &V, Type &Ty, InstrumentationConfig &IConf, InstrumentorIRBuilderTy &IIRB)
static std::pair< Value *, APInt > getMask(Value *WideMask, unsigned Factor, ElementCount LeafValueEC)
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
This file provides a LoopVectorizationPlanner class.
static const SCEV * getAddressAccessSCEV(Value *Ptr, PredicatedScalarEvolution &PSE, const Loop *TheLoop)
Gets the address access SCEV for Ptr, if it should be used for cost modeling according to isAddressSC...
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
static const Function * getCalledFunction(const Value *V)
static bool isOrdered(const Instruction *I)
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
const SmallVectorImpl< MachineOperand > & Cond
This file contains some templates that are useful if you are working with the STL at all.
This file defines less commonly used SmallVector utilities.
This file defines the SmallVector class.
#define LLVM_DEBUG(...)
Definition Debug.h:119
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This file contains the declarations of different VPlan-related auxiliary helpers.
static Value * interleaveVectors(IRBuilderBase &Builder, ArrayRef< Value * > Vals, const Twine &Name)
Return a vector containing interleaved elements from multiple smaller input vectors.
static void executePhiRecipe(VPSingleDefRecipe *R, VPPhiAccessors &Phi, VPTransformState &State, bool IsScalar, const Twine &Name)
Shared execute logic for VPPhi and VPWidenPHIRecipe.
static Value * createBitOrPointerCast(IRBuilderBase &Builder, Value *V, VectorType *DstVTy, const DataLayout &DL)
static Instruction::BinaryOps getSubRecurOpcode(RecurKind Kind)
SmallVector< Value *, 2 > VectorParts
static cl::opt< bool > VPlanPrintMetadata("vplan-print-metadata", cl::init(true), cl::Hidden, cl::desc("Controls the printing of recipe metadata when debugging."))
static void printRecurrenceKind(raw_ostream &OS, const RecurKind &Kind)
static unsigned getCalledFnOperandIndex(ArrayRef< VPValue * > Operands)
For call VPInstruction operands, return the operand index of the called function.
This file contains the declarations of the Vectorization Plan base classes:
void printAsOperand(OutputBuffer &OB, Prec P=Prec::Default, bool StrictlyWorse=false) const
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition APInt.h:235
bool ule(const APInt &RHS) const
Unsigned less or equal comparison.
Definition APInt.h:1157
Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
Get the array size.
Definition ArrayRef.h:141
bool empty() const
Check if the array is empty.
Definition ArrayRef.h:136
static LLVM_ABI Attribute getWithAlignment(LLVMContext &Context, Align Alignment)
Return a uniquified Attribute object that has the specific alignment set.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
LLVM_ABI const_iterator getFirstInsertionPt() const
Returns an iterator to the first instruction in this block that is suitable for inserting a non-PHI i...
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction; assumes that the block is well-formed.
Definition BasicBlock.h:237
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
This class represents a function call, abstracting a target machine's calling convention.
static LLVM_ABI bool isBitOrNoopPointerCastable(Type *SrcTy, Type *DestTy, const DataLayout &DL)
Check whether a bitcast, inttoptr, or ptrtoint cast between these types is valid and a no-op.
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:740
@ ICMP_UGT
unsigned greater than
Definition InstrTypes.h:763
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:765
static LLVM_ABI StringRef getPredicateName(Predicate P)
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
void setSuccessor(unsigned idx, BasicBlock *NewSucc)
This is the shared class of boolean and integer constants.
Definition Constants.h:87
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition Constants.h:159
This is an important base class in LLVM.
Definition Constant.h:43
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
A debug info location.
Definition DebugLoc.h:124
static DebugLoc getUnknown()
Definition DebugLoc.h:151
constexpr bool isVector() const
One or more elements.
Definition TypeSize.h:324
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition TypeSize.h:312
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition TypeSize.h:309
constexpr bool isScalar() const
Exactly one element.
Definition TypeSize.h:320
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:23
LLVM_ABI void print(raw_ostream &O) const
Print fast-math flags to O.
Definition Operator.cpp:284
void setAllowContract(bool B=true)
Definition FMF.h:90
bool noSignedZeros() const
Definition FMF.h:67
bool noInfs() const
Definition FMF.h:66
void setAllowReciprocal(bool B=true)
Definition FMF.h:87
bool allowReciprocal() const
Definition FMF.h:68
void setNoSignedZeros(bool B=true)
Definition FMF.h:84
bool allowReassoc() const
Flag queries.
Definition FMF.h:64
bool approxFunc() const
Definition FMF.h:70
void setNoNaNs(bool B=true)
Definition FMF.h:78
void setAllowReassoc(bool B=true)
Flag setters.
Definition FMF.h:75
bool noNaNs() const
Definition FMF.h:65
void setApproxFunc(bool B=true)
Definition FMF.h:93
void setNoInfs(bool B=true)
Definition FMF.h:81
bool allowContract() const
Definition FMF.h:69
Class to represent function types.
Type * getParamType(unsigned i) const
Parameter type accessors.
bool willReturn() const
Determine if the function will return.
Definition Function.h:669
Intrinsic::ID getIntrinsicID() const LLVM_READONLY
getIntrinsicID - This method returns the ID number of the specified function, or Intrinsic::not_intri...
Definition Function.h:246
bool doesNotThrow() const
Determine if the function cannot unwind.
Definition Function.h:602
bool doesNotAccessMemory() const
Determine if the function does not access memory.
Definition Function.cpp:863
Type * getReturnType() const
Returns the type of the ret val.
Definition Function.h:216
Represents flags for the getelementptr instruction/expression.
static GEPNoWrapFlags none()
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2669
IntegerType * getInt1Ty()
Fetch the type representing a single bit.
Definition IRBuilder.h:571
Value * CreateInsertValue(Value *Agg, Value *Val, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition IRBuilder.h:2723
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2657
LLVM_ABI Value * CreateVectorSpliceRight(Value *V1, Value *V2, Value *Offset, const Twine &Name="")
Create a vector.splice.right intrinsic call, or a shufflevector that produces the same result if the ...
CondBrInst * CreateCondBr(Value *Cond, BasicBlock *True, BasicBlock *False, MDNode *BranchWeights=nullptr, MDNode *Unpredictable=nullptr)
Create a conditional 'br Cond, TrueDest, FalseDest' instruction.
Definition IRBuilder.h:1268
LLVM_ABI Value * CreateSelectFMF(Value *C, Value *True, Value *False, FMFSource FMFSource, const Twine &Name="", Instruction *MDFrom=nullptr)
LLVM_ABI Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
Value * CreateExtractValue(Value *Agg, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition IRBuilder.h:2716
LLVM_ABI Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Value * CreateFreeze(Value *V, const Twine &Name="")
Definition IRBuilder.h:2735
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
Definition IRBuilder.h:586
Value * CreatePtrAdd(Value *Ptr, Value *Offset, const Twine &Name="", GEPNoWrapFlags NW=GEPNoWrapFlags::none())
Definition IRBuilder.h:2133
Value * CreateCast(Instruction::CastOps Op, Value *V, Type *DestTy, const Twine &Name="", MDNode *FPMathTag=nullptr, FMFSource FMFSource={})
Definition IRBuilder.h:2318
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
Definition IRBuilder.h:352
LLVM_ABI Value * CreateVectorReverse(Value *V, const Twine &Name="")
Return a vector value that contains the vector V reversed.
Value * CreateICmpNE(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:2420
Value * CreateLogicalAnd(Value *Cond1, Value *Cond2, const Twine &Name="", Instruction *MDFrom=nullptr)
Definition IRBuilder.h:1822
LLVM_ABI Value * CreateOrReduce(Value *Src)
Create a vector int OR reduction intrinsic of the source vector.
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Definition IRBuilder.h:529
Value * CreateCmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:2550
Value * CreateNot(Value *V, const Twine &Name="")
Definition IRBuilder.h:1906
Value * CreateICmpEQ(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:2416
Value * CreateCountTrailingZeroElems(Type *ResTy, Value *Mask, bool ZeroIsPoison=true, const Twine &Name="")
Create a call to llvm.experimental_cttz_elts.
Definition IRBuilder.h:1206
Value * CreateSub(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1491
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition IRBuilder.h:2162
LLVM_ABI Value * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > OverloadTypes, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="", ArrayRef< OperandBundleDef > OpBundles={}, function_ref< void(CallInst *)> SetFn=[](CallInst *) {})
Variant to create a possibly constant-folded intrinsic.
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1474
ConstantInt * getFalse()
Get the constant value for i1 false.
Definition IRBuilder.h:514
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:1783
Value * CreateICmpUGE(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:2428
Value * CreateLogicalOr(Value *Cond1, Value *Cond2, const Twine &Name="", Instruction *MDFrom=nullptr)
Definition IRBuilder.h:1830
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:2526
Value * CreateOr(Value *LHS, Value *RHS, const Twine &Name="", bool IsDisjoint=false)
Definition IRBuilder.h:1644
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1508
LLVM_ABI Value * CreateUnaryIntrinsic(Intrinsic::ID ID, Value *Op, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 1 operand which is mangled on its type.
@ IK_IntInduction
Integer induction variable. Step = C.
static InstructionCost getInvalid(CostType Val=0)
bool isCast() const
bool isBinaryOp() const
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
const char * getOpcodeName() const
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
bool isUnaryOp() const
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:350
The group of interleaved loads/stores sharing the same stride and close to each other.
uint32_t getFactor() const
InstTy * getMember(uint32_t Index) const
Get the member with the given index Index.
bool isReverse() const
InstTy * getInsertPos() const
void addMetadata(InstTy *NewInst) const
Add metadata (e.g.
Align getAlign() const
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
Information for memory intrinsic cost model.
Root of the metadata hierarchy.
Definition Metadata.h:64
LLVM_ABI void print(raw_ostream &OS, const Module *M=nullptr, bool IsForDebug=false) const
Print.
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
static PHINode * Create(Type *Ty, unsigned NumReservedValues, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
Constructors - NumReservedValues is a hint for the number of incoming edges that this phi node will h...
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
ScalarEvolution * getSE() const
Returns the ScalarEvolution analysis used.
static LLVM_ABI unsigned getOpcode(RecurKind Kind)
Returns the opcode corresponding to the RecurrenceKind.
static bool isAnyOfRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is of the form select(cmp(),x,y) where one of (x,...
static LLVM_ABI bool isSubRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is for a sub operation.
static bool isFindIVRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is of the form select(cmp(),x,y) where one of (x,...
static bool isMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is any min/max kind.
This class represents an analyzed expression in the program.
This class represents the LLVM 'select' instruction.
This class provides computation of slot numbers for LLVM Assembly writing.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
reference emplace_back(ArgTypes &&... Args)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Represent a constant reference to a string, i.e.
Definition StringRef.h:56
VectorInstrContext
Represents a hint about the context in which an insert/extract is used.
@ None
The insert/extract is not used with a load/store.
@ Load
The value being inserted comes from a load (InsertElement only).
@ Store
The extracted value is stored (ExtractElement only).
static LLVM_ABI PartialReductionExtendKind getPartialReductionExtendKind(Instruction *I)
Get the kind of extension that an instruction represents.
static LLVM_ABI OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
@ TCC_Free
Expected to fold away in lowering.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Reverse
Reverse the order of the vector.
CastContextHint
Represents a hint about the context in which a cast is used.
@ Reversed
The cast is used with a reversed load/store.
@ Masked
The cast is used with a masked load/store.
@ Normal
The cast is used with a normal load/store.
@ Interleave
The cast is used with an interleaved load/store.
@ GatherScatter
The cast is used with a gather/scatter.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
bool isByteTy() const
True if this is an instance of ByteType.
Definition Type.h:242
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:288
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:309
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:282
static LLVM_ABI Type * getVoidTy(LLVMContext &C)
Definition Type.cpp:282
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:368
bool isStructTy() const
True if this is an instance of StructType.
Definition Type.h:276
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:130
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:232
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:306
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:186
bool isIntOrPtrTy() const
Return true if this is an integer type or a pointer type.
Definition Type.h:270
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:257
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:313
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:141
value_op_iterator value_op_end()
Definition User.h:288
void setOperand(unsigned i, Value *Val)
Definition User.h:212
Value * getOperand(unsigned i) const
Definition User.h:207
value_op_iterator value_op_begin()
Definition User.h:285
void execute(VPTransformState &State) override
Generate the active lane mask phi of the vector loop.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph.
Definition VPlan.h:4407
RecipeListTy & getRecipeList()
Returns a reference to the list of recipes.
Definition VPlan.h:4460
iterator end()
Definition VPlan.h:4444
void insert(VPRecipeBase *Recipe, iterator InsertPt)
Definition VPlan.h:4473
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPWidenMemoryRecipe.
VPValue * getIncomingValue(unsigned Idx) const
Return incoming value number Idx.
Definition VPlan.h:3002
unsigned getNumIncomingValues() const
Return the number of incoming values, taking into account when normalized the first incoming value wi...
Definition VPlan.h:2997
bool usesFirstLaneOnly(const VPValue *Op) const override
Returns true if the recipe only uses the first lane of operand Op.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
bool isNormalized() const
A normalized blend is one that has an odd number of operands, whereby the first operand does not have...
Definition VPlan.h:2993
VPBlockBase is the building block of the Hierarchical Control-Flow Graph.
Definition VPlan.h:94
const VPBlocksTy & getPredecessors() const
Definition VPlan.h:222
VPlan * getPlan()
Definition VPlan.cpp:211
void printAsOperand(raw_ostream &OS, bool PrintType=false) const
Definition VPlan.h:364
static bool isHeader(const VPBlockBase *VPB, const VPDominatorTree &VPDT)
Returns true if VPB is a loop header, based on regions or VPDT in their absence.
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPBranchOnMaskRecipe.
void execute(VPTransformState &State) override
Generate the extraction of the appropriate bit from the block mask and the conditional branch.
VPlan-based builder utility analogous to IRBuilder.
LLVM_ABI_FOR_TEST void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
unsigned getNumDefinedValues() const
Returns the number of values defined by the VPDef.
Definition VPlanValue.h:561
VPValue * getVPSingleValue()
Returns the only VPValue defined by the VPDef.
Definition VPlanValue.h:534
VPValue * getVPValue(unsigned I)
Returns the VPValue with index I defined by the VPDef.
Definition VPlanValue.h:546
ArrayRef< VPRecipeValue * > definedValues()
Returns an ArrayRef of the values defined by the VPDef.
Definition VPlanValue.h:556
InductionDescriptor::InductionKind getInductionKind() const
Definition VPlan.h:4231
VPValue * getIndex() const
Definition VPlan.h:4228
VPIRValue * getStartValue() const
Definition VPlan.h:4227
VPValue * getStepValue() const
Definition VPlan.h:4229
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPDerivedIVRecipe.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
VPExpandSCEVRecipe(const SCEV *Expr)
bool isVectorToScalar() const
Returns true if this VPExpressionRecipe produces a single scalar.
void decompose()
Insert the recipes of the expression back into the VPlan, directly before the current recipe.
bool mayHaveSideEffects() const
Returns true if this expression contains recipes that may have side effects.
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Compute the cost of this recipe either using a recipe's specialized implementation or using the legac...
bool mayReadOrWriteMemory() const
Returns true if this expression contains recipes that may read from or write to memory.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this header phi recipe.
VPValue * getStartValue()
Returns the start value of the phi, if one is set.
Definition VPlan.h:2472
void execute(VPTransformState &State) override
Produce a vectorized histogram operation.
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPHistogramRecipe.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
VPValue * getMask() const
Return the mask operand if one was provided, or a null pointer if all lanes should be executed uncond...
Definition VPlan.h:2193
Class to record and manage LLVM IR flags.
Definition VPlan.h:695
FastMathFlagsTy FMFs
Definition VPlan.h:783
ReductionFlagsTy ReductionFlags
Definition VPlan.h:785
LLVM_ABI_FOR_TEST bool hasRequiredFlagsForOpcode(unsigned Opcode) const
Returns true if Opcode has its required flags set.
LLVM_ABI_FOR_TEST bool flagsValidForOpcode(unsigned Opcode) const
Returns true if the set flags are valid for Opcode.
static VPIRFlags getDefaultFlags(unsigned Opcode)
Returns default flags for Opcode for opcodes that support it, asserts otherwise.
WrapFlagsTy WrapFlags
Definition VPlan.h:777
void printFlags(raw_ostream &O) const
bool hasFastMathFlags() const
Returns true if the recipe has fast-math flags.
Definition VPlan.h:1000
bool isReductionOrdered() const
Definition VPlan.h:1064
TruncFlagsTy TruncFlags
Definition VPlan.h:778
CmpInst::Predicate getPredicate() const
Definition VPlan.h:972
LLVM_ABI_FOR_TEST FastMathFlags getFastMathFlagsOrNone() const
ExactFlagsTy ExactFlags
Definition VPlan.h:780
void intersectFlags(const VPIRFlags &Other)
Only keep flags also present in Other.
uint8_t GEPFlagsStorage
Definition VPlan.h:781
GEPNoWrapFlags getGEPNoWrapFlags() const
Definition VPlan.h:990
bool hasPredicate() const
Returns true if the recipe has a comparison predicate.
Definition VPlan.h:995
DisjointFlagsTy DisjointFlags
Definition VPlan.h:779
FCmpFlagsTy FCmpFlags
Definition VPlan.h:784
NonNegFlagsTy NonNegFlags
Definition VPlan.h:782
bool isReductionInLoop() const
Definition VPlan.h:1070
void applyFlags(Instruction &I) const
Apply the IR flags to I.
Definition VPlan.h:929
uint8_t CmpPredStorage
Definition VPlan.h:776
RecurKind getRecurKind() const
Definition VPlan.h:1058
void execute(VPTransformState &State) override
The method which generates the output IR instructions that correspond to this VPRecipe,...
LLVM_ABI_FOR_TEST InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPIRInstruction.
VPIRInstruction(Instruction &I)
VPIRInstruction::create() should be used to create VPIRInstructions, as subclasses may need to be cre...
Definition VPlan.h:1727
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void intersect(const VPIRMetadata &MD)
Intersect this VPIRMetadata object with MD, keeping only metadata nodes that are common to both.
VPIRMetadata()=default
void print(raw_ostream &O, VPSlotTracker &SlotTracker) const
Print metadata with node IDs.
void applyMetadata(Instruction &I) const
Add all metadata to I.
Type * getResultType() const
Definition VPlan.h:1588
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void execute(VPTransformState &State) override
Generate the instruction.
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPInstruction.
This is a concrete Recipe that models a single VPlan-level instruction.
Definition VPlan.h:1226
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPInstruction.
VPInstruction(unsigned Opcode, ArrayRef< VPValue * > Operands, const VPIRFlags &Flags={}, const VPIRMetadata &MD={}, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="", Type *ResultTy=nullptr)
bool doesGeneratePerAllLanes() const
Returns true if this VPInstruction generates scalar values for all lanes.
@ ExtractLastActive
Extracts the last active lane from a set of vectors.
Definition VPlan.h:1328
@ ExtractLane
Extracts a single lane (first operand) from a set of vector operands.
Definition VPlan.h:1319
@ ExitingIVValue
Compute the exiting value of a wide induction after vectorization, that is the value of the last lane...
Definition VPlan.h:1332
@ WideIVStep
Scale the first operand (vector step) by the second operand (scalar-step).
Definition VPlan.h:1344
@ ResumeForEpilogue
Explicit user for the resume phi of the canonical induction in the main VPlan, used by the epilogue v...
Definition VPlan.h:1322
@ Unpack
Extracts all lanes from its (non-scalable) vector operand.
Definition VPlan.h:1269
@ ReductionStartVector
Start vector for reductions with 3 operands: the original start value, the identity value for the red...
Definition VPlan.h:1315
@ BuildVector
Creates a fixed-width vector containing all operands.
Definition VPlan.h:1264
@ BuildStructVector
Given operands of (the same) struct type, creates a struct of fixed- width vectors each containing a ...
Definition VPlan.h:1261
@ VScale
Returns the value for vscale.
Definition VPlan.h:1348
@ CanonicalIVIncrementForPart
Definition VPlan.h:1245
@ ComputeReductionResult
Reduce the operands to the final reduction result using the operation specified via the operation's V...
Definition VPlan.h:1272
bool hasResult() const
Definition VPlan.h:1438
bool opcodeMayReadOrWriteFromMemory() const
Returns true if the underlying opcode may read from or write to memory.
LLVM_DUMP_METHOD void dump() const
Print the VPInstruction to dbgs() (for debugging).
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the VPInstruction to O.
StringRef getName() const
Returns the symbolic name assigned to the VPInstruction.
Definition VPlan.h:1520
unsigned getOpcode() const
Definition VPlan.h:1417
bool usesFirstLaneOnly(const VPValue *Op) const override
Returns true if the recipe only uses the first lane of operand Op.
void addOperand(VPValue *Op)
Add Op as operand of this VPInstruction.
bool isVectorToScalar() const
Returns true if this VPInstruction produces a scalar value from a vector, e.g.
bool isSingleScalar() const
Returns true if this VPInstruction's operands are single scalars and the result is also a single scal...
unsigned getNumOperandsForOpcode() const
Return the number of operands determined by the opcode of the VPInstruction, excluding mask.
bool isMasked() const
Returns true if the VPInstruction has a mask operand.
Definition VPlan.h:1463
void execute(VPTransformState &State) override
Generate the instruction.
bool usesFirstPartOnly(const VPValue *Op) const override
Returns true if the recipe only uses the first part of operand Op.
bool needsMaskForGaps() const
Return true if the access needs a mask because of the gaps.
Definition VPlan.h:3107
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this recipe.
Instruction * getInsertPos() const
Definition VPlan.h:3111
const InterleaveGroup< Instruction > * getInterleaveGroup() const
Definition VPlan.h:3109
VPValue * getMask() const
Return the mask used by this recipe.
Definition VPlan.h:3101
ArrayRef< VPValue * > getStoredValues() const
Return the VPValues stored by this interleave group.
Definition VPlan.h:3130
VPValue * getAddr() const
Return the address accessed by this recipe.
Definition VPlan.h:3095
VPValue * getEVL() const
The VPValue of the explicit vector length.
Definition VPlan.h:3204
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
unsigned getNumStoreOperands() const override
Returns the number of stored operands of this interleave group.
Definition VPlan.h:3217
void execute(VPTransformState &State) override
Generate the wide load or store, and shuffles.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
unsigned getNumStoreOperands() const override
Returns the number of stored operands of this interleave group.
Definition VPlan.h:3167
void execute(VPTransformState &State) override
Generate the wide load or store, and shuffles.
In what follows, the term "input IR" refers to code that is fed into the vectorizer whereas the term ...
static VPLane getLastLaneForVF(const ElementCount &VF)
static VPLane getLaneFromEnd(const ElementCount &VF, unsigned Offset)
static VPLane getFirstLane()
Helper type to provide functions to access incoming values and blocks for phi-like recipes.
Definition VPlan.h:1607
virtual const VPRecipeBase * getAsRecipe() const =0
Return a VPRecipeBase* to the current object.
VPValue * getIncomingValueForBlock(const VPBasicBlock *VPBB) const
Returns the incoming value for VPBB. VPBB must be an incoming block.
void removeIncomingValueFor(VPBlockBase *IncomingBlock) const
Removes the incoming value for IncomingBlock, which must be a predecessor.
const VPBasicBlock * getIncomingBlock(unsigned Idx) const
Returns the incoming block with index Idx.
Definition VPlan.h:4551
detail::zippy< llvm::detail::zip_first, VPUser::const_operand_range, const_incoming_blocks_range > incoming_values_and_blocks() const
Returns an iterator range over pairs of incoming values and corresponding incoming blocks.
Definition VPlan.h:1656
VPValue * getIncomingValue(unsigned Idx) const
Returns the incoming VPValue with index Idx.
Definition VPlan.h:1616
void printPhiOperands(raw_ostream &O, VPSlotTracker &SlotTracker) const
Print the recipe.
void setIncomingValueForBlock(const VPBasicBlock *VPBB, VPValue *V) const
Sets the incoming value for VPBB to V.
void execute(VPTransformState &State) override
Generates phi nodes for live-outs (from a replicate region) as needed to retain SSA form.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
VPRecipeBase is a base class modeling a sequence of one or more output IR instructions.
Definition VPlan.h:402
bool mayReadFromMemory() const
Returns true if the recipe may read from memory.
bool mayHaveSideEffects() const
Returns true if the recipe may have side-effects.
virtual void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const =0
Each concrete VPRecipe prints itself, without printing common information, like debug info or metadat...
VPRegionBlock * getRegion()
Definition VPlan.h:4752
LLVM_ABI_FOR_TEST void dump() const
Dump the recipe to stderr (for debugging).
Definition VPlan.cpp:117
bool isPhi() const
Returns true for PHI-like recipes.
bool mayWriteToMemory() const
Returns true if the recipe may write to memory.
virtual InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const
Compute the cost of this recipe either using a recipe's specialized implementation or using the legac...
VPBasicBlock * getParent()
Definition VPlan.h:477
DebugLoc getDebugLoc() const
Returns the debug location of the recipe.
Definition VPlan.h:555
void moveBefore(VPBasicBlock &BB, iplist< VPRecipeBase >::iterator I)
Unlink this recipe and insert into BB before I.
bool isSafeToSpeculativelyExecute() const
Return true if we can safely execute this recipe unconditionally even if it is masked originally.
void insertBefore(VPRecipeBase *InsertPos)
Insert an unlinked recipe into a basic block immediately before the specified recipe.
void insertAfter(VPRecipeBase *InsertPos)
Insert an unlinked Recipe into a basic block immediately after the specified Recipe.
iplist< VPRecipeBase >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
InstructionCost cost(ElementCount VF, VPCostContext &Ctx)
Return the cost of this recipe, taking into account if the cost computation should be skipped and the...
void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const
Print the recipe, delegating to printRecipe().
void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
unsigned getVPRecipeID() const
Definition VPlan.h:523
void moveAfter(VPRecipeBase *MovePos)
Unlink this recipe from its current VPBasicBlock and insert it into the VPBasicBlock that MovePos liv...
VPRecipeBase(const unsigned char SC, ArrayRef< VPValue * > Operands, DebugLoc DL=DebugLoc::getUnknown())
Definition VPlan.h:467
Type * getScalarType() const
Returns the scalar type of this VPRecipeValue.
Definition VPlanValue.h:337
friend class VPValue
Definition VPlanValue.h:316
void execute(VPTransformState &State) override
Generate the reduction in the loop.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
VPValue * getEVL() const
The VPValue of the explicit vector length.
Definition VPlan.h:3376
unsigned getVFScaleFactor() const
Get the factor that the VF of this recipe's output should be scaled by, or 1 if it isn't scaled.
Definition VPlan.h:2908
bool isInLoop() const
Returns true if the phi is part of an in-loop reduction.
Definition VPlan.h:2927
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void execute(VPTransformState &State) override
Generate the phi/select nodes.
bool isConditional() const
Return true if the in-loop reduction is conditional.
Definition VPlan.h:3318
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of VPReductionRecipe.
VPValue * getVecOp() const
The VPValue of the vector value to be reduced.
Definition VPlan.h:3329
VPValue * getCondOp() const
The VPValue of the condition for the block.
Definition VPlan.h:3331
RecurKind getRecurrenceKind() const
Return the recurrence kind for the in-loop reduction.
Definition VPlan.h:3314
bool isPartialReduction() const
Returns true if the reduction outputs a vector with a scaled down VF.
Definition VPlan.h:3320
VPValue * getChainOp() const
The VPValue of the scalar Chain being accumulated.
Definition VPlan.h:3327
bool isInLoop() const
Returns true if the reduction is in-loop.
Definition VPlan.h:3322
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void execute(VPTransformState &State) override
Generate the reduction in the loop.
VPRegionBlock represents a collection of VPBasicBlocks and VPRegionBlocks which form a Single-Entry-S...
Definition VPlan.h:4617
bool isReplicator() const
An indicator whether this region is to generate multiple replicated instances of output IR correspond...
Definition VPlan.h:4693
void execute(VPTransformState &State) override
Generate replicas of the desired Ingredient.
bool isSingleScalar() const
Definition VPlan.h:3456
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPReplicateRecipe.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
static Type * computeScalarType(const Instruction *I, ArrayRef< VPValue * > Operands)
Compute the scalar result type for a VPReplicateRecipe wrapping I with Operands (excluding any predic...
static InstructionCost computeCallCost(Function *CalledFn, Type *ResultTy, ArrayRef< const VPValue * > ArgOps, bool IsSingleScalar, ElementCount VF, VPCostContext &Ctx)
Return the cost of scalarizing a call to CalledFn with argument operands ArgOps for a given VF.
unsigned getOpcode() const
Definition VPlan.h:3491
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPScalarIVStepsRecipe.
VPValue * getStepValue() const
Definition VPlan.h:4296
VPValue * getStartIndex() const
Return the StartIndex, or null if known to be zero, valid only after unrolling.
Definition VPlan.h:4304
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void execute(VPTransformState &State) override
Generate the scalarized versions of the phi node as needed by their users.
VPSingleDefRecipe is a base class for recipes that model a sequence of one or more output IR that def...
Definition VPlan.h:609
Instruction * getUnderlyingInstr()
Returns the underlying instruction.
Definition VPlan.h:680
LLVM_ABI_FOR_TEST LLVM_DUMP_METHOD void dump() const
Print this VPSingleDefRecipe to dbgs() (for debugging).
VPSingleDefRecipe(const unsigned char SC, ArrayRef< VPValue * > Operands, DebugLoc DL=DebugLoc::getUnknown())
Definition VPlan.h:611
This class can be used to assign names to VPValues.
This class augments VPValue with operands which provide the inverse def-use edges from VPValue's user...
Definition VPlanValue.h:384
void printOperands(raw_ostream &O, VPSlotTracker &SlotTracker) const
Print the operands to O.
Definition VPlan.cpp:1520
operand_range operands()
Definition VPlanValue.h:457
unsigned getNumOperands() const
Definition VPlanValue.h:424
operand_iterator op_end()
Definition VPlanValue.h:455
operand_iterator op_begin()
Definition VPlanValue.h:453
VPValue * getOperand(unsigned N) const
Definition VPlanValue.h:425
void addOperand(VPValue *Operand)
Definition VPlanValue.h:410
This is the base class of the VPlan Def/Use graph, used for modeling the data flow into,...
Definition VPlanValue.h:50
Type * getScalarType() const
Returns the scalar type of this VPValue, dispatching based on the concrete subclass.
Definition VPlan.cpp:149
Value * getLiveInIRValue() const
Return the underlying IR value for a VPIRValue.
Definition VPlan.cpp:143
bool isDefinedOutsideLoopRegions() const
Returns true if the VPValue is defined outside any loop.
Definition VPlan.cpp:1471
VPRecipeBase * getDefiningRecipe()
Returns the recipe defining this VPValue or nullptr if it is not defined by a recipe,...
Definition VPlan.cpp:130
void printAsOperand(raw_ostream &OS, VPSlotTracker &Tracker) const
Definition VPlan.cpp:1516
Value * getUnderlyingValue() const
Return the underlying Value attached to this VPValue.
Definition VPlanValue.h:75
void setUnderlyingValue(Value *Val)
Definition VPlanValue.h:208
VPUser * getSingleUser()
Return the single user of this value, or nullptr if there is not exactly one user.
Definition VPlanValue.h:178
VPValue * getVFValue() const
Definition VPlan.h:2287
void execute(VPTransformState &State) override
The method which generates the output IR instructions that correspond to this VPRecipe,...
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
Type * getSourceElementType() const
Definition VPlan.h:2284
int64_t getStride() const
Definition VPlan.h:2285
void materializeOffset(unsigned Part=0)
Adds the offset operand to the recipe.
VPValue * getStride() const
Definition VPlan.h:2361
Type * getSourceElementType() const
Definition VPlan.h:2376
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void execute(VPTransformState &State) override
The method which generates the output IR instructions that correspond to this VPRecipe,...
VPValue * getVFxPart() const
Definition VPlan.h:2363
bool usesFirstLaneOnly(const VPValue *Op) const override
Returns true if the recipe only uses the first lane of operand Op.
operand_range args()
Definition VPlan.h:2144
Function * getCalledScalarFunction() const
Definition VPlan.h:2140
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPWidenCallRecipe.
void execute(VPTransformState &State) override
Produce a widened version of the call instruction.
static InstructionCost computeCallCost(Function *Variant, VPCostContext &Ctx)
Return the cost of widening a call using the vector function Variant.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
LLVM_ABI_FOR_TEST void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
LLVM_ABI_FOR_TEST void execute(VPTransformState &State) override
Produce widened copies of the cast.
LLVM_ABI_FOR_TEST InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPWidenCastRecipe.
void execute(VPTransformState &State) override
Generate the gep nodes.
Type * getSourceElementType() const
Definition VPlan.h:2241
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
bool usesFirstLaneOnly(const VPValue *Op) const override
Returns true if the recipe only uses the first lane of operand Op.
VPIRValue * getStartValue() const
Returns the start value of the induction.
Definition VPlan.h:2564
VPValue * getStepValue()
Returns the step value of the induction.
Definition VPlan.h:2567
VPIRValue * getStartValue() const
Returns the start value of the induction.
Definition VPlan.h:2670
TruncInst * getTruncInst()
Returns the first defined value as TruncInst, if it is one or nullptr otherwise.
Definition VPlan.h:2685
bool isCanonical() const
Returns true if the induction is canonical, i.e.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
CallInst * createVectorCall(VPTransformState &State)
Helper function to produce the widened intrinsic call.
Intrinsic::ID getVectorIntrinsicID() const
Return the ID of the intrinsic.
Definition VPlan.h:2029
LLVM_ABI_FOR_TEST void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
StringRef getIntrinsicName() const
Return to name of the intrinsic as string.
static InstructionCost computeCallCost(Intrinsic::ID ID, ArrayRef< const VPValue * > Operands, const VPRecipeWithIRFlags &R, ElementCount VF, VPCostContext &Ctx)
Compute the cost of a vector intrinsic with ID and Operands.
LLVM_ABI_FOR_TEST bool usesFirstLaneOnly(const VPValue *Op) const override
Returns true if the VPUser only uses the first lane of operand Op.
LLVM_ABI_FOR_TEST void execute(VPTransformState &State) override
Produce a widened version of the vector intrinsic.
LLVM_ABI_FOR_TEST InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this vector intrinsic.
static InstructionCost computeMemIntrinsicCost(Intrinsic::ID IID, Type *Ty, bool IsMasked, Align Alignment, VPCostContext &Ctx)
Helper function for computing the cost of vector memory intrinsic.
void execute(VPTransformState &State) override
Produce a widened version of the vector memory intrinsic.
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this vector memory intrinsic.
bool IsMasked
Whether the memory access is masked.
Definition VPlan.h:3760
bool isConsecutive() const
Return whether the loaded-from / stored-to addresses are consecutive.
Definition VPlan.h:3785
Instruction & Ingredient
Definition VPlan.h:3751
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const
Return the cost of this VPWidenMemoryRecipe.
bool Consecutive
Whether the accessed addresses are consecutive.
Definition VPlan.h:3757
VPValue * getMask() const
Return the mask used by this recipe.
Definition VPlan.h:3795
Align Alignment
Alignment information for this memory access.
Definition VPlan.h:3754
virtual VPRecipeBase * getAsRecipe()=0
Return a VPRecipeBase* to the current object.
VPValue * getAddr() const
Return the address accessed by this recipe.
Definition VPlan.h:3788
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPWidenPHIRecipe.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void execute(VPTransformState &State) override
Generate the phi/select nodes.
bool onlyScalarsGenerated(bool IsScalable)
Returns true if only scalar values will be generated.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPWidenRecipe.
void execute(VPTransformState &State) override
Produce a widened instruction using the opcode and operands of the recipe, processing State....
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
VPlan models a candidate for vectorization, encoding various decisions take to produce efficient outp...
Definition VPlan.h:4765
const DataLayout & getDataLayout() const
Definition VPlan.h:4970
VPValue * getTripCount() const
The trip count of the original loop.
Definition VPlan.h:4924
VPIRValue * getConstantInt(Type *Ty, uint64_t Val, bool IsSigned=false)
Return a VPIRValue wrapping a ConstantInt with the given type and value.
Definition VPlan.h:5072
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255
LLVM_ABI void setName(const Twine &Name)
Change the name of the value.
Definition Value.cpp:394
LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.h:258
void mutateType(Type *Ty)
Mutate the type of this Value to be of the specified type.
Definition Value.h:807
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:319
Base class of all SIMD vector types.
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Type * getElementType() const
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:168
constexpr LeafTy multiplyCoefficientBy(ScalarTy RHS) const
Definition TypeSize.h:256
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
constexpr LeafTy divideCoefficientBy(ScalarTy RHS) const
We do not provide the '/' operator here because division for polynomial types does not work in the sa...
Definition TypeSize.h:252
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
iterator erase(iterator where)
Definition ilist.h:204
pointer remove(iterator &IT)
Definition ilist.h:188
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
LLVM_ABI Intrinsic::ID getDeinterleaveIntrinsicID(unsigned Factor)
Returns the corresponding llvm.vector.deinterleaveN intrinsic for factor N.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > OverloadTys={})
Look up the Function declaration of the intrinsic id in the Module M.
LLVM_ABI StringRef getBaseName(ID id)
Return the LLVM name for an intrinsic, without encoded types for overloading, such as "llvm....
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
match_combine_or< Ty... > m_CombineOr(const Ty &...Ps)
Combine pattern matchers matching any of Ps patterns.
auto m_Cmp()
Matches any compare instruction and ignore it.
bool match(Val *V, const Pattern &P)
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
LogicalOp_match< LHS, RHS, Instruction::And, true > m_c_LogicalAnd(const LHS &L, const RHS &R)
Matches L && R with LHS and RHS in either order.
LogicalOp_match< LHS, RHS, Instruction::Or, true > m_c_LogicalOr(const LHS &L, const RHS &R)
Matches L || R with LHS and RHS in either order.
specific_intval< 1 > m_False()
specific_intval< 1 > m_True()
auto m_VPValue()
Match an arbitrary VPValue and ignore it.
VPInstruction_match< VPInstruction::BranchOnCond > m_BranchOnCond()
VPInstruction_match< VPInstruction::Reverse, Op0_t > m_Reverse(const Op0_t &Op0)
initializer< Ty > init(const Ty &Val)
NodeAddr< DefNode * > Def
Definition RDFGraph.h:384
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
bool isSingleScalar(const VPValue *VPV)
Returns true if VPV is a single scalar, either because it produces the same value for all lanes or on...
bool isAddressSCEVForCost(const SCEV *Addr, ScalarEvolution &SE, const Loop *L)
Returns true if Addr is an address SCEV that can be passed to TTI::getAddressComputationCost,...
bool onlyFirstPartUsed(const VPValue *Def)
Returns true if only the first part of Def is used.
bool onlyFirstLaneUsed(const VPValue *Def)
Returns true if only the first lane of Def is used.
bool onlyScalarValuesUsed(const VPValue *Def)
Returns true if only scalar values of Def are used by all users.
bool isUsedByLoadStoreAddress(const VPValue *V)
Returns true if V is used as part of the address of another load or store.
const SCEV * getSCEVExprForVPValue(const VPValue *V, PredicatedScalarEvolution &PSE, const Loop *L=nullptr)
Return the SCEV expression for V.
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:315
LLVM_ABI Value * createSimpleReduction(IRBuilderBase &B, Value *Src, RecurKind RdxKind)
Create a reduction of the given vector.
@ Offset
Definition DWP.cpp:558
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition STLExtras.h:830
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1738
LLVM_ABI Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
InstructionCost Cost
@ Undef
Value of the register doesn't matter.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2553
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
auto map_to_vector(ContainerTy &&C, FuncTy &&F)
Map a range to a SmallVector with element types deduced from the mapping.
Value * getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF)
Return the runtime value for VF.
auto dyn_cast_if_present(const Y &Val)
dyn_cast_if_present<X> - Functionally identical to dyn_cast, except that a null (or none in the case ...
Definition Casting.h:732
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2207
void interleaveComma(const Container &c, StreamT &os, UnaryFunctor each_fn)
Definition STLExtras.h:2312
auto cast_or_null(const Y &Val)
Definition Casting.h:714
LLVM_ABI Value * concatenateVectors(IRBuilderBase &Builder, ArrayRef< Value * > Vecs)
Concatenate a list of vectors.
Align getLoadStoreAlignment(const Value *I)
A helper function that returns the alignment of load or store instruction.
bool isa_and_nonnull(const Y &Val)
Definition Casting.h:676
LLVM_ABI Value * createMinMaxOp(IRBuilderBase &Builder, RecurKind RK, Value *Left, Value *Right)
Returns a Min/Max operation corresponding to MinMaxRecurrenceKind.
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
static Error getOffset(const SymbolRef &Sym, SectionRef Sec, uint64_t &Result)
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1745
LLVM_ABI Constant * createBitMaskForGaps(IRBuilderBase &Builder, unsigned VF, const InterleaveGroup< Instruction > &Group)
Create a mask that filters the members of an interleave group where there are gaps.
LLVM_ABI llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
auto reverse(ContainerTy &&C)
Definition STLExtras.h:407
ElementCount getVectorizedTypeVF(Type *Ty)
Returns the number of vector elements for a vectorized type.
LLVM_ABI llvm::SmallVector< int, 16 > createReplicatedMask(unsigned ReplicationFactor, unsigned VF)
Create a mask with replicated elements.
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:209
bool isPointerTy(const Type *T)
Definition SPIRVUtils.h:377
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1752
SmallVector< ValueTypeFromRangeType< R >, Size > to_vector(R &&Range)
Given a range of type R, iterate the entire range and return a SmallVector with elements of the vecto...
Type * toVectorizedTy(Type *Ty, ElementCount EC)
A helper for converting to vectorized types.
cl::opt< unsigned > ForceTargetInstructionCost
LLVM_ABI Type * computeScalarTypeForInstruction(unsigned Opcode, ArrayRef< VPValue * > Operands)
Compute the scalar result type for an IR Opcode given Operands.
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
auto drop_end(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the last N elements excluded.
Definition STLExtras.h:322
LLVM_ABI bool isVectorIntrinsicWithStructReturnOverloadAtField(Intrinsic::ID ID, int RetIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic that returns a struct is overloaded at the struct elem...
@ Other
Any other memory.
Definition ModRef.h:68
static const MachineInstrBuilder & addOffset(const MachineInstrBuilder &MIB, int Offset)
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:221
LLVM_ABI llvm::SmallVector< int, 16 > createInterleaveMask(unsigned VF, unsigned NumVecs)
Create an interleave shuffle mask.
RecurKind
These are the kinds of recurrences that we support.
@ UMin
Unsigned integer min implemented in terms of select(cmp()).
@ FMinimumNum
FP min with llvm.minimumnum semantics.
@ FindIV
FindIV reduction with select(icmp(),x,y) where one of (x,y) is a loop induction variable (increasing ...
@ Or
Bitwise or logical OR of integers.
@ FMinimum
FP min with llvm.minimum semantics.
@ FMaxNum
FP max with llvm.maxnum semantics including NaNs.
@ Mul
Product of integers.
@ FSub
Subtraction of floats.
@ FAddChainWithSubs
A chain of fadds and fsubs.
@ None
Not a recurrence.
@ AnyOf
AnyOf reduction with select(cmp(),x,y) where one of (x,y) is loop invariant, and both x and y are int...
@ Xor
Bitwise or logical XOR of integers.
@ FindLast
FindLast reduction with select(cmp(),x,y) where x and y.
@ FMax
FP max implemented in terms of select(cmp()).
@ FMaximum
FP max with llvm.maximum semantics.
@ FMulAdd
Sum of float products with llvm.fmuladd(a * b + sum).
@ FMul
Product of floats.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ And
Bitwise or logical AND of integers.
@ SMin
Signed integer min implemented in terms of select(cmp()).
@ FMin
FP min implemented in terms of select(cmp()).
@ FMinNum
FP min with llvm.minnum semantics including NaNs.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
@ AddChainWithSubs
A chain of adds and subs.
@ FAdd
Sum of floats.
@ FMaximumNum
FP max with llvm.maximumnum semantics.
@ UMax
Unsigned integer max implemented in terms of select(cmp()).
LLVM_ABI bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic has a scalar operand.
LLVM_ABI Value * getRecurrenceIdentity(RecurKind K, Type *Tp, FastMathFlags FMF)
Given information about an recurrence kind, return the identity for the @llvm.vector....
DWARFExpression::Operation Op
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1946
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
LLVM_ABI Value * createOrderedReduction(IRBuilderBase &B, RecurKind RdxKind, Value *Src, Value *Start)
Create an ordered reduction intrinsic using the given recurrence kind RdxKind.
ArrayRef< Type * > getContainedTypes(Type *const &Ty)
Returns the types contained in Ty.
Type * toVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
LLVM_ABI bool isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, int OpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic is overloaded on the type of the operand at index OpdI...
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
Struct to hold various analysis needed for cost computations.
static bool isFreeScalarIntrinsic(Intrinsic::ID ID)
Returns true if ID is a pseudo intrinsic that is dropped via scalarization rather than widened.
Definition VPlan.cpp:1939
TargetTransformInfo::TargetCostKind CostKind
void execute(VPTransformState &State) override
Generate the phi nodes.
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this first-order recurrence phi recipe.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
An overlay for VPIRInstructions wrapping PHI nodes enabling convenient use cast/dyn_cast/isa and exec...
Definition VPlan.h:1785
PHINode & getIRPhi()
Definition VPlan.h:1798
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void execute(VPTransformState &State) override
The method which generates the output IR instructions that correspond to this VPRecipe,...
void execute(VPTransformState &State) override
Generate the instruction.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
InstructionCost getCostForRecipeWithOpcode(unsigned Opcode, ElementCount VF, VPCostContext &Ctx) const
Compute the cost for this recipe for VF, using Opcode and Ctx.
VPRecipeWithIRFlags(const unsigned char SC, ArrayRef< VPValue * > Operands, const VPIRFlags &Flags, DebugLoc DL=DebugLoc::getUnknown())
Definition VPlan.h:1118
A symbolic live-in VPValue, used for values like vector trip count, VF, and VFxUF.
Definition VPlanValue.h:286
SmallDenseMap< const VPBasicBlock *, BasicBlock * > VPBB2IRBB
A mapping of each VPBasicBlock to the corresponding BasicBlock.
VPTransformState holds information passed down when "executing" a VPlan, needed for generating the ou...
struct llvm::VPTransformState::CFGState CFG
Value * get(const VPValue *Def, bool IsScalar=false)
Get the generated vector Value for a given VPValue Def if IsScalar is false, otherwise return the gen...
Definition VPlan.cpp:313
IRBuilderBase & Builder
Hold a reference to the IRBuilder used to generate output IR code.
ElementCount VF
The chosen Vectorization Factor of the loop being vectorized.
LLVM_ABI_FOR_TEST void execute(VPTransformState &State) override
Generate the wide load or gather.
LLVM_ABI_FOR_TEST VPRecipeBase * getAsRecipe() override
Return a VPRecipeBase* to the current object.
LLVM_ABI_FOR_TEST void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
LLVM_ABI_FOR_TEST InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPWidenLoadEVLRecipe.
VPValue * getEVL() const
Return the EVL operand.
Definition VPlan.h:3880
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
void execute(VPTransformState &State) override
Generate a wide load or gather.
VPRecipeBase * getAsRecipe() override
Return a VPRecipeBase* to the current object.
VPValue * getStoredValue() const
Return the address accessed by this recipe.
Definition VPlan.h:3982
LLVM_ABI_FOR_TEST void execute(VPTransformState &State) override
Generate the wide store or scatter.
LLVM_ABI_FOR_TEST VPRecipeBase * getAsRecipe() override
Return a VPRecipeBase* to the current object.
LLVM_ABI_FOR_TEST void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
LLVM_ABI_FOR_TEST InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const override
Return the cost of this VPWidenStoreEVLRecipe.
VPValue * getEVL() const
Return the EVL operand.
Definition VPlan.h:3985
void execute(VPTransformState &State) override
Generate a wide store or scatter.
void printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override
Print the recipe.
VPRecipeBase * getAsRecipe() override
Return a VPRecipeBase* to the current object.
VPValue * getStoredValue() const
Return the value stored by this recipe.
Definition VPlan.h:3930