LLVM 23.0.0git
RISCVTargetTransformInfo.cpp
Go to the documentation of this file.
1//===-- RISCVTargetTransformInfo.cpp - RISC-V specific TTI ----------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
11#include "llvm/ADT/STLExtras.h"
18#include "llvm/IR/IntrinsicsRISCV.h"
21#include <cmath>
22#include <optional>
23using namespace llvm;
24using namespace llvm::PatternMatch;
25
26#define DEBUG_TYPE "riscvtti"
27
29 "riscv-v-register-bit-width-lmul",
31 "The LMUL to use for getRegisterBitWidth queries. Affects LMUL used "
32 "by autovectorized code. Fractional LMULs are not supported."),
34
36 "riscv-v-slp-max-vf",
38 "Overrides result used for getMaximumVF query which is used "
39 "exclusively by SLP vectorizer."),
41
43 RVVMinTripCount("riscv-v-min-trip-count",
44 cl::desc("Set the lower bound of a trip count to decide on "
45 "vectorization while tail-folding."),
47
48static cl::opt<bool> EnableOrLikeSelectOpt("enable-riscv-or-like-select",
49 cl::init(true), cl::Hidden);
50
52RISCVTTIImpl::getRISCVInstructionCost(ArrayRef<unsigned> OpCodes, MVT VT,
54 // Check if the type is valid for all CostKind
55 if (!VT.isVector())
57 size_t NumInstr = OpCodes.size();
59 return NumInstr;
60 InstructionCost LMULCost = TLI->getLMULCost(VT);
62 return LMULCost * NumInstr;
63 InstructionCost Cost = 0;
64 for (auto Op : OpCodes) {
65 switch (Op) {
66 case RISCV::VRGATHER_VI:
67 Cost += TLI->getVRGatherVICost(VT);
68 break;
69 case RISCV::VRGATHER_VV:
70 Cost += TLI->getVRGatherVVCost(VT);
71 break;
72 case RISCV::VSLIDEUP_VI:
73 case RISCV::VSLIDEDOWN_VI:
74 Cost += TLI->getVSlideVICost(VT);
75 break;
76 case RISCV::VSLIDEUP_VX:
77 case RISCV::VSLIDEDOWN_VX:
78 Cost += TLI->getVSlideVXCost(VT);
79 break;
80 case RISCV::VREDMAX_VS:
81 case RISCV::VREDMIN_VS:
82 case RISCV::VREDMAXU_VS:
83 case RISCV::VREDMINU_VS:
84 case RISCV::VREDSUM_VS:
85 case RISCV::VREDAND_VS:
86 case RISCV::VREDOR_VS:
87 case RISCV::VREDXOR_VS:
88 case RISCV::VFREDMAX_VS:
89 case RISCV::VFREDMIN_VS:
90 case RISCV::VFREDUSUM_VS: {
91 unsigned VL = VT.getVectorMinNumElements();
92 if (!VT.isFixedLengthVector())
93 VL *= *getVScaleForTuning();
94 Cost += Log2_32_Ceil(VL);
95 break;
96 }
97 case RISCV::VFREDOSUM_VS: {
98 unsigned VL = VT.getVectorMinNumElements();
99 if (!VT.isFixedLengthVector())
100 VL *= *getVScaleForTuning();
101 Cost += VL;
102 break;
103 }
104 case RISCV::VMV_X_S:
105 case RISCV::VMV_S_X:
106 case RISCV::VFMV_F_S:
107 case RISCV::VFMV_S_F:
108 case RISCV::VMOR_MM:
109 case RISCV::VMXOR_MM:
110 case RISCV::VMAND_MM:
111 case RISCV::VMANDN_MM:
112 case RISCV::VMNAND_MM:
113 case RISCV::VCPOP_M:
114 case RISCV::VFIRST_M:
115 Cost += 1;
116 break;
117 case RISCV::VDIV_VV:
118 case RISCV::VREM_VV:
119 Cost += LMULCost * TTI::TCC_Expensive;
120 break;
121 default:
122 Cost += LMULCost;
123 }
124 }
125 return Cost;
126}
127
129 const RISCVSubtarget *ST,
130 const APInt &Imm, Type *Ty,
132 bool FreeZeroes) {
133 assert(Ty->isIntegerTy() &&
134 "getIntImmCost can only estimate cost of materialising integers");
135
136 // We have a Zero register, so 0 is always free.
137 if (Imm == 0)
138 return TTI::TCC_Free;
139
140 // Otherwise, we check how many instructions it will take to materialise.
141 return RISCVMatInt::getIntMatCost(Imm, DL.getTypeSizeInBits(Ty), *ST,
142 /*CompressionCost=*/false, FreeZeroes);
143}
144
148 return getIntImmCostImpl(getDataLayout(), getST(), Imm, Ty, CostKind, false);
149}
150
151// Look for patterns of shift followed by AND that can be turned into a pair of
152// shifts. We won't need to materialize an immediate for the AND so these can
153// be considered free.
154static bool canUseShiftPair(Instruction *Inst, const APInt &Imm) {
155 uint64_t Mask = Imm.getZExtValue();
156 auto *BO = dyn_cast<BinaryOperator>(Inst->getOperand(0));
157 if (!BO || !BO->hasOneUse())
158 return false;
159
160 if (BO->getOpcode() != Instruction::Shl)
161 return false;
162
163 if (!isa<ConstantInt>(BO->getOperand(1)))
164 return false;
165
166 unsigned ShAmt = cast<ConstantInt>(BO->getOperand(1))->getZExtValue();
167 // (and (shl x, c2), c1) will be matched to (srli (slli x, c2+c3), c3) if c1
168 // is a mask shifted by c2 bits with c3 leading zeros.
169 if (isShiftedMask_64(Mask)) {
170 unsigned Trailing = llvm::countr_zero(Mask);
171 if (ShAmt == Trailing)
172 return true;
173 }
174
175 return false;
176}
177
178// If this is i64 AND is part of (X & -(1 << C1) & 0xffffffff) == C2 << C1),
179// DAGCombiner can convert this to (sraiw X, C1) == sext(C2) for RV64. On RV32,
180// the type will be split so only the lower 32 bits need to be compared using
181// (srai/srli X, C) == C2.
182static bool canUseShiftCmp(Instruction *Inst, const APInt &Imm) {
183 if (!Inst->hasOneUse())
184 return false;
185
186 // Look for equality comparison.
187 auto *Cmp = dyn_cast<ICmpInst>(*Inst->user_begin());
188 if (!Cmp || !Cmp->isEquality())
189 return false;
190
191 // Right hand side of comparison should be a constant.
192 auto *C = dyn_cast<ConstantInt>(Cmp->getOperand(1));
193 if (!C)
194 return false;
195
196 uint64_t Mask = Imm.getZExtValue();
197
198 // Mask should be of the form -(1 << C) in the lower 32 bits.
199 if (!isUInt<32>(Mask) || !isPowerOf2_32(-uint32_t(Mask)))
200 return false;
201
202 // Comparison constant should be a subset of Mask.
203 uint64_t CmpC = C->getZExtValue();
204 if ((CmpC & Mask) != CmpC)
205 return false;
206
207 // We'll need to sign extend the comparison constant and shift it right. Make
208 // sure the new constant can use addi/xori+seqz/snez.
209 unsigned ShiftBits = llvm::countr_zero(Mask);
210 int64_t NewCmpC = SignExtend64<32>(CmpC) >> ShiftBits;
211 return NewCmpC >= -2048 && NewCmpC <= 2048;
212}
213
215 const APInt &Imm, Type *Ty,
217 Instruction *Inst) const {
218 assert(Ty->isIntegerTy() &&
219 "getIntImmCost can only estimate cost of materialising integers");
220
221 // We have a Zero register, so 0 is always free.
222 if (Imm == 0)
223 return TTI::TCC_Free;
224
225 // Some instructions in RISC-V can take a 12-bit immediate. Some of these are
226 // commutative, in others the immediate comes from a specific argument index.
227 bool Takes12BitImm = false;
228 unsigned ImmArgIdx = ~0U;
229
230 switch (Opcode) {
231 case Instruction::GetElementPtr:
232 // Never hoist any arguments to a GetElementPtr. CodeGenPrepare will
233 // split up large offsets in GEP into better parts than ConstantHoisting
234 // can.
235 return TTI::TCC_Free;
236 case Instruction::Store: {
237 // Use the materialization cost regardless of if it's the address or the
238 // value that is constant, except for if the store is misaligned and
239 // misaligned accesses are not legal (experience shows constant hoisting
240 // can sometimes be harmful in such cases).
241 if (Idx == 1 || !Inst)
242 return getIntImmCostImpl(getDataLayout(), getST(), Imm, Ty, CostKind,
243 /*FreeZeroes=*/true);
244
245 StoreInst *ST = cast<StoreInst>(Inst);
246 if (!getTLI()->allowsMemoryAccessForAlignment(
247 Ty->getContext(), DL, getTLI()->getValueType(DL, Ty),
248 ST->getPointerAddressSpace(), ST->getAlign()))
249 return TTI::TCC_Free;
250
251 return getIntImmCostImpl(getDataLayout(), getST(), Imm, Ty, CostKind,
252 /*FreeZeroes=*/true);
253 }
254 case Instruction::Load:
255 // If the address is a constant, use the materialization cost.
256 return getIntImmCost(Imm, Ty, CostKind);
257 case Instruction::And:
258 // zext.h
259 if (Imm == UINT64_C(0xffff) && ST->hasStdExtZbb())
260 return TTI::TCC_Free;
261 // zext.w
262 if (Imm == UINT64_C(0xffffffff) &&
263 ((ST->hasStdExtZba() && ST->isRV64()) || ST->isRV32()))
264 return TTI::TCC_Free;
265 // bclri
266 if (ST->hasStdExtZbs() && (~Imm).isPowerOf2())
267 return TTI::TCC_Free;
268 if (Inst && Idx == 1 && Imm.getBitWidth() <= ST->getXLen() &&
269 canUseShiftPair(Inst, Imm))
270 return TTI::TCC_Free;
271 if (Inst && Idx == 1 && Imm.getBitWidth() == 64 &&
272 canUseShiftCmp(Inst, Imm))
273 return TTI::TCC_Free;
274 Takes12BitImm = true;
275 break;
276 case Instruction::Add:
277 Takes12BitImm = true;
278 break;
279 case Instruction::Or:
280 case Instruction::Xor:
281 // bseti/binvi
282 if (ST->hasStdExtZbs() && Imm.isPowerOf2())
283 return TTI::TCC_Free;
284 Takes12BitImm = true;
285 break;
286 case Instruction::Mul:
287 // Power of 2 is a shift. Negated power of 2 is a shift and a negate.
288 if (Imm.isPowerOf2() || Imm.isNegatedPowerOf2())
289 return TTI::TCC_Free;
290 // One more or less than a power of 2 can use SLLI+ADD/SUB.
291 if ((Imm + 1).isPowerOf2() || (Imm - 1).isPowerOf2())
292 return TTI::TCC_Free;
293 // FIXME: There is no MULI instruction.
294 Takes12BitImm = true;
295 break;
296 case Instruction::Sub:
297 case Instruction::Shl:
298 case Instruction::LShr:
299 case Instruction::AShr:
300 Takes12BitImm = true;
301 ImmArgIdx = 1;
302 break;
303 default:
304 break;
305 }
306
307 if (Takes12BitImm) {
308 // Check immediate is the correct argument...
309 if (Instruction::isCommutative(Opcode) || Idx == ImmArgIdx) {
310 // ... and fits into the 12-bit immediate.
311 if (Imm.getSignificantBits() <= 64 &&
312 getTLI()->isLegalAddImmediate(Imm.getSExtValue())) {
313 return TTI::TCC_Free;
314 }
315 }
316
317 // Otherwise, use the full materialisation cost.
318 return getIntImmCost(Imm, Ty, CostKind);
319 }
320
321 // By default, prevent hoisting.
322 return TTI::TCC_Free;
323}
324
327 const APInt &Imm, Type *Ty,
329 // Prevent hoisting in unknown cases.
330 return TTI::TCC_Free;
331}
332
334 return ST->hasVInstructions();
335}
336
338RISCVTTIImpl::getPopcntSupport(unsigned TyWidth) const {
339 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
340 return ST->hasCPOPLike() ? TTI::PSK_FastHardware : TTI::PSK_Software;
341}
342
344 unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType,
346 TTI::PartialReductionExtendKind OpBExtend, std::optional<unsigned> BinOp,
347 TTI::TargetCostKind CostKind, std::optional<FastMathFlags> FMF) const {
348 if (Opcode == Instruction::FAdd)
350
351 // zve32x is broken for partial_reduce_umla, but let's make sure we
352 // don't generate them.
353 if (!ST->hasStdExtZvdot4a8i() || ST->getELen() < 64 ||
354 Opcode != Instruction::Add || !BinOp || *BinOp != Instruction::Mul ||
355 InputTypeA != InputTypeB || !InputTypeA->isIntegerTy(8) ||
356 !AccumType->isIntegerTy(32) || !VF.isKnownMultipleOf(4))
358
359 Type *Tp = VectorType::get(AccumType, VF.divideCoefficientBy(4));
360 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
361 // Note: Asuming all vdot4a* variants are equal cost
362 return LT.first *
363 getRISCVInstructionCost(RISCV::VDOT4A_VV, LT.second, CostKind);
364}
365
367 // Currently, the ExpandReductions pass can't expand scalable-vector
368 // reductions, but we still request expansion as RVV doesn't support certain
369 // reductions and the SelectionDAG can't legalize them either.
370 switch (II->getIntrinsicID()) {
371 default:
372 return false;
373 // These reductions have no equivalent in RVV
374 case Intrinsic::vector_reduce_mul:
375 case Intrinsic::vector_reduce_fmul:
376 return true;
377 }
378}
379
380std::optional<unsigned> RISCVTTIImpl::getMaxVScale() const {
381 if (ST->hasVInstructions())
382 return ST->getRealMaxVLen() / RISCV::RVVBitsPerBlock;
383 return BaseT::getMaxVScale();
384}
385
386std::optional<unsigned> RISCVTTIImpl::getVScaleForTuning() const {
387 if (ST->hasVInstructions())
388 if (unsigned MinVLen = ST->getRealMinVLen();
389 MinVLen >= RISCV::RVVBitsPerBlock)
390 return MinVLen / RISCV::RVVBitsPerBlock;
392}
393
396 unsigned LMUL =
397 llvm::bit_floor(std::clamp<unsigned>(RVVRegisterWidthLMUL, 1, 8));
398 switch (K) {
400 return TypeSize::getFixed(ST->getXLen());
402 return TypeSize::getFixed(
403 ST->useRVVForFixedLengthVectors() ? LMUL * ST->getRealMinVLen() : 0);
406 (ST->hasVInstructions() &&
407 ST->getRealMinVLen() >= RISCV::RVVBitsPerBlock)
409 : 0);
410 }
411
412 llvm_unreachable("Unsupported register kind");
413}
414
415InstructionCost RISCVTTIImpl::getStaticDataAddrGenerationCost(
416 const TTI::TargetCostKind CostKind) const {
417 switch (CostKind) {
420 // Always 2 instructions
421 return 2;
422 case TTI::TCK_Latency:
424 // Depending on the memory model the address generation will
425 // require AUIPC + ADDI (medany) or LUI + ADDI (medlow). Don't
426 // have a way of getting this information here, so conservatively
427 // require both.
428 // In practice, these are generally implemented together.
429 return (ST->hasAUIPCADDIFusion() && ST->hasLUIADDIFusion()) ? 1 : 2;
430 }
431 llvm_unreachable("Unsupported cost kind");
432}
433
435RISCVTTIImpl::getConstantPoolLoadCost(Type *Ty,
437 // Add a cost of address generation + the cost of the load. The address
438 // is expected to be a PC relative offset to a constant pool entry
439 // using auipc/addi.
440 return getStaticDataAddrGenerationCost(CostKind) +
441 getMemoryOpCost(Instruction::Load, Ty, DL.getABITypeAlign(Ty),
442 /*AddressSpace=*/0, CostKind);
443}
444
445static bool isRepeatedConcatMask(ArrayRef<int> Mask, int &SubVectorSize) {
446 unsigned Size = Mask.size();
447 if (!isPowerOf2_32(Size))
448 return false;
449 for (unsigned I = 0; I != Size; ++I) {
450 if (static_cast<unsigned>(Mask[I]) == I)
451 continue;
452 if (Mask[I] != 0)
453 return false;
454 if (Size % I != 0)
455 return false;
456 for (unsigned J = I + 1; J != Size; ++J)
457 // Check the pattern is repeated.
458 if (static_cast<unsigned>(Mask[J]) != J % I)
459 return false;
460 SubVectorSize = I;
461 return true;
462 }
463 // That means Mask is <0, 1, 2, 3>. This is not a concatenation.
464 return false;
465}
466
468 LLVMContext &C) {
469 assert((DataVT.getScalarSizeInBits() != 8 ||
470 DataVT.getVectorNumElements() <= 256) && "unhandled case in lowering");
471 MVT IndexVT = DataVT.changeTypeToInteger();
472 if (IndexVT.getScalarType().bitsGT(ST.getXLenVT()))
473 IndexVT = IndexVT.changeVectorElementType(MVT::i16);
474 return cast<VectorType>(EVT(IndexVT).getTypeForEVT(C));
475}
476
477/// Attempt to approximate the cost of a shuffle which will require splitting
478/// during legalization. Note that processShuffleMasks is not an exact proxy
479/// for the algorithm used in LegalizeVectorTypes, but hopefully it's a
480/// reasonably close upperbound.
482 MVT LegalVT, VectorType *Tp,
483 ArrayRef<int> Mask,
485 assert(LegalVT.isFixedLengthVector() && !Mask.empty() &&
486 "Expected fixed vector type and non-empty mask");
487 unsigned LegalNumElts = LegalVT.getVectorNumElements();
488 // Number of destination vectors after legalization:
489 unsigned NumOfDests = divideCeil(Mask.size(), LegalNumElts);
490 // We are going to permute multiple sources and the result will be in
491 // multiple destinations. Providing an accurate cost only for splits where
492 // the element type remains the same.
493 if (NumOfDests <= 1 ||
495 Tp->getElementType()->getPrimitiveSizeInBits() ||
496 LegalNumElts >= Tp->getElementCount().getFixedValue())
498
499 unsigned VecTySize = TTI.getDataLayout().getTypeStoreSize(Tp);
500 unsigned LegalVTSize = LegalVT.getStoreSize();
501 // Number of source vectors after legalization:
502 unsigned NumOfSrcs = divideCeil(VecTySize, LegalVTSize);
503
504 auto *SingleOpTy = FixedVectorType::get(Tp->getElementType(), LegalNumElts);
505
506 unsigned NormalizedVF = LegalNumElts * std::max(NumOfSrcs, NumOfDests);
507 unsigned NumOfSrcRegs = NormalizedVF / LegalNumElts;
508 unsigned NumOfDestRegs = NormalizedVF / LegalNumElts;
509 SmallVector<int> NormalizedMask(NormalizedVF, PoisonMaskElem);
510 assert(NormalizedVF >= Mask.size() &&
511 "Normalized mask expected to be not shorter than original mask.");
512 copy(Mask, NormalizedMask.begin());
513 InstructionCost Cost = 0;
514 SmallDenseSet<std::pair<ArrayRef<int>, unsigned>> ReusedSingleSrcShuffles;
516 NormalizedMask, NumOfSrcRegs, NumOfDestRegs, NumOfDestRegs, []() {},
517 [&](ArrayRef<int> RegMask, unsigned SrcReg, unsigned DestReg) {
518 if (ShuffleVectorInst::isIdentityMask(RegMask, RegMask.size()))
519 return;
520 if (!ReusedSingleSrcShuffles.insert(std::make_pair(RegMask, SrcReg))
521 .second)
522 return;
523 Cost += TTI.getShuffleCost(
525 FixedVectorType::get(SingleOpTy->getElementType(), RegMask.size()),
526 SingleOpTy, RegMask, CostKind, 0, nullptr);
527 },
528 [&](ArrayRef<int> RegMask, unsigned Idx1, unsigned Idx2, bool NewReg) {
529 Cost += TTI.getShuffleCost(
531 FixedVectorType::get(SingleOpTy->getElementType(), RegMask.size()),
532 SingleOpTy, RegMask, CostKind, 0, nullptr);
533 });
534 return Cost;
535}
536
537/// Try to perform better estimation of the permutation.
538/// 1. Split the source/destination vectors into real registers.
539/// 2. Do the mask analysis to identify which real registers are
540/// permuted. If more than 1 source registers are used for the
541/// destination register building, the cost for this destination register
542/// is (Number_of_source_register - 1) * Cost_PermuteTwoSrc. If only one
543/// source register is used, build mask and calculate the cost as a cost
544/// of PermuteSingleSrc.
545/// Also, for the single register permute we try to identify if the
546/// destination register is just a copy of the source register or the
547/// copy of the previous destination register (the cost is
548/// TTI::TCC_Basic). If the source register is just reused, the cost for
549/// this operation is 0.
550static InstructionCost
552 std::optional<unsigned> VLen, VectorType *Tp,
554 assert(LegalVT.isFixedLengthVector());
555 if (!VLen || Mask.empty())
557 MVT ElemVT = LegalVT.getVectorElementType();
558 unsigned ElemsPerVReg = *VLen / ElemVT.getFixedSizeInBits();
559 LegalVT = TTI.getTypeLegalizationCost(
560 FixedVectorType::get(Tp->getElementType(), ElemsPerVReg))
561 .second;
562 // Number of destination vectors after legalization:
563 InstructionCost NumOfDests =
564 divideCeil(Mask.size(), LegalVT.getVectorNumElements());
565 if (NumOfDests <= 1 ||
567 Tp->getElementType()->getPrimitiveSizeInBits() ||
568 LegalVT.getVectorNumElements() >= Tp->getElementCount().getFixedValue())
570
571 unsigned VecTySize = TTI.getDataLayout().getTypeStoreSize(Tp);
572 unsigned LegalVTSize = LegalVT.getStoreSize();
573 // Number of source vectors after legalization:
574 unsigned NumOfSrcs = divideCeil(VecTySize, LegalVTSize);
575
576 auto *SingleOpTy = FixedVectorType::get(Tp->getElementType(),
577 LegalVT.getVectorNumElements());
578
579 unsigned E = NumOfDests.getValue();
580 unsigned NormalizedVF =
581 LegalVT.getVectorNumElements() * std::max(NumOfSrcs, E);
582 unsigned NumOfSrcRegs = NormalizedVF / LegalVT.getVectorNumElements();
583 unsigned NumOfDestRegs = NormalizedVF / LegalVT.getVectorNumElements();
584 SmallVector<int> NormalizedMask(NormalizedVF, PoisonMaskElem);
585 assert(NormalizedVF >= Mask.size() &&
586 "Normalized mask expected to be not shorter than original mask.");
587 copy(Mask, NormalizedMask.begin());
588 InstructionCost Cost = 0;
589 int NumShuffles = 0;
590 SmallDenseSet<std::pair<ArrayRef<int>, unsigned>> ReusedSingleSrcShuffles;
592 NormalizedMask, NumOfSrcRegs, NumOfDestRegs, NumOfDestRegs, []() {},
593 [&](ArrayRef<int> RegMask, unsigned SrcReg, unsigned DestReg) {
594 if (ShuffleVectorInst::isIdentityMask(RegMask, RegMask.size()))
595 return;
596 if (!ReusedSingleSrcShuffles.insert(std::make_pair(RegMask, SrcReg))
597 .second)
598 return;
599 ++NumShuffles;
600 Cost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, SingleOpTy,
601 SingleOpTy, RegMask, CostKind, 0, nullptr);
602 },
603 [&](ArrayRef<int> RegMask, unsigned Idx1, unsigned Idx2, bool NewReg) {
604 Cost += TTI.getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy,
605 SingleOpTy, RegMask, CostKind, 0, nullptr);
606 NumShuffles += 2;
607 });
608 // Note: check that we do not emit too many shuffles here to prevent code
609 // size explosion.
610 // TODO: investigate, if it can be improved by extra analysis of the masks
611 // to check if the code is more profitable.
612 if ((NumOfDestRegs > 2 && NumShuffles <= static_cast<int>(NumOfDestRegs)) ||
613 (NumOfDestRegs <= 2 && NumShuffles < 4))
614 return Cost;
616}
617
618InstructionCost RISCVTTIImpl::getSlideCost(FixedVectorType *Tp,
619 ArrayRef<int> Mask,
621 // Avoid missing masks and length changing shuffles
622 if (Mask.size() <= 2 || Mask.size() != Tp->getNumElements())
624
625 int NumElts = Tp->getNumElements();
626 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
627 // Avoid scalarization cases
628 if (!LT.second.isFixedLengthVector())
630
631 // Requires moving elements between parts, which requires additional
632 // unmodeled instructions.
633 if (LT.first != 1)
635
636 auto GetSlideOpcode = [&](int SlideAmt) {
637 assert(SlideAmt != 0);
638 bool IsVI = isUInt<5>(std::abs(SlideAmt));
639 if (SlideAmt < 0)
640 return IsVI ? RISCV::VSLIDEDOWN_VI : RISCV::VSLIDEDOWN_VX;
641 return IsVI ? RISCV::VSLIDEUP_VI : RISCV::VSLIDEUP_VX;
642 };
643
644 std::array<std::pair<int, int>, 2> SrcInfo;
645 if (!isMaskedSlidePair(Mask, NumElts, SrcInfo))
647
648 if (SrcInfo[1].second == 0)
649 std::swap(SrcInfo[0], SrcInfo[1]);
650
651 InstructionCost FirstSlideCost = 0;
652 if (SrcInfo[0].second != 0) {
653 unsigned Opcode = GetSlideOpcode(SrcInfo[0].second);
654 FirstSlideCost = getRISCVInstructionCost(Opcode, LT.second, CostKind);
655 }
656
657 if (SrcInfo[1].first == -1)
658 return FirstSlideCost;
659
660 InstructionCost SecondSlideCost = 0;
661 if (SrcInfo[1].second != 0) {
662 unsigned Opcode = GetSlideOpcode(SrcInfo[1].second);
663 SecondSlideCost = getRISCVInstructionCost(Opcode, LT.second, CostKind);
664 } else {
665 SecondSlideCost =
666 getRISCVInstructionCost(RISCV::VMERGE_VVM, LT.second, CostKind);
667 }
668
669 auto EC = Tp->getElementCount();
670 VectorType *MaskTy =
672 InstructionCost MaskCost = getConstantPoolLoadCost(MaskTy, CostKind);
673 return FirstSlideCost + SecondSlideCost + MaskCost;
674}
675
678 VectorType *SrcTy, ArrayRef<int> Mask,
679 TTI::TargetCostKind CostKind, int Index,
681 const Instruction *CxtI) const {
682 assert((Mask.empty() || DstTy->isScalableTy() ||
683 Mask.size() == DstTy->getElementCount().getKnownMinValue()) &&
684 "Expected the Mask to match the return size if given");
685 assert(SrcTy->getScalarType() == DstTy->getScalarType() &&
686 "Expected the same scalar types");
687
688 Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTp);
689
690 // TODO: Add proper cost model for P extension fixed vectors (e.g., v4i16)
691 // For now, skip all fixed vector cost analysis when P extension is available
692 // to avoid crashes in getMinRVVVectorSizeInBits()
693 if (ST->hasStdExtP() && isa<FixedVectorType>(SrcTy))
694 return 1;
695
696 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcTy);
697
698 // First, handle cases where having a fixed length vector enables us to
699 // give a more accurate cost than falling back to generic scalable codegen.
700 // TODO: Each of these cases hints at a modeling gap around scalable vectors.
701 if (auto *FVTp = dyn_cast<FixedVectorType>(SrcTy);
702 FVTp && ST->hasVInstructions() && LT.second.isFixedLengthVector()) {
704 *this, LT.second, ST->getRealVLen(),
705 Kind == TTI::SK_InsertSubvector ? DstTy : SrcTy, Mask, CostKind);
706 if (VRegSplittingCost.isValid())
707 return VRegSplittingCost;
708 switch (Kind) {
709 default:
710 break;
712 if (Mask.size() >= 2) {
713 MVT EltTp = LT.second.getVectorElementType();
714 // If the size of the element is < ELEN then shuffles of interleaves and
715 // deinterleaves of 2 vectors can be lowered into the following
716 // sequences
717 if (EltTp.getScalarSizeInBits() < ST->getELen()) {
718 // Example sequence:
719 // vsetivli zero, 4, e8, mf4, ta, ma (ignored)
720 // vwaddu.vv v10, v8, v9
721 // li a0, -1 (ignored)
722 // vwmaccu.vx v10, a0, v9
723 if (ShuffleVectorInst::isInterleaveMask(Mask, 2, Mask.size()))
724 return 2 * LT.first * TLI->getLMULCost(LT.second);
725
726 if (Mask[0] == 0 || Mask[0] == 1) {
727 auto DeinterleaveMask = createStrideMask(Mask[0], 2, Mask.size());
728 // Example sequence:
729 // vnsrl.wi v10, v8, 0
730 if (equal(DeinterleaveMask, Mask))
731 return LT.first * getRISCVInstructionCost(RISCV::VNSRL_WI,
732 LT.second, CostKind);
733 }
734 }
735 int SubVectorSize;
736 if (LT.second.getScalarSizeInBits() != 1 &&
737 isRepeatedConcatMask(Mask, SubVectorSize)) {
739 unsigned NumSlides = Log2_32(Mask.size() / SubVectorSize);
740 // The cost of extraction from a subvector is 0 if the index is 0.
741 for (unsigned I = 0; I != NumSlides; ++I) {
742 unsigned InsertIndex = SubVectorSize * (1 << I);
743 FixedVectorType *SubTp =
744 FixedVectorType::get(SrcTy->getElementType(), InsertIndex);
745 FixedVectorType *DestTp =
747 std::pair<InstructionCost, MVT> DestLT =
749 // Add the cost of whole vector register move because the
750 // destination vector register group for vslideup cannot overlap the
751 // source.
752 Cost += DestLT.first * TLI->getLMULCost(DestLT.second);
753 Cost += getShuffleCost(TTI::SK_InsertSubvector, DestTp, DestTp, {},
754 CostKind, InsertIndex, SubTp);
755 }
756 return Cost;
757 }
758 }
759
760 if (InstructionCost SlideCost = getSlideCost(FVTp, Mask, CostKind);
761 SlideCost.isValid())
762 return SlideCost;
763
764 // vrgather + cost of generating the mask constant.
765 // We model this for an unknown mask with a single vrgather.
766 if (LT.first == 1 && (LT.second.getScalarSizeInBits() != 8 ||
767 LT.second.getVectorNumElements() <= 256)) {
768 VectorType *IdxTy =
769 getVRGatherIndexType(LT.second, *ST, SrcTy->getContext());
770 InstructionCost IndexCost = getConstantPoolLoadCost(IdxTy, CostKind);
771 return IndexCost +
772 getRISCVInstructionCost(RISCV::VRGATHER_VV, LT.second, CostKind);
773 }
774 break;
775 }
778
779 if (InstructionCost SlideCost = getSlideCost(FVTp, Mask, CostKind);
780 SlideCost.isValid())
781 return SlideCost;
782
783 // 2 x (vrgather + cost of generating the mask constant) + cost of mask
784 // register for the second vrgather. We model this for an unknown
785 // (shuffle) mask.
786 if (LT.first == 1 && (LT.second.getScalarSizeInBits() != 8 ||
787 LT.second.getVectorNumElements() <= 256)) {
788 auto &C = SrcTy->getContext();
789 auto EC = SrcTy->getElementCount();
790 VectorType *IdxTy = getVRGatherIndexType(LT.second, *ST, C);
792 InstructionCost IndexCost = getConstantPoolLoadCost(IdxTy, CostKind);
793 InstructionCost MaskCost = getConstantPoolLoadCost(MaskTy, CostKind);
794 return 2 * IndexCost +
795 getRISCVInstructionCost({RISCV::VRGATHER_VV, RISCV::VRGATHER_VV},
796 LT.second, CostKind) +
797 MaskCost;
798 }
799 break;
800 }
801 }
802
803 auto shouldSplit = [](TTI::ShuffleKind Kind) {
804 switch (Kind) {
805 default:
806 return false;
810 return true;
811 }
812 };
813
814 if (!Mask.empty() && LT.first.isValid() && LT.first != 1 &&
815 shouldSplit(Kind)) {
816 InstructionCost SplitCost =
817 costShuffleViaSplitting(*this, LT.second, FVTp, Mask, CostKind);
818 if (SplitCost.isValid())
819 return SplitCost;
820 }
821 }
822
823 // Handle scalable vectors (and fixed vectors legalized to scalable vectors).
824 switch (Kind) {
825 default:
826 // Fallthrough to generic handling.
827 // TODO: Most of these cases will return getInvalid in generic code, and
828 // must be implemented here.
829 break;
831 // Extract at zero is always a subregister extract
832 if (Index == 0)
833 return TTI::TCC_Free;
834
835 // If we're extracting a subvector of at most m1 size at a sub-register
836 // boundary - which unfortunately we need exact vlen to identify - this is
837 // a subregister extract at worst and thus won't require a vslidedown.
838 // TODO: Extend for aligned m2, m4 subvector extracts
839 // TODO: Extend for misalgined (but contained) extracts
840 // TODO: Extend for scalable subvector types
841 if (std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
842 SubLT.second.isValid() && SubLT.second.isFixedLengthVector()) {
843 if (std::optional<unsigned> VLen = ST->getRealVLen();
844 VLen && SubLT.second.getScalarSizeInBits() * Index % *VLen == 0 &&
845 SubLT.second.getSizeInBits() <= *VLen)
846 return TTI::TCC_Free;
847 }
848
849 // Example sequence:
850 // vsetivli zero, 4, e8, mf2, tu, ma (ignored)
851 // vslidedown.vi v8, v9, 2
852 return LT.first *
853 getRISCVInstructionCost(RISCV::VSLIDEDOWN_VI, LT.second, CostKind);
855 // Example sequence:
856 // vsetivli zero, 4, e8, mf2, tu, ma (ignored)
857 // vslideup.vi v8, v9, 2
858 LT = getTypeLegalizationCost(DstTy);
859 return LT.first *
860 getRISCVInstructionCost(RISCV::VSLIDEUP_VI, LT.second, CostKind);
861 case TTI::SK_Select: {
862 // Example sequence:
863 // li a0, 90
864 // vsetivli zero, 8, e8, mf2, ta, ma (ignored)
865 // vmv.s.x v0, a0
866 // vmerge.vvm v8, v9, v8, v0
867 // We use 2 for the cost of the mask materialization as this is the true
868 // cost for small masks and most shuffles are small. At worst, this cost
869 // should be a very small constant for the constant pool load. As such,
870 // we may bias towards large selects slightly more than truly warranted.
871 return LT.first *
872 (1 + getRISCVInstructionCost({RISCV::VMV_S_X, RISCV::VMERGE_VVM},
873 LT.second, CostKind));
874 }
875 case TTI::SK_Broadcast: {
876 // Check for broadcast loads, which are synthesized by optimized zero-stride
877 // loads (this is checked in RISCVTTIImpl::isLegalBroadcastLoad).
878 bool IsLoad = !Args.empty() && isa<LoadInst>(Args[0]);
879 if (IsLoad && LT.second.isVector() &&
880 isLegalBroadcastLoad(SrcTy->getElementType(),
881 LT.second.getVectorElementCount()))
882 return 0;
883
884 bool HasScalar = (Args.size() > 0) && (Operator::getOpcode(Args[0]) ==
885 Instruction::InsertElement);
886 if (LT.second.getScalarSizeInBits() == 1) {
887 if (HasScalar) {
888 // Example sequence:
889 // andi a0, a0, 1
890 // vsetivli zero, 2, e8, mf8, ta, ma (ignored)
891 // vmv.v.x v8, a0
892 // vmsne.vi v0, v8, 0
893 return LT.first *
894 (1 + getRISCVInstructionCost({RISCV::VMV_V_X, RISCV::VMSNE_VI},
895 LT.second, CostKind));
896 }
897 // Example sequence:
898 // vsetivli zero, 2, e8, mf8, ta, mu (ignored)
899 // vmv.v.i v8, 0
900 // vmerge.vim v8, v8, 1, v0
901 // vmv.x.s a0, v8
902 // andi a0, a0, 1
903 // vmv.v.x v8, a0
904 // vmsne.vi v0, v8, 0
905
906 return LT.first *
907 (1 + getRISCVInstructionCost({RISCV::VMV_V_I, RISCV::VMERGE_VIM,
908 RISCV::VMV_X_S, RISCV::VMV_V_X,
909 RISCV::VMSNE_VI},
910 LT.second, CostKind));
911 }
912
913 if (HasScalar) {
914 // Example sequence:
915 // vmv.v.x v8, a0
916 return LT.first *
917 getRISCVInstructionCost(RISCV::VMV_V_X, LT.second, CostKind);
918 }
919
920 // Example sequence:
921 // vrgather.vi v9, v8, 0
922 return LT.first *
923 getRISCVInstructionCost(RISCV::VRGATHER_VI, LT.second, CostKind);
924 }
925 case TTI::SK_Splice: {
926 // vslidedown+vslideup.
927 // TODO: Multiplying by LT.first implies this legalizes into multiple copies
928 // of similar code, but I think we expand through memory.
929 unsigned Opcodes[2] = {RISCV::VSLIDEDOWN_VX, RISCV::VSLIDEUP_VX};
930 if (Index >= 0 && Index < 32)
931 Opcodes[0] = RISCV::VSLIDEDOWN_VI;
932 else if (Index < 0 && Index > -32)
933 Opcodes[1] = RISCV::VSLIDEUP_VI;
934 return LT.first * getRISCVInstructionCost(Opcodes, LT.second, CostKind);
935 }
936 case TTI::SK_Reverse: {
937
938 if (!LT.second.isVector())
940
941 // TODO: Cases to improve here:
942 // * Illegal vector types
943 // * i64 on RV32
944 if (SrcTy->getElementType()->isIntegerTy(1)) {
945 VectorType *WideTy =
946 VectorType::get(IntegerType::get(SrcTy->getContext(), 8),
947 cast<VectorType>(SrcTy)->getElementCount());
948 return getCastInstrCost(Instruction::ZExt, WideTy, SrcTy,
950 getShuffleCost(TTI::SK_Reverse, WideTy, WideTy, {}, CostKind, 0,
951 nullptr) +
952 getCastInstrCost(Instruction::Trunc, SrcTy, WideTy,
954 }
955
956 MVT ContainerVT = LT.second;
957 if (LT.second.isFixedLengthVector())
958 ContainerVT = TLI->getContainerForFixedLengthVector(LT.second);
959 MVT M1VT = RISCVTargetLowering::getM1VT(ContainerVT);
960 if (ContainerVT.bitsLE(M1VT)) {
961 // Example sequence:
962 // csrr a0, vlenb
963 // srli a0, a0, 3
964 // addi a0, a0, -1
965 // vsetvli a1, zero, e8, mf8, ta, mu (ignored)
966 // vid.v v9
967 // vrsub.vx v10, v9, a0
968 // vrgather.vv v9, v8, v10
969 InstructionCost LenCost = 3;
970 if (LT.second.isFixedLengthVector())
971 // vrsub.vi has a 5 bit immediate field, otherwise an li suffices
972 LenCost = isInt<5>(LT.second.getVectorNumElements() - 1) ? 0 : 1;
973 unsigned Opcodes[] = {RISCV::VID_V, RISCV::VRSUB_VX, RISCV::VRGATHER_VV};
974 if (LT.second.isFixedLengthVector() &&
975 isInt<5>(LT.second.getVectorNumElements() - 1))
976 Opcodes[1] = RISCV::VRSUB_VI;
977 InstructionCost GatherCost =
978 getRISCVInstructionCost(Opcodes, LT.second, CostKind);
979 return LT.first * (LenCost + GatherCost);
980 }
981
982 // At high LMUL, we split into a series of M1 reverses (see
983 // lowerVECTOR_REVERSE) and then do a single slide at the end to eliminate
984 // the resulting gap at the bottom (for fixed vectors only). The important
985 // bit is that the cost scales linearly, not quadratically with LMUL.
986 unsigned M1Opcodes[] = {RISCV::VID_V, RISCV::VRSUB_VX};
987 InstructionCost FixedCost =
988 getRISCVInstructionCost(M1Opcodes, M1VT, CostKind) + 3;
989 unsigned Ratio =
991 InstructionCost GatherCost =
992 getRISCVInstructionCost({RISCV::VRGATHER_VV}, M1VT, CostKind) * Ratio;
993 InstructionCost SlideCost = !LT.second.isFixedLengthVector() ? 0 :
994 getRISCVInstructionCost({RISCV::VSLIDEDOWN_VX}, LT.second, CostKind);
995 return FixedCost + LT.first * (GatherCost + SlideCost);
996 }
997 }
998 return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index,
999 SubTp);
1000}
1001
1002static unsigned isM1OrSmaller(MVT VT) {
1004 return (LMUL == RISCVVType::VLMUL::LMUL_F8 ||
1008}
1009
1011 VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
1012 TTI::TargetCostKind CostKind, bool ForPoisonSrc, ArrayRef<Value *> VL,
1013 TTI::VectorInstrContext VIC) const {
1016
1017 // TODO: Add proper cost model for P extension fixed vectors (e.g., v4i16)
1018 // For now, skip all fixed vector cost analysis when P extension is available
1019 // to avoid crashes in getMinRVVVectorSizeInBits()
1020 if (ST->hasStdExtP() && isa<FixedVectorType>(Ty)) {
1021 return 1; // Treat as single instruction cost for now
1022 }
1023
1024 // A build_vector (which is m1 sized or smaller) can be done in no
1025 // worse than one vslide1down.vx per element in the type. We could
1026 // in theory do an explode_vector in the inverse manner, but our
1027 // lowering today does not have a first class node for this pattern.
1029 Ty, DemandedElts, Insert, Extract, CostKind);
1030 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1031 if (Insert && !Extract && LT.first.isValid() && LT.second.isVector()) {
1032 if (Ty->getScalarSizeInBits() == 1) {
1033 auto *WideVecTy = cast<VectorType>(Ty->getWithNewBitWidth(8));
1034 // Note: Implicit scalar anyextend is assumed to be free since the i1
1035 // must be stored in a GPR.
1036 return getScalarizationOverhead(WideVecTy, DemandedElts, Insert, Extract,
1037 CostKind) +
1038 getCastInstrCost(Instruction::Trunc, Ty, WideVecTy,
1040 }
1041
1042 assert(LT.second.isFixedLengthVector());
1043 MVT ContainerVT = TLI->getContainerForFixedLengthVector(LT.second);
1044 if (isM1OrSmaller(ContainerVT)) {
1045 InstructionCost BV =
1046 cast<FixedVectorType>(Ty)->getNumElements() *
1047 getRISCVInstructionCost(RISCV::VSLIDE1DOWN_VX, LT.second, CostKind);
1048 if (BV < Cost)
1049 Cost = BV;
1050 }
1051 }
1052 return Cost;
1053}
1054
1058 Type *DataTy = MICA.getDataType();
1059 Align Alignment = MICA.getAlignment();
1060 switch (MICA.getID()) {
1061 case Intrinsic::vp_load_ff: {
1062 EVT DataTypeVT = TLI->getValueType(DL, DataTy);
1063 if (!TLI->isLegalFirstFaultLoad(DataTypeVT, Alignment))
1065
1066 unsigned AS = MICA.getAddressSpace();
1067 return getMemoryOpCost(Instruction::Load, DataTy, Alignment, AS, CostKind,
1068 {TTI::OK_AnyValue, TTI::OP_None}, nullptr);
1069 }
1070 case Intrinsic::experimental_vp_strided_load:
1071 case Intrinsic::experimental_vp_strided_store:
1072 return getStridedMemoryOpCost(MICA, CostKind);
1073 case Intrinsic::masked_compressstore:
1074 case Intrinsic::masked_expandload:
1076 case Intrinsic::vp_scatter:
1077 case Intrinsic::vp_gather:
1078 case Intrinsic::masked_scatter:
1079 case Intrinsic::masked_gather:
1080 return getGatherScatterOpCost(MICA, CostKind);
1081 case Intrinsic::vp_load:
1082 case Intrinsic::vp_store:
1083 case Intrinsic::masked_load:
1084 case Intrinsic::masked_store:
1085 return getMaskedMemoryOpCost(MICA, CostKind);
1086 }
1088}
1089
1093 unsigned Opcode = MICA.getID() == Intrinsic::masked_load ? Instruction::Load
1094 : Instruction::Store;
1095 Type *Src = MICA.getDataType();
1096 Align Alignment = MICA.getAlignment();
1097 unsigned AddressSpace = MICA.getAddressSpace();
1098
1099 if (!isLegalMaskedLoadStore(Src, Alignment) ||
1102
1103 return getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind);
1104}
1105
1107 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
1108 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
1109 bool UseMaskForCond, bool UseMaskForGaps) const {
1110
1111 // The interleaved memory access pass will lower (de)interleave ops combined
1112 // with an adjacent appropriate memory to vlseg/vsseg intrinsics. vlseg/vsseg
1113 // only support masking per-iteration (i.e. condition), not per-segment (i.e.
1114 // gap).
1115 if (!UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) {
1116 auto *VTy = cast<VectorType>(VecTy);
1117 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VTy);
1118 // Need to make sure type has't been scalarized
1119 if (LT.second.isVector()) {
1120 auto *SubVecTy =
1121 VectorType::get(VTy->getElementType(),
1122 VTy->getElementCount().divideCoefficientBy(Factor));
1123 if (VTy->getElementCount().isKnownMultipleOf(Factor) &&
1124 TLI->isLegalInterleavedAccessType(SubVecTy, Factor, Alignment,
1125 AddressSpace, DL)) {
1126
1127 // Some processors optimize segment loads/stores as one wide memory op +
1128 // Factor * LMUL shuffle ops.
1129 if (ST->hasOptimizedSegmentLoadStore(Factor)) {
1131 getMemoryOpCost(Opcode, VTy, Alignment, AddressSpace, CostKind);
1132 MVT SubVecVT = getTLI()->getValueType(DL, SubVecTy).getSimpleVT();
1133 Cost += Factor * TLI->getLMULCost(SubVecVT);
1134 return LT.first * Cost;
1135 }
1136
1137 // Otherwise, the cost is proportional to the number of elements (VL *
1138 // Factor ops).
1139 InstructionCost MemOpCost =
1140 getMemoryOpCost(Opcode, VTy->getElementType(), Alignment, 0,
1141 CostKind, {TTI::OK_AnyValue, TTI::OP_None});
1142 unsigned NumLoads = getEstimatedVLFor(VTy);
1143 return NumLoads * MemOpCost;
1144 }
1145 }
1146 }
1147
1148 // TODO: Return the cost of interleaved accesses for scalable vector when
1149 // unable to convert to segment accesses instructions.
1150 if (isa<ScalableVectorType>(VecTy))
1152
1153 auto *FVTy = cast<FixedVectorType>(VecTy);
1154 // When gaps are only at the tail, for interleaved load, we can emit a wide
1155 // masked load and shufflevectors. For interleaved store, we can emit
1156 // shufflevectors and a wide masked store. The interleaved memory access pass
1157 // will lower them into vlsseg/vssseg intrinsics.
1158 if (UseMaskForGaps) {
1159 assert(llvm::is_sorted(Indices) && "Indices must be sorted");
1160 assert(llvm::adjacent_find(Indices) == Indices.end() &&
1161 "Indices should not contain duplicate elements");
1162 unsigned NumOfFields = Indices.size();
1163 bool IsTailGapOnly = NumOfFields > 1 && (NumOfFields == Indices.back() + 1);
1164 if (IsTailGapOnly &&
1165 NumOfFields <= TLI->getMaxSupportedInterleaveFactor()) {
1166 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(FVTy);
1167 if (LT.second.isVector() &&
1168 FVTy->getElementCount().isKnownMultipleOf(Factor)) {
1169 auto *SubVecTy = VectorType::get(
1170 FVTy->getElementType(),
1171 FVTy->getElementCount().divideCoefficientBy(Factor));
1172 if (TLI->isLegalInterleavedAccessType(SubVecTy, NumOfFields, Alignment,
1173 AddressSpace, DL)) {
1174 // The cost is proportional to the total number of element accesses.
1175 unsigned NumAccesses = getEstimatedVLFor(FVTy);
1176 return NumAccesses * TTI::TCC_Basic;
1177 }
1178 }
1179 }
1180 }
1181
1182 InstructionCost MemCost =
1183 getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace, CostKind);
1184 unsigned VF = FVTy->getNumElements() / Factor;
1185
1186 // An interleaved load will look like this for Factor=3:
1187 // %wide.vec = load <12 x i32>, ptr %3, align 4
1188 // %strided.vec = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
1189 // %strided.vec1 = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
1190 // %strided.vec2 = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
1191 if (Opcode == Instruction::Load) {
1192 InstructionCost Cost = MemCost;
1193 for (unsigned Index : Indices) {
1194 FixedVectorType *VecTy =
1195 FixedVectorType::get(FVTy->getElementType(), VF * Factor);
1196 auto Mask = createStrideMask(Index, Factor, VF);
1197 Mask.resize(VF * Factor, -1);
1198 InstructionCost ShuffleCost =
1200 Mask, CostKind, 0, nullptr, {});
1201 Cost += ShuffleCost;
1202 }
1203 return Cost;
1204 }
1205
1206 // TODO: Model for NF > 2
1207 // We'll need to enhance getShuffleCost to model shuffles that are just
1208 // inserts and extracts into subvectors, since they won't have the full cost
1209 // of a vrgather.
1210 // An interleaved store for 3 vectors of 4 lanes will look like
1211 // %11 = shufflevector <4 x i32> %4, <4 x i32> %6, <8 x i32> <0...7>
1212 // %12 = shufflevector <4 x i32> %9, <4 x i32> poison, <8 x i32> <0...3>
1213 // %13 = shufflevector <8 x i32> %11, <8 x i32> %12, <12 x i32> <0...11>
1214 // %interleaved.vec = shufflevector %13, poison, <12 x i32> <interleave mask>
1215 // store <12 x i32> %interleaved.vec, ptr %10, align 4
1216 if (Factor != 2)
1217 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
1218 Alignment, AddressSpace, CostKind,
1219 UseMaskForCond, UseMaskForGaps);
1220
1221 assert(Opcode == Instruction::Store && "Opcode must be a store");
1222 // For an interleaving store of 2 vectors, we perform one large interleaving
1223 // shuffle that goes into the wide store
1224 auto Mask = createInterleaveMask(VF, Factor);
1225 InstructionCost ShuffleCost =
1227 CostKind, 0, nullptr, {});
1228 return MemCost + ShuffleCost;
1229}
1230
1234
1235 bool IsLoad = MICA.getID() == Intrinsic::masked_gather ||
1236 MICA.getID() == Intrinsic::vp_gather;
1237 unsigned Opcode = IsLoad ? Instruction::Load : Instruction::Store;
1238 Type *DataTy = MICA.getDataType();
1239 Align Alignment = MICA.getAlignment();
1242
1243 if ((Opcode == Instruction::Load &&
1244 !isLegalMaskedGather(DataTy, Align(Alignment))) ||
1245 (Opcode == Instruction::Store &&
1246 !isLegalMaskedScatter(DataTy, Align(Alignment))))
1248
1249 // Cost is proportional to the number of memory operations implied. For
1250 // scalable vectors, we use an estimate on that number since we don't
1251 // know exactly what VL will be.
1252 auto &VTy = *cast<VectorType>(DataTy);
1253 unsigned NumLoads = getEstimatedVLFor(&VTy);
1254 return NumLoads * TTI::TCC_Basic;
1255}
1256
1258 const MemIntrinsicCostAttributes &MICA,
1260 unsigned Opcode = MICA.getID() == Intrinsic::masked_expandload
1261 ? Instruction::Load
1262 : Instruction::Store;
1263 Type *DataTy = MICA.getDataType();
1264 bool VariableMask = MICA.getVariableMask();
1265 Align Alignment = MICA.getAlignment();
1266 bool IsLegal = (Opcode == Instruction::Store &&
1267 isLegalMaskedCompressStore(DataTy, Alignment)) ||
1268 (Opcode == Instruction::Load &&
1269 isLegalMaskedExpandLoad(DataTy, Alignment));
1270 if (!IsLegal || CostKind != TTI::TCK_RecipThroughput)
1272 // Example compressstore sequence:
1273 // vsetivli zero, 8, e32, m2, ta, ma (ignored)
1274 // vcompress.vm v10, v8, v0
1275 // vcpop.m a1, v0
1276 // vsetvli zero, a1, e32, m2, ta, ma
1277 // vse32.v v10, (a0)
1278 // Example expandload sequence:
1279 // vsetivli zero, 8, e8, mf2, ta, ma (ignored)
1280 // vcpop.m a1, v0
1281 // vsetvli zero, a1, e32, m2, ta, ma
1282 // vle32.v v10, (a0)
1283 // vsetivli zero, 8, e32, m2, ta, ma
1284 // viota.m v12, v0
1285 // vrgather.vv v8, v10, v12, v0.t
1286 auto MemOpCost =
1287 getMemoryOpCost(Opcode, DataTy, Alignment, /*AddressSpace*/ 0, CostKind);
1288 auto LT = getTypeLegalizationCost(DataTy);
1289 SmallVector<unsigned, 4> Opcodes{RISCV::VSETVLI};
1290 if (VariableMask)
1291 Opcodes.push_back(RISCV::VCPOP_M);
1292 if (Opcode == Instruction::Store)
1293 Opcodes.append({RISCV::VCOMPRESS_VM});
1294 else
1295 Opcodes.append({RISCV::VSETIVLI, RISCV::VIOTA_M, RISCV::VRGATHER_VV});
1296 return MemOpCost +
1297 LT.first * getRISCVInstructionCost(Opcodes, LT.second, CostKind);
1298}
1299
1303
1304 unsigned Opcode = MICA.getID() == Intrinsic::experimental_vp_strided_load
1305 ? Instruction::Load
1306 : Instruction::Store;
1307
1308 Type *DataTy = MICA.getDataType();
1309 Align Alignment = MICA.getAlignment();
1310 const Instruction *I = MICA.getInst();
1311
1312 if (!isLegalStridedLoadStore(DataTy, Alignment))
1314
1316 return TTI::TCC_Basic;
1317
1318 // Cost is proportional to the number of memory operations implied. For
1319 // scalable vectors, we use an estimate on that number since we don't
1320 // know exactly what VL will be.
1321 // FIXME: This will overcost for i64 on rv32 with +zve64x.
1322 auto &VTy = *cast<VectorType>(DataTy);
1323 InstructionCost MemOpCost =
1324 getMemoryOpCost(Opcode, VTy.getElementType(), Alignment, 0, CostKind,
1325 {TTI::OK_AnyValue, TTI::OP_None}, I);
1326 unsigned NumLoads = getEstimatedVLFor(&VTy);
1327 return NumLoads * MemOpCost;
1328}
1329
1332 // FIXME: This is a property of the default vector convention, not
1333 // all possible calling conventions. Fixing that will require
1334 // some TTI API and SLP rework.
1337 for (auto *Ty : Tys) {
1338 if (!Ty->isVectorTy())
1339 continue;
1340 Align A = DL.getPrefTypeAlign(Ty);
1341 Cost += getMemoryOpCost(Instruction::Store, Ty, A, 0, CostKind) +
1342 getMemoryOpCost(Instruction::Load, Ty, A, 0, CostKind);
1343 }
1344 return Cost;
1345}
1346
1347// Currently, these represent both throughput and codesize costs
1348// for the respective intrinsics. The costs in this table are simply
1349// instruction counts with the following adjustments made:
1350// * One vsetvli is considered free.
1352 {Intrinsic::floor, MVT::f32, 9},
1353 {Intrinsic::floor, MVT::f64, 9},
1354 {Intrinsic::ceil, MVT::f32, 9},
1355 {Intrinsic::ceil, MVT::f64, 9},
1356 {Intrinsic::trunc, MVT::f32, 7},
1357 {Intrinsic::trunc, MVT::f64, 7},
1358 {Intrinsic::round, MVT::f32, 9},
1359 {Intrinsic::round, MVT::f64, 9},
1360 {Intrinsic::roundeven, MVT::f32, 9},
1361 {Intrinsic::roundeven, MVT::f64, 9},
1362 {Intrinsic::rint, MVT::f32, 7},
1363 {Intrinsic::rint, MVT::f64, 7},
1364 {Intrinsic::nearbyint, MVT::f32, 9},
1365 {Intrinsic::nearbyint, MVT::f64, 9},
1366 {Intrinsic::bswap, MVT::i16, 3},
1367 {Intrinsic::bswap, MVT::i32, 12},
1368 {Intrinsic::bswap, MVT::i64, 31},
1369 {Intrinsic::vp_bswap, MVT::i16, 3},
1370 {Intrinsic::vp_bswap, MVT::i32, 12},
1371 {Intrinsic::vp_bswap, MVT::i64, 31},
1372 {Intrinsic::vp_fshl, MVT::i8, 7},
1373 {Intrinsic::vp_fshl, MVT::i16, 7},
1374 {Intrinsic::vp_fshl, MVT::i32, 7},
1375 {Intrinsic::vp_fshl, MVT::i64, 7},
1376 {Intrinsic::vp_fshr, MVT::i8, 7},
1377 {Intrinsic::vp_fshr, MVT::i16, 7},
1378 {Intrinsic::vp_fshr, MVT::i32, 7},
1379 {Intrinsic::vp_fshr, MVT::i64, 7},
1380 {Intrinsic::bitreverse, MVT::i8, 17},
1381 {Intrinsic::bitreverse, MVT::i16, 24},
1382 {Intrinsic::bitreverse, MVT::i32, 33},
1383 {Intrinsic::bitreverse, MVT::i64, 52},
1384 {Intrinsic::vp_bitreverse, MVT::i8, 17},
1385 {Intrinsic::vp_bitreverse, MVT::i16, 24},
1386 {Intrinsic::vp_bitreverse, MVT::i32, 33},
1387 {Intrinsic::vp_bitreverse, MVT::i64, 52},
1388 {Intrinsic::ctpop, MVT::i8, 12},
1389 {Intrinsic::ctpop, MVT::i16, 19},
1390 {Intrinsic::ctpop, MVT::i32, 20},
1391 {Intrinsic::ctpop, MVT::i64, 21},
1392 {Intrinsic::ctlz, MVT::i8, 19},
1393 {Intrinsic::ctlz, MVT::i16, 28},
1394 {Intrinsic::ctlz, MVT::i32, 31},
1395 {Intrinsic::ctlz, MVT::i64, 35},
1396 {Intrinsic::cttz, MVT::i8, 16},
1397 {Intrinsic::cttz, MVT::i16, 23},
1398 {Intrinsic::cttz, MVT::i32, 24},
1399 {Intrinsic::cttz, MVT::i64, 25},
1400 {Intrinsic::vp_ctpop, MVT::i8, 12},
1401 {Intrinsic::vp_ctpop, MVT::i16, 19},
1402 {Intrinsic::vp_ctpop, MVT::i32, 20},
1403 {Intrinsic::vp_ctpop, MVT::i64, 21},
1404 {Intrinsic::vp_ctlz, MVT::i8, 19},
1405 {Intrinsic::vp_ctlz, MVT::i16, 28},
1406 {Intrinsic::vp_ctlz, MVT::i32, 31},
1407 {Intrinsic::vp_ctlz, MVT::i64, 35},
1408 {Intrinsic::vp_cttz, MVT::i8, 16},
1409 {Intrinsic::vp_cttz, MVT::i16, 23},
1410 {Intrinsic::vp_cttz, MVT::i32, 24},
1411 {Intrinsic::vp_cttz, MVT::i64, 25},
1412};
1413
1417 auto *RetTy = ICA.getReturnType();
1418 switch (ICA.getID()) {
1419 case Intrinsic::lrint:
1420 case Intrinsic::llrint:
1421 case Intrinsic::lround:
1422 case Intrinsic::llround: {
1423 auto LT = getTypeLegalizationCost(RetTy);
1424 Type *SrcTy = ICA.getArgTypes().front();
1425 auto SrcLT = getTypeLegalizationCost(SrcTy);
1426 if (ST->hasVInstructions() && LT.second.isVector()) {
1428 unsigned SrcEltSz = DL.getTypeSizeInBits(SrcTy->getScalarType());
1429 unsigned DstEltSz = DL.getTypeSizeInBits(RetTy->getScalarType());
1430 if (LT.second.getVectorElementType() == MVT::bf16) {
1431 if (!ST->hasVInstructionsBF16Minimal())
1433 if (DstEltSz == 32)
1434 Ops = {RISCV::VFWCVTBF16_F_F_V, RISCV::VFCVT_X_F_V};
1435 else
1436 Ops = {RISCV::VFWCVTBF16_F_F_V, RISCV::VFWCVT_X_F_V};
1437 } else if (LT.second.getVectorElementType() == MVT::f16 &&
1438 !ST->hasVInstructionsF16()) {
1439 if (!ST->hasVInstructionsF16Minimal())
1441 if (DstEltSz == 32)
1442 Ops = {RISCV::VFWCVT_F_F_V, RISCV::VFCVT_X_F_V};
1443 else
1444 Ops = {RISCV::VFWCVT_F_F_V, RISCV::VFWCVT_X_F_V};
1445
1446 } else if (SrcEltSz > DstEltSz) {
1447 Ops = {RISCV::VFNCVT_X_F_W};
1448 } else if (SrcEltSz < DstEltSz) {
1449 Ops = {RISCV::VFWCVT_X_F_V};
1450 } else {
1451 Ops = {RISCV::VFCVT_X_F_V};
1452 }
1453
1454 // We need to use the source LMUL in the case of a narrowing op, and the
1455 // destination LMUL otherwise.
1456 if (SrcEltSz > DstEltSz)
1457 return SrcLT.first *
1458 getRISCVInstructionCost(Ops, SrcLT.second, CostKind);
1459 return LT.first * getRISCVInstructionCost(Ops, LT.second, CostKind);
1460 }
1461 break;
1462 }
1463 case Intrinsic::ceil:
1464 case Intrinsic::floor:
1465 case Intrinsic::trunc:
1466 case Intrinsic::rint:
1467 case Intrinsic::round:
1468 case Intrinsic::roundeven: {
1469 // These all use the same code.
1470 auto LT = getTypeLegalizationCost(RetTy);
1471 if (!LT.second.isVector() && TLI->isOperationCustom(ISD::FCEIL, LT.second))
1472 return LT.first * 8;
1473 break;
1474 }
1475 case Intrinsic::umin:
1476 case Intrinsic::umax:
1477 case Intrinsic::smin:
1478 case Intrinsic::smax: {
1479 auto LT = getTypeLegalizationCost(RetTy);
1480 if (LT.second.isScalarInteger() && ST->hasStdExtZbb())
1481 return LT.first;
1482
1483 if (ST->hasVInstructions() && LT.second.isVector()) {
1484 unsigned Op;
1485 switch (ICA.getID()) {
1486 case Intrinsic::umin:
1487 Op = RISCV::VMINU_VV;
1488 break;
1489 case Intrinsic::umax:
1490 Op = RISCV::VMAXU_VV;
1491 break;
1492 case Intrinsic::smin:
1493 Op = RISCV::VMIN_VV;
1494 break;
1495 case Intrinsic::smax:
1496 Op = RISCV::VMAX_VV;
1497 break;
1498 }
1499 return LT.first * getRISCVInstructionCost(Op, LT.second, CostKind);
1500 }
1501 break;
1502 }
1503 case Intrinsic::sadd_sat:
1504 case Intrinsic::ssub_sat:
1505 case Intrinsic::uadd_sat:
1506 case Intrinsic::usub_sat: {
1507 auto LT = getTypeLegalizationCost(RetTy);
1508 if (ST->hasVInstructions() && LT.second.isVector()) {
1509 unsigned Op;
1510 switch (ICA.getID()) {
1511 case Intrinsic::sadd_sat:
1512 Op = RISCV::VSADD_VV;
1513 break;
1514 case Intrinsic::ssub_sat:
1515 Op = RISCV::VSSUB_VV;
1516 break;
1517 case Intrinsic::uadd_sat:
1518 Op = RISCV::VSADDU_VV;
1519 break;
1520 case Intrinsic::usub_sat:
1521 Op = RISCV::VSSUBU_VV;
1522 break;
1523 }
1524 return LT.first * getRISCVInstructionCost(Op, LT.second, CostKind);
1525 }
1526 break;
1527 }
1528 case Intrinsic::fma:
1529 case Intrinsic::fmuladd: {
1530 // TODO: handle promotion with f16/bf16 with zvfhmin/zvfbfmin
1531 auto LT = getTypeLegalizationCost(RetTy);
1532 if (ST->hasVInstructions() && LT.second.isVector())
1533 return LT.first *
1534 getRISCVInstructionCost(RISCV::VFMADD_VV, LT.second, CostKind);
1535 break;
1536 }
1537 case Intrinsic::fabs: {
1538 auto LT = getTypeLegalizationCost(RetTy);
1539 if (ST->hasVInstructions() && LT.second.isVector()) {
1540 // lui a0, 8
1541 // addi a0, a0, -1
1542 // vsetvli a1, zero, e16, m1, ta, ma
1543 // vand.vx v8, v8, a0
1544 // f16 with zvfhmin and bf16 with zvfhbmin
1545 if (LT.second.getVectorElementType() == MVT::bf16 ||
1546 (LT.second.getVectorElementType() == MVT::f16 &&
1547 !ST->hasVInstructionsF16()))
1548 return LT.first * getRISCVInstructionCost(RISCV::VAND_VX, LT.second,
1549 CostKind) +
1550 2;
1551 else
1552 return LT.first *
1553 getRISCVInstructionCost(RISCV::VFSGNJX_VV, LT.second, CostKind);
1554 }
1555 break;
1556 }
1557 case Intrinsic::sqrt: {
1558 auto LT = getTypeLegalizationCost(RetTy);
1559 if (ST->hasVInstructions() && LT.second.isVector()) {
1562 MVT ConvType = LT.second;
1563 MVT FsqrtType = LT.second;
1564 // f16 with zvfhmin and bf16 with zvfbfmin and the type of nxv32[b]f16
1565 // will be spilt.
1566 if (LT.second.getVectorElementType() == MVT::bf16) {
1567 if (LT.second == MVT::nxv32bf16) {
1568 ConvOp = {RISCV::VFWCVTBF16_F_F_V, RISCV::VFWCVTBF16_F_F_V,
1569 RISCV::VFNCVTBF16_F_F_W, RISCV::VFNCVTBF16_F_F_W};
1570 FsqrtOp = {RISCV::VFSQRT_V, RISCV::VFSQRT_V};
1571 ConvType = MVT::nxv16f16;
1572 FsqrtType = MVT::nxv16f32;
1573 } else {
1574 ConvOp = {RISCV::VFWCVTBF16_F_F_V, RISCV::VFNCVTBF16_F_F_W};
1575 FsqrtOp = {RISCV::VFSQRT_V};
1576 FsqrtType = TLI->getTypeToPromoteTo(ISD::FSQRT, FsqrtType);
1577 }
1578 } else if (LT.second.getVectorElementType() == MVT::f16 &&
1579 !ST->hasVInstructionsF16()) {
1580 if (LT.second == MVT::nxv32f16) {
1581 ConvOp = {RISCV::VFWCVT_F_F_V, RISCV::VFWCVT_F_F_V,
1582 RISCV::VFNCVT_F_F_W, RISCV::VFNCVT_F_F_W};
1583 FsqrtOp = {RISCV::VFSQRT_V, RISCV::VFSQRT_V};
1584 ConvType = MVT::nxv16f16;
1585 FsqrtType = MVT::nxv16f32;
1586 } else {
1587 ConvOp = {RISCV::VFWCVT_F_F_V, RISCV::VFNCVT_F_F_W};
1588 FsqrtOp = {RISCV::VFSQRT_V};
1589 FsqrtType = TLI->getTypeToPromoteTo(ISD::FSQRT, FsqrtType);
1590 }
1591 } else {
1592 FsqrtOp = {RISCV::VFSQRT_V};
1593 }
1594
1595 return LT.first * (getRISCVInstructionCost(FsqrtOp, FsqrtType, CostKind) +
1596 getRISCVInstructionCost(ConvOp, ConvType, CostKind));
1597 }
1598 break;
1599 }
1600 case Intrinsic::cttz:
1601 case Intrinsic::ctlz:
1602 case Intrinsic::ctpop: {
1603 auto LT = getTypeLegalizationCost(RetTy);
1604 if (ST->hasStdExtZvbb() && LT.second.isVector()) {
1605 unsigned Op;
1606 switch (ICA.getID()) {
1607 case Intrinsic::cttz:
1608 Op = RISCV::VCTZ_V;
1609 break;
1610 case Intrinsic::ctlz:
1611 Op = RISCV::VCLZ_V;
1612 break;
1613 case Intrinsic::ctpop:
1614 Op = RISCV::VCPOP_V;
1615 break;
1616 }
1617 return LT.first * getRISCVInstructionCost(Op, LT.second, CostKind);
1618 }
1619 break;
1620 }
1621 case Intrinsic::abs: {
1622 auto LT = getTypeLegalizationCost(RetTy);
1623 if (ST->hasVInstructions() && LT.second.isVector()) {
1624 // vabs.v v10, v8
1625 if (ST->hasStdExtZvabd())
1626 return LT.first *
1627 getRISCVInstructionCost({RISCV::VABS_V}, LT.second, CostKind);
1628
1629 // vrsub.vi v10, v8, 0
1630 // vmax.vv v8, v8, v10
1631 return LT.first *
1632 getRISCVInstructionCost({RISCV::VRSUB_VI, RISCV::VMAX_VV},
1633 LT.second, CostKind);
1634 }
1635 break;
1636 }
1637 case Intrinsic::fshl:
1638 case Intrinsic::fshr: {
1639 if (ICA.getArgs().empty())
1640 break;
1641
1642 // Funnel-shifts are ROTL/ROTR when the first and second operand are equal.
1643 // When Zbb/Zbkb is enabled we can use a single ROL(W)/ROR(I)(W)
1644 // instruction.
1645 if ((ST->hasStdExtZbb() || ST->hasStdExtZbkb()) && RetTy->isIntegerTy() &&
1646 ICA.getArgs()[0] == ICA.getArgs()[1] &&
1647 (RetTy->getIntegerBitWidth() == 32 ||
1648 RetTy->getIntegerBitWidth() == 64) &&
1649 RetTy->getIntegerBitWidth() <= ST->getXLen()) {
1650 return 1;
1651 }
1652 break;
1653 }
1654 case Intrinsic::masked_udiv:
1655 return getArithmeticInstrCost(Instruction::UDiv, ICA.getReturnType(),
1656 CostKind);
1657 case Intrinsic::masked_sdiv:
1658 return getArithmeticInstrCost(Instruction::SDiv, ICA.getReturnType(),
1659 CostKind);
1660 case Intrinsic::masked_urem:
1661 return getArithmeticInstrCost(Instruction::URem, ICA.getReturnType(),
1662 CostKind);
1663 case Intrinsic::masked_srem:
1664 return getArithmeticInstrCost(Instruction::SRem, ICA.getReturnType(),
1665 CostKind);
1666 case Intrinsic::get_active_lane_mask: {
1667 if (ST->hasVInstructions()) {
1668 Type *ExpRetTy = VectorType::get(
1669 ICA.getArgTypes()[0], cast<VectorType>(RetTy)->getElementCount());
1670 auto LT = getTypeLegalizationCost(ExpRetTy);
1671
1672 // vid.v v8 // considered hoisted
1673 // vsaddu.vx v8, v8, a0
1674 // vmsltu.vx v0, v8, a1
1675 return LT.first *
1676 getRISCVInstructionCost({RISCV::VSADDU_VX, RISCV::VMSLTU_VX},
1677 LT.second, CostKind);
1678 }
1679 break;
1680 }
1681 // TODO: add more intrinsic
1682 case Intrinsic::stepvector: {
1683 auto LT = getTypeLegalizationCost(RetTy);
1684 // Legalisation of illegal types involves an `index' instruction plus
1685 // (LT.first - 1) vector adds.
1686 if (ST->hasVInstructions())
1687 return getRISCVInstructionCost(RISCV::VID_V, LT.second, CostKind) +
1688 (LT.first - 1) *
1689 getRISCVInstructionCost(RISCV::VADD_VX, LT.second, CostKind);
1690 return 1 + (LT.first - 1);
1691 }
1692 case Intrinsic::vector_splice_left:
1693 case Intrinsic::vector_splice_right: {
1694 auto LT = getTypeLegalizationCost(RetTy);
1695 // Constant offsets fall through to getShuffleCost.
1696 if (!ICA.isTypeBasedOnly() && isa<ConstantInt>(ICA.getArgs()[2]))
1697 break;
1698 if (ST->hasVInstructions() && LT.second.isVector()) {
1699 return LT.first *
1700 getRISCVInstructionCost({RISCV::VSLIDEDOWN_VX, RISCV::VSLIDEUP_VX},
1701 LT.second, CostKind);
1702 }
1703 break;
1704 }
1705 case Intrinsic::experimental_cttz_elts: {
1706 Type *ArgTy = ICA.getArgTypes()[0];
1707 EVT ArgType = TLI->getValueType(DL, ArgTy, true);
1708 if (getTLI()->shouldExpandCttzElements(ArgType))
1709 break;
1710 InstructionCost Cost = getRISCVInstructionCost(
1711 RISCV::VFIRST_M, getTypeLegalizationCost(ArgTy).second, CostKind);
1712
1713 // If zero_is_poison is false, then we will generate additional
1714 // cmp + select instructions to convert -1 to EVL.
1715 Type *BoolTy = Type::getInt1Ty(RetTy->getContext());
1716 if (ICA.getArgs().size() > 1 &&
1717 cast<ConstantInt>(ICA.getArgs()[1])->isZero())
1718 Cost += getCmpSelInstrCost(Instruction::ICmp, BoolTy, RetTy,
1720 getCmpSelInstrCost(Instruction::Select, RetTy, BoolTy,
1722
1723 return Cost;
1724 }
1725 case Intrinsic::experimental_vp_splice: {
1726 // To support type-based query from vectorizer, set the index to 0.
1727 // Note that index only change the cost from vslide.vx to vslide.vi and in
1728 // current implementations they have same costs.
1730 cast<VectorType>(ICA.getArgTypes()[0]), {}, CostKind,
1732 }
1733 case Intrinsic::fptoui_sat:
1734 case Intrinsic::fptosi_sat: {
1736 bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat;
1737 Type *SrcTy = ICA.getArgTypes()[0];
1738
1739 auto SrcLT = getTypeLegalizationCost(SrcTy);
1740 auto DstLT = getTypeLegalizationCost(RetTy);
1741 if (!SrcTy->isVectorTy())
1742 break;
1743
1744 if (!SrcLT.first.isValid() || !DstLT.first.isValid())
1746
1747 Cost +=
1748 getCastInstrCost(IsSigned ? Instruction::FPToSI : Instruction::FPToUI,
1749 RetTy, SrcTy, TTI::CastContextHint::None, CostKind);
1750
1751 // Handle NaN.
1752 // vmfne v0, v8, v8 # If v8[i] is NaN set v0[i] to 1.
1753 // vmerge.vim v8, v8, 0, v0 # Convert NaN to 0.
1754 Type *CondTy = RetTy->getWithNewBitWidth(1);
1755 Cost += getCmpSelInstrCost(BinaryOperator::FCmp, SrcTy, CondTy,
1757 Cost += getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
1759 return Cost;
1760 }
1761 case Intrinsic::experimental_vector_extract_last_active: {
1762 auto *ValTy = cast<VectorType>(ICA.getArgTypes()[0]);
1763 auto *MaskTy = cast<VectorType>(ICA.getArgTypes()[1]);
1764
1765 auto ValLT = getTypeLegalizationCost(ValTy);
1766 auto MaskLT = getTypeLegalizationCost(MaskTy);
1767
1768 // TODO: Return cheaper cost when the entire lane is inactive.
1769 // The expected asm sequence is:
1770 // vcpop.m a0, v0
1771 // beqz a0, exit # Return passthru when the entire lane is inactive.
1772 // vid v10, v0.t
1773 // vredmaxu.vs v10, v10, v10
1774 // vmv.x.s a0, v10
1775 // zext.b a0, a0
1776 // vslidedown.vx v8, v8, a0
1777 // vmv.x.s a0, v8
1778 // exit:
1779 // ...
1780
1781 // Find a suitable type for a stepvector.
1782 ConstantRange VScaleRange(APInt(64, 1), APInt::getZero(64));
1783 unsigned EltWidth = getTLI()->getBitWidthForCttzElements(
1784 TLI->getVectorIdxTy(getDataLayout()), MaskTy->getElementCount(),
1785 /*ZeroIsPoison=*/true, &VScaleRange);
1786 EltWidth = std::max(EltWidth, MaskTy->getScalarSizeInBits());
1787 Type *StepTy = Type::getIntNTy(MaskTy->getContext(), EltWidth);
1788 auto *StepVecTy = VectorType::get(StepTy, ValTy->getElementCount());
1789 auto StepLT = getTypeLegalizationCost(StepVecTy);
1790
1791 // Currently expandVectorFindLastActive cannot handle step vector split.
1792 // So return invalid when the type needs split.
1793 // FIXME: Remove this if expandVectorFindLastActive supports split vector.
1794 if (StepLT.first > 1)
1796
1798 unsigned Opcodes[] = {RISCV::VID_V, RISCV::VREDMAXU_VS, RISCV::VMV_X_S};
1799
1800 Cost += MaskLT.first *
1801 getRISCVInstructionCost(RISCV::VCPOP_M, MaskLT.second, CostKind);
1802 Cost += getCFInstrCost(Instruction::CondBr, CostKind, nullptr);
1803 Cost += StepLT.first *
1804 getRISCVInstructionCost(Opcodes, StepLT.second, CostKind);
1805 Cost += getCastInstrCost(Instruction::ZExt,
1806 Type::getInt64Ty(ValTy->getContext()), StepTy,
1808 Cost += ValLT.first *
1809 getRISCVInstructionCost({RISCV::VSLIDEDOWN_VI, RISCV::VMV_X_S},
1810 ValLT.second, CostKind);
1811 return Cost;
1812 }
1813 }
1814
1815 if (ST->hasVInstructions() && RetTy->isVectorTy()) {
1816 if (auto LT = getTypeLegalizationCost(RetTy);
1817 LT.second.isVector()) {
1818 MVT EltTy = LT.second.getVectorElementType();
1819 if (const auto *Entry = CostTableLookup(VectorIntrinsicCostTable,
1820 ICA.getID(), EltTy))
1821 return LT.first * Entry->Cost;
1822 }
1823 }
1824
1826}
1827
1830 const SCEV *Ptr,
1832 // Address computations for vector indexed load/store likely require an offset
1833 // and/or scaling.
1834 if (ST->hasVInstructions() && PtrTy->isVectorTy())
1835 return getArithmeticInstrCost(Instruction::Add, PtrTy, CostKind);
1836
1837 return BaseT::getAddressComputationCost(PtrTy, SE, Ptr, CostKind);
1838}
1839
1841 Type *Src,
1844 const Instruction *I) const {
1845 bool IsVectorType = isa<VectorType>(Dst) && isa<VectorType>(Src);
1846 if (!IsVectorType)
1847 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1848
1849 // TODO: Add proper cost model for P extension fixed vectors (e.g., v4i16)
1850 // For now, skip all fixed vector cost analysis when P extension is available
1851 // to avoid crashes in getMinRVVVectorSizeInBits()
1852 if (ST->hasStdExtP() &&
1854 return 1; // Treat as single instruction cost for now
1855 }
1856
1857 // FIXME: Need to compute legalizing cost for illegal types. The current
1858 // code handles only legal types and those which can be trivially
1859 // promoted to legal.
1860 if (!ST->hasVInstructions() || Src->getScalarSizeInBits() > ST->getELen() ||
1861 Dst->getScalarSizeInBits() > ST->getELen())
1862 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1863
1864 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1865 assert(ISD && "Invalid opcode");
1866 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Src);
1867 std::pair<InstructionCost, MVT> DstLT = getTypeLegalizationCost(Dst);
1868
1869 // Handle i1 source and dest cases *before* calling logic in BasicTTI.
1870 // The shared implementation doesn't model vector widening during legalization
1871 // and instead assumes scalarization. In order to scalarize an <N x i1>
1872 // vector, we need to extend/trunc to/from i8. If we don't special case
1873 // this, we can get an infinite recursion cycle.
1874 switch (ISD) {
1875 default:
1876 break;
1877 case ISD::SIGN_EXTEND:
1878 case ISD::ZERO_EXTEND:
1879 if (Src->getScalarSizeInBits() == 1) {
1880 // We do not use vsext/vzext to extend from mask vector.
1881 // Instead we use the following instructions to extend from mask vector:
1882 // vmv.v.i v8, 0
1883 // vmerge.vim v8, v8, -1, v0 (repeated per split)
1884 return getRISCVInstructionCost(RISCV::VMV_V_I, DstLT.second, CostKind) +
1885 DstLT.first * getRISCVInstructionCost(RISCV::VMERGE_VIM,
1886 DstLT.second, CostKind) +
1887 DstLT.first - 1;
1888 }
1889 break;
1890 case ISD::TRUNCATE:
1891 if (Dst->getScalarSizeInBits() == 1) {
1892 // We do not use several vncvt to truncate to mask vector. So we could
1893 // not use PowDiff to calculate it.
1894 // Instead we use the following instructions to truncate to mask vector:
1895 // vand.vi v8, v8, 1
1896 // vmsne.vi v0, v8, 0
1897 return SrcLT.first *
1898 getRISCVInstructionCost({RISCV::VAND_VI, RISCV::VMSNE_VI},
1899 SrcLT.second, CostKind) +
1900 SrcLT.first - 1;
1901 }
1902 break;
1903 };
1904
1905 // Our actual lowering for the case where a wider legal type is available
1906 // uses promotion to the wider type. This is reflected in the result of
1907 // getTypeLegalizationCost, but BasicTTI assumes the widened cases are
1908 // scalarized if the legalized Src and Dst are not equal sized.
1909 const DataLayout &DL = this->getDataLayout();
1910 if (!SrcLT.second.isVector() || !DstLT.second.isVector() ||
1911 !SrcLT.first.isValid() || !DstLT.first.isValid() ||
1912 !TypeSize::isKnownLE(DL.getTypeSizeInBits(Src),
1913 SrcLT.second.getSizeInBits()) ||
1914 !TypeSize::isKnownLE(DL.getTypeSizeInBits(Dst),
1915 DstLT.second.getSizeInBits()) ||
1916 SrcLT.first > 1 || DstLT.first > 1)
1917 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1918
1919 // The split cost is handled by the base getCastInstrCost
1920 assert((SrcLT.first == 1) && (DstLT.first == 1) && "Illegal type");
1921
1922 int PowDiff = (int)Log2_32(DstLT.second.getScalarSizeInBits()) -
1923 (int)Log2_32(SrcLT.second.getScalarSizeInBits());
1924 switch (ISD) {
1925 case ISD::SIGN_EXTEND:
1926 case ISD::ZERO_EXTEND: {
1927 if ((PowDiff < 1) || (PowDiff > 3))
1928 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1929 unsigned SExtOp[] = {RISCV::VSEXT_VF2, RISCV::VSEXT_VF4, RISCV::VSEXT_VF8};
1930 unsigned ZExtOp[] = {RISCV::VZEXT_VF2, RISCV::VZEXT_VF4, RISCV::VZEXT_VF8};
1931 unsigned Op =
1932 (ISD == ISD::SIGN_EXTEND) ? SExtOp[PowDiff - 1] : ZExtOp[PowDiff - 1];
1933 return getRISCVInstructionCost(Op, DstLT.second, CostKind);
1934 }
1935 case ISD::TRUNCATE:
1936 case ISD::FP_EXTEND:
1937 case ISD::FP_ROUND: {
1938 // Counts of narrow/widen instructions.
1939 unsigned SrcEltSize = SrcLT.second.getScalarSizeInBits();
1940 unsigned DstEltSize = DstLT.second.getScalarSizeInBits();
1941
1942 unsigned Op = (ISD == ISD::TRUNCATE) ? RISCV::VNSRL_WI
1943 : (ISD == ISD::FP_EXTEND) ? RISCV::VFWCVT_F_F_V
1944 : RISCV::VFNCVT_F_F_W;
1946 for (; SrcEltSize != DstEltSize;) {
1947 MVT ElementMVT = (ISD == ISD::TRUNCATE)
1948 ? MVT::getIntegerVT(DstEltSize)
1949 : MVT::getFloatingPointVT(DstEltSize);
1950 MVT DstMVT = DstLT.second.changeVectorElementType(ElementMVT);
1951 DstEltSize =
1952 (DstEltSize > SrcEltSize) ? DstEltSize >> 1 : DstEltSize << 1;
1953 Cost += getRISCVInstructionCost(Op, DstMVT, CostKind);
1954 }
1955 return Cost;
1956 }
1957 case ISD::FP_TO_SINT:
1958 case ISD::FP_TO_UINT: {
1959 unsigned IsSigned = ISD == ISD::FP_TO_SINT;
1960 unsigned FCVT = IsSigned ? RISCV::VFCVT_RTZ_X_F_V : RISCV::VFCVT_RTZ_XU_F_V;
1961 unsigned FWCVT =
1962 IsSigned ? RISCV::VFWCVT_RTZ_X_F_V : RISCV::VFWCVT_RTZ_XU_F_V;
1963 unsigned FNCVT =
1964 IsSigned ? RISCV::VFNCVT_RTZ_X_F_W : RISCV::VFNCVT_RTZ_XU_F_W;
1965 unsigned SrcEltSize = Src->getScalarSizeInBits();
1966 unsigned DstEltSize = Dst->getScalarSizeInBits();
1968 if ((SrcEltSize == 16) &&
1969 (!ST->hasVInstructionsF16() || ((DstEltSize / 2) > SrcEltSize))) {
1970 // If the target only supports zvfhmin or it is fp16-to-i64 conversion
1971 // pre-widening to f32 and then convert f32 to integer
1972 VectorType *VecF32Ty =
1973 VectorType::get(Type::getFloatTy(Dst->getContext()),
1974 cast<VectorType>(Dst)->getElementCount());
1975 std::pair<InstructionCost, MVT> VecF32LT =
1976 getTypeLegalizationCost(VecF32Ty);
1977 Cost +=
1978 VecF32LT.first * getRISCVInstructionCost(RISCV::VFWCVT_F_F_V,
1979 VecF32LT.second, CostKind);
1980 Cost += getCastInstrCost(Opcode, Dst, VecF32Ty, CCH, CostKind, I);
1981 return Cost;
1982 }
1983 if (DstEltSize == SrcEltSize)
1984 Cost += getRISCVInstructionCost(FCVT, DstLT.second, CostKind);
1985 else if (DstEltSize > SrcEltSize)
1986 Cost += getRISCVInstructionCost(FWCVT, DstLT.second, CostKind);
1987 else { // (SrcEltSize > DstEltSize)
1988 // First do a narrowing conversion to an integer half the size, then
1989 // truncate if needed.
1990 MVT ElementVT = MVT::getIntegerVT(SrcEltSize / 2);
1991 MVT VecVT = DstLT.second.changeVectorElementType(ElementVT);
1992 Cost += getRISCVInstructionCost(FNCVT, VecVT, CostKind);
1993 if ((SrcEltSize / 2) > DstEltSize) {
1994 Type *VecTy = EVT(VecVT).getTypeForEVT(Dst->getContext());
1995 Cost +=
1996 getCastInstrCost(Instruction::Trunc, Dst, VecTy, CCH, CostKind, I);
1997 }
1998 }
1999 return Cost;
2000 }
2001 case ISD::SINT_TO_FP:
2002 case ISD::UINT_TO_FP: {
2003 unsigned IsSigned = ISD == ISD::SINT_TO_FP;
2004 unsigned FCVT = IsSigned ? RISCV::VFCVT_F_X_V : RISCV::VFCVT_F_XU_V;
2005 unsigned FWCVT = IsSigned ? RISCV::VFWCVT_F_X_V : RISCV::VFWCVT_F_XU_V;
2006 unsigned FNCVT = IsSigned ? RISCV::VFNCVT_F_X_W : RISCV::VFNCVT_F_XU_W;
2007 unsigned SrcEltSize = Src->getScalarSizeInBits();
2008 unsigned DstEltSize = Dst->getScalarSizeInBits();
2009
2011 if ((DstEltSize == 16) &&
2012 (!ST->hasVInstructionsF16() || ((SrcEltSize / 2) > DstEltSize))) {
2013 // If the target only supports zvfhmin or it is i64-to-fp16 conversion
2014 // it is converted to f32 and then converted to f16
2015 VectorType *VecF32Ty =
2016 VectorType::get(Type::getFloatTy(Dst->getContext()),
2017 cast<VectorType>(Dst)->getElementCount());
2018 std::pair<InstructionCost, MVT> VecF32LT =
2019 getTypeLegalizationCost(VecF32Ty);
2020 Cost += getCastInstrCost(Opcode, VecF32Ty, Src, CCH, CostKind, I);
2021 Cost += VecF32LT.first * getRISCVInstructionCost(RISCV::VFNCVT_F_F_W,
2022 DstLT.second, CostKind);
2023 return Cost;
2024 }
2025
2026 if (DstEltSize == SrcEltSize)
2027 Cost += getRISCVInstructionCost(FCVT, DstLT.second, CostKind);
2028 else if (DstEltSize > SrcEltSize) {
2029 if ((DstEltSize / 2) > SrcEltSize) {
2030 VectorType *VecTy =
2031 VectorType::get(IntegerType::get(Dst->getContext(), DstEltSize / 2),
2032 cast<VectorType>(Dst)->getElementCount());
2033 unsigned Op = IsSigned ? Instruction::SExt : Instruction::ZExt;
2034 Cost += getCastInstrCost(Op, VecTy, Src, CCH, CostKind, I);
2035 }
2036 Cost += getRISCVInstructionCost(FWCVT, DstLT.second, CostKind);
2037 } else
2038 Cost += getRISCVInstructionCost(FNCVT, DstLT.second, CostKind);
2039 return Cost;
2040 }
2041 }
2042 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
2043}
2044
2045unsigned RISCVTTIImpl::getEstimatedVLFor(VectorType *Ty) const {
2046 if (isa<ScalableVectorType>(Ty)) {
2047 const unsigned EltSize = DL.getTypeSizeInBits(Ty->getElementType());
2048 const unsigned MinSize = DL.getTypeSizeInBits(Ty).getKnownMinValue();
2049 const unsigned VectorBits = *getVScaleForTuning() * RISCV::RVVBitsPerBlock;
2050 return RISCVTargetLowering::computeVLMAX(VectorBits, EltSize, MinSize);
2051 }
2052 return cast<FixedVectorType>(Ty)->getNumElements();
2053}
2054
2057 FastMathFlags FMF,
2059 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
2060 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
2061
2062 // Skip if scalar size of Ty is bigger than ELEN.
2063 if (Ty->getScalarSizeInBits() > ST->getELen())
2064 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
2065
2066 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
2067 if (Ty->getElementType()->isIntegerTy(1)) {
2068 // SelectionDAGBuilder does following transforms:
2069 // vector_reduce_{smin,umax}(<n x i1>) --> vector_reduce_or(<n x i1>)
2070 // vector_reduce_{smax,umin}(<n x i1>) --> vector_reduce_and(<n x i1>)
2071 if (IID == Intrinsic::umax || IID == Intrinsic::smin)
2072 return getArithmeticReductionCost(Instruction::Or, Ty, FMF, CostKind);
2073 else
2074 return getArithmeticReductionCost(Instruction::And, Ty, FMF, CostKind);
2075 }
2076
2077 if (IID == Intrinsic::maximum || IID == Intrinsic::minimum) {
2079 InstructionCost ExtraCost = 0;
2080 switch (IID) {
2081 case Intrinsic::maximum:
2082 if (FMF.noNaNs()) {
2083 Opcodes = {RISCV::VFREDMAX_VS, RISCV::VFMV_F_S};
2084 } else {
2085 Opcodes = {RISCV::VMFNE_VV, RISCV::VCPOP_M, RISCV::VFREDMAX_VS,
2086 RISCV::VFMV_F_S};
2087 // Cost of Canonical Nan + branch
2088 // lui a0, 523264
2089 // fmv.w.x fa0, a0
2090 Type *DstTy = Ty->getScalarType();
2091 const unsigned EltTyBits = DstTy->getScalarSizeInBits();
2092 Type *SrcTy = IntegerType::getIntNTy(DstTy->getContext(), EltTyBits);
2093 ExtraCost = 1 +
2094 getCastInstrCost(Instruction::UIToFP, DstTy, SrcTy,
2096 getCFInstrCost(Instruction::CondBr, CostKind);
2097 }
2098 break;
2099
2100 case Intrinsic::minimum:
2101 if (FMF.noNaNs()) {
2102 Opcodes = {RISCV::VFREDMIN_VS, RISCV::VFMV_F_S};
2103 } else {
2104 Opcodes = {RISCV::VMFNE_VV, RISCV::VCPOP_M, RISCV::VFREDMIN_VS,
2105 RISCV::VFMV_F_S};
2106 // Cost of Canonical Nan + branch
2107 // lui a0, 523264
2108 // fmv.w.x fa0, a0
2109 Type *DstTy = Ty->getScalarType();
2110 const unsigned EltTyBits = DL.getTypeSizeInBits(DstTy);
2111 Type *SrcTy = IntegerType::getIntNTy(DstTy->getContext(), EltTyBits);
2112 ExtraCost = 1 +
2113 getCastInstrCost(Instruction::UIToFP, DstTy, SrcTy,
2115 getCFInstrCost(Instruction::CondBr, CostKind);
2116 }
2117 break;
2118 }
2119 return ExtraCost + getRISCVInstructionCost(Opcodes, LT.second, CostKind);
2120 }
2121
2122 // IR Reduction is composed by one rvv reduction instruction and vmv
2123 unsigned SplitOp;
2125 switch (IID) {
2126 default:
2127 llvm_unreachable("Unsupported intrinsic");
2128 case Intrinsic::smax:
2129 SplitOp = RISCV::VMAX_VV;
2130 Opcodes = {RISCV::VREDMAX_VS, RISCV::VMV_X_S};
2131 break;
2132 case Intrinsic::smin:
2133 SplitOp = RISCV::VMIN_VV;
2134 Opcodes = {RISCV::VREDMIN_VS, RISCV::VMV_X_S};
2135 break;
2136 case Intrinsic::umax:
2137 SplitOp = RISCV::VMAXU_VV;
2138 Opcodes = {RISCV::VREDMAXU_VS, RISCV::VMV_X_S};
2139 break;
2140 case Intrinsic::umin:
2141 SplitOp = RISCV::VMINU_VV;
2142 Opcodes = {RISCV::VREDMINU_VS, RISCV::VMV_X_S};
2143 break;
2144 case Intrinsic::maxnum:
2145 SplitOp = RISCV::VFMAX_VV;
2146 Opcodes = {RISCV::VFREDMAX_VS, RISCV::VFMV_F_S};
2147 break;
2148 case Intrinsic::minnum:
2149 SplitOp = RISCV::VFMIN_VV;
2150 Opcodes = {RISCV::VFREDMIN_VS, RISCV::VFMV_F_S};
2151 break;
2152 }
2153 // Add a cost for data larger than LMUL8
2154 InstructionCost SplitCost =
2155 (LT.first > 1) ? (LT.first - 1) *
2156 getRISCVInstructionCost(SplitOp, LT.second, CostKind)
2157 : 0;
2158 return SplitCost + getRISCVInstructionCost(Opcodes, LT.second, CostKind);
2159}
2160
2163 std::optional<FastMathFlags> FMF,
2165 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
2166 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
2167
2168 // Skip if scalar size of Ty is bigger than ELEN.
2169 if (Ty->getScalarSizeInBits() > ST->getELen())
2170 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
2171
2172 int ISD = TLI->InstructionOpcodeToISD(Opcode);
2173 assert(ISD && "Invalid opcode");
2174
2175 if (ISD != ISD::ADD && ISD != ISD::OR && ISD != ISD::XOR && ISD != ISD::AND &&
2176 ISD != ISD::FADD)
2177 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
2178
2179 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
2180 Type *ElementTy = Ty->getElementType();
2181 if (ElementTy->isIntegerTy(1)) {
2182 // Example sequences:
2183 // vfirst.m a0, v0
2184 // seqz a0, a0
2185 if (LT.second == MVT::v1i1)
2186 return getRISCVInstructionCost(RISCV::VFIRST_M, LT.second, CostKind) +
2187 getCmpSelInstrCost(Instruction::ICmp, ElementTy, ElementTy,
2189
2190 if (ISD == ISD::AND) {
2191 // Example sequences:
2192 // vmand.mm v8, v9, v8 ; needed every time type is split
2193 // vmnot.m v8, v0 ; alias for vmnand
2194 // vcpop.m a0, v8
2195 // seqz a0, a0
2196
2197 // See the discussion: https://github.com/llvm/llvm-project/pull/119160
2198 // For LMUL <= 8, there is no splitting,
2199 // the sequences are vmnot, vcpop and seqz.
2200 // When LMUL > 8 and split = 1,
2201 // the sequences are vmnand, vcpop and seqz.
2202 // When LMUL > 8 and split > 1,
2203 // the sequences are (LT.first-2) * vmand, vmnand, vcpop and seqz.
2204 return ((LT.first > 2) ? (LT.first - 2) : 0) *
2205 getRISCVInstructionCost(RISCV::VMAND_MM, LT.second, CostKind) +
2206 getRISCVInstructionCost(RISCV::VMNAND_MM, LT.second, CostKind) +
2207 getRISCVInstructionCost(RISCV::VCPOP_M, LT.second, CostKind) +
2208 getCmpSelInstrCost(Instruction::ICmp, ElementTy, ElementTy,
2210 } else if (ISD == ISD::XOR || ISD == ISD::ADD) {
2211 // Example sequences:
2212 // vsetvli a0, zero, e8, mf8, ta, ma
2213 // vmxor.mm v8, v0, v8 ; needed every time type is split
2214 // vcpop.m a0, v8
2215 // andi a0, a0, 1
2216 return (LT.first - 1) *
2217 getRISCVInstructionCost(RISCV::VMXOR_MM, LT.second, CostKind) +
2218 getRISCVInstructionCost(RISCV::VCPOP_M, LT.second, CostKind) + 1;
2219 } else {
2220 assert(ISD == ISD::OR);
2221 // Example sequences:
2222 // vsetvli a0, zero, e8, mf8, ta, ma
2223 // vmor.mm v8, v9, v8 ; needed every time type is split
2224 // vcpop.m a0, v0
2225 // snez a0, a0
2226 return (LT.first - 1) *
2227 getRISCVInstructionCost(RISCV::VMOR_MM, LT.second, CostKind) +
2228 getRISCVInstructionCost(RISCV::VCPOP_M, LT.second, CostKind) +
2229 getCmpSelInstrCost(Instruction::ICmp, ElementTy, ElementTy,
2231 }
2232 }
2233
2234 // IR Reduction of or/and is composed by one vmv and one rvv reduction
2235 // instruction, and others is composed by two vmv and one rvv reduction
2236 // instruction
2237 unsigned SplitOp;
2239 switch (ISD) {
2240 case ISD::ADD:
2241 SplitOp = RISCV::VADD_VV;
2242 Opcodes = {RISCV::VMV_S_X, RISCV::VREDSUM_VS, RISCV::VMV_X_S};
2243 break;
2244 case ISD::OR:
2245 SplitOp = RISCV::VOR_VV;
2246 Opcodes = {RISCV::VREDOR_VS, RISCV::VMV_X_S};
2247 break;
2248 case ISD::XOR:
2249 SplitOp = RISCV::VXOR_VV;
2250 Opcodes = {RISCV::VMV_S_X, RISCV::VREDXOR_VS, RISCV::VMV_X_S};
2251 break;
2252 case ISD::AND:
2253 SplitOp = RISCV::VAND_VV;
2254 Opcodes = {RISCV::VREDAND_VS, RISCV::VMV_X_S};
2255 break;
2256 case ISD::FADD:
2257 // We can't promote f16/bf16 fadd reductions.
2258 if ((LT.second.getScalarType() == MVT::f16 && !ST->hasVInstructionsF16()) ||
2259 LT.second.getScalarType() == MVT::bf16)
2260 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
2262 Opcodes.push_back(RISCV::VFMV_S_F);
2263 for (unsigned i = 0; i < LT.first.getValue(); i++)
2264 Opcodes.push_back(RISCV::VFREDOSUM_VS);
2265 Opcodes.push_back(RISCV::VFMV_F_S);
2266 return getRISCVInstructionCost(Opcodes, LT.second, CostKind);
2267 }
2268 SplitOp = RISCV::VFADD_VV;
2269 Opcodes = {RISCV::VFMV_S_F, RISCV::VFREDUSUM_VS, RISCV::VFMV_F_S};
2270 break;
2271 }
2272 // Add a cost for data larger than LMUL8
2273 InstructionCost SplitCost =
2274 (LT.first > 1) ? (LT.first - 1) *
2275 getRISCVInstructionCost(SplitOp, LT.second, CostKind)
2276 : 0;
2277 return SplitCost + getRISCVInstructionCost(Opcodes, LT.second, CostKind);
2278}
2279
2281 unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy,
2282 std::optional<FastMathFlags> FMF, TTI::TargetCostKind CostKind) const {
2283 if (isa<FixedVectorType>(ValTy) && !ST->useRVVForFixedLengthVectors())
2284 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
2285 FMF, CostKind);
2286
2287 // Skip if scalar size of ResTy is bigger than ELEN.
2288 if (ResTy->getScalarSizeInBits() > ST->getELen())
2289 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
2290 FMF, CostKind);
2291
2292 if (Opcode != Instruction::Add && Opcode != Instruction::FAdd)
2293 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
2294 FMF, CostKind);
2295
2296 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
2297
2298 if (IsUnsigned && Opcode == Instruction::Add &&
2299 LT.second.isFixedLengthVector() && LT.second.getScalarType() == MVT::i1) {
2300 // Represent vector_reduce_add(ZExt(<n x i1>)) as
2301 // ZExtOrTrunc(ctpop(bitcast <n x i1> to in)).
2302 return LT.first *
2303 getRISCVInstructionCost(RISCV::VCPOP_M, LT.second, CostKind);
2304 }
2305
2306 if (ResTy->getScalarSizeInBits() != 2 * LT.second.getScalarSizeInBits())
2307 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
2308 FMF, CostKind);
2309
2310 return (LT.first - 1) +
2311 getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
2312}
2313
2317 assert(OpInfo.isConstant() && "non constant operand?");
2318 if (!isa<VectorType>(Ty))
2319 // FIXME: We need to account for immediate materialization here, but doing
2320 // a decent job requires more knowledge about the immediate than we
2321 // currently have here.
2322 return 0;
2323
2324 if (OpInfo.isUniform())
2325 // vmv.v.i, vmv.v.x, or vfmv.v.f
2326 // We ignore the cost of the scalar constant materialization to be consistent
2327 // with how we treat scalar constants themselves just above.
2328 return 1;
2329
2330 return getConstantPoolLoadCost(Ty, CostKind);
2331}
2332
2334 Align Alignment,
2335 unsigned AddressSpace,
2337 TTI::OperandValueInfo OpInfo,
2338 const Instruction *I) const {
2339 EVT VT = TLI->getValueType(DL, Src, true);
2340 // Type legalization can't handle structs, and load latency isn't handled here
2341 if (VT == MVT::Other ||
2342 (Opcode == Instruction::Load && CostKind == TTI::TCK_Latency))
2343 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
2344 CostKind, OpInfo, I);
2345
2347 if (Opcode == Instruction::Store && OpInfo.isConstant())
2348 Cost += getStoreImmCost(Src, OpInfo, CostKind);
2349
2350 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
2351
2352 InstructionCost BaseCost = [&]() {
2353 InstructionCost Cost = LT.first;
2355 return Cost;
2356
2357 // Our actual lowering for the case where a wider legal type is available
2358 // uses the a VL predicated load on the wider type. This is reflected in
2359 // the result of getTypeLegalizationCost, but BasicTTI assumes the
2360 // widened cases are scalarized.
2361 const DataLayout &DL = this->getDataLayout();
2362 if (Src->isVectorTy() && LT.second.isVector() &&
2363 TypeSize::isKnownLT(DL.getTypeStoreSizeInBits(Src),
2364 LT.second.getSizeInBits()))
2365 return Cost;
2366
2367 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
2368 CostKind, OpInfo, I);
2369 }();
2370
2371 // Assume memory ops cost scale with the number of vector registers
2372 // possible accessed by the instruction. Note that BasicTTI already
2373 // handles the LT.first term for us.
2374 if (ST->hasVInstructions() && LT.second.isVector() &&
2376 BaseCost *= TLI->getLMULCost(LT.second);
2377 return Cost + BaseCost;
2378}
2379
2381 unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
2383 TTI::OperandValueInfo Op2Info, const Instruction *I) const {
2385 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2386 Op1Info, Op2Info, I);
2387
2388 if (isa<FixedVectorType>(ValTy) && !ST->useRVVForFixedLengthVectors())
2389 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2390 Op1Info, Op2Info, I);
2391
2392 // Skip if scalar size of ValTy is bigger than ELEN.
2393 if (ValTy->isVectorTy() && ValTy->getScalarSizeInBits() > ST->getELen())
2394 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2395 Op1Info, Op2Info, I);
2396
2397 auto GetConstantMatCost =
2398 [&](TTI::OperandValueInfo OpInfo) -> InstructionCost {
2399 if (OpInfo.isUniform())
2400 // We return 0 we currently ignore the cost of materializing scalar
2401 // constants in GPRs.
2402 return 0;
2403
2404 return getConstantPoolLoadCost(ValTy, CostKind);
2405 };
2406
2407 InstructionCost ConstantMatCost;
2408 if (Op1Info.isConstant())
2409 ConstantMatCost += GetConstantMatCost(Op1Info);
2410 if (Op2Info.isConstant())
2411 ConstantMatCost += GetConstantMatCost(Op2Info);
2412
2413 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
2414 if (Opcode == Instruction::Select && ValTy->isVectorTy()) {
2415 if (CondTy->isVectorTy()) {
2416 if (ValTy->getScalarSizeInBits() == 1) {
2417 // vmandn.mm v8, v8, v9
2418 // vmand.mm v9, v0, v9
2419 // vmor.mm v0, v9, v8
2420 return ConstantMatCost +
2421 LT.first *
2422 getRISCVInstructionCost(
2423 {RISCV::VMANDN_MM, RISCV::VMAND_MM, RISCV::VMOR_MM},
2424 LT.second, CostKind);
2425 }
2426 // vselect and max/min are supported natively.
2427 return ConstantMatCost +
2428 LT.first * getRISCVInstructionCost(RISCV::VMERGE_VVM, LT.second,
2429 CostKind);
2430 }
2431
2432 if (ValTy->getScalarSizeInBits() == 1) {
2433 // vmv.v.x v9, a0
2434 // vmsne.vi v9, v9, 0
2435 // vmandn.mm v8, v8, v9
2436 // vmand.mm v9, v0, v9
2437 // vmor.mm v0, v9, v8
2438 MVT InterimVT = LT.second.changeVectorElementType(MVT::i8);
2439 return ConstantMatCost +
2440 LT.first *
2441 getRISCVInstructionCost({RISCV::VMV_V_X, RISCV::VMSNE_VI},
2442 InterimVT, CostKind) +
2443 LT.first * getRISCVInstructionCost(
2444 {RISCV::VMANDN_MM, RISCV::VMAND_MM, RISCV::VMOR_MM},
2445 LT.second, CostKind);
2446 }
2447
2448 // vmv.v.x v10, a0
2449 // vmsne.vi v0, v10, 0
2450 // vmerge.vvm v8, v9, v8, v0
2451 return ConstantMatCost +
2452 LT.first * getRISCVInstructionCost(
2453 {RISCV::VMV_V_X, RISCV::VMSNE_VI, RISCV::VMERGE_VVM},
2454 LT.second, CostKind);
2455 }
2456
2457 if ((Opcode == Instruction::ICmp) && ValTy->isVectorTy() &&
2458 CmpInst::isIntPredicate(VecPred)) {
2459 // Use VMSLT_VV to represent VMSEQ, VMSNE, VMSLTU, VMSLEU, VMSLT, VMSLE
2460 // provided they incur the same cost across all implementations
2461 return ConstantMatCost + LT.first * getRISCVInstructionCost(RISCV::VMSLT_VV,
2462 LT.second,
2463 CostKind);
2464 }
2465
2466 if ((Opcode == Instruction::FCmp) && ValTy->isVectorTy() &&
2467 CmpInst::isFPPredicate(VecPred)) {
2468
2469 // Use VMXOR_MM and VMXNOR_MM to generate all true/false mask
2470 if ((VecPred == CmpInst::FCMP_FALSE) || (VecPred == CmpInst::FCMP_TRUE))
2471 return ConstantMatCost +
2472 getRISCVInstructionCost(RISCV::VMXOR_MM, LT.second, CostKind);
2473
2474 // If we do not support the input floating point vector type, use the base
2475 // one which will calculate as:
2476 // ScalarizeCost + Num * Cost for fixed vector,
2477 // InvalidCost for scalable vector.
2478 if ((ValTy->getScalarSizeInBits() == 16 && !ST->hasVInstructionsF16()) ||
2479 (ValTy->getScalarSizeInBits() == 32 && !ST->hasVInstructionsF32()) ||
2480 (ValTy->getScalarSizeInBits() == 64 && !ST->hasVInstructionsF64()))
2481 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2482 Op1Info, Op2Info, I);
2483
2484 // Assuming vector fp compare and mask instructions are all the same cost
2485 // until a need arises to differentiate them.
2486 switch (VecPred) {
2487 case CmpInst::FCMP_ONE: // vmflt.vv + vmflt.vv + vmor.mm
2488 case CmpInst::FCMP_ORD: // vmfeq.vv + vmfeq.vv + vmand.mm
2489 case CmpInst::FCMP_UNO: // vmfne.vv + vmfne.vv + vmor.mm
2490 case CmpInst::FCMP_UEQ: // vmflt.vv + vmflt.vv + vmnor.mm
2491 return ConstantMatCost +
2492 LT.first * getRISCVInstructionCost(
2493 {RISCV::VMFLT_VV, RISCV::VMFLT_VV, RISCV::VMOR_MM},
2494 LT.second, CostKind);
2495
2496 case CmpInst::FCMP_UGT: // vmfle.vv + vmnot.m
2497 case CmpInst::FCMP_UGE: // vmflt.vv + vmnot.m
2498 case CmpInst::FCMP_ULT: // vmfle.vv + vmnot.m
2499 case CmpInst::FCMP_ULE: // vmflt.vv + vmnot.m
2500 return ConstantMatCost +
2501 LT.first *
2502 getRISCVInstructionCost({RISCV::VMFLT_VV, RISCV::VMNAND_MM},
2503 LT.second, CostKind);
2504
2505 case CmpInst::FCMP_OEQ: // vmfeq.vv
2506 case CmpInst::FCMP_OGT: // vmflt.vv
2507 case CmpInst::FCMP_OGE: // vmfle.vv
2508 case CmpInst::FCMP_OLT: // vmflt.vv
2509 case CmpInst::FCMP_OLE: // vmfle.vv
2510 case CmpInst::FCMP_UNE: // vmfne.vv
2511 return ConstantMatCost +
2512 LT.first *
2513 getRISCVInstructionCost(RISCV::VMFLT_VV, LT.second, CostKind);
2514 default:
2515 break;
2516 }
2517 }
2518
2519 // With ShortForwardBranchOpt or ConditionalMoveFusion, scalar icmp + select
2520 // instructions will lower to SELECT_CC and lower to PseudoCCMOVGPR which will
2521 // generate a conditional branch + mv. The cost of scalar (icmp + select) will
2522 // be (0 + select instr cost).
2523 if (ST->hasConditionalMoveFusion() && I && isa<ICmpInst>(I) &&
2524 ValTy->isIntegerTy() && !I->user_empty()) {
2525 if (all_of(I->users(), [&](const User *U) {
2526 return match(U, m_Select(m_Specific(I), m_Value(), m_Value())) &&
2527 U->getType()->isIntegerTy() &&
2528 !isa<ConstantData>(U->getOperand(1)) &&
2529 !isa<ConstantData>(U->getOperand(2));
2530 }))
2531 return 0;
2532 }
2533
2534 // TODO: Add cost for scalar type.
2535
2536 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2537 Op1Info, Op2Info, I);
2538}
2539
2542 const Instruction *I) const {
2544 return Opcode == Instruction::PHI ? 0 : 1;
2545 // Branches are assumed to be predicted.
2546 return 0;
2547}
2548
2550 unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
2551 const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC) const {
2552 assert(Val->isVectorTy() && "This must be a vector type");
2553
2554 // TODO: Add proper cost model for P extension fixed vectors (e.g., v4i16)
2555 // For now, skip all fixed vector cost analysis when P extension is available
2556 // to avoid crashes in getMinRVVVectorSizeInBits()
2557 if (ST->hasStdExtP() && isa<FixedVectorType>(Val)) {
2558 return 1; // Treat as single instruction cost for now
2559 }
2560
2561 if (Opcode != Instruction::ExtractElement &&
2562 Opcode != Instruction::InsertElement)
2563 return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1,
2564 VIC);
2565
2566 // Legalize the type.
2567 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
2568
2569 // This type is legalized to a scalar type.
2570 if (!LT.second.isVector()) {
2571 auto *FixedVecTy = cast<FixedVectorType>(Val);
2572 // If Index is a known constant, cost is zero.
2573 if (Index != -1U)
2574 return 0;
2575 // Extract/InsertElement with non-constant index is very costly when
2576 // scalarized; estimate cost of loads/stores sequence via the stack:
2577 // ExtractElement cost: store vector to stack, load scalar;
2578 // InsertElement cost: store vector to stack, store scalar, load vector.
2579 Type *ElemTy = FixedVecTy->getElementType();
2580 auto NumElems = FixedVecTy->getNumElements();
2581 auto Align = DL.getPrefTypeAlign(ElemTy);
2582 InstructionCost LoadCost =
2583 getMemoryOpCost(Instruction::Load, ElemTy, Align, 0, CostKind);
2584 InstructionCost StoreCost =
2585 getMemoryOpCost(Instruction::Store, ElemTy, Align, 0, CostKind);
2586 return Opcode == Instruction::ExtractElement
2587 ? StoreCost * NumElems + LoadCost
2588 : (StoreCost + LoadCost) * NumElems + StoreCost;
2589 }
2590
2591 // For unsupported scalable vector.
2592 if (LT.second.isScalableVector() && !LT.first.isValid())
2593 return LT.first;
2594
2595 // Mask vector extract/insert is expanded via e8.
2596 if (Val->getScalarSizeInBits() == 1) {
2597 VectorType *WideTy =
2599 cast<VectorType>(Val)->getElementCount());
2600 if (Opcode == Instruction::ExtractElement) {
2601 InstructionCost ExtendCost
2602 = getCastInstrCost(Instruction::ZExt, WideTy, Val,
2604 InstructionCost ExtractCost
2605 = getVectorInstrCost(Opcode, WideTy, CostKind, Index, nullptr, nullptr);
2606 return ExtendCost + ExtractCost;
2607 }
2608 InstructionCost ExtendCost
2609 = getCastInstrCost(Instruction::ZExt, WideTy, Val,
2611 InstructionCost InsertCost
2612 = getVectorInstrCost(Opcode, WideTy, CostKind, Index, nullptr, nullptr);
2613 InstructionCost TruncCost
2614 = getCastInstrCost(Instruction::Trunc, Val, WideTy,
2616 return ExtendCost + InsertCost + TruncCost;
2617 }
2618
2619
2620 // In RVV, we could use vslidedown + vmv.x.s to extract element from vector
2621 // and vslideup + vmv.s.x to insert element to vector.
2622 unsigned BaseCost = 1;
2623 // When insertelement we should add the index with 1 as the input of vslideup.
2624 unsigned SlideCost = Opcode == Instruction::InsertElement ? 2 : 1;
2625
2626 if (Index != -1U) {
2627 // The type may be split. For fixed-width vectors we can normalize the
2628 // index to the new type.
2629 if (LT.second.isFixedLengthVector()) {
2630 unsigned Width = LT.second.getVectorNumElements();
2631 Index = Index % Width;
2632 }
2633
2634 // If exact VLEN is known, we will insert/extract into the appropriate
2635 // subvector with no additional subvector insert/extract cost.
2636 if (auto VLEN = ST->getRealVLen()) {
2637 unsigned EltSize = LT.second.getScalarSizeInBits();
2638 unsigned M1Max = *VLEN / EltSize;
2639 Index = Index % M1Max;
2640 }
2641
2642 if (Index == 0)
2643 // We can extract/insert the first element without vslidedown/vslideup.
2644 SlideCost = 0;
2645 else if (Opcode == Instruction::InsertElement)
2646 SlideCost = 1; // With a constant index, we do not need to use addi.
2647 }
2648
2649 // When the vector needs to split into multiple register groups and the index
2650 // exceeds single vector register group, we need to insert/extract the element
2651 // via stack.
2652 if (LT.first > 1 &&
2653 ((Index == -1U) || (Index >= LT.second.getVectorMinNumElements() &&
2654 LT.second.isScalableVector()))) {
2655 Type *ScalarType = Val->getScalarType();
2656 Align VecAlign = DL.getPrefTypeAlign(Val);
2657 Align SclAlign = DL.getPrefTypeAlign(ScalarType);
2658 // Extra addi for unknown index.
2659 InstructionCost IdxCost = Index == -1U ? 1 : 0;
2660
2661 // Store all split vectors into stack and load the target element.
2662 if (Opcode == Instruction::ExtractElement)
2663 return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) +
2664 getMemoryOpCost(Instruction::Load, ScalarType, SclAlign, 0,
2665 CostKind) +
2666 IdxCost;
2667
2668 // Store all split vectors into stack and store the target element and load
2669 // vectors back.
2670 return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) +
2671 getMemoryOpCost(Instruction::Load, Val, VecAlign, 0, CostKind) +
2672 getMemoryOpCost(Instruction::Store, ScalarType, SclAlign, 0,
2673 CostKind) +
2674 IdxCost;
2675 }
2676
2677 // Extract i64 in the target that has XLEN=32 need more instruction.
2678 if (Val->getScalarType()->isIntegerTy() &&
2679 ST->getXLen() < Val->getScalarSizeInBits()) {
2680 // For extractelement, we need the following instructions:
2681 // vsetivli zero, 1, e64, m1, ta, mu (not count)
2682 // vslidedown.vx v8, v8, a0
2683 // vmv.x.s a0, v8
2684 // li a1, 32
2685 // vsrl.vx v8, v8, a1
2686 // vmv.x.s a1, v8
2687
2688 // For insertelement, we need the following instructions:
2689 // vsetivli zero, 2, e32, m4, ta, mu (not count)
2690 // vmv.v.i v12, 0
2691 // vslide1up.vx v16, v12, a1
2692 // vslide1up.vx v12, v16, a0
2693 // addi a0, a2, 1
2694 // vsetvli zero, a0, e64, m4, tu, mu (not count)
2695 // vslideup.vx v8, v12, a2
2696
2697 // TODO: should we count these special vsetvlis?
2698 BaseCost = Opcode == Instruction::InsertElement ? 3 : 4;
2699 }
2700 return BaseCost + SlideCost;
2701}
2702
2706 unsigned Index) const {
2707 if (isa<FixedVectorType>(Val))
2709 Index);
2710
2711 // TODO: This code replicates what LoopVectorize.cpp used to do when asking
2712 // for the cost of extracting the last lane of a scalable vector. It probably
2713 // needs a more accurate cost.
2714 ElementCount EC = cast<VectorType>(Val)->getElementCount();
2715 assert(Index < EC.getKnownMinValue() && "Unexpected reverse index");
2716 return getVectorInstrCost(Opcode, Val, CostKind,
2717 EC.getKnownMinValue() - 1 - Index, nullptr,
2718 nullptr);
2719}
2720
2721/// Check to see if this instruction is expected to be combined to a simpler
2722/// operation during/before lowering. If so return the cost of the combined
2723/// operation rather than provided one. For instance, `udiv i16 %X, 2` is likely
2724/// to be combined to `lshr i16 %X, 1`, so return the cost of a `lshr` rather
2725/// than the cost of a `udiv`
2726std::optional<InstructionCost>
2728 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
2730 ArrayRef<const Value *> Args, const Instruction *CxtI) const {
2731 // Vector unsigned division/remainder will be simplified to shifts/masks.
2732 if ((Opcode == Instruction::UDiv || Opcode == Instruction::URem) &&
2733 Opd2Info.isConstant() && Opd2Info.isPowerOf2()) {
2734 if (Opcode == Instruction::UDiv)
2735 return getArithmeticInstrCost(Instruction::LShr, Ty, CostKind, Opd1Info,
2736 Opd2Info.getNoProps());
2737 // UREM
2738 return getArithmeticInstrCost(Instruction::And, Ty, CostKind, Opd1Info,
2739 Opd2Info.getNoProps());
2740 }
2741 return std::nullopt;
2742}
2743
2745 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
2747 ArrayRef<const Value *> Args, const Instruction *CxtI) const {
2748
2749 // TODO: Handle more cost kinds.
2751 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
2752 Args, CxtI);
2753
2754 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
2755 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
2756 Args, CxtI);
2757
2758 // Skip if scalar size of Ty is bigger than ELEN.
2759 if (isa<VectorType>(Ty) && Ty->getScalarSizeInBits() > ST->getELen())
2760 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
2761 Args, CxtI);
2762
2763 if (std::optional<InstructionCost> CombinedCost =
2765 Op2Info, Args, CxtI))
2766 return *CombinedCost;
2767
2768 // Legalize the type.
2769 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
2770 unsigned ISDOpcode = TLI->InstructionOpcodeToISD(Opcode);
2771
2772 // TODO: Handle scalar type.
2773 if (!LT.second.isVector()) {
2774 static const CostTblEntry DivTbl[]{
2775 {ISD::UDIV, MVT::i32, TTI::TCC_Expensive},
2776 {ISD::UDIV, MVT::i64, TTI::TCC_Expensive},
2777 {ISD::SDIV, MVT::i32, TTI::TCC_Expensive},
2778 {ISD::SDIV, MVT::i64, TTI::TCC_Expensive},
2779 {ISD::UREM, MVT::i32, TTI::TCC_Expensive},
2780 {ISD::UREM, MVT::i64, TTI::TCC_Expensive},
2781 {ISD::SREM, MVT::i32, TTI::TCC_Expensive},
2782 {ISD::SREM, MVT::i64, TTI::TCC_Expensive}};
2783 if (TLI->isOperationLegalOrPromote(ISDOpcode, LT.second))
2784 if (const auto *Entry = CostTableLookup(DivTbl, ISDOpcode, LT.second))
2785 return Entry->Cost * LT.first;
2786
2787 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
2788 Args, CxtI);
2789 }
2790
2791 // f16 with zvfhmin and bf16 will be promoted to f32.
2792 // FIXME: nxv32[b]f16 will be custom lowered and split.
2793 InstructionCost CastCost = 0;
2794 if ((LT.second.getVectorElementType() == MVT::f16 ||
2795 LT.second.getVectorElementType() == MVT::bf16) &&
2796 TLI->getOperationAction(ISDOpcode, LT.second) ==
2798 MVT PromotedVT = TLI->getTypeToPromoteTo(ISDOpcode, LT.second);
2799 Type *PromotedTy = EVT(PromotedVT).getTypeForEVT(Ty->getContext());
2800 Type *LegalTy = EVT(LT.second).getTypeForEVT(Ty->getContext());
2801 // Add cost of extending arguments
2802 CastCost += LT.first * Args.size() *
2803 getCastInstrCost(Instruction::FPExt, PromotedTy, LegalTy,
2805 // Add cost of truncating result
2806 CastCost +=
2807 LT.first * getCastInstrCost(Instruction::FPTrunc, LegalTy, PromotedTy,
2809 // Compute cost of op in promoted type
2810 LT.second = PromotedVT;
2811 }
2812
2813 auto getConstantMatCost =
2814 [&](unsigned Operand, TTI::OperandValueInfo OpInfo) -> InstructionCost {
2815 if (OpInfo.isUniform() && canSplatOperand(Opcode, Operand))
2816 // Two sub-cases:
2817 // * Has a 5 bit immediate operand which can be splatted.
2818 // * Has a larger immediate which must be materialized in scalar register
2819 // We return 0 for both as we currently ignore the cost of materializing
2820 // scalar constants in GPRs.
2821 return 0;
2822
2823 return getConstantPoolLoadCost(Ty, CostKind);
2824 };
2825
2826 // Add the cost of materializing any constant vectors required.
2827 InstructionCost ConstantMatCost = 0;
2828 if (Op1Info.isConstant())
2829 ConstantMatCost += getConstantMatCost(0, Op1Info);
2830 if (Op2Info.isConstant())
2831 ConstantMatCost += getConstantMatCost(1, Op2Info);
2832
2833 unsigned Op;
2834 switch (ISDOpcode) {
2835 case ISD::ADD:
2836 case ISD::SUB:
2837 Op = RISCV::VADD_VV;
2838 break;
2839 case ISD::SHL:
2840 case ISD::SRL:
2841 case ISD::SRA:
2842 Op = RISCV::VSLL_VV;
2843 break;
2844 case ISD::AND:
2845 case ISD::OR:
2846 case ISD::XOR:
2847 Op = (Ty->getScalarSizeInBits() == 1) ? RISCV::VMAND_MM : RISCV::VAND_VV;
2848 break;
2849 case ISD::MUL:
2850 case ISD::MULHS:
2851 case ISD::MULHU:
2852 Op = RISCV::VMUL_VV;
2853 break;
2854 case ISD::SDIV:
2855 case ISD::UDIV:
2856 Op = RISCV::VDIV_VV;
2857 break;
2858 case ISD::SREM:
2859 case ISD::UREM:
2860 Op = RISCV::VREM_VV;
2861 break;
2862 case ISD::FADD:
2863 case ISD::FSUB:
2864 Op = RISCV::VFADD_VV;
2865 break;
2866 case ISD::FMUL:
2867 Op = RISCV::VFMUL_VV;
2868 break;
2869 case ISD::FDIV:
2870 Op = RISCV::VFDIV_VV;
2871 break;
2872 case ISD::FNEG:
2873 Op = RISCV::VFSGNJN_VV;
2874 break;
2875 default:
2876 // Assuming all other instructions have the same cost until a need arises to
2877 // differentiate them.
2878 return CastCost + ConstantMatCost +
2879 BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
2880 Args, CxtI);
2881 }
2882
2883 InstructionCost InstrCost = getRISCVInstructionCost(Op, LT.second, CostKind);
2884 // We use BasicTTIImpl to calculate scalar costs, which assumes floating point
2885 // ops are twice as expensive as integer ops. Do the same for vectors so
2886 // scalar floating point ops aren't cheaper than their vector equivalents.
2887 if (Ty->isFPOrFPVectorTy())
2888 InstrCost *= 2;
2889 return CastCost + ConstantMatCost + LT.first * InstrCost;
2890}
2891
2892// TODO: Deduplicate from TargetTransformInfoImplCRTPBase.
2894 ArrayRef<const Value *> Ptrs, const Value *Base,
2895 const TTI::PointersChainInfo &Info, Type *AccessTy,
2898 // In the basic model we take into account GEP instructions only
2899 // (although here can come alloca instruction, a value, constants and/or
2900 // constant expressions, PHIs, bitcasts ... whatever allowed to be used as a
2901 // pointer). Typically, if Base is a not a GEP-instruction and all the
2902 // pointers are relative to the same base address, all the rest are
2903 // either GEP instructions, PHIs, bitcasts or constants. When we have same
2904 // base, we just calculate cost of each non-Base GEP as an ADD operation if
2905 // any their index is a non-const.
2906 // If no known dependencies between the pointers cost is calculated as a sum
2907 // of costs of GEP instructions.
2908 for (auto [I, V] : enumerate(Ptrs)) {
2909 const auto *GEP = dyn_cast<GetElementPtrInst>(V);
2910 if (!GEP)
2911 continue;
2912 if (Info.isSameBase() && V != Base) {
2913 if (GEP->hasAllConstantIndices())
2914 continue;
2915 // If the chain is unit-stride and BaseReg + stride*i is a legal
2916 // addressing mode, then presume the base GEP is sitting around in a
2917 // register somewhere and check if we can fold the offset relative to
2918 // it.
2919 unsigned Stride = DL.getTypeStoreSize(AccessTy);
2920 if (Info.isUnitStride() &&
2921 isLegalAddressingMode(AccessTy,
2922 /* BaseGV */ nullptr,
2923 /* BaseOffset */ Stride * I,
2924 /* HasBaseReg */ true,
2925 /* Scale */ 0,
2926 GEP->getType()->getPointerAddressSpace()))
2927 continue;
2928 Cost += getArithmeticInstrCost(Instruction::Add, GEP->getType(), CostKind,
2929 {TTI::OK_AnyValue, TTI::OP_None},
2930 {TTI::OK_AnyValue, TTI::OP_None}, {});
2931 } else {
2932 SmallVector<const Value *> Indices(GEP->indices());
2933 Cost += getGEPCost(GEP->getSourceElementType(), GEP->getPointerOperand(),
2934 Indices, AccessTy, CostKind);
2935 }
2936 }
2937 return Cost;
2938}
2939
2942 OptimizationRemarkEmitter *ORE) const {
2943 // TODO: More tuning on benchmarks and metrics with changes as needed
2944 // would apply to all settings below to enable performance.
2945
2946
2947 if (ST->enableDefaultUnroll())
2948 return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP, ORE);
2949
2950 // Enable Upper bound unrolling universally, not dependent upon the conditions
2951 // below.
2952 UP.UpperBound = true;
2953
2954 // Disable loop unrolling for Oz and Os.
2955 UP.OptSizeThreshold = 0;
2957 if (L->getHeader()->getParent()->hasOptSize())
2958 return;
2959
2960 SmallVector<BasicBlock *, 4> ExitingBlocks;
2961 L->getExitingBlocks(ExitingBlocks);
2962 LLVM_DEBUG(dbgs() << "Loop has:\n"
2963 << "Blocks: " << L->getNumBlocks() << "\n"
2964 << "Exit blocks: " << ExitingBlocks.size() << "\n");
2965
2966 // Only allow another exit other than the latch. This acts as an early exit
2967 // as it mirrors the profitability calculation of the runtime unroller.
2968 if (ExitingBlocks.size() > 2)
2969 return;
2970
2971 // Limit the CFG of the loop body for targets with a branch predictor.
2972 // Allowing 4 blocks permits if-then-else diamonds in the body.
2973 if (L->getNumBlocks() > 4)
2974 return;
2975
2976 // Scan the loop: don't unroll loops with calls as this could prevent
2977 // inlining. Don't unroll auto-vectorized loops either, though do allow
2978 // unrolling of the scalar remainder.
2979 bool IsVectorized = getBooleanLoopAttribute(L, "llvm.loop.isvectorized");
2981 for (auto *BB : L->getBlocks()) {
2982 for (auto &I : *BB) {
2983 // Both auto-vectorized loops and the scalar remainder have the
2984 // isvectorized attribute, so differentiate between them by the presence
2985 // of vector instructions.
2986 if (IsVectorized && (I.getType()->isVectorTy() ||
2987 llvm::any_of(I.operand_values(), [](Value *V) {
2988 return V->getType()->isVectorTy();
2989 })))
2990 return;
2991
2992 if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
2993 if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
2994 if (!isLoweredToCall(F))
2995 continue;
2996 }
2997 return;
2998 }
2999
3000 SmallVector<const Value *> Operands(I.operand_values());
3001 Cost += getInstructionCost(&I, Operands,
3003 }
3004 }
3005
3006 LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n");
3007
3008 UP.Partial = true;
3009 UP.Runtime = true;
3010 UP.UnrollRemainder = true;
3011 UP.UnrollAndJam = true;
3012
3013 // Force unrolling small loops can be very useful because of the branch
3014 // taken cost of the backedge.
3015 if (Cost < 12)
3016 UP.Force = true;
3017}
3018
3023
3025 MemIntrinsicInfo &Info) const {
3026 const DataLayout &DL = getDataLayout();
3027 Intrinsic::ID IID = Inst->getIntrinsicID();
3028 LLVMContext &C = Inst->getContext();
3029 bool HasMask = false;
3030
3031 auto getSegNum = [](const IntrinsicInst *II, unsigned PtrOperandNo,
3032 bool IsWrite) -> int64_t {
3033 if (auto *TarExtTy =
3034 dyn_cast<TargetExtType>(II->getArgOperand(0)->getType()))
3035 return TarExtTy->getIntParameter(0);
3036
3037 return 1;
3038 };
3039
3040 switch (IID) {
3041 case Intrinsic::riscv_vle_mask:
3042 case Intrinsic::riscv_vse_mask:
3043 case Intrinsic::riscv_vlseg2_mask:
3044 case Intrinsic::riscv_vlseg3_mask:
3045 case Intrinsic::riscv_vlseg4_mask:
3046 case Intrinsic::riscv_vlseg5_mask:
3047 case Intrinsic::riscv_vlseg6_mask:
3048 case Intrinsic::riscv_vlseg7_mask:
3049 case Intrinsic::riscv_vlseg8_mask:
3050 case Intrinsic::riscv_vsseg2_mask:
3051 case Intrinsic::riscv_vsseg3_mask:
3052 case Intrinsic::riscv_vsseg4_mask:
3053 case Intrinsic::riscv_vsseg5_mask:
3054 case Intrinsic::riscv_vsseg6_mask:
3055 case Intrinsic::riscv_vsseg7_mask:
3056 case Intrinsic::riscv_vsseg8_mask:
3057 HasMask = true;
3058 [[fallthrough]];
3059 case Intrinsic::riscv_vle:
3060 case Intrinsic::riscv_vse:
3061 case Intrinsic::riscv_vlseg2:
3062 case Intrinsic::riscv_vlseg3:
3063 case Intrinsic::riscv_vlseg4:
3064 case Intrinsic::riscv_vlseg5:
3065 case Intrinsic::riscv_vlseg6:
3066 case Intrinsic::riscv_vlseg7:
3067 case Intrinsic::riscv_vlseg8:
3068 case Intrinsic::riscv_vsseg2:
3069 case Intrinsic::riscv_vsseg3:
3070 case Intrinsic::riscv_vsseg4:
3071 case Intrinsic::riscv_vsseg5:
3072 case Intrinsic::riscv_vsseg6:
3073 case Intrinsic::riscv_vsseg7:
3074 case Intrinsic::riscv_vsseg8: {
3075 // Intrinsic interface:
3076 // riscv_vle(merge, ptr, vl)
3077 // riscv_vle_mask(merge, ptr, mask, vl, policy)
3078 // riscv_vse(val, ptr, vl)
3079 // riscv_vse_mask(val, ptr, mask, vl, policy)
3080 // riscv_vlseg#(merge, ptr, vl, sew)
3081 // riscv_vlseg#_mask(merge, ptr, mask, vl, policy, sew)
3082 // riscv_vsseg#(val, ptr, vl, sew)
3083 // riscv_vsseg#_mask(val, ptr, mask, vl, sew)
3084 bool IsWrite = Inst->getType()->isVoidTy();
3085 Type *Ty = IsWrite ? Inst->getArgOperand(0)->getType() : Inst->getType();
3086 // The results of segment loads are TargetExtType.
3087 if (auto *TarExtTy = dyn_cast<TargetExtType>(Ty)) {
3088 unsigned SEW =
3089 1 << cast<ConstantInt>(Inst->getArgOperand(Inst->arg_size() - 1))
3090 ->getZExtValue();
3091 Ty = TarExtTy->getTypeParameter(0U);
3093 IntegerType::get(C, SEW),
3094 cast<ScalableVectorType>(Ty)->getMinNumElements() * 8 / SEW);
3095 }
3096 const auto *RVVIInfo = RISCVVIntrinsicsTable::getRISCVVIntrinsicInfo(IID);
3097 unsigned VLIndex = RVVIInfo->VLOperand;
3098 unsigned PtrOperandNo = VLIndex - 1 - HasMask;
3099 MaybeAlign Alignment =
3100 Inst->getArgOperand(PtrOperandNo)->getPointerAlignment(DL);
3101 Type *MaskType = Ty->getWithNewType(Type::getInt1Ty(C));
3102 Value *Mask = ConstantInt::getTrue(MaskType);
3103 if (HasMask)
3104 Mask = Inst->getArgOperand(VLIndex - 1);
3105 Value *EVL = Inst->getArgOperand(VLIndex);
3106 unsigned SegNum = getSegNum(Inst, PtrOperandNo, IsWrite);
3107 // RVV uses contiguous elements as a segment.
3108 if (SegNum > 1) {
3109 unsigned ElemSize = Ty->getScalarSizeInBits();
3110 auto *SegTy = IntegerType::get(C, ElemSize * SegNum);
3111 Ty = VectorType::get(SegTy, cast<VectorType>(Ty));
3112 }
3113 Info.InterestingOperands.emplace_back(Inst, PtrOperandNo, IsWrite, Ty,
3114 Alignment, Mask, EVL);
3115 return true;
3116 }
3117 case Intrinsic::riscv_vlse_mask:
3118 case Intrinsic::riscv_vsse_mask:
3119 case Intrinsic::riscv_vlsseg2_mask:
3120 case Intrinsic::riscv_vlsseg3_mask:
3121 case Intrinsic::riscv_vlsseg4_mask:
3122 case Intrinsic::riscv_vlsseg5_mask:
3123 case Intrinsic::riscv_vlsseg6_mask:
3124 case Intrinsic::riscv_vlsseg7_mask:
3125 case Intrinsic::riscv_vlsseg8_mask:
3126 case Intrinsic::riscv_vssseg2_mask:
3127 case Intrinsic::riscv_vssseg3_mask:
3128 case Intrinsic::riscv_vssseg4_mask:
3129 case Intrinsic::riscv_vssseg5_mask:
3130 case Intrinsic::riscv_vssseg6_mask:
3131 case Intrinsic::riscv_vssseg7_mask:
3132 case Intrinsic::riscv_vssseg8_mask:
3133 HasMask = true;
3134 [[fallthrough]];
3135 case Intrinsic::riscv_vlse:
3136 case Intrinsic::riscv_vsse:
3137 case Intrinsic::riscv_vlsseg2:
3138 case Intrinsic::riscv_vlsseg3:
3139 case Intrinsic::riscv_vlsseg4:
3140 case Intrinsic::riscv_vlsseg5:
3141 case Intrinsic::riscv_vlsseg6:
3142 case Intrinsic::riscv_vlsseg7:
3143 case Intrinsic::riscv_vlsseg8:
3144 case Intrinsic::riscv_vssseg2:
3145 case Intrinsic::riscv_vssseg3:
3146 case Intrinsic::riscv_vssseg4:
3147 case Intrinsic::riscv_vssseg5:
3148 case Intrinsic::riscv_vssseg6:
3149 case Intrinsic::riscv_vssseg7:
3150 case Intrinsic::riscv_vssseg8: {
3151 // Intrinsic interface:
3152 // riscv_vlse(merge, ptr, stride, vl)
3153 // riscv_vlse_mask(merge, ptr, stride, mask, vl, policy)
3154 // riscv_vsse(val, ptr, stride, vl)
3155 // riscv_vsse_mask(val, ptr, stride, mask, vl, policy)
3156 // riscv_vlsseg#(merge, ptr, offset, vl, sew)
3157 // riscv_vlsseg#_mask(merge, ptr, offset, mask, vl, policy, sew)
3158 // riscv_vssseg#(val, ptr, offset, vl, sew)
3159 // riscv_vssseg#_mask(val, ptr, offset, mask, vl, sew)
3160 bool IsWrite = Inst->getType()->isVoidTy();
3161 Type *Ty = IsWrite ? Inst->getArgOperand(0)->getType() : Inst->getType();
3162 // The results of segment loads are TargetExtType.
3163 if (auto *TarExtTy = dyn_cast<TargetExtType>(Ty)) {
3164 unsigned SEW =
3165 1 << cast<ConstantInt>(Inst->getArgOperand(Inst->arg_size() - 1))
3166 ->getZExtValue();
3167 Ty = TarExtTy->getTypeParameter(0U);
3169 IntegerType::get(C, SEW),
3170 cast<ScalableVectorType>(Ty)->getMinNumElements() * 8 / SEW);
3171 }
3172 const auto *RVVIInfo = RISCVVIntrinsicsTable::getRISCVVIntrinsicInfo(IID);
3173 unsigned VLIndex = RVVIInfo->VLOperand;
3174 unsigned PtrOperandNo = VLIndex - 2 - HasMask;
3175 MaybeAlign Alignment =
3176 Inst->getArgOperand(PtrOperandNo)->getPointerAlignment(DL);
3177
3178 Value *Stride = Inst->getArgOperand(PtrOperandNo + 1);
3179 // Use the pointer alignment as the element alignment if the stride is a
3180 // multiple of the pointer alignment. Otherwise, the element alignment
3181 // should be the greatest common divisor of pointer alignment and stride.
3182 // For simplicity, just consider unalignment for elements.
3183 unsigned PointerAlign = Alignment.valueOrOne().value();
3184 if (!isa<ConstantInt>(Stride) ||
3185 cast<ConstantInt>(Stride)->getZExtValue() % PointerAlign != 0)
3186 Alignment = Align(1);
3187
3188 Type *MaskType = Ty->getWithNewType(Type::getInt1Ty(C));
3189 Value *Mask = ConstantInt::getTrue(MaskType);
3190 if (HasMask)
3191 Mask = Inst->getArgOperand(VLIndex - 1);
3192 Value *EVL = Inst->getArgOperand(VLIndex);
3193 unsigned SegNum = getSegNum(Inst, PtrOperandNo, IsWrite);
3194 // RVV uses contiguous elements as a segment.
3195 if (SegNum > 1) {
3196 unsigned ElemSize = Ty->getScalarSizeInBits();
3197 auto *SegTy = IntegerType::get(C, ElemSize * SegNum);
3198 Ty = VectorType::get(SegTy, cast<VectorType>(Ty));
3199 }
3200 Info.InterestingOperands.emplace_back(Inst, PtrOperandNo, IsWrite, Ty,
3201 Alignment, Mask, EVL, Stride);
3202 return true;
3203 }
3204 case Intrinsic::riscv_vloxei_mask:
3205 case Intrinsic::riscv_vluxei_mask:
3206 case Intrinsic::riscv_vsoxei_mask:
3207 case Intrinsic::riscv_vsuxei_mask:
3208 case Intrinsic::riscv_vloxseg2_mask:
3209 case Intrinsic::riscv_vloxseg3_mask:
3210 case Intrinsic::riscv_vloxseg4_mask:
3211 case Intrinsic::riscv_vloxseg5_mask:
3212 case Intrinsic::riscv_vloxseg6_mask:
3213 case Intrinsic::riscv_vloxseg7_mask:
3214 case Intrinsic::riscv_vloxseg8_mask:
3215 case Intrinsic::riscv_vluxseg2_mask:
3216 case Intrinsic::riscv_vluxseg3_mask:
3217 case Intrinsic::riscv_vluxseg4_mask:
3218 case Intrinsic::riscv_vluxseg5_mask:
3219 case Intrinsic::riscv_vluxseg6_mask:
3220 case Intrinsic::riscv_vluxseg7_mask:
3221 case Intrinsic::riscv_vluxseg8_mask:
3222 case Intrinsic::riscv_vsoxseg2_mask:
3223 case Intrinsic::riscv_vsoxseg3_mask:
3224 case Intrinsic::riscv_vsoxseg4_mask:
3225 case Intrinsic::riscv_vsoxseg5_mask:
3226 case Intrinsic::riscv_vsoxseg6_mask:
3227 case Intrinsic::riscv_vsoxseg7_mask:
3228 case Intrinsic::riscv_vsoxseg8_mask:
3229 case Intrinsic::riscv_vsuxseg2_mask:
3230 case Intrinsic::riscv_vsuxseg3_mask:
3231 case Intrinsic::riscv_vsuxseg4_mask:
3232 case Intrinsic::riscv_vsuxseg5_mask:
3233 case Intrinsic::riscv_vsuxseg6_mask:
3234 case Intrinsic::riscv_vsuxseg7_mask:
3235 case Intrinsic::riscv_vsuxseg8_mask:
3236 HasMask = true;
3237 [[fallthrough]];
3238 case Intrinsic::riscv_vloxei:
3239 case Intrinsic::riscv_vluxei:
3240 case Intrinsic::riscv_vsoxei:
3241 case Intrinsic::riscv_vsuxei:
3242 case Intrinsic::riscv_vloxseg2:
3243 case Intrinsic::riscv_vloxseg3:
3244 case Intrinsic::riscv_vloxseg4:
3245 case Intrinsic::riscv_vloxseg5:
3246 case Intrinsic::riscv_vloxseg6:
3247 case Intrinsic::riscv_vloxseg7:
3248 case Intrinsic::riscv_vloxseg8:
3249 case Intrinsic::riscv_vluxseg2:
3250 case Intrinsic::riscv_vluxseg3:
3251 case Intrinsic::riscv_vluxseg4:
3252 case Intrinsic::riscv_vluxseg5:
3253 case Intrinsic::riscv_vluxseg6:
3254 case Intrinsic::riscv_vluxseg7:
3255 case Intrinsic::riscv_vluxseg8:
3256 case Intrinsic::riscv_vsoxseg2:
3257 case Intrinsic::riscv_vsoxseg3:
3258 case Intrinsic::riscv_vsoxseg4:
3259 case Intrinsic::riscv_vsoxseg5:
3260 case Intrinsic::riscv_vsoxseg6:
3261 case Intrinsic::riscv_vsoxseg7:
3262 case Intrinsic::riscv_vsoxseg8:
3263 case Intrinsic::riscv_vsuxseg2:
3264 case Intrinsic::riscv_vsuxseg3:
3265 case Intrinsic::riscv_vsuxseg4:
3266 case Intrinsic::riscv_vsuxseg5:
3267 case Intrinsic::riscv_vsuxseg6:
3268 case Intrinsic::riscv_vsuxseg7:
3269 case Intrinsic::riscv_vsuxseg8: {
3270 // Intrinsic interface (only listed ordered version):
3271 // riscv_vloxei(merge, ptr, index, vl)
3272 // riscv_vloxei_mask(merge, ptr, index, mask, vl, policy)
3273 // riscv_vsoxei(val, ptr, index, vl)
3274 // riscv_vsoxei_mask(val, ptr, index, mask, vl, policy)
3275 // riscv_vloxseg#(merge, ptr, index, vl, sew)
3276 // riscv_vloxseg#_mask(merge, ptr, index, mask, vl, policy, sew)
3277 // riscv_vsoxseg#(val, ptr, index, vl, sew)
3278 // riscv_vsoxseg#_mask(val, ptr, index, mask, vl, sew)
3279 bool IsWrite = Inst->getType()->isVoidTy();
3280 Type *Ty = IsWrite ? Inst->getArgOperand(0)->getType() : Inst->getType();
3281 // The results of segment loads are TargetExtType.
3282 if (auto *TarExtTy = dyn_cast<TargetExtType>(Ty)) {
3283 unsigned SEW =
3284 1 << cast<ConstantInt>(Inst->getArgOperand(Inst->arg_size() - 1))
3285 ->getZExtValue();
3286 Ty = TarExtTy->getTypeParameter(0U);
3288 IntegerType::get(C, SEW),
3289 cast<ScalableVectorType>(Ty)->getMinNumElements() * 8 / SEW);
3290 }
3291 const auto *RVVIInfo = RISCVVIntrinsicsTable::getRISCVVIntrinsicInfo(IID);
3292 unsigned VLIndex = RVVIInfo->VLOperand;
3293 unsigned PtrOperandNo = VLIndex - 2 - HasMask;
3294 Value *Mask;
3295 if (HasMask) {
3296 Mask = Inst->getArgOperand(VLIndex - 1);
3297 } else {
3298 // Mask cannot be nullptr here: vector GEP produces <vscale x N x ptr>,
3299 // and casting that to scalar i64 triggers a vector/scalar mismatch
3300 // assertion in CreatePointerCast. Use an all-true mask so ASan lowers it
3301 // via extractelement instead.
3302 Type *MaskType = Ty->getWithNewType(Type::getInt1Ty(C));
3303 Mask = ConstantInt::getTrue(MaskType);
3304 }
3305 Value *EVL = Inst->getArgOperand(VLIndex);
3306 unsigned SegNum = getSegNum(Inst, PtrOperandNo, IsWrite);
3307 // RVV uses contiguous elements as a segment.
3308 if (SegNum > 1) {
3309 unsigned ElemSize = Ty->getScalarSizeInBits();
3310 auto *SegTy = IntegerType::get(C, ElemSize * SegNum);
3311 Ty = VectorType::get(SegTy, cast<VectorType>(Ty));
3312 }
3313 Value *OffsetOp = Inst->getArgOperand(PtrOperandNo + 1);
3314 Info.InterestingOperands.emplace_back(Inst, PtrOperandNo, IsWrite, Ty,
3315 Align(1), Mask, EVL,
3316 /* Stride */ nullptr, OffsetOp);
3317 return true;
3318 }
3319 }
3320 return false;
3321}
3322
3324 if (Ty->isVectorTy()) {
3325 // f16 with only zvfhmin and bf16 will be promoted to f32
3326 Type *EltTy = cast<VectorType>(Ty)->getElementType();
3327 if ((EltTy->isHalfTy() && !ST->hasVInstructionsF16()) ||
3328 EltTy->isBFloatTy())
3329 Ty = VectorType::get(Type::getFloatTy(Ty->getContext()),
3330 cast<VectorType>(Ty));
3331
3332 TypeSize Size = DL.getTypeSizeInBits(Ty);
3333 if (Size.isScalable() && ST->hasVInstructions())
3334 return divideCeil(Size.getKnownMinValue(), RISCV::RVVBitsPerBlock);
3335
3336 if (ST->useRVVForFixedLengthVectors())
3337 return divideCeil(Size, ST->getRealMinVLen());
3338 }
3339
3340 return BaseT::getRegUsageForType(Ty);
3341}
3342
3343unsigned RISCVTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
3344 if (SLPMaxVF.getNumOccurrences())
3345 return SLPMaxVF;
3346
3347 // Return how many elements can fit in getRegisterBitwidth. This is the
3348 // same routine as used in LoopVectorizer. We should probably be
3349 // accounting for whether we actually have instructions with the right
3350 // lane type, but we don't have enough information to do that without
3351 // some additional plumbing which hasn't been justified yet.
3352 TypeSize RegWidth =
3354 // If no vector registers, or absurd element widths, disable
3355 // vectorization by returning 1.
3356 return std::max<unsigned>(1U, RegWidth.getFixedValue() / ElemWidth);
3357}
3358
3362
3364 return ST->enableUnalignedVectorMem();
3365}
3366
3369 ScalarEvolution *SE) const {
3370 if (ST->hasVendorXCVmem() && !ST->is64Bit())
3371 return TTI::AMK_PostIndexed;
3372
3374}
3375
3377 const TargetTransformInfo::LSRCost &C2) const {
3378 // RISC-V specific here are "instruction number 1st priority".
3379 // If we need to emit adds inside the loop to add up base registers, then
3380 // we need at least one extra temporary register.
3381 unsigned C1NumRegs = C1.NumRegs + (C1.NumBaseAdds != 0);
3382 unsigned C2NumRegs = C2.NumRegs + (C2.NumBaseAdds != 0);
3383 return std::tie(C1.Insns, C1NumRegs, C1.AddRecCost,
3384 C1.NumIVMuls, C1.NumBaseAdds,
3385 C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
3386 std::tie(C2.Insns, C2NumRegs, C2.AddRecCost,
3387 C2.NumIVMuls, C2.NumBaseAdds,
3388 C2.ScaleCost, C2.ImmCost, C2.SetupCost);
3389}
3390
3392 Align Alignment) const {
3393 auto *VTy = dyn_cast<VectorType>(DataTy);
3394 if (!VTy || VTy->isScalableTy())
3395 return false;
3396
3397 if (!isLegalMaskedLoadStore(DataTy, Alignment))
3398 return false;
3399
3400 // FIXME: If it is an i8 vector and the element count exceeds 256, we should
3401 // scalarize these types with LMUL >= maximum fixed-length LMUL.
3402 if (VTy->getElementType()->isIntegerTy(8))
3403 if (VTy->getElementCount().getFixedValue() > 256)
3404 return VTy->getPrimitiveSizeInBits() / ST->getRealMinVLen() <
3405 ST->getMaxLMULForFixedLengthVectors();
3406 return true;
3407}
3408
3410 Align Alignment) const {
3411 auto *VTy = dyn_cast<VectorType>(DataTy);
3412 if (!VTy || VTy->isScalableTy())
3413 return false;
3414
3415 if (!isLegalMaskedLoadStore(DataTy, Alignment))
3416 return false;
3417 return true;
3418}
3419
3421 ElementCount NumElements) const {
3422 // Optimized zero-stride loads can be treated as broadcasts.
3423 if (!ST->hasVInstructions() || !ST->hasOptimizedZeroStrideLoad())
3424 return false;
3425
3426 return TLI->isLegalElementTypeForRVV(TLI->getValueType(DL, ElementTy));
3427}
3428
3429/// See if \p I should be considered for address type promotion. We check if \p
3430/// I is a sext with right type and used in memory accesses. If it used in a
3431/// "complex" getelementptr, we allow it to be promoted without finding other
3432/// sext instructions that sign extended the same initial value. A getelementptr
3433/// is considered as "complex" if it has more than 2 operands.
3435 const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const {
3436 bool Considerable = false;
3437 AllowPromotionWithoutCommonHeader = false;
3438 if (!isa<SExtInst>(&I))
3439 return false;
3440 Type *ConsideredSExtType =
3441 Type::getInt64Ty(I.getParent()->getParent()->getContext());
3442 if (I.getType() != ConsideredSExtType)
3443 return false;
3444 // See if the sext is the one with the right type and used in at least one
3445 // GetElementPtrInst.
3446 for (const User *U : I.users()) {
3447 if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) {
3448 Considerable = true;
3449 // A getelementptr is considered as "complex" if it has more than 2
3450 // operands. We will promote a SExt used in such complex GEP as we
3451 // expect some computation to be merged if they are done on 64 bits.
3452 if (GEPInst->getNumOperands() > 2) {
3453 AllowPromotionWithoutCommonHeader = true;
3454 break;
3455 }
3456 }
3457 }
3458 return Considerable;
3459}
3460
3461bool RISCVTTIImpl::canSplatOperand(unsigned Opcode, int Operand) const {
3462 switch (Opcode) {
3463 case Instruction::Add:
3464 case Instruction::Sub:
3465 case Instruction::Mul:
3466 case Instruction::And:
3467 case Instruction::Or:
3468 case Instruction::Xor:
3469 case Instruction::FAdd:
3470 case Instruction::FSub:
3471 case Instruction::FMul:
3472 case Instruction::FDiv:
3473 case Instruction::ICmp:
3474 case Instruction::FCmp:
3475 return true;
3476 case Instruction::Shl:
3477 case Instruction::LShr:
3478 case Instruction::AShr:
3479 case Instruction::UDiv:
3480 case Instruction::SDiv:
3481 case Instruction::URem:
3482 case Instruction::SRem:
3483 case Instruction::Select:
3484 return Operand == 1;
3485 default:
3486 return false;
3487 }
3488}
3489
3491 if (!I->getType()->isVectorTy() || !ST->hasVInstructions())
3492 return false;
3493
3494 if (canSplatOperand(I->getOpcode(), Operand))
3495 return true;
3496
3497 auto *II = dyn_cast<IntrinsicInst>(I);
3498 if (!II)
3499 return false;
3500
3501 switch (II->getIntrinsicID()) {
3502 case Intrinsic::fma:
3503 case Intrinsic::vp_fma:
3504 case Intrinsic::fmuladd:
3505 case Intrinsic::vp_fmuladd:
3506 return Operand == 0 || Operand == 1;
3507 case Intrinsic::vp_shl:
3508 case Intrinsic::vp_lshr:
3509 case Intrinsic::vp_ashr:
3510 case Intrinsic::vp_udiv:
3511 case Intrinsic::vp_sdiv:
3512 case Intrinsic::vp_urem:
3513 case Intrinsic::vp_srem:
3514 case Intrinsic::ssub_sat:
3515 case Intrinsic::vp_ssub_sat:
3516 case Intrinsic::usub_sat:
3517 case Intrinsic::vp_usub_sat:
3518 case Intrinsic::vp_select:
3519 return Operand == 1;
3520 // These intrinsics are commutative.
3521 case Intrinsic::vp_add:
3522 case Intrinsic::vp_mul:
3523 case Intrinsic::vp_and:
3524 case Intrinsic::vp_or:
3525 case Intrinsic::vp_xor:
3526 case Intrinsic::vp_fadd:
3527 case Intrinsic::vp_fmul:
3528 case Intrinsic::vp_icmp:
3529 case Intrinsic::vp_fcmp:
3530 case Intrinsic::smin:
3531 case Intrinsic::vp_smin:
3532 case Intrinsic::umin:
3533 case Intrinsic::vp_umin:
3534 case Intrinsic::smax:
3535 case Intrinsic::vp_smax:
3536 case Intrinsic::umax:
3537 case Intrinsic::vp_umax:
3538 case Intrinsic::sadd_sat:
3539 case Intrinsic::vp_sadd_sat:
3540 case Intrinsic::uadd_sat:
3541 case Intrinsic::vp_uadd_sat:
3542 // These intrinsics have 'vr' versions.
3543 case Intrinsic::vp_sub:
3544 case Intrinsic::vp_fsub:
3545 case Intrinsic::vp_fdiv:
3546 return Operand == 0 || Operand == 1;
3547 default:
3548 return false;
3549 }
3550}
3551
3552/// Check if sinking \p I's operands to I's basic block is profitable, because
3553/// the operands can be folded into a target instruction, e.g.
3554/// splats of scalars can fold into vector instructions.
3557 using namespace llvm::PatternMatch;
3558
3559 if (I->isBitwiseLogicOp()) {
3560 if (!I->getType()->isVectorTy()) {
3561 if (ST->hasStdExtZbb() || ST->hasStdExtZbkb()) {
3562 for (auto &Op : I->operands()) {
3563 // (and/or/xor X, (not Y)) -> (andn/orn/xnor X, Y)
3564 if (match(Op.get(), m_Not(m_Value()))) {
3565 Ops.push_back(&Op);
3566 return true;
3567 }
3568 }
3569 }
3570 } else if (I->getOpcode() == Instruction::And && ST->hasStdExtZvkb()) {
3571 for (auto &Op : I->operands()) {
3572 // (and X, (not Y)) -> (vandn.vv X, Y)
3573 if (match(Op.get(), m_Not(m_Value()))) {
3574 Ops.push_back(&Op);
3575 return true;
3576 }
3577 // (and X, (splat (not Y))) -> (vandn.vx X, Y)
3579 m_ZeroInt()),
3580 m_Value(), m_ZeroMask()))) {
3581 Use &InsertElt = cast<Instruction>(Op)->getOperandUse(0);
3582 Use &Not = cast<Instruction>(InsertElt)->getOperandUse(1);
3583 Ops.push_back(&Not);
3584 Ops.push_back(&InsertElt);
3585 Ops.push_back(&Op);
3586 return true;
3587 }
3588 }
3589 }
3590 }
3591
3592 if (!I->getType()->isVectorTy() || !ST->hasVInstructions())
3593 return false;
3594
3595 // Don't sink splat operands if the target prefers it. Some targets requires
3596 // S2V transfer buffers and we can run out of them copying the same value
3597 // repeatedly.
3598 // FIXME: It could still be worth doing if it would improve vector register
3599 // pressure and prevent a vector spill.
3600 if (!ST->sinkSplatOperands())
3601 return false;
3602
3603 for (auto OpIdx : enumerate(I->operands())) {
3604 if (!canSplatOperand(I, OpIdx.index()))
3605 continue;
3606
3607 Instruction *Op = dyn_cast<Instruction>(OpIdx.value().get());
3608 // Make sure we are not already sinking this operand
3609 if (!Op || any_of(Ops, [&](Use *U) { return U->get() == Op; }))
3610 continue;
3611
3612 // We are looking for a splat that can be sunk.
3614 m_Value(), m_ZeroMask())))
3615 continue;
3616
3617 // Don't sink i1 splats.
3618 if (cast<VectorType>(Op->getType())->getElementType()->isIntegerTy(1))
3619 continue;
3620
3621 // All uses of the shuffle should be sunk to avoid duplicating it across gpr
3622 // and vector registers
3623 for (Use &U : Op->uses()) {
3624 Instruction *Insn = cast<Instruction>(U.getUser());
3625 if (!canSplatOperand(Insn, U.getOperandNo()))
3626 return false;
3627 }
3628
3629 // Sink any fpexts since they might be used in a widening fp pattern.
3630 Use *InsertEltUse = &Op->getOperandUse(0);
3631 auto *InsertElt = cast<InsertElementInst>(InsertEltUse);
3632 if (isa<FPExtInst>(InsertElt->getOperand(1)))
3633 Ops.push_back(&InsertElt->getOperandUse(1));
3634 Ops.push_back(InsertEltUse);
3635 Ops.push_back(&OpIdx.value());
3636 }
3637 return true;
3638}
3639
3641RISCVTTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
3643 // TODO: Enable expansion when unaligned access is not supported after we fix
3644 // issues in ExpandMemcmp.
3645 if (!ST->enableUnalignedScalarMem())
3646 return Options;
3647
3648 if (!ST->hasStdExtZbb() && !ST->hasStdExtZbkb() && !IsZeroCmp)
3649 return Options;
3650
3651 Options.AllowOverlappingLoads = true;
3652 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
3653 Options.NumLoadsPerBlock = Options.MaxNumLoads;
3654 if (ST->is64Bit()) {
3655 Options.LoadSizes = {8, 4, 2, 1};
3656 Options.AllowedTailExpansions = {3, 5, 6};
3657 } else {
3658 Options.LoadSizes = {4, 2, 1};
3659 Options.AllowedTailExpansions = {3};
3660 }
3661
3662 if (IsZeroCmp && ST->hasVInstructions()) {
3663 unsigned VLenB = ST->getRealMinVLen() / 8;
3664 // The minimum size should be `XLen / 8 + 1`, and the maxinum size should be
3665 // `VLenB * MaxLMUL` so that it fits in a single register group.
3666 unsigned MinSize = ST->getXLen() / 8 + 1;
3667 unsigned MaxSize = VLenB * ST->getMaxLMULForFixedLengthVectors();
3668 for (unsigned Size = MinSize; Size <= MaxSize; Size++)
3669 Options.LoadSizes.insert(Options.LoadSizes.begin(), Size);
3670 }
3671 return Options;
3672}
3673
3675 const Instruction *I) const {
3677 // For the binary operators (e.g. or) we need to be more careful than
3678 // selects, here we only transform them if they are already at a natural
3679 // break point in the code - the end of a block with an unconditional
3680 // terminator.
3681 if (I->getOpcode() == Instruction::Or &&
3682 isa<UncondBrInst>(I->getNextNode()))
3683 return true;
3684
3685 if (I->getOpcode() == Instruction::Add ||
3686 I->getOpcode() == Instruction::Sub)
3687 return true;
3688 }
3690}
3691
3693 const Function *Caller, const Attribute &Attr) const {
3694 // "interrupt" controls the prolog/epilog of interrupt handlers (and includes
3695 // restrictions on their signatures). We can outline from the bodies of these
3696 // handlers, but when we do we need to make sure we don't mark the outlined
3697 // function as an interrupt handler too.
3698 if (Attr.isStringAttribute() && Attr.getKindAsString() == "interrupt")
3699 return false;
3700
3702}
3703
3704std::optional<Instruction *>
3706 // If all operands of a vmv.v.x are constant, fold a bitcast(vmv.v.x) to scale
3707 // the vmv.v.x, enabling removal of the bitcast. The transform helps avoid
3708 // creating redundant masks.
3709 const DataLayout &DL = IC.getDataLayout();
3710 if (II.user_empty())
3711 return {};
3712 auto *TargetVecTy = dyn_cast<ScalableVectorType>(II.user_back()->getType());
3713 if (!TargetVecTy)
3714 return {};
3715 const APInt *Scalar;
3716 uint64_t VL;
3718 m_Poison(), m_APInt(Scalar), m_ConstantInt(VL))) ||
3719 !all_of(II.users(), [TargetVecTy](User *U) {
3720 return U->getType() == TargetVecTy && match(U, m_BitCast(m_Value()));
3721 }))
3722 return {};
3723 auto *SourceVecTy = cast<ScalableVectorType>(II.getType());
3724 unsigned TargetEltBW = DL.getTypeSizeInBits(TargetVecTy->getElementType());
3725 unsigned SourceEltBW = DL.getTypeSizeInBits(SourceVecTy->getElementType());
3726 if (TargetEltBW % SourceEltBW)
3727 return {};
3728 unsigned TargetScale = TargetEltBW / SourceEltBW;
3729 if (VL % TargetScale || TargetScale == 1)
3730 return {};
3731 Type *VLTy = II.getOperand(2)->getType();
3732 ElementCount SourceEC = SourceVecTy->getElementCount();
3733 unsigned NewEltBW = SourceEltBW * TargetScale;
3734 if (!SourceEC.isKnownMultipleOf(TargetScale) ||
3735 !DL.fitsInLegalInteger(NewEltBW))
3736 return {};
3737 auto *NewEltTy = IntegerType::get(II.getContext(), NewEltBW);
3738 if (!TLI->isLegalElementTypeForRVV(TLI->getValueType(DL, NewEltTy)))
3739 return {};
3740 ElementCount NewEC = SourceEC.divideCoefficientBy(TargetScale);
3741 Type *RetTy = VectorType::get(NewEltTy, NewEC);
3742 assert(SourceVecTy->canLosslesslyBitCastTo(RetTy) &&
3743 "Lossless bitcast between types expected");
3744 APInt NewScalar = APInt::getSplat(NewEltBW, *Scalar);
3745 return IC.replaceInstUsesWith(
3746 II,
3749 RetTy, Intrinsic::riscv_vmv_v_x,
3750 {PoisonValue::get(RetTy), ConstantInt::get(NewEltTy, NewScalar),
3751 ConstantInt::get(VLTy, VL / TargetScale)}),
3752 SourceVecTy));
3753}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static cl::opt< bool > EnableOrLikeSelectOpt("enable-aarch64-or-like-select", cl::init(true), cl::Hidden)
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static bool shouldSplit(Instruction *InsertPoint, DenseSet< Value * > &PrevConditionValues, DenseSet< Value * > &ConditionValues, DominatorTree &DT, DenseSet< Instruction * > &Unhoistables)
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
Cost tables and simple lookup functions.
Hexagon Common GEP
static cl::opt< int > InstrCost("inline-instr-cost", cl::Hidden, cl::init(5), cl::desc("Cost of a single instruction when inlining"))
std::pair< Instruction::BinaryOps, Value * > OffsetOp
Find all possible pairs (BinOp, RHS) that BinOp V, RHS can be simplified.
This file provides the interface for the instcombine pass implementation.
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static LVOptions Options
Definition LVOptions.cpp:25
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
static const Function * getCalledFunction(const Value *V)
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
static InstructionCost costShuffleViaVRegSplitting(const RISCVTTIImpl &TTI, MVT LegalVT, std::optional< unsigned > VLen, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind)
Try to perform better estimation of the permutation.
static InstructionCost costShuffleViaSplitting(const RISCVTTIImpl &TTI, MVT LegalVT, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind)
Attempt to approximate the cost of a shuffle which will require splitting during legalization.
static bool isRepeatedConcatMask(ArrayRef< int > Mask, int &SubVectorSize)
static unsigned isM1OrSmaller(MVT VT)
static cl::opt< bool > EnableOrLikeSelectOpt("enable-riscv-or-like-select", cl::init(true), cl::Hidden)
static cl::opt< unsigned > SLPMaxVF("riscv-v-slp-max-vf", cl::desc("Overrides result used for getMaximumVF query which is used " "exclusively by SLP vectorizer."), cl::Hidden)
static cl::opt< unsigned > RVVRegisterWidthLMUL("riscv-v-register-bit-width-lmul", cl::desc("The LMUL to use for getRegisterBitWidth queries. Affects LMUL used " "by autovectorized code. Fractional LMULs are not supported."), cl::init(2), cl::Hidden)
static cl::opt< unsigned > RVVMinTripCount("riscv-v-min-trip-count", cl::desc("Set the lower bound of a trip count to decide on " "vectorization while tail-folding."), cl::init(5), cl::Hidden)
static InstructionCost getIntImmCostImpl(const DataLayout &DL, const RISCVSubtarget *ST, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, bool FreeZeroes)
static VectorType * getVRGatherIndexType(MVT DataVT, const RISCVSubtarget &ST, LLVMContext &C)
static const CostTblEntry VectorIntrinsicCostTable[]
static bool canUseShiftPair(Instruction *Inst, const APInt &Imm)
static bool canUseShiftCmp(Instruction *Inst, const APInt &Imm)
This file defines a TargetTransformInfoImplBase conforming object specific to the RISC-V target machi...
static Type * getValueType(Value *V, bool LookThroughCmp=false)
Returns the "element type" of the given value/instruction V.
This file contains some templates that are useful if you are working with the STL at all.
#define LLVM_DEBUG(...)
Definition Debug.h:119
This file describes how to lower LLVM code to machine code.
This pass exposes codegen information to IR-level passes.
Class for arbitrary precision integers.
Definition APInt.h:78
static LLVM_ABI APInt getSplat(unsigned NewLen, const APInt &V)
Return a value containing V broadcasted over NewLen bits.
Definition APInt.cpp:652
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition APInt.h:201
Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
const T & back() const
Get the last element.
Definition ArrayRef.h:150
iterator end() const
Definition ArrayRef.h:130
size_t size() const
Get the array size.
Definition ArrayRef.h:141
Functions, function parameters, and return types can have attributes to indicate how they should be t...
Definition Attributes.h:105
LLVM_ABI bool isStringAttribute() const
Return true if the attribute is a string (target-dependent) attribute.
LLVM_ABI StringRef getKindAsString() const
Return the attribute's kind as a string.
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef< const Value * > Operands, Type *AccessType, TTI::TargetCostKind CostKind) const override
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *SrcTy, int &Index, VectorType *&SubTy) const
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr, int64_t ScalableOffset=0) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
std::optional< unsigned > getMaxVScale() const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
InstructionCost getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index) const override
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
bool isLegalAddImmediate(int64_t imm) const override
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
std::optional< unsigned > getVScaleForTuning() const override
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *, const SCEV *, TTI::TargetCostKind) const override
unsigned getRegUsageForType(Type *Ty) const override
InstructionCost getMemIntrinsicInstrCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
Value * getArgOperand(unsigned i) const
unsigned arg_size() const
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:740
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition InstrTypes.h:743
@ FCMP_TRUE
1 1 1 1 Always true (always folded)
Definition InstrTypes.h:757
@ ICMP_SLT
signed less than
Definition InstrTypes.h:769
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition InstrTypes.h:746
@ FCMP_ULE
1 1 0 1 True if unordered, less than, or equal
Definition InstrTypes.h:755
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition InstrTypes.h:744
@ FCMP_OGE
0 0 1 1 True if ordered and greater than or equal
Definition InstrTypes.h:745
@ FCMP_ULT
1 1 0 0 True if unordered or less than
Definition InstrTypes.h:754
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition InstrTypes.h:748
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition InstrTypes.h:751
@ FCMP_UGT
1 0 1 0 True if unordered or greater than
Definition InstrTypes.h:752
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition InstrTypes.h:747
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition InstrTypes.h:749
@ ICMP_NE
not equal
Definition InstrTypes.h:762
@ FCMP_UNE
1 1 1 0 True if unordered or not equal
Definition InstrTypes.h:756
@ FCMP_UGE
1 0 1 1 True if unordered, greater than, or equal
Definition InstrTypes.h:753
@ FCMP_FALSE
0 0 0 0 Always false (always folded)
Definition InstrTypes.h:742
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition InstrTypes.h:750
static bool isFPPredicate(Predicate P)
Definition InstrTypes.h:833
static bool isIntPredicate(Predicate P)
Definition InstrTypes.h:839
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
This class represents a range of values.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:23
bool noNaNs() const
Definition FMF.h:68
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static FixedVectorType * getDoubleElementsVectorType(FixedVectorType *VTy)
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:873
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > OverloadTypes, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="", ArrayRef< OperandBundleDef > OpBundles={})
Create a call to intrinsic ID with Args, mangled using OverloadTypes.
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2242
The core instruction combiner logic.
const DataLayout & getDataLayout() const
Instruction * replaceInstUsesWith(Instruction &I, Value *V)
A combiner-aware RAUW-like routine.
BuilderTy & Builder
static InstructionCost getInvalid(CostType Val=0)
CostType getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
LLVM_ABI bool isCommutative() const LLVM_READONLY
Return true if the instruction is commutative:
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:354
const SmallVectorImpl< Type * > & getArgTypes() const
const SmallVectorImpl< const Value * > & getArgs() const
A wrapper class for inspecting calls to intrinsic functions.
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
Machine Value Type.
static MVT getFloatingPointVT(unsigned BitWidth)
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
uint64_t getScalarSizeInBits() const
MVT changeVectorElementType(MVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
MVT changeTypeToInteger()
Return the type converted to an equivalently sized integer or vector with integer element type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
bool bitsGT(MVT VT) const
Return true if this has more bits than VT.
bool isFixedLengthVector() const
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
MVT getVectorElementType() const
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
Information for memory intrinsic cost model.
const Instruction * getInst() const
unsigned getOpcode() const
Return the opcode for this Instruction or ConstantExpr.
Definition Operator.h:43
The optimization diagnostic interface.
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
bool shouldCopyAttributeWhenOutliningFrom(const Function *Caller, const Attribute &Attr) const override
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
bool isLegalMaskedExpandLoad(Type *DataType, Align Alignment) const override
InstructionCost getStridedMemoryOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const
bool isLegalMaskedLoadStore(Type *DataType, Align Alignment) const
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) const override
unsigned getMinTripCountTailFoldingThreshold() const override
TTI::AddressingModeKind getPreferredAddressingMode(const Loop *L, ScalarEvolution *SE) const override
InstructionCost getAddressComputationCost(Type *PTy, ScalarEvolution *SE, const SCEV *Ptr, TTI::TargetCostKind CostKind) const override
InstructionCost getStoreImmCost(Type *VecTy, TTI::OperandValueInfo OpInfo, TTI::TargetCostKind CostKind) const
Return the cost of materializing an immediate for a value operand of a store instruction.
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const override
InstructionCost getCostOfKeepingLiveOverCall(ArrayRef< Type * > Tys) const override
std::optional< InstructionCost > getCombinedArithmeticInstructionCost(unsigned ISDOpcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info, TTI::OperandValueInfo Opd2Info, ArrayRef< const Value * > Args, const Instruction *CxtI) const
Check to see if this instruction is expected to be combined to a simpler operation during/before lowe...
bool hasActiveVectorLength() const override
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index) const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
bool isLegalBroadcastLoad(Type *ElementTy, ElementCount NumElements) const override
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
Try to calculate op costs for min/max reduction operations.
bool canSplatOperand(Instruction *I, int Operand) const
Return true if the (vector) instruction I will be lowered to an instruction with a scalar splat opera...
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2) const override
bool isLegalStridedLoadStore(Type *DataType, Align Alignment) const override
unsigned getRegUsageForType(Type *Ty) const override
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
bool isLegalMaskedScatter(Type *DataType, Align Alignment) const override
bool isLegalMaskedCompressStore(Type *DataTy, Align Alignment) const override
InstructionCost getGatherScatterOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const
InstructionCost getPartialReductionCost(unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType, ElementCount VF, TTI::PartialReductionExtendKind OpAExtend, TTI::PartialReductionExtendKind OpBExtend, std::optional< unsigned > BinOp, TTI::TargetCostKind CostKind, std::optional< FastMathFlags > FMF) const override
bool shouldTreatInstructionLikeSelect(const Instruction *I) const override
InstructionCost getExpandCompressMemoryOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const
bool preferAlternateOpcodeVectorization() const override
bool isProfitableToSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const override
Check if sinking I's operands to I's basic block is profitable, because the operands can be folded in...
std::optional< unsigned > getMaxVScale() const override
bool shouldExpandReduction(const IntrinsicInst *II) const override
std::optional< unsigned > getVScaleForTuning() const override
InstructionCost getMemIntrinsicInstrCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const override
Get memory intrinsic cost based on arguments.
bool isLegalMaskedGather(Type *DataType, Align Alignment) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const override
InstructionCost getPointersChainCost(ArrayRef< const Value * > Ptrs, const Value *Base, const TTI::PointersChainInfo &Info, Type *AccessTy, TTI::TargetCostKind CostKind) const override
TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const override
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
Estimate the overhead of scalarizing an instruction.
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpdInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
Get intrinsic cost based on arguments.
InstructionCost getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
std::optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const override
bool shouldConsiderAddressTypePromotion(const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const override
See if I should be considered for address type promotion.
InstructionCost getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) const override
TargetTransformInfo::PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override
static MVT getM1VT(MVT VT)
Given a vector (either fixed or scalable), return the scalable vector corresponding to a vector regis...
InstructionCost getVRGatherVVCost(MVT VT) const
Return the cost of a vrgather.vv instruction for the type VT.
InstructionCost getVRGatherVICost(MVT VT) const
Return the cost of a vrgather.vi (or vx) instruction for the type VT.
static unsigned computeVLMAX(unsigned VectorBits, unsigned EltSize, unsigned MinSize)
InstructionCost getLMULCost(MVT VT) const
Return the cost of LMUL for linear operations.
InstructionCost getVSlideVICost(MVT VT) const
Return the cost of a vslidedown.vi or vslideup.vi instruction for the type VT.
InstructionCost getVSlideVXCost(MVT VT) const
Return the cost of a vslidedown.vx or vslideup.vx instruction for the type VT.
static RISCVVType::VLMUL getLMUL(MVT VT)
This class represents an analyzed expression in the program.
static LLVM_ABI ScalableVectorType * get(Type *ElementType, unsigned MinNumElts)
Definition Type.cpp:895
The main scalar evolution driver.
static LLVM_ABI bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
static LLVM_ABI bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
Implements a dense probed hash-table based set with some number of buckets stored inline.
Definition DenseSet.h:301
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
virtual const DataLayout & getDataLayout() const
virtual bool shouldTreatInstructionLikeSelect(const Instruction *I) const
virtual TTI::AddressingModeKind getPreferredAddressingMode(const Loop *L, ScalarEvolution *SE) const
virtual bool shouldCopyAttributeWhenOutliningFrom(const Function *Caller, const Attribute &Attr) const
virtual bool isLoweredToCall(const Function *F) const
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TTI::TargetCostKind CostKind) const override
VectorInstrContext
Represents a hint about the context in which an insert/extract is used.
@ None
The insert/extract is not used with a load/store.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
@ TCK_Latency
The latency of instruction.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
PopcntSupportKind
Flags indicating the kind of support for population count.
@ TCC_Expensive
The cost of a 'div' instruction on x86.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
AddressingModeKind
Which addressing mode Loop Strength Reduction will try to generate.
@ AMK_PostIndexed
Prefer post-indexed addressing mode.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Transpose
Transpose two vectors.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition TypeSize.h:346
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:314
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:290
LLVM_ABI bool isScalableTy(SmallPtrSetImpl< const Type * > &Visited) const
Return true if this is a type whose size is a known multiple of vscale.
Definition Type.cpp:65
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition Type.h:147
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:370
LLVM_ABI Type * getWithNewBitWidth(unsigned NewBitWidth) const
Given an integer or vector type, change the lane bitwidth to NewBitwidth, whilst keeping the old numb...
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:144
LLVM_ABI Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:130
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:236
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:310
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:257
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:317
static LLVM_ABI Type * getFloatTy(LLVMContext &C)
Definition Type.cpp:290
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:141
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
Value * getOperand(unsigned i) const
Definition User.h:207
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255
user_iterator user_begin()
Definition Value.h:402
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.h:258
LLVM_ABI Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
Definition Value.cpp:964
Base class of all SIMD vector types.
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:212
constexpr bool isKnownMultipleOf(ScalarTy RHS) const
This function tells the caller whether the element count is known at compile time to be a multiple of...
Definition TypeSize.h:180
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
static constexpr bool isKnownLE(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:230
static constexpr bool isKnownLT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:216
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
constexpr LeafTy divideCoefficientBy(ScalarTy RHS) const
We do not provide the '/' operator here because division for polynomial types does not work in the sa...
Definition TypeSize.h:252
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
ISD namespace - This namespace contains an enum which represents all of the SelectionDAG node types a...
Definition ISDOpcodes.h:24
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:264
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:884
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:417
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:848
@ FNEG
Perform various unary floating-point operations inspired by libm.
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition ISDOpcodes.h:704
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:769
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:854
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:982
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:930
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:739
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:963
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:860
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
auto m_Poison()
Match an arbitrary poison constant.
ap_match< APInt > m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
bool match(Val *V, const Pattern &P)
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
auto m_Value()
Match an arbitrary value and ignore it.
TwoOps_match< V1_t, V2_t, Instruction::ShuffleVector > m_Shuffle(const V1_t &v1, const V2_t &v2)
Matches ShuffleVectorInst independently of mask value.
ThreeOps_match< Val_t, Elt_t, Idx_t, Instruction::InsertElement > m_InsertElt(const Val_t &Val, const Elt_t &Elt, const Idx_t &Idx)
Matches InsertElementInst.
auto m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
int getIntMatCost(const APInt &Val, unsigned Size, const MCSubtargetInfo &STI, bool CompressionCost, bool FreeZeroes)
static constexpr unsigned RVVBitsPerBlock
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
unsigned Log2_32_Ceil(uint32_t Value)
Return the ceil log base 2 of the specified value, 32 if the value is zero.
Definition MathExtras.h:344
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1738
const CostTblEntryT< CostType > * CostTableLookup(ArrayRef< CostTblEntryT< CostType > > Tbl, int ISD, MVT Ty)
Find in cost table.
Definition CostTable.h:35
LLVM_ABI bool getBooleanLoopAttribute(const Loop *TheLoop, StringRef Name)
Returns true if Name is applied to TheLoop and enabled.
InstructionCost Cost
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2553
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
auto adjacent_find(R &&Range)
Provide wrappers to std::adjacent_find which finds the first pair of adjacent elements that are equal...
Definition STLExtras.h:1817
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:204
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition MathExtras.h:273
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1745
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
LLVM_ABI llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:209
bool is_sorted(R &&Range, Compare C)
Wrapper function around std::is_sorted to check if elements in a range R are sorted with respect to a...
Definition STLExtras.h:1969
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
constexpr int PoisonMaskElem
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
TargetTransformInfo TTI
LLVM_ABI bool isMaskedSlidePair(ArrayRef< int > Mask, int NumElts, std::array< std::pair< int, int >, 2 > &SrcInfo)
Does this shuffle mask represent either one slide shuffle or a pair of two slide shuffles,...
LLVM_ABI llvm::SmallVector< int, 16 > createInterleaveMask(unsigned VF, unsigned NumVecs)
Create an interleave shuffle mask.
DWARFExpression::Operation Op
CostTblEntryT< unsigned > CostTblEntry
Definition CostTable.h:30
OutputIt copy(R &&Range, OutputIt Out)
Definition STLExtras.h:1884
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
constexpr int64_t SignExtend64(uint64_t x)
Sign-extend the number in the bottom B bits of X to a 64-bit integer.
Definition MathExtras.h:572
LLVM_ABI void processShuffleMasks(ArrayRef< int > Mask, unsigned NumOfSrcRegs, unsigned NumOfDestRegs, unsigned NumOfUsedRegs, function_ref< void()> NoInputAction, function_ref< void(ArrayRef< int >, unsigned, unsigned)> SingleInputAction, function_ref< void(ArrayRef< int >, unsigned, unsigned, bool)> ManyInputsAction)
Splits and processes shuffle mask depending on the number of input and output registers.
bool equal(L &&LRange, R &&RRange)
Wrapper function around std::equal to detect if pair-wise elements between two ranges are the same.
Definition STLExtras.h:2145
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition bit.h:347
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:876
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77
Extended Value Type.
Definition ValueTypes.h:35
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:106
Align valueOrOne() const
For convenience, returns a valid alignment or 1 if undefined.
Definition Alignment.h:130
Information about a load/store intrinsic defined by the target.
unsigned Insns
TODO: Some of these could be merged.
Returns options for expansion of memcmp. IsZeroCmp is.
Describe known properties for a set of pointers.
Parameters that control the generic loop unrolling transformation.
bool UpperBound
Allow using trip count upper bound to unroll loops.
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
bool UnrollAndJam
Allow unroll and jam. Used to enable unroll and jam for the target.
bool UnrollRemainder
Allow unrolling of all the iterations of the runtime loop remainder.
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
unsigned OptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size (set to UINT_MAX to disable).