LLVM 23.0.0git
RISCVTargetTransformInfo.cpp
Go to the documentation of this file.
1//===-- RISCVTargetTransformInfo.cpp - RISC-V specific TTI ----------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
11#include "llvm/ADT/STLExtras.h"
18#include "llvm/IR/IntrinsicsRISCV.h"
21#include <cmath>
22#include <optional>
23using namespace llvm;
24using namespace llvm::PatternMatch;
25
26#define DEBUG_TYPE "riscvtti"
27
29 "riscv-v-register-bit-width-lmul",
31 "The LMUL to use for getRegisterBitWidth queries. Affects LMUL used "
32 "by autovectorized code. Fractional LMULs are not supported."),
34
36 "riscv-v-slp-max-vf",
38 "Overrides result used for getMaximumVF query which is used "
39 "exclusively by SLP vectorizer."),
41
43 RVVMinTripCount("riscv-v-min-trip-count",
44 cl::desc("Set the lower bound of a trip count to decide on "
45 "vectorization while tail-folding."),
47
48static cl::opt<bool> EnableOrLikeSelectOpt("enable-riscv-or-like-select",
49 cl::init(true), cl::Hidden);
50
52RISCVTTIImpl::getRISCVInstructionCost(ArrayRef<unsigned> OpCodes, MVT VT,
54 // Check if the type is valid for all CostKind
55 if (!VT.isVector())
57 size_t NumInstr = OpCodes.size();
59 return NumInstr;
60 InstructionCost LMULCost = TLI->getLMULCost(VT);
62 return LMULCost * NumInstr;
63 InstructionCost Cost = 0;
64 for (auto Op : OpCodes) {
65 switch (Op) {
66 case RISCV::VRGATHER_VI:
67 Cost += TLI->getVRGatherVICost(VT);
68 break;
69 case RISCV::VRGATHER_VV:
70 Cost += TLI->getVRGatherVVCost(VT);
71 break;
72 case RISCV::VSLIDEUP_VI:
73 case RISCV::VSLIDEDOWN_VI:
74 Cost += TLI->getVSlideVICost(VT);
75 break;
76 case RISCV::VSLIDEUP_VX:
77 case RISCV::VSLIDEDOWN_VX:
78 Cost += TLI->getVSlideVXCost(VT);
79 break;
80 case RISCV::VREDMAX_VS:
81 case RISCV::VREDMIN_VS:
82 case RISCV::VREDMAXU_VS:
83 case RISCV::VREDMINU_VS:
84 case RISCV::VREDSUM_VS:
85 case RISCV::VREDAND_VS:
86 case RISCV::VREDOR_VS:
87 case RISCV::VREDXOR_VS:
88 case RISCV::VFREDMAX_VS:
89 case RISCV::VFREDMIN_VS:
90 case RISCV::VFREDUSUM_VS: {
91 unsigned VL = VT.getVectorMinNumElements();
92 if (!VT.isFixedLengthVector())
93 VL *= *getVScaleForTuning();
94 Cost += Log2_32_Ceil(VL);
95 break;
96 }
97 case RISCV::VFREDOSUM_VS: {
98 unsigned VL = VT.getVectorMinNumElements();
99 if (!VT.isFixedLengthVector())
100 VL *= *getVScaleForTuning();
101 Cost += VL;
102 break;
103 }
104 case RISCV::VMV_X_S:
105 case RISCV::VMV_S_X:
106 case RISCV::VFMV_F_S:
107 case RISCV::VFMV_S_F:
108 case RISCV::VMOR_MM:
109 case RISCV::VMXOR_MM:
110 case RISCV::VMAND_MM:
111 case RISCV::VMANDN_MM:
112 case RISCV::VMNAND_MM:
113 case RISCV::VCPOP_M:
114 case RISCV::VFIRST_M:
115 Cost += 1;
116 break;
117 case RISCV::VDIV_VV:
118 case RISCV::VREM_VV:
119 Cost += LMULCost * TTI::TCC_Expensive;
120 break;
121 default:
122 Cost += LMULCost;
123 }
124 }
125 return Cost;
126}
127
129 const RISCVSubtarget *ST,
130 const APInt &Imm, Type *Ty,
132 bool FreeZeroes) {
133 assert(Ty->isIntegerTy() &&
134 "getIntImmCost can only estimate cost of materialising integers");
135
136 // We have a Zero register, so 0 is always free.
137 if (Imm == 0)
138 return TTI::TCC_Free;
139
140 // Otherwise, we check how many instructions it will take to materialise.
141 return RISCVMatInt::getIntMatCost(Imm, DL.getTypeSizeInBits(Ty), *ST,
142 /*CompressionCost=*/false, FreeZeroes);
143}
144
148 return getIntImmCostImpl(getDataLayout(), getST(), Imm, Ty, CostKind, false);
149}
150
151// Look for patterns of shift followed by AND that can be turned into a pair of
152// shifts. We won't need to materialize an immediate for the AND so these can
153// be considered free.
154static bool canUseShiftPair(Instruction *Inst, const APInt &Imm) {
155 uint64_t Mask = Imm.getZExtValue();
156 auto *BO = dyn_cast<BinaryOperator>(Inst->getOperand(0));
157 if (!BO || !BO->hasOneUse())
158 return false;
159
160 if (BO->getOpcode() != Instruction::Shl)
161 return false;
162
163 if (!isa<ConstantInt>(BO->getOperand(1)))
164 return false;
165
166 unsigned ShAmt = cast<ConstantInt>(BO->getOperand(1))->getZExtValue();
167 // (and (shl x, c2), c1) will be matched to (srli (slli x, c2+c3), c3) if c1
168 // is a mask shifted by c2 bits with c3 leading zeros.
169 if (isShiftedMask_64(Mask)) {
170 unsigned Trailing = llvm::countr_zero(Mask);
171 if (ShAmt == Trailing)
172 return true;
173 }
174
175 return false;
176}
177
178// If this is i64 AND is part of (X & -(1 << C1) & 0xffffffff) == C2 << C1),
179// DAGCombiner can convert this to (sraiw X, C1) == sext(C2) for RV64. On RV32,
180// the type will be split so only the lower 32 bits need to be compared using
181// (srai/srli X, C) == C2.
182static bool canUseShiftCmp(Instruction *Inst, const APInt &Imm) {
183 if (!Inst->hasOneUse())
184 return false;
185
186 // Look for equality comparison.
187 auto *Cmp = dyn_cast<ICmpInst>(*Inst->user_begin());
188 if (!Cmp || !Cmp->isEquality())
189 return false;
190
191 // Right hand side of comparison should be a constant.
192 auto *C = dyn_cast<ConstantInt>(Cmp->getOperand(1));
193 if (!C)
194 return false;
195
196 uint64_t Mask = Imm.getZExtValue();
197
198 // Mask should be of the form -(1 << C) in the lower 32 bits.
199 if (!isUInt<32>(Mask) || !isPowerOf2_32(-uint32_t(Mask)))
200 return false;
201
202 // Comparison constant should be a subset of Mask.
203 uint64_t CmpC = C->getZExtValue();
204 if ((CmpC & Mask) != CmpC)
205 return false;
206
207 // We'll need to sign extend the comparison constant and shift it right. Make
208 // sure the new constant can use addi/xori+seqz/snez.
209 unsigned ShiftBits = llvm::countr_zero(Mask);
210 int64_t NewCmpC = SignExtend64<32>(CmpC) >> ShiftBits;
211 return NewCmpC >= -2048 && NewCmpC <= 2048;
212}
213
215 const APInt &Imm, Type *Ty,
217 Instruction *Inst) const {
218 assert(Ty->isIntegerTy() &&
219 "getIntImmCost can only estimate cost of materialising integers");
220
221 // We have a Zero register, so 0 is always free.
222 if (Imm == 0)
223 return TTI::TCC_Free;
224
225 // Some instructions in RISC-V can take a 12-bit immediate. Some of these are
226 // commutative, in others the immediate comes from a specific argument index.
227 bool Takes12BitImm = false;
228 unsigned ImmArgIdx = ~0U;
229
230 switch (Opcode) {
231 case Instruction::GetElementPtr:
232 // Never hoist any arguments to a GetElementPtr. CodeGenPrepare will
233 // split up large offsets in GEP into better parts than ConstantHoisting
234 // can.
235 return TTI::TCC_Free;
236 case Instruction::Store: {
237 // Use the materialization cost regardless of if it's the address or the
238 // value that is constant, except for if the store is misaligned and
239 // misaligned accesses are not legal (experience shows constant hoisting
240 // can sometimes be harmful in such cases).
241 if (Idx == 1 || !Inst)
242 return getIntImmCostImpl(getDataLayout(), getST(), Imm, Ty, CostKind,
243 /*FreeZeroes=*/true);
244
245 StoreInst *ST = cast<StoreInst>(Inst);
246 if (!getTLI()->allowsMemoryAccessForAlignment(
247 Ty->getContext(), DL, getTLI()->getValueType(DL, Ty),
248 ST->getPointerAddressSpace(), ST->getAlign()))
249 return TTI::TCC_Free;
250
251 return getIntImmCostImpl(getDataLayout(), getST(), Imm, Ty, CostKind,
252 /*FreeZeroes=*/true);
253 }
254 case Instruction::Load:
255 // If the address is a constant, use the materialization cost.
256 return getIntImmCost(Imm, Ty, CostKind);
257 case Instruction::And:
258 // zext.h
259 if (Imm == UINT64_C(0xffff) && ST->hasStdExtZbb())
260 return TTI::TCC_Free;
261 // zext.w
262 if (Imm == UINT64_C(0xffffffff) &&
263 ((ST->hasStdExtZba() && ST->isRV64()) || ST->isRV32()))
264 return TTI::TCC_Free;
265 // bclri
266 if (ST->hasStdExtZbs() && (~Imm).isPowerOf2())
267 return TTI::TCC_Free;
268 if (Inst && Idx == 1 && Imm.getBitWidth() <= ST->getXLen() &&
269 canUseShiftPair(Inst, Imm))
270 return TTI::TCC_Free;
271 if (Inst && Idx == 1 && Imm.getBitWidth() == 64 &&
272 canUseShiftCmp(Inst, Imm))
273 return TTI::TCC_Free;
274 Takes12BitImm = true;
275 break;
276 case Instruction::Add:
277 Takes12BitImm = true;
278 break;
279 case Instruction::Or:
280 case Instruction::Xor:
281 // bseti/binvi
282 if (ST->hasStdExtZbs() && Imm.isPowerOf2())
283 return TTI::TCC_Free;
284 Takes12BitImm = true;
285 break;
286 case Instruction::Mul:
287 // Power of 2 is a shift. Negated power of 2 is a shift and a negate.
288 if (Imm.isPowerOf2() || Imm.isNegatedPowerOf2())
289 return TTI::TCC_Free;
290 // One more or less than a power of 2 can use SLLI+ADD/SUB.
291 if ((Imm + 1).isPowerOf2() || (Imm - 1).isPowerOf2())
292 return TTI::TCC_Free;
293 // FIXME: There is no MULI instruction.
294 Takes12BitImm = true;
295 break;
296 case Instruction::Sub:
297 case Instruction::Shl:
298 case Instruction::LShr:
299 case Instruction::AShr:
300 Takes12BitImm = true;
301 ImmArgIdx = 1;
302 break;
303 default:
304 break;
305 }
306
307 if (Takes12BitImm) {
308 // Check immediate is the correct argument...
309 if (Instruction::isCommutative(Opcode) || Idx == ImmArgIdx) {
310 // ... and fits into the 12-bit immediate.
311 if (Imm.getSignificantBits() <= 64 &&
312 getTLI()->isLegalAddImmediate(Imm.getSExtValue())) {
313 return TTI::TCC_Free;
314 }
315 }
316
317 // Otherwise, use the full materialisation cost.
318 return getIntImmCost(Imm, Ty, CostKind);
319 }
320
321 // By default, prevent hoisting.
322 return TTI::TCC_Free;
323}
324
327 const APInt &Imm, Type *Ty,
329 // Prevent hoisting in unknown cases.
330 return TTI::TCC_Free;
331}
332
334 return ST->hasVInstructions();
335}
336
338RISCVTTIImpl::getPopcntSupport(unsigned TyWidth) const {
339 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
340 return ST->hasCPOPLike() ? TTI::PSK_FastHardware : TTI::PSK_Software;
341}
342
344 unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType,
346 TTI::PartialReductionExtendKind OpBExtend, std::optional<unsigned> BinOp,
347 TTI::TargetCostKind CostKind, std::optional<FastMathFlags> FMF) const {
348 if (Opcode == Instruction::FAdd)
350
351 // zve32x is broken for partial_reduce_umla, but let's make sure we
352 // don't generate them.
353 if (!ST->hasStdExtZvdot4a8i() || ST->getELen() < 64 ||
354 Opcode != Instruction::Add || !BinOp || *BinOp != Instruction::Mul ||
355 InputTypeA != InputTypeB || !InputTypeA->isIntegerTy(8) ||
356 !AccumType->isIntegerTy(32) || !VF.isKnownMultipleOf(4))
358
359 Type *Tp = VectorType::get(AccumType, VF.divideCoefficientBy(4));
360 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
361 // Note: Asuming all vdota4* variants are equal cost
362 return LT.first *
363 getRISCVInstructionCost(RISCV::VDOTA4_VV, LT.second, CostKind);
364}
365
367 // Currently, the ExpandReductions pass can't expand scalable-vector
368 // reductions, but we still request expansion as RVV doesn't support certain
369 // reductions and the SelectionDAG can't legalize them either.
370 switch (II->getIntrinsicID()) {
371 default:
372 return false;
373 // These reductions have no equivalent in RVV
374 case Intrinsic::vector_reduce_mul:
375 case Intrinsic::vector_reduce_fmul:
376 return true;
377 }
378}
379
380std::optional<unsigned> RISCVTTIImpl::getMaxVScale() const {
381 if (ST->hasVInstructions())
382 return ST->getRealMaxVLen() / RISCV::RVVBitsPerBlock;
383 return BaseT::getMaxVScale();
384}
385
386std::optional<unsigned> RISCVTTIImpl::getVScaleForTuning() const {
387 if (ST->hasVInstructions())
388 if (unsigned MinVLen = ST->getRealMinVLen();
389 MinVLen >= RISCV::RVVBitsPerBlock)
390 return MinVLen / RISCV::RVVBitsPerBlock;
392}
393
396 unsigned LMUL =
397 llvm::bit_floor(std::clamp<unsigned>(RVVRegisterWidthLMUL, 1, 8));
398 switch (K) {
400 return TypeSize::getFixed(ST->getXLen());
402 return TypeSize::getFixed(
403 ST->useRVVForFixedLengthVectors() ? LMUL * ST->getRealMinVLen() : 0);
406 (ST->hasVInstructions() &&
407 ST->getRealMinVLen() >= RISCV::RVVBitsPerBlock)
409 : 0);
410 }
411
412 llvm_unreachable("Unsupported register kind");
413}
414
415InstructionCost RISCVTTIImpl::getStaticDataAddrGenerationCost(
416 const TTI::TargetCostKind CostKind) const {
417 switch (CostKind) {
420 // Always 2 instructions
421 return 2;
422 case TTI::TCK_Latency:
424 // Depending on the memory model the address generation will
425 // require AUIPC + ADDI (medany) or LUI + ADDI (medlow). Don't
426 // have a way of getting this information here, so conservatively
427 // require both.
428 // In practice, these are generally implemented together.
429 return (ST->hasAUIPCADDIFusion() && ST->hasLUIADDIFusion()) ? 1 : 2;
430 }
431 llvm_unreachable("Unsupported cost kind");
432}
433
435RISCVTTIImpl::getConstantPoolLoadCost(Type *Ty,
437 // Add a cost of address generation + the cost of the load. The address
438 // is expected to be a PC relative offset to a constant pool entry
439 // using auipc/addi.
440 return getStaticDataAddrGenerationCost(CostKind) +
441 getMemoryOpCost(Instruction::Load, Ty, DL.getABITypeAlign(Ty),
442 /*AddressSpace=*/0, CostKind);
443}
444
445static bool isRepeatedConcatMask(ArrayRef<int> Mask, int &SubVectorSize) {
446 unsigned Size = Mask.size();
447 if (!isPowerOf2_32(Size))
448 return false;
449 for (unsigned I = 0; I != Size; ++I) {
450 if (static_cast<unsigned>(Mask[I]) == I)
451 continue;
452 if (Mask[I] != 0)
453 return false;
454 if (Size % I != 0)
455 return false;
456 for (unsigned J = I + 1; J != Size; ++J)
457 // Check the pattern is repeated.
458 if (static_cast<unsigned>(Mask[J]) != J % I)
459 return false;
460 SubVectorSize = I;
461 return true;
462 }
463 // That means Mask is <0, 1, 2, 3>. This is not a concatenation.
464 return false;
465}
466
468 LLVMContext &C) {
469 assert((DataVT.getScalarSizeInBits() != 8 ||
470 DataVT.getVectorNumElements() <= 256) && "unhandled case in lowering");
471 MVT IndexVT = DataVT.changeTypeToInteger();
472 if (IndexVT.getScalarType().bitsGT(ST.getXLenVT()))
473 IndexVT = IndexVT.changeVectorElementType(MVT::i16);
474 return cast<VectorType>(EVT(IndexVT).getTypeForEVT(C));
475}
476
477/// Attempt to approximate the cost of a shuffle which will require splitting
478/// during legalization. Note that processShuffleMasks is not an exact proxy
479/// for the algorithm used in LegalizeVectorTypes, but hopefully it's a
480/// reasonably close upperbound.
482 MVT LegalVT, VectorType *Tp,
483 ArrayRef<int> Mask,
485 assert(LegalVT.isFixedLengthVector() && !Mask.empty() &&
486 "Expected fixed vector type and non-empty mask");
487 unsigned LegalNumElts = LegalVT.getVectorNumElements();
488 // Number of destination vectors after legalization:
489 unsigned NumOfDests = divideCeil(Mask.size(), LegalNumElts);
490 // We are going to permute multiple sources and the result will be in
491 // multiple destinations. Providing an accurate cost only for splits where
492 // the element type remains the same.
493 if (NumOfDests <= 1 ||
495 Tp->getElementType()->getPrimitiveSizeInBits() ||
496 LegalNumElts >= Tp->getElementCount().getFixedValue())
498
499 unsigned VecTySize = TTI.getDataLayout().getTypeStoreSize(Tp);
500 unsigned LegalVTSize = LegalVT.getStoreSize();
501 // Number of source vectors after legalization:
502 unsigned NumOfSrcs = divideCeil(VecTySize, LegalVTSize);
503
504 auto *SingleOpTy = FixedVectorType::get(Tp->getElementType(), LegalNumElts);
505
506 unsigned NormalizedVF = LegalNumElts * std::max(NumOfSrcs, NumOfDests);
507 unsigned NumOfSrcRegs = NormalizedVF / LegalNumElts;
508 unsigned NumOfDestRegs = NormalizedVF / LegalNumElts;
509 SmallVector<int> NormalizedMask(NormalizedVF, PoisonMaskElem);
510 assert(NormalizedVF >= Mask.size() &&
511 "Normalized mask expected to be not shorter than original mask.");
512 copy(Mask, NormalizedMask.begin());
513 InstructionCost Cost = 0;
514 SmallDenseSet<std::pair<ArrayRef<int>, unsigned>> ReusedSingleSrcShuffles;
516 NormalizedMask, NumOfSrcRegs, NumOfDestRegs, NumOfDestRegs, []() {},
517 [&](ArrayRef<int> RegMask, unsigned SrcReg, unsigned DestReg) {
518 if (ShuffleVectorInst::isIdentityMask(RegMask, RegMask.size()))
519 return;
520 if (!ReusedSingleSrcShuffles.insert(std::make_pair(RegMask, SrcReg))
521 .second)
522 return;
523 Cost += TTI.getShuffleCost(
525 FixedVectorType::get(SingleOpTy->getElementType(), RegMask.size()),
526 SingleOpTy, RegMask, CostKind, 0, nullptr);
527 },
528 [&](ArrayRef<int> RegMask, unsigned Idx1, unsigned Idx2, bool NewReg) {
529 Cost += TTI.getShuffleCost(
531 FixedVectorType::get(SingleOpTy->getElementType(), RegMask.size()),
532 SingleOpTy, RegMask, CostKind, 0, nullptr);
533 });
534 return Cost;
535}
536
537/// Try to perform better estimation of the permutation.
538/// 1. Split the source/destination vectors into real registers.
539/// 2. Do the mask analysis to identify which real registers are
540/// permuted. If more than 1 source registers are used for the
541/// destination register building, the cost for this destination register
542/// is (Number_of_source_register - 1) * Cost_PermuteTwoSrc. If only one
543/// source register is used, build mask and calculate the cost as a cost
544/// of PermuteSingleSrc.
545/// Also, for the single register permute we try to identify if the
546/// destination register is just a copy of the source register or the
547/// copy of the previous destination register (the cost is
548/// TTI::TCC_Basic). If the source register is just reused, the cost for
549/// this operation is 0.
550static InstructionCost
552 std::optional<unsigned> VLen, VectorType *Tp,
554 assert(LegalVT.isFixedLengthVector());
555 if (!VLen || Mask.empty())
557 MVT ElemVT = LegalVT.getVectorElementType();
558 unsigned ElemsPerVReg = *VLen / ElemVT.getFixedSizeInBits();
559 LegalVT = TTI.getTypeLegalizationCost(
560 FixedVectorType::get(Tp->getElementType(), ElemsPerVReg))
561 .second;
562 // Number of destination vectors after legalization:
563 InstructionCost NumOfDests =
564 divideCeil(Mask.size(), LegalVT.getVectorNumElements());
565 if (NumOfDests <= 1 ||
567 Tp->getElementType()->getPrimitiveSizeInBits() ||
568 LegalVT.getVectorNumElements() >= Tp->getElementCount().getFixedValue())
570
571 unsigned VecTySize = TTI.getDataLayout().getTypeStoreSize(Tp);
572 unsigned LegalVTSize = LegalVT.getStoreSize();
573 // Number of source vectors after legalization:
574 unsigned NumOfSrcs = divideCeil(VecTySize, LegalVTSize);
575
576 auto *SingleOpTy = FixedVectorType::get(Tp->getElementType(),
577 LegalVT.getVectorNumElements());
578
579 unsigned E = NumOfDests.getValue();
580 unsigned NormalizedVF =
581 LegalVT.getVectorNumElements() * std::max(NumOfSrcs, E);
582 unsigned NumOfSrcRegs = NormalizedVF / LegalVT.getVectorNumElements();
583 unsigned NumOfDestRegs = NormalizedVF / LegalVT.getVectorNumElements();
584 SmallVector<int> NormalizedMask(NormalizedVF, PoisonMaskElem);
585 assert(NormalizedVF >= Mask.size() &&
586 "Normalized mask expected to be not shorter than original mask.");
587 copy(Mask, NormalizedMask.begin());
588 InstructionCost Cost = 0;
589 int NumShuffles = 0;
590 SmallDenseSet<std::pair<ArrayRef<int>, unsigned>> ReusedSingleSrcShuffles;
592 NormalizedMask, NumOfSrcRegs, NumOfDestRegs, NumOfDestRegs, []() {},
593 [&](ArrayRef<int> RegMask, unsigned SrcReg, unsigned DestReg) {
594 if (ShuffleVectorInst::isIdentityMask(RegMask, RegMask.size()))
595 return;
596 if (!ReusedSingleSrcShuffles.insert(std::make_pair(RegMask, SrcReg))
597 .second)
598 return;
599 ++NumShuffles;
600 Cost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, SingleOpTy,
601 SingleOpTy, RegMask, CostKind, 0, nullptr);
602 },
603 [&](ArrayRef<int> RegMask, unsigned Idx1, unsigned Idx2, bool NewReg) {
604 Cost += TTI.getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy,
605 SingleOpTy, RegMask, CostKind, 0, nullptr);
606 NumShuffles += 2;
607 });
608 // Note: check that we do not emit too many shuffles here to prevent code
609 // size explosion.
610 // TODO: investigate, if it can be improved by extra analysis of the masks
611 // to check if the code is more profitable.
612 if ((NumOfDestRegs > 2 && NumShuffles <= static_cast<int>(NumOfDestRegs)) ||
613 (NumOfDestRegs <= 2 && NumShuffles < 4))
614 return Cost;
616}
617
618InstructionCost RISCVTTIImpl::getSlideCost(FixedVectorType *Tp,
619 ArrayRef<int> Mask,
621 // Avoid missing masks and length changing shuffles
622 if (Mask.size() <= 2 || Mask.size() != Tp->getNumElements())
624
625 int NumElts = Tp->getNumElements();
626 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
627 // Avoid scalarization cases
628 if (!LT.second.isFixedLengthVector())
630
631 // Requires moving elements between parts, which requires additional
632 // unmodeled instructions.
633 if (LT.first != 1)
635
636 auto GetSlideOpcode = [&](int SlideAmt) {
637 assert(SlideAmt != 0);
638 bool IsVI = isUInt<5>(std::abs(SlideAmt));
639 if (SlideAmt < 0)
640 return IsVI ? RISCV::VSLIDEDOWN_VI : RISCV::VSLIDEDOWN_VX;
641 return IsVI ? RISCV::VSLIDEUP_VI : RISCV::VSLIDEUP_VX;
642 };
643
644 std::array<std::pair<int, int>, 2> SrcInfo;
645 if (!isMaskedSlidePair(Mask, NumElts, SrcInfo))
647
648 if (SrcInfo[1].second == 0)
649 std::swap(SrcInfo[0], SrcInfo[1]);
650
651 InstructionCost FirstSlideCost = 0;
652 if (SrcInfo[0].second != 0) {
653 unsigned Opcode = GetSlideOpcode(SrcInfo[0].second);
654 FirstSlideCost = getRISCVInstructionCost(Opcode, LT.second, CostKind);
655 }
656
657 if (SrcInfo[1].first == -1)
658 return FirstSlideCost;
659
660 InstructionCost SecondSlideCost = 0;
661 if (SrcInfo[1].second != 0) {
662 unsigned Opcode = GetSlideOpcode(SrcInfo[1].second);
663 SecondSlideCost = getRISCVInstructionCost(Opcode, LT.second, CostKind);
664 } else {
665 SecondSlideCost =
666 getRISCVInstructionCost(RISCV::VMERGE_VVM, LT.second, CostKind);
667 }
668
669 auto EC = Tp->getElementCount();
670 VectorType *MaskTy =
672 InstructionCost MaskCost = getConstantPoolLoadCost(MaskTy, CostKind);
673 return FirstSlideCost + SecondSlideCost + MaskCost;
674}
675
678 VectorType *SrcTy, ArrayRef<int> Mask,
679 TTI::TargetCostKind CostKind, int Index,
681 const Instruction *CxtI) const {
682 assert((Mask.empty() || DstTy->isScalableTy() ||
683 Mask.size() == DstTy->getElementCount().getKnownMinValue()) &&
684 "Expected the Mask to match the return size if given");
685 assert(SrcTy->getScalarType() == DstTy->getScalarType() &&
686 "Expected the same scalar types");
687
688 Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTp);
689
690 // TODO: Add proper cost model for P extension fixed vectors (e.g., v4i16)
691 // For now, skip all fixed vector cost analysis when P extension is available
692 // to avoid crashes in getMinRVVVectorSizeInBits()
693 if (ST->hasStdExtP() && isa<FixedVectorType>(SrcTy))
694 return 1;
695
696 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcTy);
697
698 // First, handle cases where having a fixed length vector enables us to
699 // give a more accurate cost than falling back to generic scalable codegen.
700 // TODO: Each of these cases hints at a modeling gap around scalable vectors.
701 if (auto *FVTp = dyn_cast<FixedVectorType>(SrcTy);
702 FVTp && ST->hasVInstructions() && LT.second.isFixedLengthVector()) {
704 *this, LT.second, ST->getRealVLen(),
705 Kind == TTI::SK_InsertSubvector ? DstTy : SrcTy, Mask, CostKind);
706 if (VRegSplittingCost.isValid())
707 return VRegSplittingCost;
708 switch (Kind) {
709 default:
710 break;
712 if (Mask.size() >= 2) {
713 MVT EltTp = LT.second.getVectorElementType();
714 // If the size of the element is < ELEN then shuffles of interleaves and
715 // deinterleaves of 2 vectors can be lowered into the following
716 // sequences
717 if (EltTp.getScalarSizeInBits() < ST->getELen()) {
718 // Example sequence:
719 // vsetivli zero, 4, e8, mf4, ta, ma (ignored)
720 // vwaddu.vv v10, v8, v9
721 // li a0, -1 (ignored)
722 // vwmaccu.vx v10, a0, v9
723 if (ShuffleVectorInst::isInterleaveMask(Mask, 2, Mask.size()))
724 return 2 * LT.first * TLI->getLMULCost(LT.second);
725
726 if (Mask[0] == 0 || Mask[0] == 1) {
727 auto DeinterleaveMask = createStrideMask(Mask[0], 2, Mask.size());
728 // Example sequence:
729 // vnsrl.wi v10, v8, 0
730 if (equal(DeinterleaveMask, Mask))
731 return LT.first * getRISCVInstructionCost(RISCV::VNSRL_WI,
732 LT.second, CostKind);
733 }
734 }
735 int SubVectorSize;
736 if (LT.second.getScalarSizeInBits() != 1 &&
737 isRepeatedConcatMask(Mask, SubVectorSize)) {
739 unsigned NumSlides = Log2_32(Mask.size() / SubVectorSize);
740 // The cost of extraction from a subvector is 0 if the index is 0.
741 for (unsigned I = 0; I != NumSlides; ++I) {
742 unsigned InsertIndex = SubVectorSize * (1 << I);
743 FixedVectorType *SubTp =
744 FixedVectorType::get(SrcTy->getElementType(), InsertIndex);
745 FixedVectorType *DestTp =
747 std::pair<InstructionCost, MVT> DestLT =
749 // Add the cost of whole vector register move because the
750 // destination vector register group for vslideup cannot overlap the
751 // source.
752 Cost += DestLT.first * TLI->getLMULCost(DestLT.second);
753 Cost += getShuffleCost(TTI::SK_InsertSubvector, DestTp, DestTp, {},
754 CostKind, InsertIndex, SubTp);
755 }
756 return Cost;
757 }
758 }
759
760 if (InstructionCost SlideCost = getSlideCost(FVTp, Mask, CostKind);
761 SlideCost.isValid())
762 return SlideCost;
763
764 // vrgather + cost of generating the mask constant.
765 // We model this for an unknown mask with a single vrgather.
766 if (LT.first == 1 && (LT.second.getScalarSizeInBits() != 8 ||
767 LT.second.getVectorNumElements() <= 256)) {
768 VectorType *IdxTy =
769 getVRGatherIndexType(LT.second, *ST, SrcTy->getContext());
770 InstructionCost IndexCost = getConstantPoolLoadCost(IdxTy, CostKind);
771 return IndexCost +
772 getRISCVInstructionCost(RISCV::VRGATHER_VV, LT.second, CostKind);
773 }
774 break;
775 }
778
779 if (InstructionCost SlideCost = getSlideCost(FVTp, Mask, CostKind);
780 SlideCost.isValid())
781 return SlideCost;
782
783 // 2 x (vrgather + cost of generating the mask constant) + cost of mask
784 // register for the second vrgather. We model this for an unknown
785 // (shuffle) mask.
786 if (LT.first == 1 && (LT.second.getScalarSizeInBits() != 8 ||
787 LT.second.getVectorNumElements() <= 256)) {
788 auto &C = SrcTy->getContext();
789 auto EC = SrcTy->getElementCount();
790 VectorType *IdxTy = getVRGatherIndexType(LT.second, *ST, C);
792 InstructionCost IndexCost = getConstantPoolLoadCost(IdxTy, CostKind);
793 InstructionCost MaskCost = getConstantPoolLoadCost(MaskTy, CostKind);
794 return 2 * IndexCost +
795 getRISCVInstructionCost({RISCV::VRGATHER_VV, RISCV::VRGATHER_VV},
796 LT.second, CostKind) +
797 MaskCost;
798 }
799 break;
800 }
801 }
802
803 auto shouldSplit = [](TTI::ShuffleKind Kind) {
804 switch (Kind) {
805 default:
806 return false;
810 return true;
811 }
812 };
813
814 if (!Mask.empty() && LT.first.isValid() && LT.first != 1 &&
815 shouldSplit(Kind)) {
816 InstructionCost SplitCost =
817 costShuffleViaSplitting(*this, LT.second, FVTp, Mask, CostKind);
818 if (SplitCost.isValid())
819 return SplitCost;
820 }
821 }
822
823 // Handle scalable vectors (and fixed vectors legalized to scalable vectors).
824 switch (Kind) {
825 default:
826 // Fallthrough to generic handling.
827 // TODO: Most of these cases will return getInvalid in generic code, and
828 // must be implemented here.
829 break;
831 // Extract at zero is always a subregister extract
832 if (Index == 0)
833 return TTI::TCC_Free;
834
835 // If we're extracting a subvector of at most m1 size at a sub-register
836 // boundary - which unfortunately we need exact vlen to identify - this is
837 // a subregister extract at worst and thus won't require a vslidedown.
838 // TODO: Extend for aligned m2, m4 subvector extracts
839 // TODO: Extend for misalgined (but contained) extracts
840 // TODO: Extend for scalable subvector types
841 if (std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
842 SubLT.second.isValid() && SubLT.second.isFixedLengthVector()) {
843 if (std::optional<unsigned> VLen = ST->getRealVLen();
844 VLen && SubLT.second.getScalarSizeInBits() * Index % *VLen == 0 &&
845 SubLT.second.getSizeInBits() <= *VLen)
846 return TTI::TCC_Free;
847 }
848
849 // Example sequence:
850 // vsetivli zero, 4, e8, mf2, tu, ma (ignored)
851 // vslidedown.vi v8, v9, 2
852 return LT.first *
853 getRISCVInstructionCost(RISCV::VSLIDEDOWN_VI, LT.second, CostKind);
855 // Example sequence:
856 // vsetivli zero, 4, e8, mf2, tu, ma (ignored)
857 // vslideup.vi v8, v9, 2
858 LT = getTypeLegalizationCost(DstTy);
859 return LT.first *
860 getRISCVInstructionCost(RISCV::VSLIDEUP_VI, LT.second, CostKind);
861 case TTI::SK_Select: {
862 // Example sequence:
863 // li a0, 90
864 // vsetivli zero, 8, e8, mf2, ta, ma (ignored)
865 // vmv.s.x v0, a0
866 // vmerge.vvm v8, v9, v8, v0
867 // We use 2 for the cost of the mask materialization as this is the true
868 // cost for small masks and most shuffles are small. At worst, this cost
869 // should be a very small constant for the constant pool load. As such,
870 // we may bias towards large selects slightly more than truly warranted.
871 return LT.first *
872 (1 + getRISCVInstructionCost({RISCV::VMV_S_X, RISCV::VMERGE_VVM},
873 LT.second, CostKind));
874 }
875 case TTI::SK_Broadcast: {
876 bool HasScalar = (Args.size() > 0) && (Operator::getOpcode(Args[0]) ==
877 Instruction::InsertElement);
878 if (LT.second.getScalarSizeInBits() == 1) {
879 if (HasScalar) {
880 // Example sequence:
881 // andi a0, a0, 1
882 // vsetivli zero, 2, e8, mf8, ta, ma (ignored)
883 // vmv.v.x v8, a0
884 // vmsne.vi v0, v8, 0
885 return LT.first *
886 (1 + getRISCVInstructionCost({RISCV::VMV_V_X, RISCV::VMSNE_VI},
887 LT.second, CostKind));
888 }
889 // Example sequence:
890 // vsetivli zero, 2, e8, mf8, ta, mu (ignored)
891 // vmv.v.i v8, 0
892 // vmerge.vim v8, v8, 1, v0
893 // vmv.x.s a0, v8
894 // andi a0, a0, 1
895 // vmv.v.x v8, a0
896 // vmsne.vi v0, v8, 0
897
898 return LT.first *
899 (1 + getRISCVInstructionCost({RISCV::VMV_V_I, RISCV::VMERGE_VIM,
900 RISCV::VMV_X_S, RISCV::VMV_V_X,
901 RISCV::VMSNE_VI},
902 LT.second, CostKind));
903 }
904
905 if (HasScalar) {
906 // Example sequence:
907 // vmv.v.x v8, a0
908 return LT.first *
909 getRISCVInstructionCost(RISCV::VMV_V_X, LT.second, CostKind);
910 }
911
912 // Example sequence:
913 // vrgather.vi v9, v8, 0
914 return LT.first *
915 getRISCVInstructionCost(RISCV::VRGATHER_VI, LT.second, CostKind);
916 }
917 case TTI::SK_Splice: {
918 // vslidedown+vslideup.
919 // TODO: Multiplying by LT.first implies this legalizes into multiple copies
920 // of similar code, but I think we expand through memory.
921 unsigned Opcodes[2] = {RISCV::VSLIDEDOWN_VX, RISCV::VSLIDEUP_VX};
922 if (Index >= 0 && Index < 32)
923 Opcodes[0] = RISCV::VSLIDEDOWN_VI;
924 else if (Index < 0 && Index > -32)
925 Opcodes[1] = RISCV::VSLIDEUP_VI;
926 return LT.first * getRISCVInstructionCost(Opcodes, LT.second, CostKind);
927 }
928 case TTI::SK_Reverse: {
929
930 if (!LT.second.isVector())
932
933 // TODO: Cases to improve here:
934 // * Illegal vector types
935 // * i64 on RV32
936 if (SrcTy->getElementType()->isIntegerTy(1)) {
937 VectorType *WideTy =
938 VectorType::get(IntegerType::get(SrcTy->getContext(), 8),
939 cast<VectorType>(SrcTy)->getElementCount());
940 return getCastInstrCost(Instruction::ZExt, WideTy, SrcTy,
942 getShuffleCost(TTI::SK_Reverse, WideTy, WideTy, {}, CostKind, 0,
943 nullptr) +
944 getCastInstrCost(Instruction::Trunc, SrcTy, WideTy,
946 }
947
948 MVT ContainerVT = LT.second;
949 if (LT.second.isFixedLengthVector())
950 ContainerVT = TLI->getContainerForFixedLengthVector(LT.second);
951 MVT M1VT = RISCVTargetLowering::getM1VT(ContainerVT);
952 if (ContainerVT.bitsLE(M1VT)) {
953 // Example sequence:
954 // csrr a0, vlenb
955 // srli a0, a0, 3
956 // addi a0, a0, -1
957 // vsetvli a1, zero, e8, mf8, ta, mu (ignored)
958 // vid.v v9
959 // vrsub.vx v10, v9, a0
960 // vrgather.vv v9, v8, v10
961 InstructionCost LenCost = 3;
962 if (LT.second.isFixedLengthVector())
963 // vrsub.vi has a 5 bit immediate field, otherwise an li suffices
964 LenCost = isInt<5>(LT.second.getVectorNumElements() - 1) ? 0 : 1;
965 unsigned Opcodes[] = {RISCV::VID_V, RISCV::VRSUB_VX, RISCV::VRGATHER_VV};
966 if (LT.second.isFixedLengthVector() &&
967 isInt<5>(LT.second.getVectorNumElements() - 1))
968 Opcodes[1] = RISCV::VRSUB_VI;
969 InstructionCost GatherCost =
970 getRISCVInstructionCost(Opcodes, LT.second, CostKind);
971 return LT.first * (LenCost + GatherCost);
972 }
973
974 // At high LMUL, we split into a series of M1 reverses (see
975 // lowerVECTOR_REVERSE) and then do a single slide at the end to eliminate
976 // the resulting gap at the bottom (for fixed vectors only). The important
977 // bit is that the cost scales linearly, not quadratically with LMUL.
978 unsigned M1Opcodes[] = {RISCV::VID_V, RISCV::VRSUB_VX};
979 InstructionCost FixedCost =
980 getRISCVInstructionCost(M1Opcodes, M1VT, CostKind) + 3;
981 unsigned Ratio =
983 InstructionCost GatherCost =
984 getRISCVInstructionCost({RISCV::VRGATHER_VV}, M1VT, CostKind) * Ratio;
985 InstructionCost SlideCost = !LT.second.isFixedLengthVector() ? 0 :
986 getRISCVInstructionCost({RISCV::VSLIDEDOWN_VX}, LT.second, CostKind);
987 return FixedCost + LT.first * (GatherCost + SlideCost);
988 }
989 }
990 return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index,
991 SubTp);
992}
993
994static unsigned isM1OrSmaller(MVT VT) {
996 return (LMUL == RISCVVType::VLMUL::LMUL_F8 ||
1000}
1001
1003 VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
1004 TTI::TargetCostKind CostKind, bool ForPoisonSrc, ArrayRef<Value *> VL,
1005 TTI::VectorInstrContext VIC) const {
1008
1009 // TODO: Add proper cost model for P extension fixed vectors (e.g., v4i16)
1010 // For now, skip all fixed vector cost analysis when P extension is available
1011 // to avoid crashes in getMinRVVVectorSizeInBits()
1012 if (ST->hasStdExtP() && isa<FixedVectorType>(Ty)) {
1013 return 1; // Treat as single instruction cost for now
1014 }
1015
1016 // A build_vector (which is m1 sized or smaller) can be done in no
1017 // worse than one vslide1down.vx per element in the type. We could
1018 // in theory do an explode_vector in the inverse manner, but our
1019 // lowering today does not have a first class node for this pattern.
1021 Ty, DemandedElts, Insert, Extract, CostKind);
1022 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1023 if (Insert && !Extract && LT.first.isValid() && LT.second.isVector()) {
1024 if (Ty->getScalarSizeInBits() == 1) {
1025 auto *WideVecTy = cast<VectorType>(Ty->getWithNewBitWidth(8));
1026 // Note: Implicit scalar anyextend is assumed to be free since the i1
1027 // must be stored in a GPR.
1028 return getScalarizationOverhead(WideVecTy, DemandedElts, Insert, Extract,
1029 CostKind) +
1030 getCastInstrCost(Instruction::Trunc, Ty, WideVecTy,
1032 }
1033
1034 assert(LT.second.isFixedLengthVector());
1035 MVT ContainerVT = TLI->getContainerForFixedLengthVector(LT.second);
1036 if (isM1OrSmaller(ContainerVT)) {
1037 InstructionCost BV =
1038 cast<FixedVectorType>(Ty)->getNumElements() *
1039 getRISCVInstructionCost(RISCV::VSLIDE1DOWN_VX, LT.second, CostKind);
1040 if (BV < Cost)
1041 Cost = BV;
1042 }
1043 }
1044 return Cost;
1045}
1046
1050 Type *DataTy = MICA.getDataType();
1051 Align Alignment = MICA.getAlignment();
1052 switch (MICA.getID()) {
1053 case Intrinsic::vp_load_ff: {
1054 EVT DataTypeVT = TLI->getValueType(DL, DataTy);
1055 if (!TLI->isLegalFirstFaultLoad(DataTypeVT, Alignment))
1057
1058 unsigned AS = MICA.getAddressSpace();
1059 return getMemoryOpCost(Instruction::Load, DataTy, Alignment, AS, CostKind,
1060 {TTI::OK_AnyValue, TTI::OP_None}, nullptr);
1061 }
1062 case Intrinsic::experimental_vp_strided_load:
1063 case Intrinsic::experimental_vp_strided_store:
1064 return getStridedMemoryOpCost(MICA, CostKind);
1065 case Intrinsic::masked_compressstore:
1066 case Intrinsic::masked_expandload:
1068 case Intrinsic::vp_scatter:
1069 case Intrinsic::vp_gather:
1070 case Intrinsic::masked_scatter:
1071 case Intrinsic::masked_gather:
1072 return getGatherScatterOpCost(MICA, CostKind);
1073 case Intrinsic::vp_load:
1074 case Intrinsic::vp_store:
1075 case Intrinsic::masked_load:
1076 case Intrinsic::masked_store:
1077 return getMaskedMemoryOpCost(MICA, CostKind);
1078 }
1080}
1081
1085 unsigned Opcode = MICA.getID() == Intrinsic::masked_load ? Instruction::Load
1086 : Instruction::Store;
1087 Type *Src = MICA.getDataType();
1088 Align Alignment = MICA.getAlignment();
1089 unsigned AddressSpace = MICA.getAddressSpace();
1090
1091 if (!isLegalMaskedLoadStore(Src, Alignment) ||
1094
1095 return getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind);
1096}
1097
1099 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
1100 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
1101 bool UseMaskForCond, bool UseMaskForGaps) const {
1102
1103 // The interleaved memory access pass will lower (de)interleave ops combined
1104 // with an adjacent appropriate memory to vlseg/vsseg intrinsics. vlseg/vsseg
1105 // only support masking per-iteration (i.e. condition), not per-segment (i.e.
1106 // gap).
1107 if (!UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) {
1108 auto *VTy = cast<VectorType>(VecTy);
1109 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VTy);
1110 // Need to make sure type has't been scalarized
1111 if (LT.second.isVector()) {
1112 auto *SubVecTy =
1113 VectorType::get(VTy->getElementType(),
1114 VTy->getElementCount().divideCoefficientBy(Factor));
1115 if (VTy->getElementCount().isKnownMultipleOf(Factor) &&
1116 TLI->isLegalInterleavedAccessType(SubVecTy, Factor, Alignment,
1117 AddressSpace, DL)) {
1118
1119 // Some processors optimize segment loads/stores as one wide memory op +
1120 // Factor * LMUL shuffle ops.
1121 if (ST->hasOptimizedSegmentLoadStore(Factor)) {
1123 getMemoryOpCost(Opcode, VTy, Alignment, AddressSpace, CostKind);
1124 MVT SubVecVT = getTLI()->getValueType(DL, SubVecTy).getSimpleVT();
1125 Cost += Factor * TLI->getLMULCost(SubVecVT);
1126 return LT.first * Cost;
1127 }
1128
1129 // Otherwise, the cost is proportional to the number of elements (VL *
1130 // Factor ops).
1131 InstructionCost MemOpCost =
1132 getMemoryOpCost(Opcode, VTy->getElementType(), Alignment, 0,
1133 CostKind, {TTI::OK_AnyValue, TTI::OP_None});
1134 unsigned NumLoads = getEstimatedVLFor(VTy);
1135 return NumLoads * MemOpCost;
1136 }
1137 }
1138 }
1139
1140 // TODO: Return the cost of interleaved accesses for scalable vector when
1141 // unable to convert to segment accesses instructions.
1142 if (isa<ScalableVectorType>(VecTy))
1144
1145 auto *FVTy = cast<FixedVectorType>(VecTy);
1146 InstructionCost MemCost =
1147 getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace, CostKind);
1148 unsigned VF = FVTy->getNumElements() / Factor;
1149
1150 // An interleaved load will look like this for Factor=3:
1151 // %wide.vec = load <12 x i32>, ptr %3, align 4
1152 // %strided.vec = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
1153 // %strided.vec1 = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
1154 // %strided.vec2 = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
1155 if (Opcode == Instruction::Load) {
1156 InstructionCost Cost = MemCost;
1157 for (unsigned Index : Indices) {
1158 FixedVectorType *VecTy =
1159 FixedVectorType::get(FVTy->getElementType(), VF * Factor);
1160 auto Mask = createStrideMask(Index, Factor, VF);
1161 Mask.resize(VF * Factor, -1);
1162 InstructionCost ShuffleCost =
1164 Mask, CostKind, 0, nullptr, {});
1165 Cost += ShuffleCost;
1166 }
1167 return Cost;
1168 }
1169
1170 // TODO: Model for NF > 2
1171 // We'll need to enhance getShuffleCost to model shuffles that are just
1172 // inserts and extracts into subvectors, since they won't have the full cost
1173 // of a vrgather.
1174 // An interleaved store for 3 vectors of 4 lanes will look like
1175 // %11 = shufflevector <4 x i32> %4, <4 x i32> %6, <8 x i32> <0...7>
1176 // %12 = shufflevector <4 x i32> %9, <4 x i32> poison, <8 x i32> <0...3>
1177 // %13 = shufflevector <8 x i32> %11, <8 x i32> %12, <12 x i32> <0...11>
1178 // %interleaved.vec = shufflevector %13, poison, <12 x i32> <interleave mask>
1179 // store <12 x i32> %interleaved.vec, ptr %10, align 4
1180 if (Factor != 2)
1181 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
1182 Alignment, AddressSpace, CostKind,
1183 UseMaskForCond, UseMaskForGaps);
1184
1185 assert(Opcode == Instruction::Store && "Opcode must be a store");
1186 // For an interleaving store of 2 vectors, we perform one large interleaving
1187 // shuffle that goes into the wide store
1188 auto Mask = createInterleaveMask(VF, Factor);
1189 InstructionCost ShuffleCost =
1191 CostKind, 0, nullptr, {});
1192 return MemCost + ShuffleCost;
1193}
1194
1198
1199 bool IsLoad = MICA.getID() == Intrinsic::masked_gather ||
1200 MICA.getID() == Intrinsic::vp_gather;
1201 unsigned Opcode = IsLoad ? Instruction::Load : Instruction::Store;
1202 Type *DataTy = MICA.getDataType();
1203 Align Alignment = MICA.getAlignment();
1206
1207 if ((Opcode == Instruction::Load &&
1208 !isLegalMaskedGather(DataTy, Align(Alignment))) ||
1209 (Opcode == Instruction::Store &&
1210 !isLegalMaskedScatter(DataTy, Align(Alignment))))
1212
1213 // Cost is proportional to the number of memory operations implied. For
1214 // scalable vectors, we use an estimate on that number since we don't
1215 // know exactly what VL will be.
1216 auto &VTy = *cast<VectorType>(DataTy);
1217 unsigned NumLoads = getEstimatedVLFor(&VTy);
1218 return NumLoads * TTI::TCC_Basic;
1219}
1220
1222 const MemIntrinsicCostAttributes &MICA,
1224 unsigned Opcode = MICA.getID() == Intrinsic::masked_expandload
1225 ? Instruction::Load
1226 : Instruction::Store;
1227 Type *DataTy = MICA.getDataType();
1228 bool VariableMask = MICA.getVariableMask();
1229 Align Alignment = MICA.getAlignment();
1230 bool IsLegal = (Opcode == Instruction::Store &&
1231 isLegalMaskedCompressStore(DataTy, Alignment)) ||
1232 (Opcode == Instruction::Load &&
1233 isLegalMaskedExpandLoad(DataTy, Alignment));
1234 if (!IsLegal || CostKind != TTI::TCK_RecipThroughput)
1236 // Example compressstore sequence:
1237 // vsetivli zero, 8, e32, m2, ta, ma (ignored)
1238 // vcompress.vm v10, v8, v0
1239 // vcpop.m a1, v0
1240 // vsetvli zero, a1, e32, m2, ta, ma
1241 // vse32.v v10, (a0)
1242 // Example expandload sequence:
1243 // vsetivli zero, 8, e8, mf2, ta, ma (ignored)
1244 // vcpop.m a1, v0
1245 // vsetvli zero, a1, e32, m2, ta, ma
1246 // vle32.v v10, (a0)
1247 // vsetivli zero, 8, e32, m2, ta, ma
1248 // viota.m v12, v0
1249 // vrgather.vv v8, v10, v12, v0.t
1250 auto MemOpCost =
1251 getMemoryOpCost(Opcode, DataTy, Alignment, /*AddressSpace*/ 0, CostKind);
1252 auto LT = getTypeLegalizationCost(DataTy);
1253 SmallVector<unsigned, 4> Opcodes{RISCV::VSETVLI};
1254 if (VariableMask)
1255 Opcodes.push_back(RISCV::VCPOP_M);
1256 if (Opcode == Instruction::Store)
1257 Opcodes.append({RISCV::VCOMPRESS_VM});
1258 else
1259 Opcodes.append({RISCV::VSETIVLI, RISCV::VIOTA_M, RISCV::VRGATHER_VV});
1260 return MemOpCost +
1261 LT.first * getRISCVInstructionCost(Opcodes, LT.second, CostKind);
1262}
1263
1267
1268 unsigned Opcode = MICA.getID() == Intrinsic::experimental_vp_strided_load
1269 ? Instruction::Load
1270 : Instruction::Store;
1271
1272 Type *DataTy = MICA.getDataType();
1273 Align Alignment = MICA.getAlignment();
1274 const Instruction *I = MICA.getInst();
1275
1276 if (!isLegalStridedLoadStore(DataTy, Alignment))
1278
1280 return TTI::TCC_Basic;
1281
1282 // Cost is proportional to the number of memory operations implied. For
1283 // scalable vectors, we use an estimate on that number since we don't
1284 // know exactly what VL will be.
1285 // FIXME: This will overcost for i64 on rv32 with +zve64x.
1286 auto &VTy = *cast<VectorType>(DataTy);
1287 InstructionCost MemOpCost =
1288 getMemoryOpCost(Opcode, VTy.getElementType(), Alignment, 0, CostKind,
1289 {TTI::OK_AnyValue, TTI::OP_None}, I);
1290 unsigned NumLoads = getEstimatedVLFor(&VTy);
1291 return NumLoads * MemOpCost;
1292}
1293
1296 // FIXME: This is a property of the default vector convention, not
1297 // all possible calling conventions. Fixing that will require
1298 // some TTI API and SLP rework.
1301 for (auto *Ty : Tys) {
1302 if (!Ty->isVectorTy())
1303 continue;
1304 Align A = DL.getPrefTypeAlign(Ty);
1305 Cost += getMemoryOpCost(Instruction::Store, Ty, A, 0, CostKind) +
1306 getMemoryOpCost(Instruction::Load, Ty, A, 0, CostKind);
1307 }
1308 return Cost;
1309}
1310
1311// Currently, these represent both throughput and codesize costs
1312// for the respective intrinsics. The costs in this table are simply
1313// instruction counts with the following adjustments made:
1314// * One vsetvli is considered free.
1316 {Intrinsic::floor, MVT::f32, 9},
1317 {Intrinsic::floor, MVT::f64, 9},
1318 {Intrinsic::ceil, MVT::f32, 9},
1319 {Intrinsic::ceil, MVT::f64, 9},
1320 {Intrinsic::trunc, MVT::f32, 7},
1321 {Intrinsic::trunc, MVT::f64, 7},
1322 {Intrinsic::round, MVT::f32, 9},
1323 {Intrinsic::round, MVT::f64, 9},
1324 {Intrinsic::roundeven, MVT::f32, 9},
1325 {Intrinsic::roundeven, MVT::f64, 9},
1326 {Intrinsic::rint, MVT::f32, 7},
1327 {Intrinsic::rint, MVT::f64, 7},
1328 {Intrinsic::nearbyint, MVT::f32, 9},
1329 {Intrinsic::nearbyint, MVT::f64, 9},
1330 {Intrinsic::bswap, MVT::i16, 3},
1331 {Intrinsic::bswap, MVT::i32, 12},
1332 {Intrinsic::bswap, MVT::i64, 31},
1333 {Intrinsic::vp_bswap, MVT::i16, 3},
1334 {Intrinsic::vp_bswap, MVT::i32, 12},
1335 {Intrinsic::vp_bswap, MVT::i64, 31},
1336 {Intrinsic::vp_fshl, MVT::i8, 7},
1337 {Intrinsic::vp_fshl, MVT::i16, 7},
1338 {Intrinsic::vp_fshl, MVT::i32, 7},
1339 {Intrinsic::vp_fshl, MVT::i64, 7},
1340 {Intrinsic::vp_fshr, MVT::i8, 7},
1341 {Intrinsic::vp_fshr, MVT::i16, 7},
1342 {Intrinsic::vp_fshr, MVT::i32, 7},
1343 {Intrinsic::vp_fshr, MVT::i64, 7},
1344 {Intrinsic::bitreverse, MVT::i8, 17},
1345 {Intrinsic::bitreverse, MVT::i16, 24},
1346 {Intrinsic::bitreverse, MVT::i32, 33},
1347 {Intrinsic::bitreverse, MVT::i64, 52},
1348 {Intrinsic::vp_bitreverse, MVT::i8, 17},
1349 {Intrinsic::vp_bitreverse, MVT::i16, 24},
1350 {Intrinsic::vp_bitreverse, MVT::i32, 33},
1351 {Intrinsic::vp_bitreverse, MVT::i64, 52},
1352 {Intrinsic::ctpop, MVT::i8, 12},
1353 {Intrinsic::ctpop, MVT::i16, 19},
1354 {Intrinsic::ctpop, MVT::i32, 20},
1355 {Intrinsic::ctpop, MVT::i64, 21},
1356 {Intrinsic::ctlz, MVT::i8, 19},
1357 {Intrinsic::ctlz, MVT::i16, 28},
1358 {Intrinsic::ctlz, MVT::i32, 31},
1359 {Intrinsic::ctlz, MVT::i64, 35},
1360 {Intrinsic::cttz, MVT::i8, 16},
1361 {Intrinsic::cttz, MVT::i16, 23},
1362 {Intrinsic::cttz, MVT::i32, 24},
1363 {Intrinsic::cttz, MVT::i64, 25},
1364 {Intrinsic::vp_ctpop, MVT::i8, 12},
1365 {Intrinsic::vp_ctpop, MVT::i16, 19},
1366 {Intrinsic::vp_ctpop, MVT::i32, 20},
1367 {Intrinsic::vp_ctpop, MVT::i64, 21},
1368 {Intrinsic::vp_ctlz, MVT::i8, 19},
1369 {Intrinsic::vp_ctlz, MVT::i16, 28},
1370 {Intrinsic::vp_ctlz, MVT::i32, 31},
1371 {Intrinsic::vp_ctlz, MVT::i64, 35},
1372 {Intrinsic::vp_cttz, MVT::i8, 16},
1373 {Intrinsic::vp_cttz, MVT::i16, 23},
1374 {Intrinsic::vp_cttz, MVT::i32, 24},
1375 {Intrinsic::vp_cttz, MVT::i64, 25},
1376};
1377
1381 auto *RetTy = ICA.getReturnType();
1382 switch (ICA.getID()) {
1383 case Intrinsic::lrint:
1384 case Intrinsic::llrint:
1385 case Intrinsic::lround:
1386 case Intrinsic::llround: {
1387 auto LT = getTypeLegalizationCost(RetTy);
1388 Type *SrcTy = ICA.getArgTypes().front();
1389 auto SrcLT = getTypeLegalizationCost(SrcTy);
1390 if (ST->hasVInstructions() && LT.second.isVector()) {
1392 unsigned SrcEltSz = DL.getTypeSizeInBits(SrcTy->getScalarType());
1393 unsigned DstEltSz = DL.getTypeSizeInBits(RetTy->getScalarType());
1394 if (LT.second.getVectorElementType() == MVT::bf16) {
1395 if (!ST->hasVInstructionsBF16Minimal())
1397 if (DstEltSz == 32)
1398 Ops = {RISCV::VFWCVTBF16_F_F_V, RISCV::VFCVT_X_F_V};
1399 else
1400 Ops = {RISCV::VFWCVTBF16_F_F_V, RISCV::VFWCVT_X_F_V};
1401 } else if (LT.second.getVectorElementType() == MVT::f16 &&
1402 !ST->hasVInstructionsF16()) {
1403 if (!ST->hasVInstructionsF16Minimal())
1405 if (DstEltSz == 32)
1406 Ops = {RISCV::VFWCVT_F_F_V, RISCV::VFCVT_X_F_V};
1407 else
1408 Ops = {RISCV::VFWCVT_F_F_V, RISCV::VFWCVT_X_F_V};
1409
1410 } else if (SrcEltSz > DstEltSz) {
1411 Ops = {RISCV::VFNCVT_X_F_W};
1412 } else if (SrcEltSz < DstEltSz) {
1413 Ops = {RISCV::VFWCVT_X_F_V};
1414 } else {
1415 Ops = {RISCV::VFCVT_X_F_V};
1416 }
1417
1418 // We need to use the source LMUL in the case of a narrowing op, and the
1419 // destination LMUL otherwise.
1420 if (SrcEltSz > DstEltSz)
1421 return SrcLT.first *
1422 getRISCVInstructionCost(Ops, SrcLT.second, CostKind);
1423 return LT.first * getRISCVInstructionCost(Ops, LT.second, CostKind);
1424 }
1425 break;
1426 }
1427 case Intrinsic::ceil:
1428 case Intrinsic::floor:
1429 case Intrinsic::trunc:
1430 case Intrinsic::rint:
1431 case Intrinsic::round:
1432 case Intrinsic::roundeven: {
1433 // These all use the same code.
1434 auto LT = getTypeLegalizationCost(RetTy);
1435 if (!LT.second.isVector() && TLI->isOperationCustom(ISD::FCEIL, LT.second))
1436 return LT.first * 8;
1437 break;
1438 }
1439 case Intrinsic::umin:
1440 case Intrinsic::umax:
1441 case Intrinsic::smin:
1442 case Intrinsic::smax: {
1443 auto LT = getTypeLegalizationCost(RetTy);
1444 if (LT.second.isScalarInteger() && ST->hasStdExtZbb())
1445 return LT.first;
1446
1447 if (ST->hasVInstructions() && LT.second.isVector()) {
1448 unsigned Op;
1449 switch (ICA.getID()) {
1450 case Intrinsic::umin:
1451 Op = RISCV::VMINU_VV;
1452 break;
1453 case Intrinsic::umax:
1454 Op = RISCV::VMAXU_VV;
1455 break;
1456 case Intrinsic::smin:
1457 Op = RISCV::VMIN_VV;
1458 break;
1459 case Intrinsic::smax:
1460 Op = RISCV::VMAX_VV;
1461 break;
1462 }
1463 return LT.first * getRISCVInstructionCost(Op, LT.second, CostKind);
1464 }
1465 break;
1466 }
1467 case Intrinsic::sadd_sat:
1468 case Intrinsic::ssub_sat:
1469 case Intrinsic::uadd_sat:
1470 case Intrinsic::usub_sat: {
1471 auto LT = getTypeLegalizationCost(RetTy);
1472 if (ST->hasVInstructions() && LT.second.isVector()) {
1473 unsigned Op;
1474 switch (ICA.getID()) {
1475 case Intrinsic::sadd_sat:
1476 Op = RISCV::VSADD_VV;
1477 break;
1478 case Intrinsic::ssub_sat:
1479 Op = RISCV::VSSUB_VV;
1480 break;
1481 case Intrinsic::uadd_sat:
1482 Op = RISCV::VSADDU_VV;
1483 break;
1484 case Intrinsic::usub_sat:
1485 Op = RISCV::VSSUBU_VV;
1486 break;
1487 }
1488 return LT.first * getRISCVInstructionCost(Op, LT.second, CostKind);
1489 }
1490 break;
1491 }
1492 case Intrinsic::fma:
1493 case Intrinsic::fmuladd: {
1494 // TODO: handle promotion with f16/bf16 with zvfhmin/zvfbfmin
1495 auto LT = getTypeLegalizationCost(RetTy);
1496 if (ST->hasVInstructions() && LT.second.isVector())
1497 return LT.first *
1498 getRISCVInstructionCost(RISCV::VFMADD_VV, LT.second, CostKind);
1499 break;
1500 }
1501 case Intrinsic::fabs: {
1502 auto LT = getTypeLegalizationCost(RetTy);
1503 if (ST->hasVInstructions() && LT.second.isVector()) {
1504 // lui a0, 8
1505 // addi a0, a0, -1
1506 // vsetvli a1, zero, e16, m1, ta, ma
1507 // vand.vx v8, v8, a0
1508 // f16 with zvfhmin and bf16 with zvfhbmin
1509 if (LT.second.getVectorElementType() == MVT::bf16 ||
1510 (LT.second.getVectorElementType() == MVT::f16 &&
1511 !ST->hasVInstructionsF16()))
1512 return LT.first * getRISCVInstructionCost(RISCV::VAND_VX, LT.second,
1513 CostKind) +
1514 2;
1515 else
1516 return LT.first *
1517 getRISCVInstructionCost(RISCV::VFSGNJX_VV, LT.second, CostKind);
1518 }
1519 break;
1520 }
1521 case Intrinsic::sqrt: {
1522 auto LT = getTypeLegalizationCost(RetTy);
1523 if (ST->hasVInstructions() && LT.second.isVector()) {
1526 MVT ConvType = LT.second;
1527 MVT FsqrtType = LT.second;
1528 // f16 with zvfhmin and bf16 with zvfbfmin and the type of nxv32[b]f16
1529 // will be spilt.
1530 if (LT.second.getVectorElementType() == MVT::bf16) {
1531 if (LT.second == MVT::nxv32bf16) {
1532 ConvOp = {RISCV::VFWCVTBF16_F_F_V, RISCV::VFWCVTBF16_F_F_V,
1533 RISCV::VFNCVTBF16_F_F_W, RISCV::VFNCVTBF16_F_F_W};
1534 FsqrtOp = {RISCV::VFSQRT_V, RISCV::VFSQRT_V};
1535 ConvType = MVT::nxv16f16;
1536 FsqrtType = MVT::nxv16f32;
1537 } else {
1538 ConvOp = {RISCV::VFWCVTBF16_F_F_V, RISCV::VFNCVTBF16_F_F_W};
1539 FsqrtOp = {RISCV::VFSQRT_V};
1540 FsqrtType = TLI->getTypeToPromoteTo(ISD::FSQRT, FsqrtType);
1541 }
1542 } else if (LT.second.getVectorElementType() == MVT::f16 &&
1543 !ST->hasVInstructionsF16()) {
1544 if (LT.second == MVT::nxv32f16) {
1545 ConvOp = {RISCV::VFWCVT_F_F_V, RISCV::VFWCVT_F_F_V,
1546 RISCV::VFNCVT_F_F_W, RISCV::VFNCVT_F_F_W};
1547 FsqrtOp = {RISCV::VFSQRT_V, RISCV::VFSQRT_V};
1548 ConvType = MVT::nxv16f16;
1549 FsqrtType = MVT::nxv16f32;
1550 } else {
1551 ConvOp = {RISCV::VFWCVT_F_F_V, RISCV::VFNCVT_F_F_W};
1552 FsqrtOp = {RISCV::VFSQRT_V};
1553 FsqrtType = TLI->getTypeToPromoteTo(ISD::FSQRT, FsqrtType);
1554 }
1555 } else {
1556 FsqrtOp = {RISCV::VFSQRT_V};
1557 }
1558
1559 return LT.first * (getRISCVInstructionCost(FsqrtOp, FsqrtType, CostKind) +
1560 getRISCVInstructionCost(ConvOp, ConvType, CostKind));
1561 }
1562 break;
1563 }
1564 case Intrinsic::cttz:
1565 case Intrinsic::ctlz:
1566 case Intrinsic::ctpop: {
1567 auto LT = getTypeLegalizationCost(RetTy);
1568 if (ST->hasStdExtZvbb() && LT.second.isVector()) {
1569 unsigned Op;
1570 switch (ICA.getID()) {
1571 case Intrinsic::cttz:
1572 Op = RISCV::VCTZ_V;
1573 break;
1574 case Intrinsic::ctlz:
1575 Op = RISCV::VCLZ_V;
1576 break;
1577 case Intrinsic::ctpop:
1578 Op = RISCV::VCPOP_V;
1579 break;
1580 }
1581 return LT.first * getRISCVInstructionCost(Op, LT.second, CostKind);
1582 }
1583 break;
1584 }
1585 case Intrinsic::abs: {
1586 auto LT = getTypeLegalizationCost(RetTy);
1587 if (ST->hasVInstructions() && LT.second.isVector()) {
1588 // vabs.v v10, v8
1589 if (ST->hasStdExtZvabd())
1590 return LT.first *
1591 getRISCVInstructionCost({RISCV::VABS_V}, LT.second, CostKind);
1592
1593 // vrsub.vi v10, v8, 0
1594 // vmax.vv v8, v8, v10
1595 return LT.first *
1596 getRISCVInstructionCost({RISCV::VRSUB_VI, RISCV::VMAX_VV},
1597 LT.second, CostKind);
1598 }
1599 break;
1600 }
1601 case Intrinsic::fshl:
1602 case Intrinsic::fshr: {
1603 if (ICA.getArgs().empty())
1604 break;
1605
1606 // Funnel-shifts are ROTL/ROTR when the first and second operand are equal.
1607 // When Zbb/Zbkb is enabled we can use a single ROL(W)/ROR(I)(W)
1608 // instruction.
1609 if ((ST->hasStdExtZbb() || ST->hasStdExtZbkb()) && RetTy->isIntegerTy() &&
1610 ICA.getArgs()[0] == ICA.getArgs()[1] &&
1611 (RetTy->getIntegerBitWidth() == 32 ||
1612 RetTy->getIntegerBitWidth() == 64) &&
1613 RetTy->getIntegerBitWidth() <= ST->getXLen()) {
1614 return 1;
1615 }
1616 break;
1617 }
1618 case Intrinsic::masked_udiv:
1619 return getArithmeticInstrCost(Instruction::UDiv, ICA.getReturnType(),
1620 CostKind);
1621 case Intrinsic::masked_sdiv:
1622 return getArithmeticInstrCost(Instruction::SDiv, ICA.getReturnType(),
1623 CostKind);
1624 case Intrinsic::masked_urem:
1625 return getArithmeticInstrCost(Instruction::URem, ICA.getReturnType(),
1626 CostKind);
1627 case Intrinsic::masked_srem:
1628 return getArithmeticInstrCost(Instruction::SRem, ICA.getReturnType(),
1629 CostKind);
1630 case Intrinsic::get_active_lane_mask: {
1631 if (ST->hasVInstructions()) {
1632 Type *ExpRetTy = VectorType::get(
1633 ICA.getArgTypes()[0], cast<VectorType>(RetTy)->getElementCount());
1634 auto LT = getTypeLegalizationCost(ExpRetTy);
1635
1636 // vid.v v8 // considered hoisted
1637 // vsaddu.vx v8, v8, a0
1638 // vmsltu.vx v0, v8, a1
1639 return LT.first *
1640 getRISCVInstructionCost({RISCV::VSADDU_VX, RISCV::VMSLTU_VX},
1641 LT.second, CostKind);
1642 }
1643 break;
1644 }
1645 // TODO: add more intrinsic
1646 case Intrinsic::stepvector: {
1647 auto LT = getTypeLegalizationCost(RetTy);
1648 // Legalisation of illegal types involves an `index' instruction plus
1649 // (LT.first - 1) vector adds.
1650 if (ST->hasVInstructions())
1651 return getRISCVInstructionCost(RISCV::VID_V, LT.second, CostKind) +
1652 (LT.first - 1) *
1653 getRISCVInstructionCost(RISCV::VADD_VX, LT.second, CostKind);
1654 return 1 + (LT.first - 1);
1655 }
1656 case Intrinsic::vector_splice_left:
1657 case Intrinsic::vector_splice_right: {
1658 auto LT = getTypeLegalizationCost(RetTy);
1659 // Constant offsets fall through to getShuffleCost.
1660 if (!ICA.isTypeBasedOnly() && isa<ConstantInt>(ICA.getArgs()[2]))
1661 break;
1662 if (ST->hasVInstructions() && LT.second.isVector()) {
1663 return LT.first *
1664 getRISCVInstructionCost({RISCV::VSLIDEDOWN_VX, RISCV::VSLIDEUP_VX},
1665 LT.second, CostKind);
1666 }
1667 break;
1668 }
1669 case Intrinsic::experimental_cttz_elts: {
1670 Type *ArgTy = ICA.getArgTypes()[0];
1671 EVT ArgType = TLI->getValueType(DL, ArgTy, true);
1672 if (getTLI()->shouldExpandCttzElements(ArgType))
1673 break;
1674 InstructionCost Cost = getRISCVInstructionCost(
1675 RISCV::VFIRST_M, getTypeLegalizationCost(ArgTy).second, CostKind);
1676
1677 // If zero_is_poison is false, then we will generate additional
1678 // cmp + select instructions to convert -1 to EVL.
1679 Type *BoolTy = Type::getInt1Ty(RetTy->getContext());
1680 if (ICA.getArgs().size() > 1 &&
1681 cast<ConstantInt>(ICA.getArgs()[1])->isZero())
1682 Cost += getCmpSelInstrCost(Instruction::ICmp, BoolTy, RetTy,
1684 getCmpSelInstrCost(Instruction::Select, RetTy, BoolTy,
1686
1687 return Cost;
1688 }
1689 case Intrinsic::experimental_vp_splice: {
1690 // To support type-based query from vectorizer, set the index to 0.
1691 // Note that index only change the cost from vslide.vx to vslide.vi and in
1692 // current implementations they have same costs.
1694 cast<VectorType>(ICA.getArgTypes()[0]), {}, CostKind,
1696 }
1697 case Intrinsic::fptoui_sat:
1698 case Intrinsic::fptosi_sat: {
1700 bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat;
1701 Type *SrcTy = ICA.getArgTypes()[0];
1702
1703 auto SrcLT = getTypeLegalizationCost(SrcTy);
1704 auto DstLT = getTypeLegalizationCost(RetTy);
1705 if (!SrcTy->isVectorTy())
1706 break;
1707
1708 if (!SrcLT.first.isValid() || !DstLT.first.isValid())
1710
1711 Cost +=
1712 getCastInstrCost(IsSigned ? Instruction::FPToSI : Instruction::FPToUI,
1713 RetTy, SrcTy, TTI::CastContextHint::None, CostKind);
1714
1715 // Handle NaN.
1716 // vmfne v0, v8, v8 # If v8[i] is NaN set v0[i] to 1.
1717 // vmerge.vim v8, v8, 0, v0 # Convert NaN to 0.
1718 Type *CondTy = RetTy->getWithNewBitWidth(1);
1719 Cost += getCmpSelInstrCost(BinaryOperator::FCmp, SrcTy, CondTy,
1721 Cost += getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
1723 return Cost;
1724 }
1725 case Intrinsic::experimental_vector_extract_last_active: {
1726 auto *ValTy = cast<VectorType>(ICA.getArgTypes()[0]);
1727 auto *MaskTy = cast<VectorType>(ICA.getArgTypes()[1]);
1728
1729 auto ValLT = getTypeLegalizationCost(ValTy);
1730 auto MaskLT = getTypeLegalizationCost(MaskTy);
1731
1732 // TODO: Return cheaper cost when the entire lane is inactive.
1733 // The expected asm sequence is:
1734 // vcpop.m a0, v0
1735 // beqz a0, exit # Return passthru when the entire lane is inactive.
1736 // vid v10, v0.t
1737 // vredmaxu.vs v10, v10, v10
1738 // vmv.x.s a0, v10
1739 // zext.b a0, a0
1740 // vslidedown.vx v8, v8, a0
1741 // vmv.x.s a0, v8
1742 // exit:
1743 // ...
1744
1745 // Find a suitable type for a stepvector.
1746 ConstantRange VScaleRange(APInt(64, 1), APInt::getZero(64));
1747 unsigned EltWidth = getTLI()->getBitWidthForCttzElements(
1748 TLI->getVectorIdxTy(getDataLayout()), MaskTy->getElementCount(),
1749 /*ZeroIsPoison=*/true, &VScaleRange);
1750 EltWidth = std::max(EltWidth, MaskTy->getScalarSizeInBits());
1751 Type *StepTy = Type::getIntNTy(MaskTy->getContext(), EltWidth);
1752 auto *StepVecTy = VectorType::get(StepTy, ValTy->getElementCount());
1753 auto StepLT = getTypeLegalizationCost(StepVecTy);
1754
1755 // Currently expandVectorFindLastActive cannot handle step vector split.
1756 // So return invalid when the type needs split.
1757 // FIXME: Remove this if expandVectorFindLastActive supports split vector.
1758 if (StepLT.first > 1)
1760
1762 unsigned Opcodes[] = {RISCV::VID_V, RISCV::VREDMAXU_VS, RISCV::VMV_X_S};
1763
1764 Cost += MaskLT.first *
1765 getRISCVInstructionCost(RISCV::VCPOP_M, MaskLT.second, CostKind);
1766 Cost += getCFInstrCost(Instruction::CondBr, CostKind, nullptr);
1767 Cost += StepLT.first *
1768 getRISCVInstructionCost(Opcodes, StepLT.second, CostKind);
1769 Cost += getCastInstrCost(Instruction::ZExt,
1770 Type::getInt64Ty(ValTy->getContext()), StepTy,
1772 Cost += ValLT.first *
1773 getRISCVInstructionCost({RISCV::VSLIDEDOWN_VI, RISCV::VMV_X_S},
1774 ValLT.second, CostKind);
1775 return Cost;
1776 }
1777 }
1778
1779 if (ST->hasVInstructions() && RetTy->isVectorTy()) {
1780 if (auto LT = getTypeLegalizationCost(RetTy);
1781 LT.second.isVector()) {
1782 MVT EltTy = LT.second.getVectorElementType();
1783 if (const auto *Entry = CostTableLookup(VectorIntrinsicCostTable,
1784 ICA.getID(), EltTy))
1785 return LT.first * Entry->Cost;
1786 }
1787 }
1788
1790}
1791
1794 const SCEV *Ptr,
1796 // Address computations for vector indexed load/store likely require an offset
1797 // and/or scaling.
1798 if (ST->hasVInstructions() && PtrTy->isVectorTy())
1799 return getArithmeticInstrCost(Instruction::Add, PtrTy, CostKind);
1800
1801 return BaseT::getAddressComputationCost(PtrTy, SE, Ptr, CostKind);
1802}
1803
1805 Type *Src,
1808 const Instruction *I) const {
1809 bool IsVectorType = isa<VectorType>(Dst) && isa<VectorType>(Src);
1810 if (!IsVectorType)
1811 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1812
1813 // TODO: Add proper cost model for P extension fixed vectors (e.g., v4i16)
1814 // For now, skip all fixed vector cost analysis when P extension is available
1815 // to avoid crashes in getMinRVVVectorSizeInBits()
1816 if (ST->hasStdExtP() &&
1818 return 1; // Treat as single instruction cost for now
1819 }
1820
1821 // FIXME: Need to compute legalizing cost for illegal types. The current
1822 // code handles only legal types and those which can be trivially
1823 // promoted to legal.
1824 if (!ST->hasVInstructions() || Src->getScalarSizeInBits() > ST->getELen() ||
1825 Dst->getScalarSizeInBits() > ST->getELen())
1826 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1827
1828 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1829 assert(ISD && "Invalid opcode");
1830 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Src);
1831 std::pair<InstructionCost, MVT> DstLT = getTypeLegalizationCost(Dst);
1832
1833 // Handle i1 source and dest cases *before* calling logic in BasicTTI.
1834 // The shared implementation doesn't model vector widening during legalization
1835 // and instead assumes scalarization. In order to scalarize an <N x i1>
1836 // vector, we need to extend/trunc to/from i8. If we don't special case
1837 // this, we can get an infinite recursion cycle.
1838 switch (ISD) {
1839 default:
1840 break;
1841 case ISD::SIGN_EXTEND:
1842 case ISD::ZERO_EXTEND:
1843 if (Src->getScalarSizeInBits() == 1) {
1844 // We do not use vsext/vzext to extend from mask vector.
1845 // Instead we use the following instructions to extend from mask vector:
1846 // vmv.v.i v8, 0
1847 // vmerge.vim v8, v8, -1, v0 (repeated per split)
1848 return getRISCVInstructionCost(RISCV::VMV_V_I, DstLT.second, CostKind) +
1849 DstLT.first * getRISCVInstructionCost(RISCV::VMERGE_VIM,
1850 DstLT.second, CostKind) +
1851 DstLT.first - 1;
1852 }
1853 break;
1854 case ISD::TRUNCATE:
1855 if (Dst->getScalarSizeInBits() == 1) {
1856 // We do not use several vncvt to truncate to mask vector. So we could
1857 // not use PowDiff to calculate it.
1858 // Instead we use the following instructions to truncate to mask vector:
1859 // vand.vi v8, v8, 1
1860 // vmsne.vi v0, v8, 0
1861 return SrcLT.first *
1862 getRISCVInstructionCost({RISCV::VAND_VI, RISCV::VMSNE_VI},
1863 SrcLT.second, CostKind) +
1864 SrcLT.first - 1;
1865 }
1866 break;
1867 };
1868
1869 // Our actual lowering for the case where a wider legal type is available
1870 // uses promotion to the wider type. This is reflected in the result of
1871 // getTypeLegalizationCost, but BasicTTI assumes the widened cases are
1872 // scalarized if the legalized Src and Dst are not equal sized.
1873 const DataLayout &DL = this->getDataLayout();
1874 if (!SrcLT.second.isVector() || !DstLT.second.isVector() ||
1875 !SrcLT.first.isValid() || !DstLT.first.isValid() ||
1876 !TypeSize::isKnownLE(DL.getTypeSizeInBits(Src),
1877 SrcLT.second.getSizeInBits()) ||
1878 !TypeSize::isKnownLE(DL.getTypeSizeInBits(Dst),
1879 DstLT.second.getSizeInBits()) ||
1880 SrcLT.first > 1 || DstLT.first > 1)
1881 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1882
1883 // The split cost is handled by the base getCastInstrCost
1884 assert((SrcLT.first == 1) && (DstLT.first == 1) && "Illegal type");
1885
1886 int PowDiff = (int)Log2_32(DstLT.second.getScalarSizeInBits()) -
1887 (int)Log2_32(SrcLT.second.getScalarSizeInBits());
1888 switch (ISD) {
1889 case ISD::SIGN_EXTEND:
1890 case ISD::ZERO_EXTEND: {
1891 if ((PowDiff < 1) || (PowDiff > 3))
1892 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1893 unsigned SExtOp[] = {RISCV::VSEXT_VF2, RISCV::VSEXT_VF4, RISCV::VSEXT_VF8};
1894 unsigned ZExtOp[] = {RISCV::VZEXT_VF2, RISCV::VZEXT_VF4, RISCV::VZEXT_VF8};
1895 unsigned Op =
1896 (ISD == ISD::SIGN_EXTEND) ? SExtOp[PowDiff - 1] : ZExtOp[PowDiff - 1];
1897 return getRISCVInstructionCost(Op, DstLT.second, CostKind);
1898 }
1899 case ISD::TRUNCATE:
1900 case ISD::FP_EXTEND:
1901 case ISD::FP_ROUND: {
1902 // Counts of narrow/widen instructions.
1903 unsigned SrcEltSize = SrcLT.second.getScalarSizeInBits();
1904 unsigned DstEltSize = DstLT.second.getScalarSizeInBits();
1905
1906 unsigned Op = (ISD == ISD::TRUNCATE) ? RISCV::VNSRL_WI
1907 : (ISD == ISD::FP_EXTEND) ? RISCV::VFWCVT_F_F_V
1908 : RISCV::VFNCVT_F_F_W;
1910 for (; SrcEltSize != DstEltSize;) {
1911 MVT ElementMVT = (ISD == ISD::TRUNCATE)
1912 ? MVT::getIntegerVT(DstEltSize)
1913 : MVT::getFloatingPointVT(DstEltSize);
1914 MVT DstMVT = DstLT.second.changeVectorElementType(ElementMVT);
1915 DstEltSize =
1916 (DstEltSize > SrcEltSize) ? DstEltSize >> 1 : DstEltSize << 1;
1917 Cost += getRISCVInstructionCost(Op, DstMVT, CostKind);
1918 }
1919 return Cost;
1920 }
1921 case ISD::FP_TO_SINT:
1922 case ISD::FP_TO_UINT: {
1923 unsigned IsSigned = ISD == ISD::FP_TO_SINT;
1924 unsigned FCVT = IsSigned ? RISCV::VFCVT_RTZ_X_F_V : RISCV::VFCVT_RTZ_XU_F_V;
1925 unsigned FWCVT =
1926 IsSigned ? RISCV::VFWCVT_RTZ_X_F_V : RISCV::VFWCVT_RTZ_XU_F_V;
1927 unsigned FNCVT =
1928 IsSigned ? RISCV::VFNCVT_RTZ_X_F_W : RISCV::VFNCVT_RTZ_XU_F_W;
1929 unsigned SrcEltSize = Src->getScalarSizeInBits();
1930 unsigned DstEltSize = Dst->getScalarSizeInBits();
1932 if ((SrcEltSize == 16) &&
1933 (!ST->hasVInstructionsF16() || ((DstEltSize / 2) > SrcEltSize))) {
1934 // If the target only supports zvfhmin or it is fp16-to-i64 conversion
1935 // pre-widening to f32 and then convert f32 to integer
1936 VectorType *VecF32Ty =
1937 VectorType::get(Type::getFloatTy(Dst->getContext()),
1938 cast<VectorType>(Dst)->getElementCount());
1939 std::pair<InstructionCost, MVT> VecF32LT =
1940 getTypeLegalizationCost(VecF32Ty);
1941 Cost +=
1942 VecF32LT.first * getRISCVInstructionCost(RISCV::VFWCVT_F_F_V,
1943 VecF32LT.second, CostKind);
1944 Cost += getCastInstrCost(Opcode, Dst, VecF32Ty, CCH, CostKind, I);
1945 return Cost;
1946 }
1947 if (DstEltSize == SrcEltSize)
1948 Cost += getRISCVInstructionCost(FCVT, DstLT.second, CostKind);
1949 else if (DstEltSize > SrcEltSize)
1950 Cost += getRISCVInstructionCost(FWCVT, DstLT.second, CostKind);
1951 else { // (SrcEltSize > DstEltSize)
1952 // First do a narrowing conversion to an integer half the size, then
1953 // truncate if needed.
1954 MVT ElementVT = MVT::getIntegerVT(SrcEltSize / 2);
1955 MVT VecVT = DstLT.second.changeVectorElementType(ElementVT);
1956 Cost += getRISCVInstructionCost(FNCVT, VecVT, CostKind);
1957 if ((SrcEltSize / 2) > DstEltSize) {
1958 Type *VecTy = EVT(VecVT).getTypeForEVT(Dst->getContext());
1959 Cost +=
1960 getCastInstrCost(Instruction::Trunc, Dst, VecTy, CCH, CostKind, I);
1961 }
1962 }
1963 return Cost;
1964 }
1965 case ISD::SINT_TO_FP:
1966 case ISD::UINT_TO_FP: {
1967 unsigned IsSigned = ISD == ISD::SINT_TO_FP;
1968 unsigned FCVT = IsSigned ? RISCV::VFCVT_F_X_V : RISCV::VFCVT_F_XU_V;
1969 unsigned FWCVT = IsSigned ? RISCV::VFWCVT_F_X_V : RISCV::VFWCVT_F_XU_V;
1970 unsigned FNCVT = IsSigned ? RISCV::VFNCVT_F_X_W : RISCV::VFNCVT_F_XU_W;
1971 unsigned SrcEltSize = Src->getScalarSizeInBits();
1972 unsigned DstEltSize = Dst->getScalarSizeInBits();
1973
1975 if ((DstEltSize == 16) &&
1976 (!ST->hasVInstructionsF16() || ((SrcEltSize / 2) > DstEltSize))) {
1977 // If the target only supports zvfhmin or it is i64-to-fp16 conversion
1978 // it is converted to f32 and then converted to f16
1979 VectorType *VecF32Ty =
1980 VectorType::get(Type::getFloatTy(Dst->getContext()),
1981 cast<VectorType>(Dst)->getElementCount());
1982 std::pair<InstructionCost, MVT> VecF32LT =
1983 getTypeLegalizationCost(VecF32Ty);
1984 Cost += getCastInstrCost(Opcode, VecF32Ty, Src, CCH, CostKind, I);
1985 Cost += VecF32LT.first * getRISCVInstructionCost(RISCV::VFNCVT_F_F_W,
1986 DstLT.second, CostKind);
1987 return Cost;
1988 }
1989
1990 if (DstEltSize == SrcEltSize)
1991 Cost += getRISCVInstructionCost(FCVT, DstLT.second, CostKind);
1992 else if (DstEltSize > SrcEltSize) {
1993 if ((DstEltSize / 2) > SrcEltSize) {
1994 VectorType *VecTy =
1995 VectorType::get(IntegerType::get(Dst->getContext(), DstEltSize / 2),
1996 cast<VectorType>(Dst)->getElementCount());
1997 unsigned Op = IsSigned ? Instruction::SExt : Instruction::ZExt;
1998 Cost += getCastInstrCost(Op, VecTy, Src, CCH, CostKind, I);
1999 }
2000 Cost += getRISCVInstructionCost(FWCVT, DstLT.second, CostKind);
2001 } else
2002 Cost += getRISCVInstructionCost(FNCVT, DstLT.second, CostKind);
2003 return Cost;
2004 }
2005 }
2006 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
2007}
2008
2009unsigned RISCVTTIImpl::getEstimatedVLFor(VectorType *Ty) const {
2010 if (isa<ScalableVectorType>(Ty)) {
2011 const unsigned EltSize = DL.getTypeSizeInBits(Ty->getElementType());
2012 const unsigned MinSize = DL.getTypeSizeInBits(Ty).getKnownMinValue();
2013 const unsigned VectorBits = *getVScaleForTuning() * RISCV::RVVBitsPerBlock;
2014 return RISCVTargetLowering::computeVLMAX(VectorBits, EltSize, MinSize);
2015 }
2016 return cast<FixedVectorType>(Ty)->getNumElements();
2017}
2018
2021 FastMathFlags FMF,
2023 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
2024 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
2025
2026 // Skip if scalar size of Ty is bigger than ELEN.
2027 if (Ty->getScalarSizeInBits() > ST->getELen())
2028 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
2029
2030 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
2031 if (Ty->getElementType()->isIntegerTy(1)) {
2032 // SelectionDAGBuilder does following transforms:
2033 // vector_reduce_{smin,umax}(<n x i1>) --> vector_reduce_or(<n x i1>)
2034 // vector_reduce_{smax,umin}(<n x i1>) --> vector_reduce_and(<n x i1>)
2035 if (IID == Intrinsic::umax || IID == Intrinsic::smin)
2036 return getArithmeticReductionCost(Instruction::Or, Ty, FMF, CostKind);
2037 else
2038 return getArithmeticReductionCost(Instruction::And, Ty, FMF, CostKind);
2039 }
2040
2041 if (IID == Intrinsic::maximum || IID == Intrinsic::minimum) {
2043 InstructionCost ExtraCost = 0;
2044 switch (IID) {
2045 case Intrinsic::maximum:
2046 if (FMF.noNaNs()) {
2047 Opcodes = {RISCV::VFREDMAX_VS, RISCV::VFMV_F_S};
2048 } else {
2049 Opcodes = {RISCV::VMFNE_VV, RISCV::VCPOP_M, RISCV::VFREDMAX_VS,
2050 RISCV::VFMV_F_S};
2051 // Cost of Canonical Nan + branch
2052 // lui a0, 523264
2053 // fmv.w.x fa0, a0
2054 Type *DstTy = Ty->getScalarType();
2055 const unsigned EltTyBits = DstTy->getScalarSizeInBits();
2056 Type *SrcTy = IntegerType::getIntNTy(DstTy->getContext(), EltTyBits);
2057 ExtraCost = 1 +
2058 getCastInstrCost(Instruction::UIToFP, DstTy, SrcTy,
2060 getCFInstrCost(Instruction::CondBr, CostKind);
2061 }
2062 break;
2063
2064 case Intrinsic::minimum:
2065 if (FMF.noNaNs()) {
2066 Opcodes = {RISCV::VFREDMIN_VS, RISCV::VFMV_F_S};
2067 } else {
2068 Opcodes = {RISCV::VMFNE_VV, RISCV::VCPOP_M, RISCV::VFREDMIN_VS,
2069 RISCV::VFMV_F_S};
2070 // Cost of Canonical Nan + branch
2071 // lui a0, 523264
2072 // fmv.w.x fa0, a0
2073 Type *DstTy = Ty->getScalarType();
2074 const unsigned EltTyBits = DL.getTypeSizeInBits(DstTy);
2075 Type *SrcTy = IntegerType::getIntNTy(DstTy->getContext(), EltTyBits);
2076 ExtraCost = 1 +
2077 getCastInstrCost(Instruction::UIToFP, DstTy, SrcTy,
2079 getCFInstrCost(Instruction::CondBr, CostKind);
2080 }
2081 break;
2082 }
2083 return ExtraCost + getRISCVInstructionCost(Opcodes, LT.second, CostKind);
2084 }
2085
2086 // IR Reduction is composed by one rvv reduction instruction and vmv
2087 unsigned SplitOp;
2089 switch (IID) {
2090 default:
2091 llvm_unreachable("Unsupported intrinsic");
2092 case Intrinsic::smax:
2093 SplitOp = RISCV::VMAX_VV;
2094 Opcodes = {RISCV::VREDMAX_VS, RISCV::VMV_X_S};
2095 break;
2096 case Intrinsic::smin:
2097 SplitOp = RISCV::VMIN_VV;
2098 Opcodes = {RISCV::VREDMIN_VS, RISCV::VMV_X_S};
2099 break;
2100 case Intrinsic::umax:
2101 SplitOp = RISCV::VMAXU_VV;
2102 Opcodes = {RISCV::VREDMAXU_VS, RISCV::VMV_X_S};
2103 break;
2104 case Intrinsic::umin:
2105 SplitOp = RISCV::VMINU_VV;
2106 Opcodes = {RISCV::VREDMINU_VS, RISCV::VMV_X_S};
2107 break;
2108 case Intrinsic::maxnum:
2109 SplitOp = RISCV::VFMAX_VV;
2110 Opcodes = {RISCV::VFREDMAX_VS, RISCV::VFMV_F_S};
2111 break;
2112 case Intrinsic::minnum:
2113 SplitOp = RISCV::VFMIN_VV;
2114 Opcodes = {RISCV::VFREDMIN_VS, RISCV::VFMV_F_S};
2115 break;
2116 }
2117 // Add a cost for data larger than LMUL8
2118 InstructionCost SplitCost =
2119 (LT.first > 1) ? (LT.first - 1) *
2120 getRISCVInstructionCost(SplitOp, LT.second, CostKind)
2121 : 0;
2122 return SplitCost + getRISCVInstructionCost(Opcodes, LT.second, CostKind);
2123}
2124
2127 std::optional<FastMathFlags> FMF,
2129 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
2130 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
2131
2132 // Skip if scalar size of Ty is bigger than ELEN.
2133 if (Ty->getScalarSizeInBits() > ST->getELen())
2134 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
2135
2136 int ISD = TLI->InstructionOpcodeToISD(Opcode);
2137 assert(ISD && "Invalid opcode");
2138
2139 if (ISD != ISD::ADD && ISD != ISD::OR && ISD != ISD::XOR && ISD != ISD::AND &&
2140 ISD != ISD::FADD)
2141 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
2142
2143 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
2144 Type *ElementTy = Ty->getElementType();
2145 if (ElementTy->isIntegerTy(1)) {
2146 // Example sequences:
2147 // vfirst.m a0, v0
2148 // seqz a0, a0
2149 if (LT.second == MVT::v1i1)
2150 return getRISCVInstructionCost(RISCV::VFIRST_M, LT.second, CostKind) +
2151 getCmpSelInstrCost(Instruction::ICmp, ElementTy, ElementTy,
2153
2154 if (ISD == ISD::AND) {
2155 // Example sequences:
2156 // vmand.mm v8, v9, v8 ; needed every time type is split
2157 // vmnot.m v8, v0 ; alias for vmnand
2158 // vcpop.m a0, v8
2159 // seqz a0, a0
2160
2161 // See the discussion: https://github.com/llvm/llvm-project/pull/119160
2162 // For LMUL <= 8, there is no splitting,
2163 // the sequences are vmnot, vcpop and seqz.
2164 // When LMUL > 8 and split = 1,
2165 // the sequences are vmnand, vcpop and seqz.
2166 // When LMUL > 8 and split > 1,
2167 // the sequences are (LT.first-2) * vmand, vmnand, vcpop and seqz.
2168 return ((LT.first > 2) ? (LT.first - 2) : 0) *
2169 getRISCVInstructionCost(RISCV::VMAND_MM, LT.second, CostKind) +
2170 getRISCVInstructionCost(RISCV::VMNAND_MM, LT.second, CostKind) +
2171 getRISCVInstructionCost(RISCV::VCPOP_M, LT.second, CostKind) +
2172 getCmpSelInstrCost(Instruction::ICmp, ElementTy, ElementTy,
2174 } else if (ISD == ISD::XOR || ISD == ISD::ADD) {
2175 // Example sequences:
2176 // vsetvli a0, zero, e8, mf8, ta, ma
2177 // vmxor.mm v8, v0, v8 ; needed every time type is split
2178 // vcpop.m a0, v8
2179 // andi a0, a0, 1
2180 return (LT.first - 1) *
2181 getRISCVInstructionCost(RISCV::VMXOR_MM, LT.second, CostKind) +
2182 getRISCVInstructionCost(RISCV::VCPOP_M, LT.second, CostKind) + 1;
2183 } else {
2184 assert(ISD == ISD::OR);
2185 // Example sequences:
2186 // vsetvli a0, zero, e8, mf8, ta, ma
2187 // vmor.mm v8, v9, v8 ; needed every time type is split
2188 // vcpop.m a0, v0
2189 // snez a0, a0
2190 return (LT.first - 1) *
2191 getRISCVInstructionCost(RISCV::VMOR_MM, LT.second, CostKind) +
2192 getRISCVInstructionCost(RISCV::VCPOP_M, LT.second, CostKind) +
2193 getCmpSelInstrCost(Instruction::ICmp, ElementTy, ElementTy,
2195 }
2196 }
2197
2198 // IR Reduction of or/and is composed by one vmv and one rvv reduction
2199 // instruction, and others is composed by two vmv and one rvv reduction
2200 // instruction
2201 unsigned SplitOp;
2203 switch (ISD) {
2204 case ISD::ADD:
2205 SplitOp = RISCV::VADD_VV;
2206 Opcodes = {RISCV::VMV_S_X, RISCV::VREDSUM_VS, RISCV::VMV_X_S};
2207 break;
2208 case ISD::OR:
2209 SplitOp = RISCV::VOR_VV;
2210 Opcodes = {RISCV::VREDOR_VS, RISCV::VMV_X_S};
2211 break;
2212 case ISD::XOR:
2213 SplitOp = RISCV::VXOR_VV;
2214 Opcodes = {RISCV::VMV_S_X, RISCV::VREDXOR_VS, RISCV::VMV_X_S};
2215 break;
2216 case ISD::AND:
2217 SplitOp = RISCV::VAND_VV;
2218 Opcodes = {RISCV::VREDAND_VS, RISCV::VMV_X_S};
2219 break;
2220 case ISD::FADD:
2221 // We can't promote f16/bf16 fadd reductions.
2222 if ((LT.second.getScalarType() == MVT::f16 && !ST->hasVInstructionsF16()) ||
2223 LT.second.getScalarType() == MVT::bf16)
2224 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
2226 Opcodes.push_back(RISCV::VFMV_S_F);
2227 for (unsigned i = 0; i < LT.first.getValue(); i++)
2228 Opcodes.push_back(RISCV::VFREDOSUM_VS);
2229 Opcodes.push_back(RISCV::VFMV_F_S);
2230 return getRISCVInstructionCost(Opcodes, LT.second, CostKind);
2231 }
2232 SplitOp = RISCV::VFADD_VV;
2233 Opcodes = {RISCV::VFMV_S_F, RISCV::VFREDUSUM_VS, RISCV::VFMV_F_S};
2234 break;
2235 }
2236 // Add a cost for data larger than LMUL8
2237 InstructionCost SplitCost =
2238 (LT.first > 1) ? (LT.first - 1) *
2239 getRISCVInstructionCost(SplitOp, LT.second, CostKind)
2240 : 0;
2241 return SplitCost + getRISCVInstructionCost(Opcodes, LT.second, CostKind);
2242}
2243
2245 unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy,
2246 std::optional<FastMathFlags> FMF, TTI::TargetCostKind CostKind) const {
2247 if (isa<FixedVectorType>(ValTy) && !ST->useRVVForFixedLengthVectors())
2248 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
2249 FMF, CostKind);
2250
2251 // Skip if scalar size of ResTy is bigger than ELEN.
2252 if (ResTy->getScalarSizeInBits() > ST->getELen())
2253 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
2254 FMF, CostKind);
2255
2256 if (Opcode != Instruction::Add && Opcode != Instruction::FAdd)
2257 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
2258 FMF, CostKind);
2259
2260 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
2261
2262 if (IsUnsigned && Opcode == Instruction::Add &&
2263 LT.second.isFixedLengthVector() && LT.second.getScalarType() == MVT::i1) {
2264 // Represent vector_reduce_add(ZExt(<n x i1>)) as
2265 // ZExtOrTrunc(ctpop(bitcast <n x i1> to in)).
2266 return LT.first *
2267 getRISCVInstructionCost(RISCV::VCPOP_M, LT.second, CostKind);
2268 }
2269
2270 if (ResTy->getScalarSizeInBits() != 2 * LT.second.getScalarSizeInBits())
2271 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
2272 FMF, CostKind);
2273
2274 return (LT.first - 1) +
2275 getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
2276}
2277
2281 assert(OpInfo.isConstant() && "non constant operand?");
2282 if (!isa<VectorType>(Ty))
2283 // FIXME: We need to account for immediate materialization here, but doing
2284 // a decent job requires more knowledge about the immediate than we
2285 // currently have here.
2286 return 0;
2287
2288 if (OpInfo.isUniform())
2289 // vmv.v.i, vmv.v.x, or vfmv.v.f
2290 // We ignore the cost of the scalar constant materialization to be consistent
2291 // with how we treat scalar constants themselves just above.
2292 return 1;
2293
2294 return getConstantPoolLoadCost(Ty, CostKind);
2295}
2296
2298 Align Alignment,
2299 unsigned AddressSpace,
2301 TTI::OperandValueInfo OpInfo,
2302 const Instruction *I) const {
2303 EVT VT = TLI->getValueType(DL, Src, true);
2304 // Type legalization can't handle structs
2305 if (VT == MVT::Other)
2306 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
2307 CostKind, OpInfo, I);
2308
2310 if (Opcode == Instruction::Store && OpInfo.isConstant())
2311 Cost += getStoreImmCost(Src, OpInfo, CostKind);
2312
2313 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
2314
2315 InstructionCost BaseCost = [&]() {
2316 InstructionCost Cost = LT.first;
2318 return Cost;
2319
2320 // Our actual lowering for the case where a wider legal type is available
2321 // uses the a VL predicated load on the wider type. This is reflected in
2322 // the result of getTypeLegalizationCost, but BasicTTI assumes the
2323 // widened cases are scalarized.
2324 const DataLayout &DL = this->getDataLayout();
2325 if (Src->isVectorTy() && LT.second.isVector() &&
2326 TypeSize::isKnownLT(DL.getTypeStoreSizeInBits(Src),
2327 LT.second.getSizeInBits()))
2328 return Cost;
2329
2330 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
2331 CostKind, OpInfo, I);
2332 }();
2333
2334 // Assume memory ops cost scale with the number of vector registers
2335 // possible accessed by the instruction. Note that BasicTTI already
2336 // handles the LT.first term for us.
2337 if (ST->hasVInstructions() && LT.second.isVector() &&
2339 BaseCost *= TLI->getLMULCost(LT.second);
2340 return Cost + BaseCost;
2341}
2342
2344 unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
2346 TTI::OperandValueInfo Op2Info, const Instruction *I) const {
2348 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2349 Op1Info, Op2Info, I);
2350
2351 if (isa<FixedVectorType>(ValTy) && !ST->useRVVForFixedLengthVectors())
2352 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2353 Op1Info, Op2Info, I);
2354
2355 // Skip if scalar size of ValTy is bigger than ELEN.
2356 if (ValTy->isVectorTy() && ValTy->getScalarSizeInBits() > ST->getELen())
2357 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2358 Op1Info, Op2Info, I);
2359
2360 auto GetConstantMatCost =
2361 [&](TTI::OperandValueInfo OpInfo) -> InstructionCost {
2362 if (OpInfo.isUniform())
2363 // We return 0 we currently ignore the cost of materializing scalar
2364 // constants in GPRs.
2365 return 0;
2366
2367 return getConstantPoolLoadCost(ValTy, CostKind);
2368 };
2369
2370 InstructionCost ConstantMatCost;
2371 if (Op1Info.isConstant())
2372 ConstantMatCost += GetConstantMatCost(Op1Info);
2373 if (Op2Info.isConstant())
2374 ConstantMatCost += GetConstantMatCost(Op2Info);
2375
2376 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
2377 if (Opcode == Instruction::Select && ValTy->isVectorTy()) {
2378 if (CondTy->isVectorTy()) {
2379 if (ValTy->getScalarSizeInBits() == 1) {
2380 // vmandn.mm v8, v8, v9
2381 // vmand.mm v9, v0, v9
2382 // vmor.mm v0, v9, v8
2383 return ConstantMatCost +
2384 LT.first *
2385 getRISCVInstructionCost(
2386 {RISCV::VMANDN_MM, RISCV::VMAND_MM, RISCV::VMOR_MM},
2387 LT.second, CostKind);
2388 }
2389 // vselect and max/min are supported natively.
2390 return ConstantMatCost +
2391 LT.first * getRISCVInstructionCost(RISCV::VMERGE_VVM, LT.second,
2392 CostKind);
2393 }
2394
2395 if (ValTy->getScalarSizeInBits() == 1) {
2396 // vmv.v.x v9, a0
2397 // vmsne.vi v9, v9, 0
2398 // vmandn.mm v8, v8, v9
2399 // vmand.mm v9, v0, v9
2400 // vmor.mm v0, v9, v8
2401 MVT InterimVT = LT.second.changeVectorElementType(MVT::i8);
2402 return ConstantMatCost +
2403 LT.first *
2404 getRISCVInstructionCost({RISCV::VMV_V_X, RISCV::VMSNE_VI},
2405 InterimVT, CostKind) +
2406 LT.first * getRISCVInstructionCost(
2407 {RISCV::VMANDN_MM, RISCV::VMAND_MM, RISCV::VMOR_MM},
2408 LT.second, CostKind);
2409 }
2410
2411 // vmv.v.x v10, a0
2412 // vmsne.vi v0, v10, 0
2413 // vmerge.vvm v8, v9, v8, v0
2414 return ConstantMatCost +
2415 LT.first * getRISCVInstructionCost(
2416 {RISCV::VMV_V_X, RISCV::VMSNE_VI, RISCV::VMERGE_VVM},
2417 LT.second, CostKind);
2418 }
2419
2420 if ((Opcode == Instruction::ICmp) && ValTy->isVectorTy() &&
2421 CmpInst::isIntPredicate(VecPred)) {
2422 // Use VMSLT_VV to represent VMSEQ, VMSNE, VMSLTU, VMSLEU, VMSLT, VMSLE
2423 // provided they incur the same cost across all implementations
2424 return ConstantMatCost + LT.first * getRISCVInstructionCost(RISCV::VMSLT_VV,
2425 LT.second,
2426 CostKind);
2427 }
2428
2429 if ((Opcode == Instruction::FCmp) && ValTy->isVectorTy() &&
2430 CmpInst::isFPPredicate(VecPred)) {
2431
2432 // Use VMXOR_MM and VMXNOR_MM to generate all true/false mask
2433 if ((VecPred == CmpInst::FCMP_FALSE) || (VecPred == CmpInst::FCMP_TRUE))
2434 return ConstantMatCost +
2435 getRISCVInstructionCost(RISCV::VMXOR_MM, LT.second, CostKind);
2436
2437 // If we do not support the input floating point vector type, use the base
2438 // one which will calculate as:
2439 // ScalarizeCost + Num * Cost for fixed vector,
2440 // InvalidCost for scalable vector.
2441 if ((ValTy->getScalarSizeInBits() == 16 && !ST->hasVInstructionsF16()) ||
2442 (ValTy->getScalarSizeInBits() == 32 && !ST->hasVInstructionsF32()) ||
2443 (ValTy->getScalarSizeInBits() == 64 && !ST->hasVInstructionsF64()))
2444 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2445 Op1Info, Op2Info, I);
2446
2447 // Assuming vector fp compare and mask instructions are all the same cost
2448 // until a need arises to differentiate them.
2449 switch (VecPred) {
2450 case CmpInst::FCMP_ONE: // vmflt.vv + vmflt.vv + vmor.mm
2451 case CmpInst::FCMP_ORD: // vmfeq.vv + vmfeq.vv + vmand.mm
2452 case CmpInst::FCMP_UNO: // vmfne.vv + vmfne.vv + vmor.mm
2453 case CmpInst::FCMP_UEQ: // vmflt.vv + vmflt.vv + vmnor.mm
2454 return ConstantMatCost +
2455 LT.first * getRISCVInstructionCost(
2456 {RISCV::VMFLT_VV, RISCV::VMFLT_VV, RISCV::VMOR_MM},
2457 LT.second, CostKind);
2458
2459 case CmpInst::FCMP_UGT: // vmfle.vv + vmnot.m
2460 case CmpInst::FCMP_UGE: // vmflt.vv + vmnot.m
2461 case CmpInst::FCMP_ULT: // vmfle.vv + vmnot.m
2462 case CmpInst::FCMP_ULE: // vmflt.vv + vmnot.m
2463 return ConstantMatCost +
2464 LT.first *
2465 getRISCVInstructionCost({RISCV::VMFLT_VV, RISCV::VMNAND_MM},
2466 LT.second, CostKind);
2467
2468 case CmpInst::FCMP_OEQ: // vmfeq.vv
2469 case CmpInst::FCMP_OGT: // vmflt.vv
2470 case CmpInst::FCMP_OGE: // vmfle.vv
2471 case CmpInst::FCMP_OLT: // vmflt.vv
2472 case CmpInst::FCMP_OLE: // vmfle.vv
2473 case CmpInst::FCMP_UNE: // vmfne.vv
2474 return ConstantMatCost +
2475 LT.first *
2476 getRISCVInstructionCost(RISCV::VMFLT_VV, LT.second, CostKind);
2477 default:
2478 break;
2479 }
2480 }
2481
2482 // With ShortForwardBranchOpt or ConditionalMoveFusion, scalar icmp + select
2483 // instructions will lower to SELECT_CC and lower to PseudoCCMOVGPR which will
2484 // generate a conditional branch + mv. The cost of scalar (icmp + select) will
2485 // be (0 + select instr cost).
2486 if (ST->hasConditionalMoveFusion() && I && isa<ICmpInst>(I) &&
2487 ValTy->isIntegerTy() && !I->user_empty()) {
2488 if (all_of(I->users(), [&](const User *U) {
2489 return match(U, m_Select(m_Specific(I), m_Value(), m_Value())) &&
2490 U->getType()->isIntegerTy() &&
2491 !isa<ConstantData>(U->getOperand(1)) &&
2492 !isa<ConstantData>(U->getOperand(2));
2493 }))
2494 return 0;
2495 }
2496
2497 // TODO: Add cost for scalar type.
2498
2499 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2500 Op1Info, Op2Info, I);
2501}
2502
2505 const Instruction *I) const {
2507 return Opcode == Instruction::PHI ? 0 : 1;
2508 // Branches are assumed to be predicted.
2509 return 0;
2510}
2511
2513 unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
2514 const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC) const {
2515 assert(Val->isVectorTy() && "This must be a vector type");
2516
2517 // TODO: Add proper cost model for P extension fixed vectors (e.g., v4i16)
2518 // For now, skip all fixed vector cost analysis when P extension is available
2519 // to avoid crashes in getMinRVVVectorSizeInBits()
2520 if (ST->hasStdExtP() && isa<FixedVectorType>(Val)) {
2521 return 1; // Treat as single instruction cost for now
2522 }
2523
2524 if (Opcode != Instruction::ExtractElement &&
2525 Opcode != Instruction::InsertElement)
2526 return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1,
2527 VIC);
2528
2529 // Legalize the type.
2530 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
2531
2532 // This type is legalized to a scalar type.
2533 if (!LT.second.isVector()) {
2534 auto *FixedVecTy = cast<FixedVectorType>(Val);
2535 // If Index is a known constant, cost is zero.
2536 if (Index != -1U)
2537 return 0;
2538 // Extract/InsertElement with non-constant index is very costly when
2539 // scalarized; estimate cost of loads/stores sequence via the stack:
2540 // ExtractElement cost: store vector to stack, load scalar;
2541 // InsertElement cost: store vector to stack, store scalar, load vector.
2542 Type *ElemTy = FixedVecTy->getElementType();
2543 auto NumElems = FixedVecTy->getNumElements();
2544 auto Align = DL.getPrefTypeAlign(ElemTy);
2545 InstructionCost LoadCost =
2546 getMemoryOpCost(Instruction::Load, ElemTy, Align, 0, CostKind);
2547 InstructionCost StoreCost =
2548 getMemoryOpCost(Instruction::Store, ElemTy, Align, 0, CostKind);
2549 return Opcode == Instruction::ExtractElement
2550 ? StoreCost * NumElems + LoadCost
2551 : (StoreCost + LoadCost) * NumElems + StoreCost;
2552 }
2553
2554 // For unsupported scalable vector.
2555 if (LT.second.isScalableVector() && !LT.first.isValid())
2556 return LT.first;
2557
2558 // Mask vector extract/insert is expanded via e8.
2559 if (Val->getScalarSizeInBits() == 1) {
2560 VectorType *WideTy =
2562 cast<VectorType>(Val)->getElementCount());
2563 if (Opcode == Instruction::ExtractElement) {
2564 InstructionCost ExtendCost
2565 = getCastInstrCost(Instruction::ZExt, WideTy, Val,
2567 InstructionCost ExtractCost
2568 = getVectorInstrCost(Opcode, WideTy, CostKind, Index, nullptr, nullptr);
2569 return ExtendCost + ExtractCost;
2570 }
2571 InstructionCost ExtendCost
2572 = getCastInstrCost(Instruction::ZExt, WideTy, Val,
2574 InstructionCost InsertCost
2575 = getVectorInstrCost(Opcode, WideTy, CostKind, Index, nullptr, nullptr);
2576 InstructionCost TruncCost
2577 = getCastInstrCost(Instruction::Trunc, Val, WideTy,
2579 return ExtendCost + InsertCost + TruncCost;
2580 }
2581
2582
2583 // In RVV, we could use vslidedown + vmv.x.s to extract element from vector
2584 // and vslideup + vmv.s.x to insert element to vector.
2585 unsigned BaseCost = 1;
2586 // When insertelement we should add the index with 1 as the input of vslideup.
2587 unsigned SlideCost = Opcode == Instruction::InsertElement ? 2 : 1;
2588
2589 if (Index != -1U) {
2590 // The type may be split. For fixed-width vectors we can normalize the
2591 // index to the new type.
2592 if (LT.second.isFixedLengthVector()) {
2593 unsigned Width = LT.second.getVectorNumElements();
2594 Index = Index % Width;
2595 }
2596
2597 // If exact VLEN is known, we will insert/extract into the appropriate
2598 // subvector with no additional subvector insert/extract cost.
2599 if (auto VLEN = ST->getRealVLen()) {
2600 unsigned EltSize = LT.second.getScalarSizeInBits();
2601 unsigned M1Max = *VLEN / EltSize;
2602 Index = Index % M1Max;
2603 }
2604
2605 if (Index == 0)
2606 // We can extract/insert the first element without vslidedown/vslideup.
2607 SlideCost = 0;
2608 else if (Opcode == Instruction::InsertElement)
2609 SlideCost = 1; // With a constant index, we do not need to use addi.
2610 }
2611
2612 // When the vector needs to split into multiple register groups and the index
2613 // exceeds single vector register group, we need to insert/extract the element
2614 // via stack.
2615 if (LT.first > 1 &&
2616 ((Index == -1U) || (Index >= LT.second.getVectorMinNumElements() &&
2617 LT.second.isScalableVector()))) {
2618 Type *ScalarType = Val->getScalarType();
2619 Align VecAlign = DL.getPrefTypeAlign(Val);
2620 Align SclAlign = DL.getPrefTypeAlign(ScalarType);
2621 // Extra addi for unknown index.
2622 InstructionCost IdxCost = Index == -1U ? 1 : 0;
2623
2624 // Store all split vectors into stack and load the target element.
2625 if (Opcode == Instruction::ExtractElement)
2626 return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) +
2627 getMemoryOpCost(Instruction::Load, ScalarType, SclAlign, 0,
2628 CostKind) +
2629 IdxCost;
2630
2631 // Store all split vectors into stack and store the target element and load
2632 // vectors back.
2633 return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) +
2634 getMemoryOpCost(Instruction::Load, Val, VecAlign, 0, CostKind) +
2635 getMemoryOpCost(Instruction::Store, ScalarType, SclAlign, 0,
2636 CostKind) +
2637 IdxCost;
2638 }
2639
2640 // Extract i64 in the target that has XLEN=32 need more instruction.
2641 if (Val->getScalarType()->isIntegerTy() &&
2642 ST->getXLen() < Val->getScalarSizeInBits()) {
2643 // For extractelement, we need the following instructions:
2644 // vsetivli zero, 1, e64, m1, ta, mu (not count)
2645 // vslidedown.vx v8, v8, a0
2646 // vmv.x.s a0, v8
2647 // li a1, 32
2648 // vsrl.vx v8, v8, a1
2649 // vmv.x.s a1, v8
2650
2651 // For insertelement, we need the following instructions:
2652 // vsetivli zero, 2, e32, m4, ta, mu (not count)
2653 // vmv.v.i v12, 0
2654 // vslide1up.vx v16, v12, a1
2655 // vslide1up.vx v12, v16, a0
2656 // addi a0, a2, 1
2657 // vsetvli zero, a0, e64, m4, tu, mu (not count)
2658 // vslideup.vx v8, v12, a2
2659
2660 // TODO: should we count these special vsetvlis?
2661 BaseCost = Opcode == Instruction::InsertElement ? 3 : 4;
2662 }
2663 return BaseCost + SlideCost;
2664}
2665
2669 unsigned Index) const {
2670 if (isa<FixedVectorType>(Val))
2672 Index);
2673
2674 // TODO: This code replicates what LoopVectorize.cpp used to do when asking
2675 // for the cost of extracting the last lane of a scalable vector. It probably
2676 // needs a more accurate cost.
2677 ElementCount EC = cast<VectorType>(Val)->getElementCount();
2678 assert(Index < EC.getKnownMinValue() && "Unexpected reverse index");
2679 return getVectorInstrCost(Opcode, Val, CostKind,
2680 EC.getKnownMinValue() - 1 - Index, nullptr,
2681 nullptr);
2682}
2683
2684/// Check to see if this instruction is expected to be combined to a simpler
2685/// operation during/before lowering. If so return the cost of the combined
2686/// operation rather than provided one. For instance, `udiv i16 %X, 2` is likely
2687/// to be combined to `lshr i16 %X, 1`, so return the cost of a `lshr` rather
2688/// than the cost of a `udiv`
2689std::optional<InstructionCost>
2691 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
2693 ArrayRef<const Value *> Args, const Instruction *CxtI) const {
2694 // Vector unsigned division/remainder will be simplified to shifts/masks.
2695 if ((Opcode == Instruction::UDiv || Opcode == Instruction::URem) &&
2696 Opd2Info.isConstant() && Opd2Info.isPowerOf2()) {
2697 if (Opcode == Instruction::UDiv)
2698 return getArithmeticInstrCost(Instruction::LShr, Ty, CostKind, Opd1Info,
2699 Opd2Info.getNoProps());
2700 // UREM
2701 return getArithmeticInstrCost(Instruction::And, Ty, CostKind, Opd1Info,
2702 Opd2Info.getNoProps());
2703 }
2704 return std::nullopt;
2705}
2706
2708 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
2710 ArrayRef<const Value *> Args, const Instruction *CxtI) const {
2711
2712 // TODO: Handle more cost kinds.
2714 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
2715 Args, CxtI);
2716
2717 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
2718 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
2719 Args, CxtI);
2720
2721 // Skip if scalar size of Ty is bigger than ELEN.
2722 if (isa<VectorType>(Ty) && Ty->getScalarSizeInBits() > ST->getELen())
2723 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
2724 Args, CxtI);
2725
2726 if (std::optional<InstructionCost> CombinedCost =
2728 Op2Info, Args, CxtI))
2729 return *CombinedCost;
2730
2731 // Legalize the type.
2732 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
2733 unsigned ISDOpcode = TLI->InstructionOpcodeToISD(Opcode);
2734
2735 // TODO: Handle scalar type.
2736 if (!LT.second.isVector()) {
2737 static const CostTblEntry DivTbl[]{
2738 {ISD::UDIV, MVT::i32, TTI::TCC_Expensive},
2739 {ISD::UDIV, MVT::i64, TTI::TCC_Expensive},
2740 {ISD::SDIV, MVT::i32, TTI::TCC_Expensive},
2741 {ISD::SDIV, MVT::i64, TTI::TCC_Expensive},
2742 {ISD::UREM, MVT::i32, TTI::TCC_Expensive},
2743 {ISD::UREM, MVT::i64, TTI::TCC_Expensive},
2744 {ISD::SREM, MVT::i32, TTI::TCC_Expensive},
2745 {ISD::SREM, MVT::i64, TTI::TCC_Expensive}};
2746 if (TLI->isOperationLegalOrPromote(ISDOpcode, LT.second))
2747 if (const auto *Entry = CostTableLookup(DivTbl, ISDOpcode, LT.second))
2748 return Entry->Cost * LT.first;
2749
2750 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
2751 Args, CxtI);
2752 }
2753
2754 // f16 with zvfhmin and bf16 will be promoted to f32.
2755 // FIXME: nxv32[b]f16 will be custom lowered and split.
2756 InstructionCost CastCost = 0;
2757 if ((LT.second.getVectorElementType() == MVT::f16 ||
2758 LT.second.getVectorElementType() == MVT::bf16) &&
2759 TLI->getOperationAction(ISDOpcode, LT.second) ==
2761 MVT PromotedVT = TLI->getTypeToPromoteTo(ISDOpcode, LT.second);
2762 Type *PromotedTy = EVT(PromotedVT).getTypeForEVT(Ty->getContext());
2763 Type *LegalTy = EVT(LT.second).getTypeForEVT(Ty->getContext());
2764 // Add cost of extending arguments
2765 CastCost += LT.first * Args.size() *
2766 getCastInstrCost(Instruction::FPExt, PromotedTy, LegalTy,
2768 // Add cost of truncating result
2769 CastCost +=
2770 LT.first * getCastInstrCost(Instruction::FPTrunc, LegalTy, PromotedTy,
2772 // Compute cost of op in promoted type
2773 LT.second = PromotedVT;
2774 }
2775
2776 auto getConstantMatCost =
2777 [&](unsigned Operand, TTI::OperandValueInfo OpInfo) -> InstructionCost {
2778 if (OpInfo.isUniform() && canSplatOperand(Opcode, Operand))
2779 // Two sub-cases:
2780 // * Has a 5 bit immediate operand which can be splatted.
2781 // * Has a larger immediate which must be materialized in scalar register
2782 // We return 0 for both as we currently ignore the cost of materializing
2783 // scalar constants in GPRs.
2784 return 0;
2785
2786 return getConstantPoolLoadCost(Ty, CostKind);
2787 };
2788
2789 // Add the cost of materializing any constant vectors required.
2790 InstructionCost ConstantMatCost = 0;
2791 if (Op1Info.isConstant())
2792 ConstantMatCost += getConstantMatCost(0, Op1Info);
2793 if (Op2Info.isConstant())
2794 ConstantMatCost += getConstantMatCost(1, Op2Info);
2795
2796 unsigned Op;
2797 switch (ISDOpcode) {
2798 case ISD::ADD:
2799 case ISD::SUB:
2800 Op = RISCV::VADD_VV;
2801 break;
2802 case ISD::SHL:
2803 case ISD::SRL:
2804 case ISD::SRA:
2805 Op = RISCV::VSLL_VV;
2806 break;
2807 case ISD::AND:
2808 case ISD::OR:
2809 case ISD::XOR:
2810 Op = (Ty->getScalarSizeInBits() == 1) ? RISCV::VMAND_MM : RISCV::VAND_VV;
2811 break;
2812 case ISD::MUL:
2813 case ISD::MULHS:
2814 case ISD::MULHU:
2815 Op = RISCV::VMUL_VV;
2816 break;
2817 case ISD::SDIV:
2818 case ISD::UDIV:
2819 Op = RISCV::VDIV_VV;
2820 break;
2821 case ISD::SREM:
2822 case ISD::UREM:
2823 Op = RISCV::VREM_VV;
2824 break;
2825 case ISD::FADD:
2826 case ISD::FSUB:
2827 Op = RISCV::VFADD_VV;
2828 break;
2829 case ISD::FMUL:
2830 Op = RISCV::VFMUL_VV;
2831 break;
2832 case ISD::FDIV:
2833 Op = RISCV::VFDIV_VV;
2834 break;
2835 case ISD::FNEG:
2836 Op = RISCV::VFSGNJN_VV;
2837 break;
2838 default:
2839 // Assuming all other instructions have the same cost until a need arises to
2840 // differentiate them.
2841 return CastCost + ConstantMatCost +
2842 BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
2843 Args, CxtI);
2844 }
2845
2846 InstructionCost InstrCost = getRISCVInstructionCost(Op, LT.second, CostKind);
2847 // We use BasicTTIImpl to calculate scalar costs, which assumes floating point
2848 // ops are twice as expensive as integer ops. Do the same for vectors so
2849 // scalar floating point ops aren't cheaper than their vector equivalents.
2850 if (Ty->isFPOrFPVectorTy())
2851 InstrCost *= 2;
2852 return CastCost + ConstantMatCost + LT.first * InstrCost;
2853}
2854
2855// TODO: Deduplicate from TargetTransformInfoImplCRTPBase.
2857 ArrayRef<const Value *> Ptrs, const Value *Base,
2858 const TTI::PointersChainInfo &Info, Type *AccessTy,
2861 // In the basic model we take into account GEP instructions only
2862 // (although here can come alloca instruction, a value, constants and/or
2863 // constant expressions, PHIs, bitcasts ... whatever allowed to be used as a
2864 // pointer). Typically, if Base is a not a GEP-instruction and all the
2865 // pointers are relative to the same base address, all the rest are
2866 // either GEP instructions, PHIs, bitcasts or constants. When we have same
2867 // base, we just calculate cost of each non-Base GEP as an ADD operation if
2868 // any their index is a non-const.
2869 // If no known dependencies between the pointers cost is calculated as a sum
2870 // of costs of GEP instructions.
2871 for (auto [I, V] : enumerate(Ptrs)) {
2872 const auto *GEP = dyn_cast<GetElementPtrInst>(V);
2873 if (!GEP)
2874 continue;
2875 if (Info.isSameBase() && V != Base) {
2876 if (GEP->hasAllConstantIndices())
2877 continue;
2878 // If the chain is unit-stride and BaseReg + stride*i is a legal
2879 // addressing mode, then presume the base GEP is sitting around in a
2880 // register somewhere and check if we can fold the offset relative to
2881 // it.
2882 unsigned Stride = DL.getTypeStoreSize(AccessTy);
2883 if (Info.isUnitStride() &&
2884 isLegalAddressingMode(AccessTy,
2885 /* BaseGV */ nullptr,
2886 /* BaseOffset */ Stride * I,
2887 /* HasBaseReg */ true,
2888 /* Scale */ 0,
2889 GEP->getType()->getPointerAddressSpace()))
2890 continue;
2891 Cost += getArithmeticInstrCost(Instruction::Add, GEP->getType(), CostKind,
2892 {TTI::OK_AnyValue, TTI::OP_None},
2893 {TTI::OK_AnyValue, TTI::OP_None}, {});
2894 } else {
2895 SmallVector<const Value *> Indices(GEP->indices());
2896 Cost += getGEPCost(GEP->getSourceElementType(), GEP->getPointerOperand(),
2897 Indices, AccessTy, CostKind);
2898 }
2899 }
2900 return Cost;
2901}
2902
2905 OptimizationRemarkEmitter *ORE) const {
2906 // TODO: More tuning on benchmarks and metrics with changes as needed
2907 // would apply to all settings below to enable performance.
2908
2909
2910 if (ST->enableDefaultUnroll())
2911 return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP, ORE);
2912
2913 // Enable Upper bound unrolling universally, not dependent upon the conditions
2914 // below.
2915 UP.UpperBound = true;
2916
2917 // Disable loop unrolling for Oz and Os.
2918 UP.OptSizeThreshold = 0;
2920 if (L->getHeader()->getParent()->hasOptSize())
2921 return;
2922
2923 SmallVector<BasicBlock *, 4> ExitingBlocks;
2924 L->getExitingBlocks(ExitingBlocks);
2925 LLVM_DEBUG(dbgs() << "Loop has:\n"
2926 << "Blocks: " << L->getNumBlocks() << "\n"
2927 << "Exit blocks: " << ExitingBlocks.size() << "\n");
2928
2929 // Only allow another exit other than the latch. This acts as an early exit
2930 // as it mirrors the profitability calculation of the runtime unroller.
2931 if (ExitingBlocks.size() > 2)
2932 return;
2933
2934 // Limit the CFG of the loop body for targets with a branch predictor.
2935 // Allowing 4 blocks permits if-then-else diamonds in the body.
2936 if (L->getNumBlocks() > 4)
2937 return;
2938
2939 // Scan the loop: don't unroll loops with calls as this could prevent
2940 // inlining. Don't unroll auto-vectorized loops either, though do allow
2941 // unrolling of the scalar remainder.
2942 bool IsVectorized = getBooleanLoopAttribute(L, "llvm.loop.isvectorized");
2944 for (auto *BB : L->getBlocks()) {
2945 for (auto &I : *BB) {
2946 // Both auto-vectorized loops and the scalar remainder have the
2947 // isvectorized attribute, so differentiate between them by the presence
2948 // of vector instructions.
2949 if (IsVectorized && (I.getType()->isVectorTy() ||
2950 llvm::any_of(I.operand_values(), [](Value *V) {
2951 return V->getType()->isVectorTy();
2952 })))
2953 return;
2954
2955 if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
2956 if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
2957 if (!isLoweredToCall(F))
2958 continue;
2959 }
2960 return;
2961 }
2962
2963 SmallVector<const Value *> Operands(I.operand_values());
2964 Cost += getInstructionCost(&I, Operands,
2966 }
2967 }
2968
2969 LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n");
2970
2971 UP.Partial = true;
2972 UP.Runtime = true;
2973 UP.UnrollRemainder = true;
2974 UP.UnrollAndJam = true;
2975
2976 // Force unrolling small loops can be very useful because of the branch
2977 // taken cost of the backedge.
2978 if (Cost < 12)
2979 UP.Force = true;
2980}
2981
2986
2988 MemIntrinsicInfo &Info) const {
2989 const DataLayout &DL = getDataLayout();
2990 Intrinsic::ID IID = Inst->getIntrinsicID();
2991 LLVMContext &C = Inst->getContext();
2992 bool HasMask = false;
2993
2994 auto getSegNum = [](const IntrinsicInst *II, unsigned PtrOperandNo,
2995 bool IsWrite) -> int64_t {
2996 if (auto *TarExtTy =
2997 dyn_cast<TargetExtType>(II->getArgOperand(0)->getType()))
2998 return TarExtTy->getIntParameter(0);
2999
3000 return 1;
3001 };
3002
3003 switch (IID) {
3004 case Intrinsic::riscv_vle_mask:
3005 case Intrinsic::riscv_vse_mask:
3006 case Intrinsic::riscv_vlseg2_mask:
3007 case Intrinsic::riscv_vlseg3_mask:
3008 case Intrinsic::riscv_vlseg4_mask:
3009 case Intrinsic::riscv_vlseg5_mask:
3010 case Intrinsic::riscv_vlseg6_mask:
3011 case Intrinsic::riscv_vlseg7_mask:
3012 case Intrinsic::riscv_vlseg8_mask:
3013 case Intrinsic::riscv_vsseg2_mask:
3014 case Intrinsic::riscv_vsseg3_mask:
3015 case Intrinsic::riscv_vsseg4_mask:
3016 case Intrinsic::riscv_vsseg5_mask:
3017 case Intrinsic::riscv_vsseg6_mask:
3018 case Intrinsic::riscv_vsseg7_mask:
3019 case Intrinsic::riscv_vsseg8_mask:
3020 HasMask = true;
3021 [[fallthrough]];
3022 case Intrinsic::riscv_vle:
3023 case Intrinsic::riscv_vse:
3024 case Intrinsic::riscv_vlseg2:
3025 case Intrinsic::riscv_vlseg3:
3026 case Intrinsic::riscv_vlseg4:
3027 case Intrinsic::riscv_vlseg5:
3028 case Intrinsic::riscv_vlseg6:
3029 case Intrinsic::riscv_vlseg7:
3030 case Intrinsic::riscv_vlseg8:
3031 case Intrinsic::riscv_vsseg2:
3032 case Intrinsic::riscv_vsseg3:
3033 case Intrinsic::riscv_vsseg4:
3034 case Intrinsic::riscv_vsseg5:
3035 case Intrinsic::riscv_vsseg6:
3036 case Intrinsic::riscv_vsseg7:
3037 case Intrinsic::riscv_vsseg8: {
3038 // Intrinsic interface:
3039 // riscv_vle(merge, ptr, vl)
3040 // riscv_vle_mask(merge, ptr, mask, vl, policy)
3041 // riscv_vse(val, ptr, vl)
3042 // riscv_vse_mask(val, ptr, mask, vl, policy)
3043 // riscv_vlseg#(merge, ptr, vl, sew)
3044 // riscv_vlseg#_mask(merge, ptr, mask, vl, policy, sew)
3045 // riscv_vsseg#(val, ptr, vl, sew)
3046 // riscv_vsseg#_mask(val, ptr, mask, vl, sew)
3047 bool IsWrite = Inst->getType()->isVoidTy();
3048 Type *Ty = IsWrite ? Inst->getArgOperand(0)->getType() : Inst->getType();
3049 // The results of segment loads are TargetExtType.
3050 if (auto *TarExtTy = dyn_cast<TargetExtType>(Ty)) {
3051 unsigned SEW =
3052 1 << cast<ConstantInt>(Inst->getArgOperand(Inst->arg_size() - 1))
3053 ->getZExtValue();
3054 Ty = TarExtTy->getTypeParameter(0U);
3056 IntegerType::get(C, SEW),
3057 cast<ScalableVectorType>(Ty)->getMinNumElements() * 8 / SEW);
3058 }
3059 const auto *RVVIInfo = RISCVVIntrinsicsTable::getRISCVVIntrinsicInfo(IID);
3060 unsigned VLIndex = RVVIInfo->VLOperand;
3061 unsigned PtrOperandNo = VLIndex - 1 - HasMask;
3062 MaybeAlign Alignment =
3063 Inst->getArgOperand(PtrOperandNo)->getPointerAlignment(DL);
3064 Type *MaskType = Ty->getWithNewType(Type::getInt1Ty(C));
3065 Value *Mask = ConstantInt::getTrue(MaskType);
3066 if (HasMask)
3067 Mask = Inst->getArgOperand(VLIndex - 1);
3068 Value *EVL = Inst->getArgOperand(VLIndex);
3069 unsigned SegNum = getSegNum(Inst, PtrOperandNo, IsWrite);
3070 // RVV uses contiguous elements as a segment.
3071 if (SegNum > 1) {
3072 unsigned ElemSize = Ty->getScalarSizeInBits();
3073 auto *SegTy = IntegerType::get(C, ElemSize * SegNum);
3074 Ty = VectorType::get(SegTy, cast<VectorType>(Ty));
3075 }
3076 Info.InterestingOperands.emplace_back(Inst, PtrOperandNo, IsWrite, Ty,
3077 Alignment, Mask, EVL);
3078 return true;
3079 }
3080 case Intrinsic::riscv_vlse_mask:
3081 case Intrinsic::riscv_vsse_mask:
3082 case Intrinsic::riscv_vlsseg2_mask:
3083 case Intrinsic::riscv_vlsseg3_mask:
3084 case Intrinsic::riscv_vlsseg4_mask:
3085 case Intrinsic::riscv_vlsseg5_mask:
3086 case Intrinsic::riscv_vlsseg6_mask:
3087 case Intrinsic::riscv_vlsseg7_mask:
3088 case Intrinsic::riscv_vlsseg8_mask:
3089 case Intrinsic::riscv_vssseg2_mask:
3090 case Intrinsic::riscv_vssseg3_mask:
3091 case Intrinsic::riscv_vssseg4_mask:
3092 case Intrinsic::riscv_vssseg5_mask:
3093 case Intrinsic::riscv_vssseg6_mask:
3094 case Intrinsic::riscv_vssseg7_mask:
3095 case Intrinsic::riscv_vssseg8_mask:
3096 HasMask = true;
3097 [[fallthrough]];
3098 case Intrinsic::riscv_vlse:
3099 case Intrinsic::riscv_vsse:
3100 case Intrinsic::riscv_vlsseg2:
3101 case Intrinsic::riscv_vlsseg3:
3102 case Intrinsic::riscv_vlsseg4:
3103 case Intrinsic::riscv_vlsseg5:
3104 case Intrinsic::riscv_vlsseg6:
3105 case Intrinsic::riscv_vlsseg7:
3106 case Intrinsic::riscv_vlsseg8:
3107 case Intrinsic::riscv_vssseg2:
3108 case Intrinsic::riscv_vssseg3:
3109 case Intrinsic::riscv_vssseg4:
3110 case Intrinsic::riscv_vssseg5:
3111 case Intrinsic::riscv_vssseg6:
3112 case Intrinsic::riscv_vssseg7:
3113 case Intrinsic::riscv_vssseg8: {
3114 // Intrinsic interface:
3115 // riscv_vlse(merge, ptr, stride, vl)
3116 // riscv_vlse_mask(merge, ptr, stride, mask, vl, policy)
3117 // riscv_vsse(val, ptr, stride, vl)
3118 // riscv_vsse_mask(val, ptr, stride, mask, vl, policy)
3119 // riscv_vlsseg#(merge, ptr, offset, vl, sew)
3120 // riscv_vlsseg#_mask(merge, ptr, offset, mask, vl, policy, sew)
3121 // riscv_vssseg#(val, ptr, offset, vl, sew)
3122 // riscv_vssseg#_mask(val, ptr, offset, mask, vl, sew)
3123 bool IsWrite = Inst->getType()->isVoidTy();
3124 Type *Ty = IsWrite ? Inst->getArgOperand(0)->getType() : Inst->getType();
3125 // The results of segment loads are TargetExtType.
3126 if (auto *TarExtTy = dyn_cast<TargetExtType>(Ty)) {
3127 unsigned SEW =
3128 1 << cast<ConstantInt>(Inst->getArgOperand(Inst->arg_size() - 1))
3129 ->getZExtValue();
3130 Ty = TarExtTy->getTypeParameter(0U);
3132 IntegerType::get(C, SEW),
3133 cast<ScalableVectorType>(Ty)->getMinNumElements() * 8 / SEW);
3134 }
3135 const auto *RVVIInfo = RISCVVIntrinsicsTable::getRISCVVIntrinsicInfo(IID);
3136 unsigned VLIndex = RVVIInfo->VLOperand;
3137 unsigned PtrOperandNo = VLIndex - 2 - HasMask;
3138 MaybeAlign Alignment =
3139 Inst->getArgOperand(PtrOperandNo)->getPointerAlignment(DL);
3140
3141 Value *Stride = Inst->getArgOperand(PtrOperandNo + 1);
3142 // Use the pointer alignment as the element alignment if the stride is a
3143 // multiple of the pointer alignment. Otherwise, the element alignment
3144 // should be the greatest common divisor of pointer alignment and stride.
3145 // For simplicity, just consider unalignment for elements.
3146 unsigned PointerAlign = Alignment.valueOrOne().value();
3147 if (!isa<ConstantInt>(Stride) ||
3148 cast<ConstantInt>(Stride)->getZExtValue() % PointerAlign != 0)
3149 Alignment = Align(1);
3150
3151 Type *MaskType = Ty->getWithNewType(Type::getInt1Ty(C));
3152 Value *Mask = ConstantInt::getTrue(MaskType);
3153 if (HasMask)
3154 Mask = Inst->getArgOperand(VLIndex - 1);
3155 Value *EVL = Inst->getArgOperand(VLIndex);
3156 unsigned SegNum = getSegNum(Inst, PtrOperandNo, IsWrite);
3157 // RVV uses contiguous elements as a segment.
3158 if (SegNum > 1) {
3159 unsigned ElemSize = Ty->getScalarSizeInBits();
3160 auto *SegTy = IntegerType::get(C, ElemSize * SegNum);
3161 Ty = VectorType::get(SegTy, cast<VectorType>(Ty));
3162 }
3163 Info.InterestingOperands.emplace_back(Inst, PtrOperandNo, IsWrite, Ty,
3164 Alignment, Mask, EVL, Stride);
3165 return true;
3166 }
3167 case Intrinsic::riscv_vloxei_mask:
3168 case Intrinsic::riscv_vluxei_mask:
3169 case Intrinsic::riscv_vsoxei_mask:
3170 case Intrinsic::riscv_vsuxei_mask:
3171 case Intrinsic::riscv_vloxseg2_mask:
3172 case Intrinsic::riscv_vloxseg3_mask:
3173 case Intrinsic::riscv_vloxseg4_mask:
3174 case Intrinsic::riscv_vloxseg5_mask:
3175 case Intrinsic::riscv_vloxseg6_mask:
3176 case Intrinsic::riscv_vloxseg7_mask:
3177 case Intrinsic::riscv_vloxseg8_mask:
3178 case Intrinsic::riscv_vluxseg2_mask:
3179 case Intrinsic::riscv_vluxseg3_mask:
3180 case Intrinsic::riscv_vluxseg4_mask:
3181 case Intrinsic::riscv_vluxseg5_mask:
3182 case Intrinsic::riscv_vluxseg6_mask:
3183 case Intrinsic::riscv_vluxseg7_mask:
3184 case Intrinsic::riscv_vluxseg8_mask:
3185 case Intrinsic::riscv_vsoxseg2_mask:
3186 case Intrinsic::riscv_vsoxseg3_mask:
3187 case Intrinsic::riscv_vsoxseg4_mask:
3188 case Intrinsic::riscv_vsoxseg5_mask:
3189 case Intrinsic::riscv_vsoxseg6_mask:
3190 case Intrinsic::riscv_vsoxseg7_mask:
3191 case Intrinsic::riscv_vsoxseg8_mask:
3192 case Intrinsic::riscv_vsuxseg2_mask:
3193 case Intrinsic::riscv_vsuxseg3_mask:
3194 case Intrinsic::riscv_vsuxseg4_mask:
3195 case Intrinsic::riscv_vsuxseg5_mask:
3196 case Intrinsic::riscv_vsuxseg6_mask:
3197 case Intrinsic::riscv_vsuxseg7_mask:
3198 case Intrinsic::riscv_vsuxseg8_mask:
3199 HasMask = true;
3200 [[fallthrough]];
3201 case Intrinsic::riscv_vloxei:
3202 case Intrinsic::riscv_vluxei:
3203 case Intrinsic::riscv_vsoxei:
3204 case Intrinsic::riscv_vsuxei:
3205 case Intrinsic::riscv_vloxseg2:
3206 case Intrinsic::riscv_vloxseg3:
3207 case Intrinsic::riscv_vloxseg4:
3208 case Intrinsic::riscv_vloxseg5:
3209 case Intrinsic::riscv_vloxseg6:
3210 case Intrinsic::riscv_vloxseg7:
3211 case Intrinsic::riscv_vloxseg8:
3212 case Intrinsic::riscv_vluxseg2:
3213 case Intrinsic::riscv_vluxseg3:
3214 case Intrinsic::riscv_vluxseg4:
3215 case Intrinsic::riscv_vluxseg5:
3216 case Intrinsic::riscv_vluxseg6:
3217 case Intrinsic::riscv_vluxseg7:
3218 case Intrinsic::riscv_vluxseg8:
3219 case Intrinsic::riscv_vsoxseg2:
3220 case Intrinsic::riscv_vsoxseg3:
3221 case Intrinsic::riscv_vsoxseg4:
3222 case Intrinsic::riscv_vsoxseg5:
3223 case Intrinsic::riscv_vsoxseg6:
3224 case Intrinsic::riscv_vsoxseg7:
3225 case Intrinsic::riscv_vsoxseg8:
3226 case Intrinsic::riscv_vsuxseg2:
3227 case Intrinsic::riscv_vsuxseg3:
3228 case Intrinsic::riscv_vsuxseg4:
3229 case Intrinsic::riscv_vsuxseg5:
3230 case Intrinsic::riscv_vsuxseg6:
3231 case Intrinsic::riscv_vsuxseg7:
3232 case Intrinsic::riscv_vsuxseg8: {
3233 // Intrinsic interface (only listed ordered version):
3234 // riscv_vloxei(merge, ptr, index, vl)
3235 // riscv_vloxei_mask(merge, ptr, index, mask, vl, policy)
3236 // riscv_vsoxei(val, ptr, index, vl)
3237 // riscv_vsoxei_mask(val, ptr, index, mask, vl, policy)
3238 // riscv_vloxseg#(merge, ptr, index, vl, sew)
3239 // riscv_vloxseg#_mask(merge, ptr, index, mask, vl, policy, sew)
3240 // riscv_vsoxseg#(val, ptr, index, vl, sew)
3241 // riscv_vsoxseg#_mask(val, ptr, index, mask, vl, sew)
3242 bool IsWrite = Inst->getType()->isVoidTy();
3243 Type *Ty = IsWrite ? Inst->getArgOperand(0)->getType() : Inst->getType();
3244 // The results of segment loads are TargetExtType.
3245 if (auto *TarExtTy = dyn_cast<TargetExtType>(Ty)) {
3246 unsigned SEW =
3247 1 << cast<ConstantInt>(Inst->getArgOperand(Inst->arg_size() - 1))
3248 ->getZExtValue();
3249 Ty = TarExtTy->getTypeParameter(0U);
3251 IntegerType::get(C, SEW),
3252 cast<ScalableVectorType>(Ty)->getMinNumElements() * 8 / SEW);
3253 }
3254 const auto *RVVIInfo = RISCVVIntrinsicsTable::getRISCVVIntrinsicInfo(IID);
3255 unsigned VLIndex = RVVIInfo->VLOperand;
3256 unsigned PtrOperandNo = VLIndex - 2 - HasMask;
3257 Value *Mask;
3258 if (HasMask) {
3259 Mask = Inst->getArgOperand(VLIndex - 1);
3260 } else {
3261 // Mask cannot be nullptr here: vector GEP produces <vscale x N x ptr>,
3262 // and casting that to scalar i64 triggers a vector/scalar mismatch
3263 // assertion in CreatePointerCast. Use an all-true mask so ASan lowers it
3264 // via extractelement instead.
3265 Type *MaskType = Ty->getWithNewType(Type::getInt1Ty(C));
3266 Mask = ConstantInt::getTrue(MaskType);
3267 }
3268 Value *EVL = Inst->getArgOperand(VLIndex);
3269 unsigned SegNum = getSegNum(Inst, PtrOperandNo, IsWrite);
3270 // RVV uses contiguous elements as a segment.
3271 if (SegNum > 1) {
3272 unsigned ElemSize = Ty->getScalarSizeInBits();
3273 auto *SegTy = IntegerType::get(C, ElemSize * SegNum);
3274 Ty = VectorType::get(SegTy, cast<VectorType>(Ty));
3275 }
3276 Value *OffsetOp = Inst->getArgOperand(PtrOperandNo + 1);
3277 Info.InterestingOperands.emplace_back(Inst, PtrOperandNo, IsWrite, Ty,
3278 Align(1), Mask, EVL,
3279 /* Stride */ nullptr, OffsetOp);
3280 return true;
3281 }
3282 }
3283 return false;
3284}
3285
3287 if (Ty->isVectorTy()) {
3288 // f16 with only zvfhmin and bf16 will be promoted to f32
3289 Type *EltTy = cast<VectorType>(Ty)->getElementType();
3290 if ((EltTy->isHalfTy() && !ST->hasVInstructionsF16()) ||
3291 EltTy->isBFloatTy())
3292 Ty = VectorType::get(Type::getFloatTy(Ty->getContext()),
3293 cast<VectorType>(Ty));
3294
3295 TypeSize Size = DL.getTypeSizeInBits(Ty);
3296 if (Size.isScalable() && ST->hasVInstructions())
3297 return divideCeil(Size.getKnownMinValue(), RISCV::RVVBitsPerBlock);
3298
3299 if (ST->useRVVForFixedLengthVectors())
3300 return divideCeil(Size, ST->getRealMinVLen());
3301 }
3302
3303 return BaseT::getRegUsageForType(Ty);
3304}
3305
3306unsigned RISCVTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
3307 if (SLPMaxVF.getNumOccurrences())
3308 return SLPMaxVF;
3309
3310 // Return how many elements can fit in getRegisterBitwidth. This is the
3311 // same routine as used in LoopVectorizer. We should probably be
3312 // accounting for whether we actually have instructions with the right
3313 // lane type, but we don't have enough information to do that without
3314 // some additional plumbing which hasn't been justified yet.
3315 TypeSize RegWidth =
3317 // If no vector registers, or absurd element widths, disable
3318 // vectorization by returning 1.
3319 return std::max<unsigned>(1U, RegWidth.getFixedValue() / ElemWidth);
3320}
3321
3325
3327 return ST->enableUnalignedVectorMem();
3328}
3329
3332 ScalarEvolution *SE) const {
3333 if (ST->hasVendorXCVmem() && !ST->is64Bit())
3334 return TTI::AMK_PostIndexed;
3335
3337}
3338
3340 const TargetTransformInfo::LSRCost &C2) const {
3341 // RISC-V specific here are "instruction number 1st priority".
3342 // If we need to emit adds inside the loop to add up base registers, then
3343 // we need at least one extra temporary register.
3344 unsigned C1NumRegs = C1.NumRegs + (C1.NumBaseAdds != 0);
3345 unsigned C2NumRegs = C2.NumRegs + (C2.NumBaseAdds != 0);
3346 return std::tie(C1.Insns, C1NumRegs, C1.AddRecCost,
3347 C1.NumIVMuls, C1.NumBaseAdds,
3348 C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
3349 std::tie(C2.Insns, C2NumRegs, C2.AddRecCost,
3350 C2.NumIVMuls, C2.NumBaseAdds,
3351 C2.ScaleCost, C2.ImmCost, C2.SetupCost);
3352}
3353
3355 Align Alignment) const {
3356 auto *VTy = dyn_cast<VectorType>(DataTy);
3357 if (!VTy || VTy->isScalableTy())
3358 return false;
3359
3360 if (!isLegalMaskedLoadStore(DataTy, Alignment))
3361 return false;
3362
3363 // FIXME: If it is an i8 vector and the element count exceeds 256, we should
3364 // scalarize these types with LMUL >= maximum fixed-length LMUL.
3365 if (VTy->getElementType()->isIntegerTy(8))
3366 if (VTy->getElementCount().getFixedValue() > 256)
3367 return VTy->getPrimitiveSizeInBits() / ST->getRealMinVLen() <
3368 ST->getMaxLMULForFixedLengthVectors();
3369 return true;
3370}
3371
3373 Align Alignment) const {
3374 auto *VTy = dyn_cast<VectorType>(DataTy);
3375 if (!VTy || VTy->isScalableTy())
3376 return false;
3377
3378 if (!isLegalMaskedLoadStore(DataTy, Alignment))
3379 return false;
3380 return true;
3381}
3382
3383/// See if \p I should be considered for address type promotion. We check if \p
3384/// I is a sext with right type and used in memory accesses. If it used in a
3385/// "complex" getelementptr, we allow it to be promoted without finding other
3386/// sext instructions that sign extended the same initial value. A getelementptr
3387/// is considered as "complex" if it has more than 2 operands.
3389 const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const {
3390 bool Considerable = false;
3391 AllowPromotionWithoutCommonHeader = false;
3392 if (!isa<SExtInst>(&I))
3393 return false;
3394 Type *ConsideredSExtType =
3395 Type::getInt64Ty(I.getParent()->getParent()->getContext());
3396 if (I.getType() != ConsideredSExtType)
3397 return false;
3398 // See if the sext is the one with the right type and used in at least one
3399 // GetElementPtrInst.
3400 for (const User *U : I.users()) {
3401 if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) {
3402 Considerable = true;
3403 // A getelementptr is considered as "complex" if it has more than 2
3404 // operands. We will promote a SExt used in such complex GEP as we
3405 // expect some computation to be merged if they are done on 64 bits.
3406 if (GEPInst->getNumOperands() > 2) {
3407 AllowPromotionWithoutCommonHeader = true;
3408 break;
3409 }
3410 }
3411 }
3412 return Considerable;
3413}
3414
3415bool RISCVTTIImpl::canSplatOperand(unsigned Opcode, int Operand) const {
3416 switch (Opcode) {
3417 case Instruction::Add:
3418 case Instruction::Sub:
3419 case Instruction::Mul:
3420 case Instruction::And:
3421 case Instruction::Or:
3422 case Instruction::Xor:
3423 case Instruction::FAdd:
3424 case Instruction::FSub:
3425 case Instruction::FMul:
3426 case Instruction::FDiv:
3427 case Instruction::ICmp:
3428 case Instruction::FCmp:
3429 return true;
3430 case Instruction::Shl:
3431 case Instruction::LShr:
3432 case Instruction::AShr:
3433 case Instruction::UDiv:
3434 case Instruction::SDiv:
3435 case Instruction::URem:
3436 case Instruction::SRem:
3437 case Instruction::Select:
3438 return Operand == 1;
3439 default:
3440 return false;
3441 }
3442}
3443
3445 if (!I->getType()->isVectorTy() || !ST->hasVInstructions())
3446 return false;
3447
3448 if (canSplatOperand(I->getOpcode(), Operand))
3449 return true;
3450
3451 auto *II = dyn_cast<IntrinsicInst>(I);
3452 if (!II)
3453 return false;
3454
3455 switch (II->getIntrinsicID()) {
3456 case Intrinsic::fma:
3457 case Intrinsic::vp_fma:
3458 case Intrinsic::fmuladd:
3459 case Intrinsic::vp_fmuladd:
3460 return Operand == 0 || Operand == 1;
3461 case Intrinsic::vp_shl:
3462 case Intrinsic::vp_lshr:
3463 case Intrinsic::vp_ashr:
3464 case Intrinsic::vp_udiv:
3465 case Intrinsic::vp_sdiv:
3466 case Intrinsic::vp_urem:
3467 case Intrinsic::vp_srem:
3468 case Intrinsic::ssub_sat:
3469 case Intrinsic::vp_ssub_sat:
3470 case Intrinsic::usub_sat:
3471 case Intrinsic::vp_usub_sat:
3472 case Intrinsic::vp_select:
3473 return Operand == 1;
3474 // These intrinsics are commutative.
3475 case Intrinsic::vp_add:
3476 case Intrinsic::vp_mul:
3477 case Intrinsic::vp_and:
3478 case Intrinsic::vp_or:
3479 case Intrinsic::vp_xor:
3480 case Intrinsic::vp_fadd:
3481 case Intrinsic::vp_fmul:
3482 case Intrinsic::vp_icmp:
3483 case Intrinsic::vp_fcmp:
3484 case Intrinsic::smin:
3485 case Intrinsic::vp_smin:
3486 case Intrinsic::umin:
3487 case Intrinsic::vp_umin:
3488 case Intrinsic::smax:
3489 case Intrinsic::vp_smax:
3490 case Intrinsic::umax:
3491 case Intrinsic::vp_umax:
3492 case Intrinsic::sadd_sat:
3493 case Intrinsic::vp_sadd_sat:
3494 case Intrinsic::uadd_sat:
3495 case Intrinsic::vp_uadd_sat:
3496 // These intrinsics have 'vr' versions.
3497 case Intrinsic::vp_sub:
3498 case Intrinsic::vp_fsub:
3499 case Intrinsic::vp_fdiv:
3500 return Operand == 0 || Operand == 1;
3501 default:
3502 return false;
3503 }
3504}
3505
3506/// Check if sinking \p I's operands to I's basic block is profitable, because
3507/// the operands can be folded into a target instruction, e.g.
3508/// splats of scalars can fold into vector instructions.
3511 using namespace llvm::PatternMatch;
3512
3513 if (I->isBitwiseLogicOp()) {
3514 if (!I->getType()->isVectorTy()) {
3515 if (ST->hasStdExtZbb() || ST->hasStdExtZbkb()) {
3516 for (auto &Op : I->operands()) {
3517 // (and/or/xor X, (not Y)) -> (andn/orn/xnor X, Y)
3518 if (match(Op.get(), m_Not(m_Value()))) {
3519 Ops.push_back(&Op);
3520 return true;
3521 }
3522 }
3523 }
3524 } else if (I->getOpcode() == Instruction::And && ST->hasStdExtZvkb()) {
3525 for (auto &Op : I->operands()) {
3526 // (and X, (not Y)) -> (vandn.vv X, Y)
3527 if (match(Op.get(), m_Not(m_Value()))) {
3528 Ops.push_back(&Op);
3529 return true;
3530 }
3531 // (and X, (splat (not Y))) -> (vandn.vx X, Y)
3533 m_ZeroInt()),
3534 m_Value(), m_ZeroMask()))) {
3535 Use &InsertElt = cast<Instruction>(Op)->getOperandUse(0);
3536 Use &Not = cast<Instruction>(InsertElt)->getOperandUse(1);
3537 Ops.push_back(&Not);
3538 Ops.push_back(&InsertElt);
3539 Ops.push_back(&Op);
3540 return true;
3541 }
3542 }
3543 }
3544 }
3545
3546 if (!I->getType()->isVectorTy() || !ST->hasVInstructions())
3547 return false;
3548
3549 // Don't sink splat operands if the target prefers it. Some targets requires
3550 // S2V transfer buffers and we can run out of them copying the same value
3551 // repeatedly.
3552 // FIXME: It could still be worth doing if it would improve vector register
3553 // pressure and prevent a vector spill.
3554 if (!ST->sinkSplatOperands())
3555 return false;
3556
3557 for (auto OpIdx : enumerate(I->operands())) {
3558 if (!canSplatOperand(I, OpIdx.index()))
3559 continue;
3560
3561 Instruction *Op = dyn_cast<Instruction>(OpIdx.value().get());
3562 // Make sure we are not already sinking this operand
3563 if (!Op || any_of(Ops, [&](Use *U) { return U->get() == Op; }))
3564 continue;
3565
3566 // We are looking for a splat that can be sunk.
3568 m_Value(), m_ZeroMask())))
3569 continue;
3570
3571 // Don't sink i1 splats.
3572 if (cast<VectorType>(Op->getType())->getElementType()->isIntegerTy(1))
3573 continue;
3574
3575 // All uses of the shuffle should be sunk to avoid duplicating it across gpr
3576 // and vector registers
3577 for (Use &U : Op->uses()) {
3578 Instruction *Insn = cast<Instruction>(U.getUser());
3579 if (!canSplatOperand(Insn, U.getOperandNo()))
3580 return false;
3581 }
3582
3583 // Sink any fpexts since they might be used in a widening fp pattern.
3584 Use *InsertEltUse = &Op->getOperandUse(0);
3585 auto *InsertElt = cast<InsertElementInst>(InsertEltUse);
3586 if (isa<FPExtInst>(InsertElt->getOperand(1)))
3587 Ops.push_back(&InsertElt->getOperandUse(1));
3588 Ops.push_back(InsertEltUse);
3589 Ops.push_back(&OpIdx.value());
3590 }
3591 return true;
3592}
3593
3595RISCVTTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
3597 // TODO: Enable expansion when unaligned access is not supported after we fix
3598 // issues in ExpandMemcmp.
3599 if (!ST->enableUnalignedScalarMem())
3600 return Options;
3601
3602 if (!ST->hasStdExtZbb() && !ST->hasStdExtZbkb() && !IsZeroCmp)
3603 return Options;
3604
3605 Options.AllowOverlappingLoads = true;
3606 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
3607 Options.NumLoadsPerBlock = Options.MaxNumLoads;
3608 if (ST->is64Bit()) {
3609 Options.LoadSizes = {8, 4, 2, 1};
3610 Options.AllowedTailExpansions = {3, 5, 6};
3611 } else {
3612 Options.LoadSizes = {4, 2, 1};
3613 Options.AllowedTailExpansions = {3};
3614 }
3615
3616 if (IsZeroCmp && ST->hasVInstructions()) {
3617 unsigned VLenB = ST->getRealMinVLen() / 8;
3618 // The minimum size should be `XLen / 8 + 1`, and the maxinum size should be
3619 // `VLenB * MaxLMUL` so that it fits in a single register group.
3620 unsigned MinSize = ST->getXLen() / 8 + 1;
3621 unsigned MaxSize = VLenB * ST->getMaxLMULForFixedLengthVectors();
3622 for (unsigned Size = MinSize; Size <= MaxSize; Size++)
3623 Options.LoadSizes.insert(Options.LoadSizes.begin(), Size);
3624 }
3625 return Options;
3626}
3627
3629 const Instruction *I) const {
3631 // For the binary operators (e.g. or) we need to be more careful than
3632 // selects, here we only transform them if they are already at a natural
3633 // break point in the code - the end of a block with an unconditional
3634 // terminator.
3635 if (I->getOpcode() == Instruction::Or &&
3636 isa<UncondBrInst>(I->getNextNode()))
3637 return true;
3638
3639 if (I->getOpcode() == Instruction::Add ||
3640 I->getOpcode() == Instruction::Sub)
3641 return true;
3642 }
3644}
3645
3647 const Function *Caller, const Attribute &Attr) const {
3648 // "interrupt" controls the prolog/epilog of interrupt handlers (and includes
3649 // restrictions on their signatures). We can outline from the bodies of these
3650 // handlers, but when we do we need to make sure we don't mark the outlined
3651 // function as an interrupt handler too.
3652 if (Attr.isStringAttribute() && Attr.getKindAsString() == "interrupt")
3653 return false;
3654
3656}
3657
3658std::optional<Instruction *>
3660 // If all operands of a vmv.v.x are constant, fold a bitcast(vmv.v.x) to scale
3661 // the vmv.v.x, enabling removal of the bitcast. The transform helps avoid
3662 // creating redundant masks.
3663 const DataLayout &DL = IC.getDataLayout();
3664 if (II.user_empty())
3665 return {};
3666 auto *TargetVecTy = dyn_cast<ScalableVectorType>(II.user_back()->getType());
3667 if (!TargetVecTy)
3668 return {};
3669 const APInt *Scalar;
3670 uint64_t VL;
3672 m_Poison(), m_APInt(Scalar), m_ConstantInt(VL))) ||
3673 !all_of(II.users(), [TargetVecTy](User *U) {
3674 return U->getType() == TargetVecTy && match(U, m_BitCast(m_Value()));
3675 }))
3676 return {};
3677 auto *SourceVecTy = cast<ScalableVectorType>(II.getType());
3678 unsigned TargetEltBW = DL.getTypeSizeInBits(TargetVecTy->getElementType());
3679 unsigned SourceEltBW = DL.getTypeSizeInBits(SourceVecTy->getElementType());
3680 if (TargetEltBW % SourceEltBW)
3681 return {};
3682 unsigned TargetScale = TargetEltBW / SourceEltBW;
3683 if (VL % TargetScale)
3684 return {};
3685 Type *VLTy = II.getOperand(2)->getType();
3686 ElementCount SourceEC = SourceVecTy->getElementCount();
3687 unsigned NewEltBW = SourceEltBW * TargetScale;
3688 if (!SourceEC.isKnownMultipleOf(TargetScale) ||
3689 !DL.fitsInLegalInteger(NewEltBW))
3690 return {};
3691 auto *NewEltTy = IntegerType::get(II.getContext(), NewEltBW);
3692 if (!TLI->isLegalElementTypeForRVV(TLI->getValueType(DL, NewEltTy)))
3693 return {};
3694 ElementCount NewEC = SourceEC.divideCoefficientBy(TargetScale);
3695 Type *RetTy = VectorType::get(NewEltTy, NewEC);
3696 assert(SourceVecTy->canLosslesslyBitCastTo(RetTy) &&
3697 "Lossless bitcast between types expected");
3698 APInt NewScalar = APInt::getSplat(NewEltBW, *Scalar);
3699 return IC.replaceInstUsesWith(
3700 II,
3703 RetTy, Intrinsic::riscv_vmv_v_x,
3704 {PoisonValue::get(RetTy), ConstantInt::get(NewEltTy, NewScalar),
3705 ConstantInt::get(VLTy, VL / TargetScale)}),
3706 SourceVecTy));
3707}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static cl::opt< bool > EnableOrLikeSelectOpt("enable-aarch64-or-like-select", cl::init(true), cl::Hidden)
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static bool shouldSplit(Instruction *InsertPoint, DenseSet< Value * > &PrevConditionValues, DenseSet< Value * > &ConditionValues, DominatorTree &DT, DenseSet< Instruction * > &Unhoistables)
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
Cost tables and simple lookup functions.
Hexagon Common GEP
static cl::opt< int > InstrCost("inline-instr-cost", cl::Hidden, cl::init(5), cl::desc("Cost of a single instruction when inlining"))
std::pair< Instruction::BinaryOps, Value * > OffsetOp
Find all possible pairs (BinOp, RHS) that BinOp V, RHS can be simplified.
This file provides the interface for the instcombine pass implementation.
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static LVOptions Options
Definition LVOptions.cpp:25
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
static const Function * getCalledFunction(const Value *V)
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
static InstructionCost costShuffleViaVRegSplitting(const RISCVTTIImpl &TTI, MVT LegalVT, std::optional< unsigned > VLen, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind)
Try to perform better estimation of the permutation.
static InstructionCost costShuffleViaSplitting(const RISCVTTIImpl &TTI, MVT LegalVT, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind)
Attempt to approximate the cost of a shuffle which will require splitting during legalization.
static bool isRepeatedConcatMask(ArrayRef< int > Mask, int &SubVectorSize)
static unsigned isM1OrSmaller(MVT VT)
static cl::opt< bool > EnableOrLikeSelectOpt("enable-riscv-or-like-select", cl::init(true), cl::Hidden)
static cl::opt< unsigned > SLPMaxVF("riscv-v-slp-max-vf", cl::desc("Overrides result used for getMaximumVF query which is used " "exclusively by SLP vectorizer."), cl::Hidden)
static cl::opt< unsigned > RVVRegisterWidthLMUL("riscv-v-register-bit-width-lmul", cl::desc("The LMUL to use for getRegisterBitWidth queries. Affects LMUL used " "by autovectorized code. Fractional LMULs are not supported."), cl::init(2), cl::Hidden)
static cl::opt< unsigned > RVVMinTripCount("riscv-v-min-trip-count", cl::desc("Set the lower bound of a trip count to decide on " "vectorization while tail-folding."), cl::init(5), cl::Hidden)
static InstructionCost getIntImmCostImpl(const DataLayout &DL, const RISCVSubtarget *ST, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, bool FreeZeroes)
static VectorType * getVRGatherIndexType(MVT DataVT, const RISCVSubtarget &ST, LLVMContext &C)
static const CostTblEntry VectorIntrinsicCostTable[]
static bool canUseShiftPair(Instruction *Inst, const APInt &Imm)
static bool canUseShiftCmp(Instruction *Inst, const APInt &Imm)
This file defines a TargetTransformInfoImplBase conforming object specific to the RISC-V target machi...
static Type * getValueType(Value *V, bool LookThroughCmp=false)
Returns the "element type" of the given value/instruction V.
This file contains some templates that are useful if you are working with the STL at all.
#define LLVM_DEBUG(...)
Definition Debug.h:114
This file describes how to lower LLVM code to machine code.
This pass exposes codegen information to IR-level passes.
Class for arbitrary precision integers.
Definition APInt.h:78
static LLVM_ABI APInt getSplat(unsigned NewLen, const APInt &V)
Return a value containing V broadcasted over NewLen bits.
Definition APInt.cpp:651
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition APInt.h:201
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
Functions, function parameters, and return types can have attributes to indicate how they should be t...
Definition Attributes.h:105
LLVM_ABI bool isStringAttribute() const
Return true if the attribute is a string (target-dependent) attribute.
LLVM_ABI StringRef getKindAsString() const
Return the attribute's kind as a string.
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef< const Value * > Operands, Type *AccessType, TTI::TargetCostKind CostKind) const override
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *SrcTy, int &Index, VectorType *&SubTy) const
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr, int64_t ScalableOffset=0) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
std::optional< unsigned > getMaxVScale() const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
InstructionCost getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index) const override
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
bool isLegalAddImmediate(int64_t imm) const override
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
std::optional< unsigned > getVScaleForTuning() const override
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *, const SCEV *, TTI::TargetCostKind) const override
unsigned getRegUsageForType(Type *Ty) const override
InstructionCost getMemIntrinsicInstrCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
Value * getArgOperand(unsigned i) const
unsigned arg_size() const
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition InstrTypes.h:679
@ FCMP_TRUE
1 1 1 1 Always true (always folded)
Definition InstrTypes.h:693
@ ICMP_SLT
signed less than
Definition InstrTypes.h:705
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition InstrTypes.h:682
@ FCMP_ULE
1 1 0 1 True if unordered, less than, or equal
Definition InstrTypes.h:691
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition InstrTypes.h:680
@ FCMP_OGE
0 0 1 1 True if ordered and greater than or equal
Definition InstrTypes.h:681
@ FCMP_ULT
1 1 0 0 True if unordered or less than
Definition InstrTypes.h:690
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition InstrTypes.h:684
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition InstrTypes.h:687
@ FCMP_UGT
1 0 1 0 True if unordered or greater than
Definition InstrTypes.h:688
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition InstrTypes.h:683
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition InstrTypes.h:685
@ ICMP_NE
not equal
Definition InstrTypes.h:698
@ FCMP_UNE
1 1 1 0 True if unordered or not equal
Definition InstrTypes.h:692
@ FCMP_UGE
1 0 1 1 True if unordered, greater than, or equal
Definition InstrTypes.h:689
@ FCMP_FALSE
0 0 0 0 Always false (always folded)
Definition InstrTypes.h:678
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition InstrTypes.h:686
static bool isFPPredicate(Predicate P)
Definition InstrTypes.h:770
static bool isIntPredicate(Predicate P)
Definition InstrTypes.h:776
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
This class represents a range of values.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:23
bool noNaNs() const
Definition FMF.h:68
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static FixedVectorType * getDoubleElementsVectorType(FixedVectorType *VTy)
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:873
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > OverloadTypes, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using OverloadTypes.
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2210
The core instruction combiner logic.
const DataLayout & getDataLayout() const
Instruction * replaceInstUsesWith(Instruction &I, Value *V)
A combiner-aware RAUW-like routine.
BuilderTy & Builder
static InstructionCost getInvalid(CostType Val=0)
CostType getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
LLVM_ABI bool isCommutative() const LLVM_READONLY
Return true if the instruction is commutative:
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:354
const SmallVectorImpl< Type * > & getArgTypes() const
const SmallVectorImpl< const Value * > & getArgs() const
A wrapper class for inspecting calls to intrinsic functions.
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
Machine Value Type.
static MVT getFloatingPointVT(unsigned BitWidth)
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
uint64_t getScalarSizeInBits() const
MVT changeVectorElementType(MVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
MVT changeTypeToInteger()
Return the type converted to an equivalently sized integer or vector with integer element type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
bool bitsGT(MVT VT) const
Return true if this has more bits than VT.
bool isFixedLengthVector() const
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
MVT getVectorElementType() const
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
Information for memory intrinsic cost model.
const Instruction * getInst() const
unsigned getOpcode() const
Return the opcode for this Instruction or ConstantExpr.
Definition Operator.h:43
The optimization diagnostic interface.
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
bool shouldCopyAttributeWhenOutliningFrom(const Function *Caller, const Attribute &Attr) const override
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
bool isLegalMaskedExpandLoad(Type *DataType, Align Alignment) const override
InstructionCost getStridedMemoryOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const
bool isLegalMaskedLoadStore(Type *DataType, Align Alignment) const
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) const override
unsigned getMinTripCountTailFoldingThreshold() const override
TTI::AddressingModeKind getPreferredAddressingMode(const Loop *L, ScalarEvolution *SE) const override
InstructionCost getAddressComputationCost(Type *PTy, ScalarEvolution *SE, const SCEV *Ptr, TTI::TargetCostKind CostKind) const override
InstructionCost getStoreImmCost(Type *VecTy, TTI::OperandValueInfo OpInfo, TTI::TargetCostKind CostKind) const
Return the cost of materializing an immediate for a value operand of a store instruction.
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const override
InstructionCost getCostOfKeepingLiveOverCall(ArrayRef< Type * > Tys) const override
std::optional< InstructionCost > getCombinedArithmeticInstructionCost(unsigned ISDOpcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info, TTI::OperandValueInfo Opd2Info, ArrayRef< const Value * > Args, const Instruction *CxtI) const
Check to see if this instruction is expected to be combined to a simpler operation during/before lowe...
bool hasActiveVectorLength() const override
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index) const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
Try to calculate op costs for min/max reduction operations.
bool canSplatOperand(Instruction *I, int Operand) const
Return true if the (vector) instruction I will be lowered to an instruction with a scalar splat opera...
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2) const override
bool isLegalStridedLoadStore(Type *DataType, Align Alignment) const override
unsigned getRegUsageForType(Type *Ty) const override
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
bool isLegalMaskedScatter(Type *DataType, Align Alignment) const override
bool isLegalMaskedCompressStore(Type *DataTy, Align Alignment) const override
InstructionCost getGatherScatterOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const
InstructionCost getPartialReductionCost(unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType, ElementCount VF, TTI::PartialReductionExtendKind OpAExtend, TTI::PartialReductionExtendKind OpBExtend, std::optional< unsigned > BinOp, TTI::TargetCostKind CostKind, std::optional< FastMathFlags > FMF) const override
bool shouldTreatInstructionLikeSelect(const Instruction *I) const override
InstructionCost getExpandCompressMemoryOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const
bool preferAlternateOpcodeVectorization() const override
bool isProfitableToSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const override
Check if sinking I's operands to I's basic block is profitable, because the operands can be folded in...
std::optional< unsigned > getMaxVScale() const override
bool shouldExpandReduction(const IntrinsicInst *II) const override
std::optional< unsigned > getVScaleForTuning() const override
InstructionCost getMemIntrinsicInstrCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const override
Get memory intrinsic cost based on arguments.
bool isLegalMaskedGather(Type *DataType, Align Alignment) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const override
InstructionCost getPointersChainCost(ArrayRef< const Value * > Ptrs, const Value *Base, const TTI::PointersChainInfo &Info, Type *AccessTy, TTI::TargetCostKind CostKind) const override
TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const override
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
Estimate the overhead of scalarizing an instruction.
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpdInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
Get intrinsic cost based on arguments.
InstructionCost getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
std::optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const override
bool shouldConsiderAddressTypePromotion(const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const override
See if I should be considered for address type promotion.
InstructionCost getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) const override
TargetTransformInfo::PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override
static MVT getM1VT(MVT VT)
Given a vector (either fixed or scalable), return the scalable vector corresponding to a vector regis...
InstructionCost getVRGatherVVCost(MVT VT) const
Return the cost of a vrgather.vv instruction for the type VT.
InstructionCost getVRGatherVICost(MVT VT) const
Return the cost of a vrgather.vi (or vx) instruction for the type VT.
static unsigned computeVLMAX(unsigned VectorBits, unsigned EltSize, unsigned MinSize)
InstructionCost getLMULCost(MVT VT) const
Return the cost of LMUL for linear operations.
InstructionCost getVSlideVICost(MVT VT) const
Return the cost of a vslidedown.vi or vslideup.vi instruction for the type VT.
InstructionCost getVSlideVXCost(MVT VT) const
Return the cost of a vslidedown.vx or vslideup.vx instruction for the type VT.
static RISCVVType::VLMUL getLMUL(MVT VT)
This class represents an analyzed expression in the program.
static LLVM_ABI ScalableVectorType * get(Type *ElementType, unsigned MinNumElts)
Definition Type.cpp:895
The main scalar evolution driver.
static LLVM_ABI bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
static LLVM_ABI bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
Implements a dense probed hash-table based set with some number of buckets stored inline.
Definition DenseSet.h:291
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
virtual const DataLayout & getDataLayout() const
virtual bool shouldTreatInstructionLikeSelect(const Instruction *I) const
virtual TTI::AddressingModeKind getPreferredAddressingMode(const Loop *L, ScalarEvolution *SE) const
virtual bool shouldCopyAttributeWhenOutliningFrom(const Function *Caller, const Attribute &Attr) const
virtual bool isLoweredToCall(const Function *F) const
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TTI::TargetCostKind CostKind) const override
VectorInstrContext
Represents a hint about the context in which an insert/extract is used.
@ None
The insert/extract is not used with a load/store.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
@ TCK_Latency
The latency of instruction.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
PopcntSupportKind
Flags indicating the kind of support for population count.
@ TCC_Expensive
The cost of a 'div' instruction on x86.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
AddressingModeKind
Which addressing mode Loop Strength Reduction will try to generate.
@ AMK_PostIndexed
Prefer post-indexed addressing mode.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Transpose
Transpose two vectors.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition TypeSize.h:346
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:314
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:290
LLVM_ABI bool isScalableTy(SmallPtrSetImpl< const Type * > &Visited) const
Return true if this is a type whose size is a known multiple of vscale.
Definition Type.cpp:65
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition Type.h:147
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:370
LLVM_ABI Type * getWithNewBitWidth(unsigned NewBitWidth) const
Given an integer or vector type, change the lane bitwidth to NewBitwidth, whilst keeping the old numb...
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:144
LLVM_ABI Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:130
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:236
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:310
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:257
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:317
static LLVM_ABI Type * getFloatTy(LLVMContext &C)
Definition Type.cpp:290
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:141
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
Value * getOperand(unsigned i) const
Definition User.h:207
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255
user_iterator user_begin()
Definition Value.h:402
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.h:258
LLVM_ABI Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
Definition Value.cpp:964
Base class of all SIMD vector types.
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
constexpr bool isKnownMultipleOf(ScalarTy RHS) const
This function tells the caller whether the element count is known at compile time to be a multiple of...
Definition TypeSize.h:180
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
static constexpr bool isKnownLE(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:230
static constexpr bool isKnownLT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:216
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
constexpr LeafTy divideCoefficientBy(ScalarTy RHS) const
We do not provide the '/' operator here because division for polynomial types does not work in the sa...
Definition TypeSize.h:252
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
ISD namespace - This namespace contains an enum which represents all of the SelectionDAG node types a...
Definition ISDOpcodes.h:24
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:264
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:880
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:417
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:844
@ FNEG
Perform various unary floating-point operations inspired by libm.
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition ISDOpcodes.h:704
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:765
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:850
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:978
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:926
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:739
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:959
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:856
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
auto m_Poison()
Match an arbitrary poison constant.
ap_match< APInt > m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
bool match(Val *V, const Pattern &P)
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
auto m_Value()
Match an arbitrary value and ignore it.
TwoOps_match< V1_t, V2_t, Instruction::ShuffleVector > m_Shuffle(const V1_t &v1, const V2_t &v2)
Matches ShuffleVectorInst independently of mask value.
ThreeOps_match< Val_t, Elt_t, Idx_t, Instruction::InsertElement > m_InsertElt(const Val_t &Val, const Elt_t &Elt, const Idx_t &Idx)
Matches InsertElementInst.
auto m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
int getIntMatCost(const APInt &Val, unsigned Size, const MCSubtargetInfo &STI, bool CompressionCost, bool FreeZeroes)
static constexpr unsigned RVVBitsPerBlock
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
unsigned Log2_32_Ceil(uint32_t Value)
Return the ceil log base 2 of the specified value, 32 if the value is zero.
Definition MathExtras.h:344
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
const CostTblEntryT< CostType > * CostTableLookup(ArrayRef< CostTblEntryT< CostType > > Tbl, int ISD, MVT Ty)
Find in cost table.
Definition CostTable.h:35
LLVM_ABI bool getBooleanLoopAttribute(const Loop *TheLoop, StringRef Name)
Returns true if Name is applied to TheLoop and enabled.
InstructionCost Cost
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2554
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:204
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition MathExtras.h:273
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1746
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
LLVM_ABI llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
constexpr int PoisonMaskElem
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
TargetTransformInfo TTI
LLVM_ABI bool isMaskedSlidePair(ArrayRef< int > Mask, int NumElts, std::array< std::pair< int, int >, 2 > &SrcInfo)
Does this shuffle mask represent either one slide shuffle or a pair of two slide shuffles,...
LLVM_ABI llvm::SmallVector< int, 16 > createInterleaveMask(unsigned VF, unsigned NumVecs)
Create an interleave shuffle mask.
DWARFExpression::Operation Op
CostTblEntryT< unsigned > CostTblEntry
Definition CostTable.h:30
OutputIt copy(R &&Range, OutputIt Out)
Definition STLExtras.h:1885
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
constexpr int64_t SignExtend64(uint64_t x)
Sign-extend the number in the bottom B bits of X to a 64-bit integer.
Definition MathExtras.h:572
LLVM_ABI void processShuffleMasks(ArrayRef< int > Mask, unsigned NumOfSrcRegs, unsigned NumOfDestRegs, unsigned NumOfUsedRegs, function_ref< void()> NoInputAction, function_ref< void(ArrayRef< int >, unsigned, unsigned)> SingleInputAction, function_ref< void(ArrayRef< int >, unsigned, unsigned, bool)> ManyInputsAction)
Splits and processes shuffle mask depending on the number of input and output registers.
bool equal(L &&LRange, R &&RRange)
Wrapper function around std::equal to detect if pair-wise elements between two ranges are the same.
Definition STLExtras.h:2146
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition bit.h:347
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77
Extended Value Type.
Definition ValueTypes.h:35
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:106
Align valueOrOne() const
For convenience, returns a valid alignment or 1 if undefined.
Definition Alignment.h:130
Information about a load/store intrinsic defined by the target.
unsigned Insns
TODO: Some of these could be merged.
Returns options for expansion of memcmp. IsZeroCmp is.
Describe known properties for a set of pointers.
Parameters that control the generic loop unrolling transformation.
bool UpperBound
Allow using trip count upper bound to unroll loops.
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
bool UnrollAndJam
Allow unroll and jam. Used to enable unroll and jam for the target.
bool UnrollRemainder
Allow unrolling of all the iterations of the runtime loop remainder.
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
unsigned OptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size (set to UINT_MAX to disable).