LLVM 18.0.0git
RISCVTargetTransformInfo.cpp
Go to the documentation of this file.
1//===-- RISCVTargetTransformInfo.cpp - RISC-V specific TTI ----------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
11#include "llvm/ADT/STLExtras.h"
17#include <cmath>
18#include <optional>
19using namespace llvm;
20
21#define DEBUG_TYPE "riscvtti"
22
24 "riscv-v-register-bit-width-lmul",
26 "The LMUL to use for getRegisterBitWidth queries. Affects LMUL used "
27 "by autovectorized code. Fractional LMULs are not supported."),
29
31 "riscv-v-slp-max-vf",
33 "Overrides result used for getMaximumVF query which is used "
34 "exclusively by SLP vectorizer."),
36
39 assert(Ty->isIntegerTy() &&
40 "getIntImmCost can only estimate cost of materialising integers");
41
42 // We have a Zero register, so 0 is always free.
43 if (Imm == 0)
44 return TTI::TCC_Free;
45
46 // Otherwise, we check how many instructions it will take to materialise.
47 const DataLayout &DL = getDataLayout();
49 getST()->getFeatureBits());
50}
51
52// Look for patterns of shift followed by AND that can be turned into a pair of
53// shifts. We won't need to materialize an immediate for the AND so these can
54// be considered free.
55static bool canUseShiftPair(Instruction *Inst, const APInt &Imm) {
56 uint64_t Mask = Imm.getZExtValue();
57 auto *BO = dyn_cast<BinaryOperator>(Inst->getOperand(0));
58 if (!BO || !BO->hasOneUse())
59 return false;
60
61 if (BO->getOpcode() != Instruction::Shl)
62 return false;
63
64 if (!isa<ConstantInt>(BO->getOperand(1)))
65 return false;
66
67 unsigned ShAmt = cast<ConstantInt>(BO->getOperand(1))->getZExtValue();
68 // (and (shl x, c2), c1) will be matched to (srli (slli x, c2+c3), c3) if c1
69 // is a mask shifted by c2 bits with c3 leading zeros.
70 if (isShiftedMask_64(Mask)) {
71 unsigned Trailing = llvm::countr_zero(Mask);
72 if (ShAmt == Trailing)
73 return true;
74 }
75
76 return false;
77}
78
80 const APInt &Imm, Type *Ty,
82 Instruction *Inst) {
83 assert(Ty->isIntegerTy() &&
84 "getIntImmCost can only estimate cost of materialising integers");
85
86 // We have a Zero register, so 0 is always free.
87 if (Imm == 0)
88 return TTI::TCC_Free;
89
90 // Some instructions in RISC-V can take a 12-bit immediate. Some of these are
91 // commutative, in others the immediate comes from a specific argument index.
92 bool Takes12BitImm = false;
93 unsigned ImmArgIdx = ~0U;
94
95 switch (Opcode) {
96 case Instruction::GetElementPtr:
97 // Never hoist any arguments to a GetElementPtr. CodeGenPrepare will
98 // split up large offsets in GEP into better parts than ConstantHoisting
99 // can.
100 return TTI::TCC_Free;
101 case Instruction::And:
102 // zext.h
103 if (Imm == UINT64_C(0xffff) && ST->hasStdExtZbb())
104 return TTI::TCC_Free;
105 // zext.w
106 if (Imm == UINT64_C(0xffffffff) && ST->hasStdExtZba())
107 return TTI::TCC_Free;
108 // bclri
109 if (ST->hasStdExtZbs() && (~Imm).isPowerOf2())
110 return TTI::TCC_Free;
111 if (Inst && Idx == 1 && Imm.getBitWidth() <= ST->getXLen() &&
112 canUseShiftPair(Inst, Imm))
113 return TTI::TCC_Free;
114 Takes12BitImm = true;
115 break;
116 case Instruction::Add:
117 Takes12BitImm = true;
118 break;
119 case Instruction::Or:
120 case Instruction::Xor:
121 // bseti/binvi
122 if (ST->hasStdExtZbs() && Imm.isPowerOf2())
123 return TTI::TCC_Free;
124 Takes12BitImm = true;
125 break;
126 case Instruction::Mul:
127 // Power of 2 is a shift. Negated power of 2 is a shift and a negate.
128 if (Imm.isPowerOf2() || Imm.isNegatedPowerOf2())
129 return TTI::TCC_Free;
130 // FIXME: There is no MULI instruction.
131 Takes12BitImm = true;
132 break;
133 case Instruction::Sub:
134 case Instruction::Shl:
135 case Instruction::LShr:
136 case Instruction::AShr:
137 Takes12BitImm = true;
138 ImmArgIdx = 1;
139 break;
140 default:
141 break;
142 }
143
144 if (Takes12BitImm) {
145 // Check immediate is the correct argument...
146 if (Instruction::isCommutative(Opcode) || Idx == ImmArgIdx) {
147 // ... and fits into the 12-bit immediate.
148 if (Imm.getSignificantBits() <= 64 &&
149 getTLI()->isLegalAddImmediate(Imm.getSExtValue())) {
150 return TTI::TCC_Free;
151 }
152 }
153
154 // Otherwise, use the full materialisation cost.
155 return getIntImmCost(Imm, Ty, CostKind);
156 }
157
158 // By default, prevent hoisting.
159 return TTI::TCC_Free;
160}
161
164 const APInt &Imm, Type *Ty,
166 // Prevent hoisting in unknown cases.
167 return TTI::TCC_Free;
168}
169
172 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
173 return ST->hasStdExtZbb() ? TTI::PSK_FastHardware : TTI::PSK_Software;
174}
175
177 // Currently, the ExpandReductions pass can't expand scalable-vector
178 // reductions, but we still request expansion as RVV doesn't support certain
179 // reductions and the SelectionDAG can't legalize them either.
180 switch (II->getIntrinsicID()) {
181 default:
182 return false;
183 // These reductions have no equivalent in RVV
184 case Intrinsic::vector_reduce_mul:
185 case Intrinsic::vector_reduce_fmul:
186 return true;
187 }
188}
189
190std::optional<unsigned> RISCVTTIImpl::getMaxVScale() const {
191 if (ST->hasVInstructions())
193 return BaseT::getMaxVScale();
194}
195
196std::optional<unsigned> RISCVTTIImpl::getVScaleForTuning() const {
197 if (ST->hasVInstructions())
198 if (unsigned MinVLen = ST->getRealMinVLen();
199 MinVLen >= RISCV::RVVBitsPerBlock)
200 return MinVLen / RISCV::RVVBitsPerBlock;
202}
203
206 unsigned LMUL =
207 llvm::bit_floor(std::clamp<unsigned>(RVVRegisterWidthLMUL, 1, 8));
208 switch (K) {
210 return TypeSize::getFixed(ST->getXLen());
212 return TypeSize::getFixed(
213 ST->useRVVForFixedLengthVectors() ? LMUL * ST->getRealMinVLen() : 0);
216 (ST->hasVInstructions() &&
219 : 0);
220 }
221
222 llvm_unreachable("Unsupported register kind");
223}
224
226RISCVTTIImpl::getConstantPoolLoadCost(Type *Ty, TTI::TargetCostKind CostKind) {
227 // Add a cost of address generation + the cost of the load. The address
228 // is expected to be a PC relative offset to a constant pool entry
229 // using auipc/addi.
230 return 2 + getMemoryOpCost(Instruction::Load, Ty, DL.getABITypeAlign(Ty),
231 /*AddressSpace=*/0, CostKind);
232}
233
235 LLVMContext &C) {
236 assert((DataVT.getScalarSizeInBits() != 8 ||
237 DataVT.getVectorNumElements() <= 256) && "unhandled case in lowering");
238 MVT IndexVT = DataVT.changeTypeToInteger();
239 if (IndexVT.getScalarType().bitsGT(ST.getXLenVT()))
240 IndexVT = IndexVT.changeVectorElementType(MVT::i16);
241 return cast<VectorType>(EVT(IndexVT).getTypeForEVT(C));
242}
243
245 VectorType *Tp, ArrayRef<int> Mask,
247 int Index, VectorType *SubTp,
249 Kind = improveShuffleKindFromMask(Kind, Mask, Tp, Index, SubTp);
250
251 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
252
253 // First, handle cases where having a fixed length vector enables us to
254 // give a more accurate cost than falling back to generic scalable codegen.
255 // TODO: Each of these cases hints at a modeling gap around scalable vectors.
256 if (isa<FixedVectorType>(Tp)) {
257 switch (Kind) {
258 default:
259 break;
261 if (Mask.size() >= 2 && LT.second.isFixedLengthVector()) {
262 MVT EltTp = LT.second.getVectorElementType();
263 // If the size of the element is < ELEN then shuffles of interleaves and
264 // deinterleaves of 2 vectors can be lowered into the following
265 // sequences
266 if (EltTp.getScalarSizeInBits() < ST->getELen()) {
267 // Example sequence:
268 // vsetivli zero, 4, e8, mf4, ta, ma (ignored)
269 // vwaddu.vv v10, v8, v9
270 // li a0, -1 (ignored)
271 // vwmaccu.vx v10, a0, v9
272 if (ShuffleVectorInst::isInterleaveMask(Mask, 2, Mask.size()))
273 return 2 * LT.first * TLI->getLMULCost(LT.second);
274
275 if (Mask[0] == 0 || Mask[0] == 1) {
276 auto DeinterleaveMask = createStrideMask(Mask[0], 2, Mask.size());
277 // Example sequence:
278 // vnsrl.wi v10, v8, 0
279 if (equal(DeinterleaveMask, Mask))
280 return LT.first * TLI->getLMULCost(LT.second);
281 }
282 }
283 }
284 // vrgather + cost of generating the mask constant.
285 // We model this for an unknown mask with a single vrgather.
286 if (LT.second.isFixedLengthVector() && LT.first == 1 &&
287 (LT.second.getScalarSizeInBits() != 8 ||
288 LT.second.getVectorNumElements() <= 256)) {
289 VectorType *IdxTy = getVRGatherIndexType(LT.second, *ST, Tp->getContext());
290 InstructionCost IndexCost = getConstantPoolLoadCost(IdxTy, CostKind);
291 return IndexCost + TLI->getVRGatherVVCost(LT.second);
292 }
293 [[fallthrough]];
294 }
297 // 2 x (vrgather + cost of generating the mask constant) + cost of mask
298 // register for the second vrgather. We model this for an unknown
299 // (shuffle) mask.
300 if (LT.second.isFixedLengthVector() && LT.first == 1 &&
301 (LT.second.getScalarSizeInBits() != 8 ||
302 LT.second.getVectorNumElements() <= 256)) {
303 auto &C = Tp->getContext();
304 auto EC = Tp->getElementCount();
305 VectorType *IdxTy = getVRGatherIndexType(LT.second, *ST, C);
307 InstructionCost IndexCost = getConstantPoolLoadCost(IdxTy, CostKind);
308 InstructionCost MaskCost = getConstantPoolLoadCost(MaskTy, CostKind);
309 return 2 * IndexCost + 2 * TLI->getVRGatherVVCost(LT.second) + MaskCost;
310 }
311 [[fallthrough]];
312 }
313 case TTI::SK_Select: {
314 // We are going to permute multiple sources and the result will be in
315 // multiple destinations. Providing an accurate cost only for splits where
316 // the element type remains the same.
317 if (!Mask.empty() && LT.first.isValid() && LT.first != 1 &&
318 LT.second.isFixedLengthVector() &&
319 LT.second.getVectorElementType().getSizeInBits() ==
321 LT.second.getVectorNumElements() <
322 cast<FixedVectorType>(Tp)->getNumElements()) {
323 unsigned NumRegs = *LT.first.getValue();
324 unsigned VF = cast<FixedVectorType>(Tp)->getNumElements();
325 unsigned SubVF = PowerOf2Ceil(VF / NumRegs);
326 auto *SubVecTy = FixedVectorType::get(Tp->getElementType(), SubVF);
327
329 for (unsigned I = 0; I < NumRegs; ++I) {
330 bool IsSingleVector = true;
331 SmallVector<int> SubMask(SubVF, PoisonMaskElem);
332 transform(Mask.slice(I * SubVF,
333 I == NumRegs - 1 ? Mask.size() % SubVF : SubVF),
334 SubMask.begin(), [&](int I) {
335 bool SingleSubVector = I / VF == 0;
336 IsSingleVector &= SingleSubVector;
337 return (SingleSubVector ? 0 : 1) * SubVF + I % VF;
338 });
341 SubVecTy, SubMask, CostKind, 0, nullptr);
342 return Cost;
343 }
344 }
345 break;
346 }
347 }
348 };
349
350 // Handle scalable vectors (and fixed vectors legalized to scalable vectors).
351 switch (Kind) {
352 default:
353 // Fallthrough to generic handling.
354 // TODO: Most of these cases will return getInvalid in generic code, and
355 // must be implemented here.
356 break;
358 // Example sequence:
359 // vsetivli zero, 4, e8, mf2, tu, ma (ignored)
360 // vslidedown.vi v8, v9, 2
361 return LT.first * TLI->getVSlideCost(LT.second);
363 // Example sequence:
364 // vsetivli zero, 4, e8, mf2, tu, ma (ignored)
365 // vslideup.vi v8, v9, 2
366 return LT.first * TLI->getVSlideCost(LT.second);
367 case TTI::SK_Select: {
368 // Example sequence:
369 // li a0, 90
370 // vsetivli zero, 8, e8, mf2, ta, ma (ignored)
371 // vmv.s.x v0, a0
372 // vmerge.vvm v8, v9, v8, v0
373 return LT.first * 3 * TLI->getLMULCost(LT.second);
374 }
375 case TTI::SK_Broadcast: {
376 bool HasScalar = (Args.size() > 0) && (Operator::getOpcode(Args[0]) ==
377 Instruction::InsertElement);
378 if (LT.second.getScalarSizeInBits() == 1) {
379 if (HasScalar) {
380 // Example sequence:
381 // andi a0, a0, 1
382 // vsetivli zero, 2, e8, mf8, ta, ma (ignored)
383 // vmv.v.x v8, a0
384 // vmsne.vi v0, v8, 0
385 return LT.first * TLI->getLMULCost(LT.second) * 3;
386 }
387 // Example sequence:
388 // vsetivli zero, 2, e8, mf8, ta, mu (ignored)
389 // vmv.v.i v8, 0
390 // vmerge.vim v8, v8, 1, v0
391 // vmv.x.s a0, v8
392 // andi a0, a0, 1
393 // vmv.v.x v8, a0
394 // vmsne.vi v0, v8, 0
395
396 return LT.first * TLI->getLMULCost(LT.second) * 6;
397 }
398
399 if (HasScalar) {
400 // Example sequence:
401 // vmv.v.x v8, a0
402 return LT.first * TLI->getLMULCost(LT.second);
403 }
404
405 // Example sequence:
406 // vrgather.vi v9, v8, 0
407 return LT.first * TLI->getVRGatherVICost(LT.second);
408 }
409 case TTI::SK_Splice:
410 // vslidedown+vslideup.
411 // TODO: Multiplying by LT.first implies this legalizes into multiple copies
412 // of similar code, but I think we expand through memory.
413 return 2 * LT.first * TLI->getVSlideCost(LT.second);
414 case TTI::SK_Reverse: {
415 // TODO: Cases to improve here:
416 // * Illegal vector types
417 // * i64 on RV32
418 // * i1 vector
419 // At low LMUL, most of the cost is producing the vrgather index register.
420 // At high LMUL, the cost of the vrgather itself will dominate.
421 // Example sequence:
422 // csrr a0, vlenb
423 // srli a0, a0, 3
424 // addi a0, a0, -1
425 // vsetvli a1, zero, e8, mf8, ta, mu (ignored)
426 // vid.v v9
427 // vrsub.vx v10, v9, a0
428 // vrgather.vv v9, v8, v10
429 InstructionCost LenCost = 3;
430 if (LT.second.isFixedLengthVector())
431 // vrsub.vi has a 5 bit immediate field, otherwise an li suffices
432 LenCost = isInt<5>(LT.second.getVectorNumElements() - 1) ? 0 : 1;
433 InstructionCost GatherCost = 2 + TLI->getVRGatherVVCost(LT.second);
434 // Mask operation additionally required extend and truncate
435 InstructionCost ExtendCost = Tp->getElementType()->isIntegerTy(1) ? 3 : 0;
436 return LT.first * (LenCost + GatherCost + ExtendCost);
437 }
438 }
439 return BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp);
440}
441
443RISCVTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
444 unsigned AddressSpace,
446 if (!isLegalMaskedLoadStore(Src, Alignment) ||
448 return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
449 CostKind);
450
451 return getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind);
452}
453
455 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
456 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
457 bool UseMaskForCond, bool UseMaskForGaps) {
458 if (isa<ScalableVectorType>(VecTy))
460 auto *FVTy = cast<FixedVectorType>(VecTy);
461 InstructionCost MemCost =
462 getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace, CostKind);
463 unsigned VF = FVTy->getNumElements() / Factor;
464
465 // The interleaved memory access pass will lower interleaved memory ops (i.e
466 // a load and store followed by a specific shuffle) to vlseg/vsseg
467 // intrinsics. In those cases then we can treat it as if it's just one (legal)
468 // memory op
469 if (!UseMaskForCond && !UseMaskForGaps &&
470 Factor <= TLI->getMaxSupportedInterleaveFactor()) {
471 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(FVTy);
472 // Need to make sure type has't been scalarized
473 if (LT.second.isFixedLengthVector()) {
474 auto *LegalFVTy = FixedVectorType::get(FVTy->getElementType(),
475 LT.second.getVectorNumElements());
476 // FIXME: We use the memory op cost of the *legalized* type here, becuase
477 // it's getMemoryOpCost returns a really expensive cost for types like
478 // <6 x i8>, which show up when doing interleaves of Factor=3 etc.
479 // Should the memory op cost of these be cheaper?
480 if (TLI->isLegalInterleavedAccessType(LegalFVTy, Factor, Alignment,
481 AddressSpace, DL)) {
482 InstructionCost LegalMemCost = getMemoryOpCost(
483 Opcode, LegalFVTy, Alignment, AddressSpace, CostKind);
484 return LT.first + LegalMemCost;
485 }
486 }
487 }
488
489 // An interleaved load will look like this for Factor=3:
490 // %wide.vec = load <12 x i32>, ptr %3, align 4
491 // %strided.vec = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
492 // %strided.vec1 = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
493 // %strided.vec2 = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
494 if (Opcode == Instruction::Load) {
495 InstructionCost Cost = MemCost;
496 for (unsigned Index : Indices) {
497 FixedVectorType *SubVecTy =
498 FixedVectorType::get(FVTy->getElementType(), VF);
499 auto Mask = createStrideMask(Index, Factor, VF);
500 InstructionCost ShuffleCost =
502 CostKind, 0, nullptr, {});
503 Cost += ShuffleCost;
504 }
505 return Cost;
506 }
507
508 // TODO: Model for NF > 2
509 // We'll need to enhance getShuffleCost to model shuffles that are just
510 // inserts and extracts into subvectors, since they won't have the full cost
511 // of a vrgather.
512 // An interleaved store for 3 vectors of 4 lanes will look like
513 // %11 = shufflevector <4 x i32> %4, <4 x i32> %6, <8 x i32> <0...7>
514 // %12 = shufflevector <4 x i32> %9, <4 x i32> poison, <8 x i32> <0...3>
515 // %13 = shufflevector <8 x i32> %11, <8 x i32> %12, <12 x i32> <0...11>
516 // %interleaved.vec = shufflevector %13, poison, <12 x i32> <interleave mask>
517 // store <12 x i32> %interleaved.vec, ptr %10, align 4
518 if (Factor != 2)
519 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
520 Alignment, AddressSpace, CostKind,
521 UseMaskForCond, UseMaskForGaps);
522
523 assert(Opcode == Instruction::Store && "Opcode must be a store");
524 // For an interleaving store of 2 vectors, we perform one large interleaving
525 // shuffle that goes into the wide store
526 auto Mask = createInterleaveMask(VF, Factor);
527 InstructionCost ShuffleCost =
529 CostKind, 0, nullptr, {});
530 return MemCost + ShuffleCost;
531}
532
534 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
535 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
537 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
538 Alignment, CostKind, I);
539
540 if ((Opcode == Instruction::Load &&
541 !isLegalMaskedGather(DataTy, Align(Alignment))) ||
542 (Opcode == Instruction::Store &&
543 !isLegalMaskedScatter(DataTy, Align(Alignment))))
544 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
545 Alignment, CostKind, I);
546
547 // Cost is proportional to the number of memory operations implied. For
548 // scalable vectors, we use an estimate on that number since we don't
549 // know exactly what VL will be.
550 auto &VTy = *cast<VectorType>(DataTy);
551 InstructionCost MemOpCost =
552 getMemoryOpCost(Opcode, VTy.getElementType(), Alignment, 0, CostKind,
553 {TTI::OK_AnyValue, TTI::OP_None}, I);
554 unsigned NumLoads = getEstimatedVLFor(&VTy);
555 return NumLoads * MemOpCost;
556}
557
558// Currently, these represent both throughput and codesize costs
559// for the respective intrinsics. The costs in this table are simply
560// instruction counts with the following adjustments made:
561// * One vsetvli is considered free.
563 {Intrinsic::floor, MVT::v2f32, 9},
564 {Intrinsic::floor, MVT::v4f32, 9},
565 {Intrinsic::floor, MVT::v8f32, 9},
566 {Intrinsic::floor, MVT::v16f32, 9},
567 {Intrinsic::floor, MVT::nxv1f32, 9},
568 {Intrinsic::floor, MVT::nxv2f32, 9},
569 {Intrinsic::floor, MVT::nxv4f32, 9},
570 {Intrinsic::floor, MVT::nxv8f32, 9},
571 {Intrinsic::floor, MVT::nxv16f32, 9},
572 {Intrinsic::floor, MVT::v2f64, 9},
573 {Intrinsic::floor, MVT::v4f64, 9},
574 {Intrinsic::floor, MVT::v8f64, 9},
575 {Intrinsic::floor, MVT::v16f64, 9},
576 {Intrinsic::floor, MVT::nxv1f64, 9},
577 {Intrinsic::floor, MVT::nxv2f64, 9},
578 {Intrinsic::floor, MVT::nxv4f64, 9},
579 {Intrinsic::floor, MVT::nxv8f64, 9},
580 {Intrinsic::ceil, MVT::v2f32, 9},
581 {Intrinsic::ceil, MVT::v4f32, 9},
582 {Intrinsic::ceil, MVT::v8f32, 9},
583 {Intrinsic::ceil, MVT::v16f32, 9},
584 {Intrinsic::ceil, MVT::nxv1f32, 9},
585 {Intrinsic::ceil, MVT::nxv2f32, 9},
586 {Intrinsic::ceil, MVT::nxv4f32, 9},
587 {Intrinsic::ceil, MVT::nxv8f32, 9},
588 {Intrinsic::ceil, MVT::nxv16f32, 9},
589 {Intrinsic::ceil, MVT::v2f64, 9},
590 {Intrinsic::ceil, MVT::v4f64, 9},
591 {Intrinsic::ceil, MVT::v8f64, 9},
592 {Intrinsic::ceil, MVT::v16f64, 9},
593 {Intrinsic::ceil, MVT::nxv1f64, 9},
594 {Intrinsic::ceil, MVT::nxv2f64, 9},
595 {Intrinsic::ceil, MVT::nxv4f64, 9},
596 {Intrinsic::ceil, MVT::nxv8f64, 9},
597 {Intrinsic::trunc, MVT::v2f32, 7},
598 {Intrinsic::trunc, MVT::v4f32, 7},
599 {Intrinsic::trunc, MVT::v8f32, 7},
600 {Intrinsic::trunc, MVT::v16f32, 7},
601 {Intrinsic::trunc, MVT::nxv1f32, 7},
602 {Intrinsic::trunc, MVT::nxv2f32, 7},
603 {Intrinsic::trunc, MVT::nxv4f32, 7},
604 {Intrinsic::trunc, MVT::nxv8f32, 7},
605 {Intrinsic::trunc, MVT::nxv16f32, 7},
606 {Intrinsic::trunc, MVT::v2f64, 7},
607 {Intrinsic::trunc, MVT::v4f64, 7},
608 {Intrinsic::trunc, MVT::v8f64, 7},
609 {Intrinsic::trunc, MVT::v16f64, 7},
610 {Intrinsic::trunc, MVT::nxv1f64, 7},
611 {Intrinsic::trunc, MVT::nxv2f64, 7},
612 {Intrinsic::trunc, MVT::nxv4f64, 7},
613 {Intrinsic::trunc, MVT::nxv8f64, 7},
614 {Intrinsic::round, MVT::v2f32, 9},
615 {Intrinsic::round, MVT::v4f32, 9},
616 {Intrinsic::round, MVT::v8f32, 9},
617 {Intrinsic::round, MVT::v16f32, 9},
618 {Intrinsic::round, MVT::nxv1f32, 9},
619 {Intrinsic::round, MVT::nxv2f32, 9},
620 {Intrinsic::round, MVT::nxv4f32, 9},
621 {Intrinsic::round, MVT::nxv8f32, 9},
622 {Intrinsic::round, MVT::nxv16f32, 9},
623 {Intrinsic::round, MVT::v2f64, 9},
624 {Intrinsic::round, MVT::v4f64, 9},
625 {Intrinsic::round, MVT::v8f64, 9},
626 {Intrinsic::round, MVT::v16f64, 9},
627 {Intrinsic::round, MVT::nxv1f64, 9},
628 {Intrinsic::round, MVT::nxv2f64, 9},
629 {Intrinsic::round, MVT::nxv4f64, 9},
630 {Intrinsic::round, MVT::nxv8f64, 9},
631 {Intrinsic::roundeven, MVT::v2f32, 9},
632 {Intrinsic::roundeven, MVT::v4f32, 9},
633 {Intrinsic::roundeven, MVT::v8f32, 9},
634 {Intrinsic::roundeven, MVT::v16f32, 9},
635 {Intrinsic::roundeven, MVT::nxv1f32, 9},
636 {Intrinsic::roundeven, MVT::nxv2f32, 9},
637 {Intrinsic::roundeven, MVT::nxv4f32, 9},
638 {Intrinsic::roundeven, MVT::nxv8f32, 9},
639 {Intrinsic::roundeven, MVT::nxv16f32, 9},
640 {Intrinsic::roundeven, MVT::v2f64, 9},
641 {Intrinsic::roundeven, MVT::v4f64, 9},
642 {Intrinsic::roundeven, MVT::v8f64, 9},
643 {Intrinsic::roundeven, MVT::v16f64, 9},
644 {Intrinsic::roundeven, MVT::nxv1f64, 9},
645 {Intrinsic::roundeven, MVT::nxv2f64, 9},
646 {Intrinsic::roundeven, MVT::nxv4f64, 9},
647 {Intrinsic::roundeven, MVT::nxv8f64, 9},
648 {Intrinsic::rint, MVT::v2f32, 7},
649 {Intrinsic::rint, MVT::v4f32, 7},
650 {Intrinsic::rint, MVT::v8f32, 7},
651 {Intrinsic::rint, MVT::v16f32, 7},
652 {Intrinsic::rint, MVT::nxv1f32, 7},
653 {Intrinsic::rint, MVT::nxv2f32, 7},
654 {Intrinsic::rint, MVT::nxv4f32, 7},
655 {Intrinsic::rint, MVT::nxv8f32, 7},
656 {Intrinsic::rint, MVT::nxv16f32, 7},
657 {Intrinsic::rint, MVT::v2f64, 7},
658 {Intrinsic::rint, MVT::v4f64, 7},
659 {Intrinsic::rint, MVT::v8f64, 7},
660 {Intrinsic::rint, MVT::v16f64, 7},
661 {Intrinsic::rint, MVT::nxv1f64, 7},
662 {Intrinsic::rint, MVT::nxv2f64, 7},
663 {Intrinsic::rint, MVT::nxv4f64, 7},
664 {Intrinsic::rint, MVT::nxv8f64, 7},
665 {Intrinsic::nearbyint, MVT::v2f32, 9},
666 {Intrinsic::nearbyint, MVT::v4f32, 9},
667 {Intrinsic::nearbyint, MVT::v8f32, 9},
668 {Intrinsic::nearbyint, MVT::v16f32, 9},
669 {Intrinsic::nearbyint, MVT::nxv1f32, 9},
670 {Intrinsic::nearbyint, MVT::nxv2f32, 9},
671 {Intrinsic::nearbyint, MVT::nxv4f32, 9},
672 {Intrinsic::nearbyint, MVT::nxv8f32, 9},
673 {Intrinsic::nearbyint, MVT::nxv16f32, 9},
674 {Intrinsic::nearbyint, MVT::v2f64, 9},
675 {Intrinsic::nearbyint, MVT::v4f64, 9},
676 {Intrinsic::nearbyint, MVT::v8f64, 9},
677 {Intrinsic::nearbyint, MVT::v16f64, 9},
678 {Intrinsic::nearbyint, MVT::nxv1f64, 9},
679 {Intrinsic::nearbyint, MVT::nxv2f64, 9},
680 {Intrinsic::nearbyint, MVT::nxv4f64, 9},
681 {Intrinsic::nearbyint, MVT::nxv8f64, 9},
682 {Intrinsic::bswap, MVT::v2i16, 3},
683 {Intrinsic::bswap, MVT::v4i16, 3},
684 {Intrinsic::bswap, MVT::v8i16, 3},
685 {Intrinsic::bswap, MVT::v16i16, 3},
686 {Intrinsic::bswap, MVT::nxv1i16, 3},
687 {Intrinsic::bswap, MVT::nxv2i16, 3},
688 {Intrinsic::bswap, MVT::nxv4i16, 3},
689 {Intrinsic::bswap, MVT::nxv8i16, 3},
690 {Intrinsic::bswap, MVT::nxv16i16, 3},
691 {Intrinsic::bswap, MVT::v2i32, 12},
692 {Intrinsic::bswap, MVT::v4i32, 12},
693 {Intrinsic::bswap, MVT::v8i32, 12},
694 {Intrinsic::bswap, MVT::v16i32, 12},
695 {Intrinsic::bswap, MVT::nxv1i32, 12},
696 {Intrinsic::bswap, MVT::nxv2i32, 12},
697 {Intrinsic::bswap, MVT::nxv4i32, 12},
698 {Intrinsic::bswap, MVT::nxv8i32, 12},
699 {Intrinsic::bswap, MVT::nxv16i32, 12},
700 {Intrinsic::bswap, MVT::v2i64, 31},
701 {Intrinsic::bswap, MVT::v4i64, 31},
702 {Intrinsic::bswap, MVT::v8i64, 31},
703 {Intrinsic::bswap, MVT::v16i64, 31},
704 {Intrinsic::bswap, MVT::nxv1i64, 31},
705 {Intrinsic::bswap, MVT::nxv2i64, 31},
706 {Intrinsic::bswap, MVT::nxv4i64, 31},
707 {Intrinsic::bswap, MVT::nxv8i64, 31},
708 {Intrinsic::vp_bswap, MVT::v2i16, 3},
709 {Intrinsic::vp_bswap, MVT::v4i16, 3},
710 {Intrinsic::vp_bswap, MVT::v8i16, 3},
711 {Intrinsic::vp_bswap, MVT::v16i16, 3},
712 {Intrinsic::vp_bswap, MVT::nxv1i16, 3},
713 {Intrinsic::vp_bswap, MVT::nxv2i16, 3},
714 {Intrinsic::vp_bswap, MVT::nxv4i16, 3},
715 {Intrinsic::vp_bswap, MVT::nxv8i16, 3},
716 {Intrinsic::vp_bswap, MVT::nxv16i16, 3},
717 {Intrinsic::vp_bswap, MVT::v2i32, 12},
718 {Intrinsic::vp_bswap, MVT::v4i32, 12},
719 {Intrinsic::vp_bswap, MVT::v8i32, 12},
720 {Intrinsic::vp_bswap, MVT::v16i32, 12},
721 {Intrinsic::vp_bswap, MVT::nxv1i32, 12},
722 {Intrinsic::vp_bswap, MVT::nxv2i32, 12},
723 {Intrinsic::vp_bswap, MVT::nxv4i32, 12},
724 {Intrinsic::vp_bswap, MVT::nxv8i32, 12},
725 {Intrinsic::vp_bswap, MVT::nxv16i32, 12},
726 {Intrinsic::vp_bswap, MVT::v2i64, 31},
727 {Intrinsic::vp_bswap, MVT::v4i64, 31},
728 {Intrinsic::vp_bswap, MVT::v8i64, 31},
729 {Intrinsic::vp_bswap, MVT::v16i64, 31},
730 {Intrinsic::vp_bswap, MVT::nxv1i64, 31},
731 {Intrinsic::vp_bswap, MVT::nxv2i64, 31},
732 {Intrinsic::vp_bswap, MVT::nxv4i64, 31},
733 {Intrinsic::vp_bswap, MVT::nxv8i64, 31},
734 {Intrinsic::vp_fshl, MVT::v2i8, 7},
735 {Intrinsic::vp_fshl, MVT::v4i8, 7},
736 {Intrinsic::vp_fshl, MVT::v8i8, 7},
737 {Intrinsic::vp_fshl, MVT::v16i8, 7},
738 {Intrinsic::vp_fshl, MVT::nxv1i8, 7},
739 {Intrinsic::vp_fshl, MVT::nxv2i8, 7},
740 {Intrinsic::vp_fshl, MVT::nxv4i8, 7},
741 {Intrinsic::vp_fshl, MVT::nxv8i8, 7},
742 {Intrinsic::vp_fshl, MVT::nxv16i8, 7},
743 {Intrinsic::vp_fshl, MVT::nxv32i8, 7},
744 {Intrinsic::vp_fshl, MVT::nxv64i8, 7},
745 {Intrinsic::vp_fshl, MVT::v2i16, 7},
746 {Intrinsic::vp_fshl, MVT::v4i16, 7},
747 {Intrinsic::vp_fshl, MVT::v8i16, 7},
748 {Intrinsic::vp_fshl, MVT::v16i16, 7},
749 {Intrinsic::vp_fshl, MVT::nxv1i16, 7},
750 {Intrinsic::vp_fshl, MVT::nxv2i16, 7},
751 {Intrinsic::vp_fshl, MVT::nxv4i16, 7},
752 {Intrinsic::vp_fshl, MVT::nxv8i16, 7},
753 {Intrinsic::vp_fshl, MVT::nxv16i16, 7},
754 {Intrinsic::vp_fshl, MVT::nxv32i16, 7},
755 {Intrinsic::vp_fshl, MVT::v2i32, 7},
756 {Intrinsic::vp_fshl, MVT::v4i32, 7},
757 {Intrinsic::vp_fshl, MVT::v8i32, 7},
758 {Intrinsic::vp_fshl, MVT::v16i32, 7},
759 {Intrinsic::vp_fshl, MVT::nxv1i32, 7},
760 {Intrinsic::vp_fshl, MVT::nxv2i32, 7},
761 {Intrinsic::vp_fshl, MVT::nxv4i32, 7},
762 {Intrinsic::vp_fshl, MVT::nxv8i32, 7},
763 {Intrinsic::vp_fshl, MVT::nxv16i32, 7},
764 {Intrinsic::vp_fshl, MVT::v2i64, 7},
765 {Intrinsic::vp_fshl, MVT::v4i64, 7},
766 {Intrinsic::vp_fshl, MVT::v8i64, 7},
767 {Intrinsic::vp_fshl, MVT::v16i64, 7},
768 {Intrinsic::vp_fshl, MVT::nxv1i64, 7},
769 {Intrinsic::vp_fshl, MVT::nxv2i64, 7},
770 {Intrinsic::vp_fshl, MVT::nxv4i64, 7},
771 {Intrinsic::vp_fshl, MVT::nxv8i64, 7},
772 {Intrinsic::vp_fshr, MVT::v2i8, 7},
773 {Intrinsic::vp_fshr, MVT::v4i8, 7},
774 {Intrinsic::vp_fshr, MVT::v8i8, 7},
775 {Intrinsic::vp_fshr, MVT::v16i8, 7},
776 {Intrinsic::vp_fshr, MVT::nxv1i8, 7},
777 {Intrinsic::vp_fshr, MVT::nxv2i8, 7},
778 {Intrinsic::vp_fshr, MVT::nxv4i8, 7},
779 {Intrinsic::vp_fshr, MVT::nxv8i8, 7},
780 {Intrinsic::vp_fshr, MVT::nxv16i8, 7},
781 {Intrinsic::vp_fshr, MVT::nxv32i8, 7},
782 {Intrinsic::vp_fshr, MVT::nxv64i8, 7},
783 {Intrinsic::vp_fshr, MVT::v2i16, 7},
784 {Intrinsic::vp_fshr, MVT::v4i16, 7},
785 {Intrinsic::vp_fshr, MVT::v8i16, 7},
786 {Intrinsic::vp_fshr, MVT::v16i16, 7},
787 {Intrinsic::vp_fshr, MVT::nxv1i16, 7},
788 {Intrinsic::vp_fshr, MVT::nxv2i16, 7},
789 {Intrinsic::vp_fshr, MVT::nxv4i16, 7},
790 {Intrinsic::vp_fshr, MVT::nxv8i16, 7},
791 {Intrinsic::vp_fshr, MVT::nxv16i16, 7},
792 {Intrinsic::vp_fshr, MVT::nxv32i16, 7},
793 {Intrinsic::vp_fshr, MVT::v2i32, 7},
794 {Intrinsic::vp_fshr, MVT::v4i32, 7},
795 {Intrinsic::vp_fshr, MVT::v8i32, 7},
796 {Intrinsic::vp_fshr, MVT::v16i32, 7},
797 {Intrinsic::vp_fshr, MVT::nxv1i32, 7},
798 {Intrinsic::vp_fshr, MVT::nxv2i32, 7},
799 {Intrinsic::vp_fshr, MVT::nxv4i32, 7},
800 {Intrinsic::vp_fshr, MVT::nxv8i32, 7},
801 {Intrinsic::vp_fshr, MVT::nxv16i32, 7},
802 {Intrinsic::vp_fshr, MVT::v2i64, 7},
803 {Intrinsic::vp_fshr, MVT::v4i64, 7},
804 {Intrinsic::vp_fshr, MVT::v8i64, 7},
805 {Intrinsic::vp_fshr, MVT::v16i64, 7},
806 {Intrinsic::vp_fshr, MVT::nxv1i64, 7},
807 {Intrinsic::vp_fshr, MVT::nxv2i64, 7},
808 {Intrinsic::vp_fshr, MVT::nxv4i64, 7},
809 {Intrinsic::vp_fshr, MVT::nxv8i64, 7},
810 {Intrinsic::bitreverse, MVT::v2i8, 17},
811 {Intrinsic::bitreverse, MVT::v4i8, 17},
812 {Intrinsic::bitreverse, MVT::v8i8, 17},
813 {Intrinsic::bitreverse, MVT::v16i8, 17},
814 {Intrinsic::bitreverse, MVT::nxv1i8, 17},
815 {Intrinsic::bitreverse, MVT::nxv2i8, 17},
816 {Intrinsic::bitreverse, MVT::nxv4i8, 17},
817 {Intrinsic::bitreverse, MVT::nxv8i8, 17},
818 {Intrinsic::bitreverse, MVT::nxv16i8, 17},
819 {Intrinsic::bitreverse, MVT::v2i16, 24},
820 {Intrinsic::bitreverse, MVT::v4i16, 24},
821 {Intrinsic::bitreverse, MVT::v8i16, 24},
822 {Intrinsic::bitreverse, MVT::v16i16, 24},
823 {Intrinsic::bitreverse, MVT::nxv1i16, 24},
824 {Intrinsic::bitreverse, MVT::nxv2i16, 24},
825 {Intrinsic::bitreverse, MVT::nxv4i16, 24},
826 {Intrinsic::bitreverse, MVT::nxv8i16, 24},
827 {Intrinsic::bitreverse, MVT::nxv16i16, 24},
828 {Intrinsic::bitreverse, MVT::v2i32, 33},
829 {Intrinsic::bitreverse, MVT::v4i32, 33},
830 {Intrinsic::bitreverse, MVT::v8i32, 33},
831 {Intrinsic::bitreverse, MVT::v16i32, 33},
832 {Intrinsic::bitreverse, MVT::nxv1i32, 33},
833 {Intrinsic::bitreverse, MVT::nxv2i32, 33},
834 {Intrinsic::bitreverse, MVT::nxv4i32, 33},
835 {Intrinsic::bitreverse, MVT::nxv8i32, 33},
836 {Intrinsic::bitreverse, MVT::nxv16i32, 33},
837 {Intrinsic::bitreverse, MVT::v2i64, 52},
838 {Intrinsic::bitreverse, MVT::v4i64, 52},
839 {Intrinsic::bitreverse, MVT::v8i64, 52},
840 {Intrinsic::bitreverse, MVT::v16i64, 52},
841 {Intrinsic::bitreverse, MVT::nxv1i64, 52},
842 {Intrinsic::bitreverse, MVT::nxv2i64, 52},
843 {Intrinsic::bitreverse, MVT::nxv4i64, 52},
844 {Intrinsic::bitreverse, MVT::nxv8i64, 52},
845 {Intrinsic::vp_bitreverse, MVT::v2i8, 17},
846 {Intrinsic::vp_bitreverse, MVT::v4i8, 17},
847 {Intrinsic::vp_bitreverse, MVT::v8i8, 17},
848 {Intrinsic::vp_bitreverse, MVT::v16i8, 17},
849 {Intrinsic::vp_bitreverse, MVT::nxv1i8, 17},
850 {Intrinsic::vp_bitreverse, MVT::nxv2i8, 17},
851 {Intrinsic::vp_bitreverse, MVT::nxv4i8, 17},
852 {Intrinsic::vp_bitreverse, MVT::nxv8i8, 17},
853 {Intrinsic::vp_bitreverse, MVT::nxv16i8, 17},
854 {Intrinsic::vp_bitreverse, MVT::v2i16, 24},
855 {Intrinsic::vp_bitreverse, MVT::v4i16, 24},
856 {Intrinsic::vp_bitreverse, MVT::v8i16, 24},
857 {Intrinsic::vp_bitreverse, MVT::v16i16, 24},
858 {Intrinsic::vp_bitreverse, MVT::nxv1i16, 24},
859 {Intrinsic::vp_bitreverse, MVT::nxv2i16, 24},
860 {Intrinsic::vp_bitreverse, MVT::nxv4i16, 24},
861 {Intrinsic::vp_bitreverse, MVT::nxv8i16, 24},
862 {Intrinsic::vp_bitreverse, MVT::nxv16i16, 24},
863 {Intrinsic::vp_bitreverse, MVT::v2i32, 33},
864 {Intrinsic::vp_bitreverse, MVT::v4i32, 33},
865 {Intrinsic::vp_bitreverse, MVT::v8i32, 33},
866 {Intrinsic::vp_bitreverse, MVT::v16i32, 33},
867 {Intrinsic::vp_bitreverse, MVT::nxv1i32, 33},
868 {Intrinsic::vp_bitreverse, MVT::nxv2i32, 33},
869 {Intrinsic::vp_bitreverse, MVT::nxv4i32, 33},
870 {Intrinsic::vp_bitreverse, MVT::nxv8i32, 33},
871 {Intrinsic::vp_bitreverse, MVT::nxv16i32, 33},
872 {Intrinsic::vp_bitreverse, MVT::v2i64, 52},
873 {Intrinsic::vp_bitreverse, MVT::v4i64, 52},
874 {Intrinsic::vp_bitreverse, MVT::v8i64, 52},
875 {Intrinsic::vp_bitreverse, MVT::v16i64, 52},
876 {Intrinsic::vp_bitreverse, MVT::nxv1i64, 52},
877 {Intrinsic::vp_bitreverse, MVT::nxv2i64, 52},
878 {Intrinsic::vp_bitreverse, MVT::nxv4i64, 52},
879 {Intrinsic::vp_bitreverse, MVT::nxv8i64, 52},
880 {Intrinsic::ctpop, MVT::v2i8, 12},
881 {Intrinsic::ctpop, MVT::v4i8, 12},
882 {Intrinsic::ctpop, MVT::v8i8, 12},
883 {Intrinsic::ctpop, MVT::v16i8, 12},
884 {Intrinsic::ctpop, MVT::nxv1i8, 12},
885 {Intrinsic::ctpop, MVT::nxv2i8, 12},
886 {Intrinsic::ctpop, MVT::nxv4i8, 12},
887 {Intrinsic::ctpop, MVT::nxv8i8, 12},
888 {Intrinsic::ctpop, MVT::nxv16i8, 12},
889 {Intrinsic::ctpop, MVT::v2i16, 19},
890 {Intrinsic::ctpop, MVT::v4i16, 19},
891 {Intrinsic::ctpop, MVT::v8i16, 19},
892 {Intrinsic::ctpop, MVT::v16i16, 19},
893 {Intrinsic::ctpop, MVT::nxv1i16, 19},
894 {Intrinsic::ctpop, MVT::nxv2i16, 19},
895 {Intrinsic::ctpop, MVT::nxv4i16, 19},
896 {Intrinsic::ctpop, MVT::nxv8i16, 19},
897 {Intrinsic::ctpop, MVT::nxv16i16, 19},
898 {Intrinsic::ctpop, MVT::v2i32, 20},
899 {Intrinsic::ctpop, MVT::v4i32, 20},
900 {Intrinsic::ctpop, MVT::v8i32, 20},
901 {Intrinsic::ctpop, MVT::v16i32, 20},
902 {Intrinsic::ctpop, MVT::nxv1i32, 20},
903 {Intrinsic::ctpop, MVT::nxv2i32, 20},
904 {Intrinsic::ctpop, MVT::nxv4i32, 20},
905 {Intrinsic::ctpop, MVT::nxv8i32, 20},
906 {Intrinsic::ctpop, MVT::nxv16i32, 20},
907 {Intrinsic::ctpop, MVT::v2i64, 21},
908 {Intrinsic::ctpop, MVT::v4i64, 21},
909 {Intrinsic::ctpop, MVT::v8i64, 21},
910 {Intrinsic::ctpop, MVT::v16i64, 21},
911 {Intrinsic::ctpop, MVT::nxv1i64, 21},
912 {Intrinsic::ctpop, MVT::nxv2i64, 21},
913 {Intrinsic::ctpop, MVT::nxv4i64, 21},
914 {Intrinsic::ctpop, MVT::nxv8i64, 21},
915 {Intrinsic::vp_ctpop, MVT::v2i8, 12},
916 {Intrinsic::vp_ctpop, MVT::v4i8, 12},
917 {Intrinsic::vp_ctpop, MVT::v8i8, 12},
918 {Intrinsic::vp_ctpop, MVT::v16i8, 12},
919 {Intrinsic::vp_ctpop, MVT::nxv1i8, 12},
920 {Intrinsic::vp_ctpop, MVT::nxv2i8, 12},
921 {Intrinsic::vp_ctpop, MVT::nxv4i8, 12},
922 {Intrinsic::vp_ctpop, MVT::nxv8i8, 12},
923 {Intrinsic::vp_ctpop, MVT::nxv16i8, 12},
924 {Intrinsic::vp_ctpop, MVT::v2i16, 19},
925 {Intrinsic::vp_ctpop, MVT::v4i16, 19},
926 {Intrinsic::vp_ctpop, MVT::v8i16, 19},
927 {Intrinsic::vp_ctpop, MVT::v16i16, 19},
928 {Intrinsic::vp_ctpop, MVT::nxv1i16, 19},
929 {Intrinsic::vp_ctpop, MVT::nxv2i16, 19},
930 {Intrinsic::vp_ctpop, MVT::nxv4i16, 19},
931 {Intrinsic::vp_ctpop, MVT::nxv8i16, 19},
932 {Intrinsic::vp_ctpop, MVT::nxv16i16, 19},
933 {Intrinsic::vp_ctpop, MVT::v2i32, 20},
934 {Intrinsic::vp_ctpop, MVT::v4i32, 20},
935 {Intrinsic::vp_ctpop, MVT::v8i32, 20},
936 {Intrinsic::vp_ctpop, MVT::v16i32, 20},
937 {Intrinsic::vp_ctpop, MVT::nxv1i32, 20},
938 {Intrinsic::vp_ctpop, MVT::nxv2i32, 20},
939 {Intrinsic::vp_ctpop, MVT::nxv4i32, 20},
940 {Intrinsic::vp_ctpop, MVT::nxv8i32, 20},
941 {Intrinsic::vp_ctpop, MVT::nxv16i32, 20},
942 {Intrinsic::vp_ctpop, MVT::v2i64, 21},
943 {Intrinsic::vp_ctpop, MVT::v4i64, 21},
944 {Intrinsic::vp_ctpop, MVT::v8i64, 21},
945 {Intrinsic::vp_ctpop, MVT::v16i64, 21},
946 {Intrinsic::vp_ctpop, MVT::nxv1i64, 21},
947 {Intrinsic::vp_ctpop, MVT::nxv2i64, 21},
948 {Intrinsic::vp_ctpop, MVT::nxv4i64, 21},
949 {Intrinsic::vp_ctpop, MVT::nxv8i64, 21},
950 {Intrinsic::vp_ctlz, MVT::v2i8, 19},
951 {Intrinsic::vp_ctlz, MVT::v4i8, 19},
952 {Intrinsic::vp_ctlz, MVT::v8i8, 19},
953 {Intrinsic::vp_ctlz, MVT::v16i8, 19},
954 {Intrinsic::vp_ctlz, MVT::nxv1i8, 19},
955 {Intrinsic::vp_ctlz, MVT::nxv2i8, 19},
956 {Intrinsic::vp_ctlz, MVT::nxv4i8, 19},
957 {Intrinsic::vp_ctlz, MVT::nxv8i8, 19},
958 {Intrinsic::vp_ctlz, MVT::nxv16i8, 19},
959 {Intrinsic::vp_ctlz, MVT::nxv32i8, 19},
960 {Intrinsic::vp_ctlz, MVT::nxv64i8, 19},
961 {Intrinsic::vp_ctlz, MVT::v2i16, 28},
962 {Intrinsic::vp_ctlz, MVT::v4i16, 28},
963 {Intrinsic::vp_ctlz, MVT::v8i16, 28},
964 {Intrinsic::vp_ctlz, MVT::v16i16, 28},
965 {Intrinsic::vp_ctlz, MVT::nxv1i16, 28},
966 {Intrinsic::vp_ctlz, MVT::nxv2i16, 28},
967 {Intrinsic::vp_ctlz, MVT::nxv4i16, 28},
968 {Intrinsic::vp_ctlz, MVT::nxv8i16, 28},
969 {Intrinsic::vp_ctlz, MVT::nxv16i16, 28},
970 {Intrinsic::vp_ctlz, MVT::nxv32i16, 28},
971 {Intrinsic::vp_ctlz, MVT::v2i32, 31},
972 {Intrinsic::vp_ctlz, MVT::v4i32, 31},
973 {Intrinsic::vp_ctlz, MVT::v8i32, 31},
974 {Intrinsic::vp_ctlz, MVT::v16i32, 31},
975 {Intrinsic::vp_ctlz, MVT::nxv1i32, 31},
976 {Intrinsic::vp_ctlz, MVT::nxv2i32, 31},
977 {Intrinsic::vp_ctlz, MVT::nxv4i32, 31},
978 {Intrinsic::vp_ctlz, MVT::nxv8i32, 31},
979 {Intrinsic::vp_ctlz, MVT::nxv16i32, 31},
980 {Intrinsic::vp_ctlz, MVT::v2i64, 35},
981 {Intrinsic::vp_ctlz, MVT::v4i64, 35},
982 {Intrinsic::vp_ctlz, MVT::v8i64, 35},
983 {Intrinsic::vp_ctlz, MVT::v16i64, 35},
984 {Intrinsic::vp_ctlz, MVT::nxv1i64, 35},
985 {Intrinsic::vp_ctlz, MVT::nxv2i64, 35},
986 {Intrinsic::vp_ctlz, MVT::nxv4i64, 35},
987 {Intrinsic::vp_ctlz, MVT::nxv8i64, 35},
988 {Intrinsic::vp_cttz, MVT::v2i8, 16},
989 {Intrinsic::vp_cttz, MVT::v4i8, 16},
990 {Intrinsic::vp_cttz, MVT::v8i8, 16},
991 {Intrinsic::vp_cttz, MVT::v16i8, 16},
992 {Intrinsic::vp_cttz, MVT::nxv1i8, 16},
993 {Intrinsic::vp_cttz, MVT::nxv2i8, 16},
994 {Intrinsic::vp_cttz, MVT::nxv4i8, 16},
995 {Intrinsic::vp_cttz, MVT::nxv8i8, 16},
996 {Intrinsic::vp_cttz, MVT::nxv16i8, 16},
997 {Intrinsic::vp_cttz, MVT::nxv32i8, 16},
998 {Intrinsic::vp_cttz, MVT::nxv64i8, 16},
999 {Intrinsic::vp_cttz, MVT::v2i16, 23},
1000 {Intrinsic::vp_cttz, MVT::v4i16, 23},
1001 {Intrinsic::vp_cttz, MVT::v8i16, 23},
1002 {Intrinsic::vp_cttz, MVT::v16i16, 23},
1003 {Intrinsic::vp_cttz, MVT::nxv1i16, 23},
1004 {Intrinsic::vp_cttz, MVT::nxv2i16, 23},
1005 {Intrinsic::vp_cttz, MVT::nxv4i16, 23},
1006 {Intrinsic::vp_cttz, MVT::nxv8i16, 23},
1007 {Intrinsic::vp_cttz, MVT::nxv16i16, 23},
1008 {Intrinsic::vp_cttz, MVT::nxv32i16, 23},
1009 {Intrinsic::vp_cttz, MVT::v2i32, 24},
1010 {Intrinsic::vp_cttz, MVT::v4i32, 24},
1011 {Intrinsic::vp_cttz, MVT::v8i32, 24},
1012 {Intrinsic::vp_cttz, MVT::v16i32, 24},
1013 {Intrinsic::vp_cttz, MVT::nxv1i32, 24},
1014 {Intrinsic::vp_cttz, MVT::nxv2i32, 24},
1015 {Intrinsic::vp_cttz, MVT::nxv4i32, 24},
1016 {Intrinsic::vp_cttz, MVT::nxv8i32, 24},
1017 {Intrinsic::vp_cttz, MVT::nxv16i32, 24},
1018 {Intrinsic::vp_cttz, MVT::v2i64, 25},
1019 {Intrinsic::vp_cttz, MVT::v4i64, 25},
1020 {Intrinsic::vp_cttz, MVT::v8i64, 25},
1021 {Intrinsic::vp_cttz, MVT::v16i64, 25},
1022 {Intrinsic::vp_cttz, MVT::nxv1i64, 25},
1023 {Intrinsic::vp_cttz, MVT::nxv2i64, 25},
1024 {Intrinsic::vp_cttz, MVT::nxv4i64, 25},
1025 {Intrinsic::vp_cttz, MVT::nxv8i64, 25},
1026};
1027
1029 switch (ID) {
1030#define HELPER_MAP_VPID_TO_VPSD(VPID, VPSD) \
1031 case Intrinsic::VPID: \
1032 return ISD::VPSD;
1033#include "llvm/IR/VPIntrinsics.def"
1034#undef HELPER_MAP_VPID_TO_VPSD
1035 }
1036 return ISD::DELETED_NODE;
1037}
1038
1042 auto *RetTy = ICA.getReturnType();
1043 switch (ICA.getID()) {
1044 case Intrinsic::ceil:
1045 case Intrinsic::floor:
1046 case Intrinsic::trunc:
1047 case Intrinsic::rint:
1048 case Intrinsic::round:
1049 case Intrinsic::roundeven: {
1050 // These all use the same code.
1051 auto LT = getTypeLegalizationCost(RetTy);
1052 if (!LT.second.isVector() && TLI->isOperationCustom(ISD::FCEIL, LT.second))
1053 return LT.first * 8;
1054 break;
1055 }
1056 case Intrinsic::umin:
1057 case Intrinsic::umax:
1058 case Intrinsic::smin:
1059 case Intrinsic::smax: {
1060 auto LT = getTypeLegalizationCost(RetTy);
1061 if ((ST->hasVInstructions() && LT.second.isVector()) ||
1062 (LT.second.isScalarInteger() && ST->hasStdExtZbb()))
1063 return LT.first;
1064 break;
1065 }
1066 case Intrinsic::sadd_sat:
1067 case Intrinsic::ssub_sat:
1068 case Intrinsic::uadd_sat:
1069 case Intrinsic::usub_sat:
1070 case Intrinsic::fabs:
1071 case Intrinsic::sqrt: {
1072 auto LT = getTypeLegalizationCost(RetTy);
1073 if (ST->hasVInstructions() && LT.second.isVector())
1074 return LT.first;
1075 break;
1076 }
1077 case Intrinsic::abs: {
1078 auto LT = getTypeLegalizationCost(RetTy);
1079 if (ST->hasVInstructions() && LT.second.isVector()) {
1080 // vrsub.vi v10, v8, 0
1081 // vmax.vv v8, v8, v10
1082 return LT.first * 2;
1083 }
1084 break;
1085 }
1086 // TODO: add more intrinsic
1087 case Intrinsic::experimental_stepvector: {
1088 unsigned Cost = 1; // vid
1089 auto LT = getTypeLegalizationCost(RetTy);
1090 return Cost + (LT.first - 1);
1091 }
1092 case Intrinsic::vp_rint: {
1093 // RISC-V target uses at least 5 instructions to lower rounding intrinsics.
1094 unsigned Cost = 5;
1095 auto LT = getTypeLegalizationCost(RetTy);
1096 if (TLI->isOperationCustom(ISD::VP_FRINT, LT.second))
1097 return Cost * LT.first;
1098 break;
1099 }
1100 case Intrinsic::vp_nearbyint: {
1101 // More one read and one write for fflags than vp_rint.
1102 unsigned Cost = 7;
1103 auto LT = getTypeLegalizationCost(RetTy);
1104 if (TLI->isOperationCustom(ISD::VP_FRINT, LT.second))
1105 return Cost * LT.first;
1106 break;
1107 }
1108 case Intrinsic::vp_ceil:
1109 case Intrinsic::vp_floor:
1110 case Intrinsic::vp_round:
1111 case Intrinsic::vp_roundeven:
1112 case Intrinsic::vp_roundtozero: {
1113 // Rounding with static rounding mode needs two more instructions to
1114 // swap/write FRM than vp_rint.
1115 unsigned Cost = 7;
1116 auto LT = getTypeLegalizationCost(RetTy);
1117 unsigned VPISD = getISDForVPIntrinsicID(ICA.getID());
1118 if (TLI->isOperationCustom(VPISD, LT.second))
1119 return Cost * LT.first;
1120 break;
1121 }
1122 }
1123
1124 if (ST->hasVInstructions() && RetTy->isVectorTy()) {
1125 auto LT = getTypeLegalizationCost(RetTy);
1126 if (const auto *Entry = CostTableLookup(VectorIntrinsicCostTable,
1127 ICA.getID(), LT.second))
1128 return LT.first * Entry->Cost;
1129 }
1130
1132}
1133
1135 Type *Src,
1138 const Instruction *I) {
1139 if (isa<VectorType>(Dst) && isa<VectorType>(Src)) {
1140 // FIXME: Need to compute legalizing cost for illegal types.
1141 if (!isTypeLegal(Src) || !isTypeLegal(Dst))
1142 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1143
1144 // Skip if element size of Dst or Src is bigger than ELEN.
1145 if (Src->getScalarSizeInBits() > ST->getELen() ||
1146 Dst->getScalarSizeInBits() > ST->getELen())
1147 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1148
1149 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1150 assert(ISD && "Invalid opcode");
1151
1152 // FIXME: Need to consider vsetvli and lmul.
1153 int PowDiff = (int)Log2_32(Dst->getScalarSizeInBits()) -
1154 (int)Log2_32(Src->getScalarSizeInBits());
1155 switch (ISD) {
1156 case ISD::SIGN_EXTEND:
1157 case ISD::ZERO_EXTEND:
1158 if (Src->getScalarSizeInBits() == 1) {
1159 // We do not use vsext/vzext to extend from mask vector.
1160 // Instead we use the following instructions to extend from mask vector:
1161 // vmv.v.i v8, 0
1162 // vmerge.vim v8, v8, -1, v0
1163 return 2;
1164 }
1165 return 1;
1166 case ISD::TRUNCATE:
1167 if (Dst->getScalarSizeInBits() == 1) {
1168 // We do not use several vncvt to truncate to mask vector. So we could
1169 // not use PowDiff to calculate it.
1170 // Instead we use the following instructions to truncate to mask vector:
1171 // vand.vi v8, v8, 1
1172 // vmsne.vi v0, v8, 0
1173 return 2;
1174 }
1175 [[fallthrough]];
1176 case ISD::FP_EXTEND:
1177 case ISD::FP_ROUND:
1178 // Counts of narrow/widen instructions.
1179 return std::abs(PowDiff);
1180 case ISD::FP_TO_SINT:
1181 case ISD::FP_TO_UINT:
1182 case ISD::SINT_TO_FP:
1183 case ISD::UINT_TO_FP:
1184 if (Src->getScalarSizeInBits() == 1 || Dst->getScalarSizeInBits() == 1) {
1185 // The cost of convert from or to mask vector is different from other
1186 // cases. We could not use PowDiff to calculate it.
1187 // For mask vector to fp, we should use the following instructions:
1188 // vmv.v.i v8, 0
1189 // vmerge.vim v8, v8, -1, v0
1190 // vfcvt.f.x.v v8, v8
1191
1192 // And for fp vector to mask, we use:
1193 // vfncvt.rtz.x.f.w v9, v8
1194 // vand.vi v8, v9, 1
1195 // vmsne.vi v0, v8, 0
1196 return 3;
1197 }
1198 if (std::abs(PowDiff) <= 1)
1199 return 1;
1200 // Backend could lower (v[sz]ext i8 to double) to vfcvt(v[sz]ext.f8 i8),
1201 // so it only need two conversion.
1202 if (Src->isIntOrIntVectorTy())
1203 return 2;
1204 // Counts of narrow/widen instructions.
1205 return std::abs(PowDiff);
1206 }
1207 }
1208 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1209}
1210
1211unsigned RISCVTTIImpl::getEstimatedVLFor(VectorType *Ty) {
1212 if (isa<ScalableVectorType>(Ty)) {
1213 const unsigned EltSize = DL.getTypeSizeInBits(Ty->getElementType());
1214 const unsigned MinSize = DL.getTypeSizeInBits(Ty).getKnownMinValue();
1215 const unsigned VectorBits = *getVScaleForTuning() * RISCV::RVVBitsPerBlock;
1216 return RISCVTargetLowering::computeVLMAX(VectorBits, EltSize, MinSize);
1217 }
1218 return cast<FixedVectorType>(Ty)->getNumElements();
1219}
1220
1223 FastMathFlags FMF,
1225 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
1226 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
1227
1228 // Skip if scalar size of Ty is bigger than ELEN.
1229 if (Ty->getScalarSizeInBits() > ST->getELen())
1230 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
1231
1232 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1233 if (Ty->getElementType()->isIntegerTy(1))
1234 // vcpop sequences, see vreduction-mask.ll. umax, smin actually only
1235 // cost 2, but we don't have enough info here so we slightly over cost.
1236 return (LT.first - 1) + 3;
1237
1238 // IR Reduction is composed by two vmv and one rvv reduction instruction.
1239 InstructionCost BaseCost = 2;
1240
1242 return (LT.first - 1) + BaseCost;
1243
1244 unsigned VL = getEstimatedVLFor(Ty);
1245 return (LT.first - 1) + BaseCost + Log2_32_Ceil(VL);
1246}
1247
1250 std::optional<FastMathFlags> FMF,
1252 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
1253 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
1254
1255 // Skip if scalar size of Ty is bigger than ELEN.
1256 if (Ty->getScalarSizeInBits() > ST->getELen())
1257 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
1258
1259 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1260 assert(ISD && "Invalid opcode");
1261
1262 if (ISD != ISD::ADD && ISD != ISD::OR && ISD != ISD::XOR && ISD != ISD::AND &&
1263 ISD != ISD::FADD)
1264 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
1265
1266 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1267 if (Ty->getElementType()->isIntegerTy(1))
1268 // vcpop sequences, see vreduction-mask.ll
1269 return (LT.first - 1) + (ISD == ISD::AND ? 3 : 2);
1270
1271 // IR Reduction is composed by two vmv and one rvv reduction instruction.
1272 InstructionCost BaseCost = 2;
1273
1275 return (LT.first - 1) + BaseCost;
1276
1277 unsigned VL = getEstimatedVLFor(Ty);
1279 return (LT.first - 1) + BaseCost + VL;
1280 return (LT.first - 1) + BaseCost + Log2_32_Ceil(VL);
1281}
1282
1284 unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy,
1286 if (isa<FixedVectorType>(ValTy) && !ST->useRVVForFixedLengthVectors())
1287 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
1288 FMF, CostKind);
1289
1290 // Skip if scalar size of ResTy is bigger than ELEN.
1291 if (ResTy->getScalarSizeInBits() > ST->getELen())
1292 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
1293 FMF, CostKind);
1294
1295 if (Opcode != Instruction::Add && Opcode != Instruction::FAdd)
1296 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
1297 FMF, CostKind);
1298
1299 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1300
1301 if (ResTy->getScalarSizeInBits() != 2 * LT.second.getScalarSizeInBits())
1302 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
1303 FMF, CostKind);
1304
1305 return (LT.first - 1) +
1306 getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
1307}
1308
1310 TTI::OperandValueInfo OpInfo,
1312 assert(OpInfo.isConstant() && "non constant operand?");
1313 if (!isa<VectorType>(Ty))
1314 // FIXME: We need to account for immediate materialization here, but doing
1315 // a decent job requires more knowledge about the immediate than we
1316 // currently have here.
1317 return 0;
1318
1319 if (OpInfo.isUniform())
1320 // vmv.x.i, vmv.v.x, or vfmv.v.f
1321 // We ignore the cost of the scalar constant materialization to be consistent
1322 // with how we treat scalar constants themselves just above.
1323 return 1;
1324
1325 return getConstantPoolLoadCost(Ty, CostKind);
1326}
1327
1328
1330 MaybeAlign Alignment,
1331 unsigned AddressSpace,
1333 TTI::OperandValueInfo OpInfo,
1334 const Instruction *I) {
1335 EVT VT = TLI->getValueType(DL, Src, true);
1336 // Type legalization can't handle structs
1337 if (VT == MVT::Other)
1338 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1339 CostKind, OpInfo, I);
1340
1342 if (Opcode == Instruction::Store && OpInfo.isConstant())
1343 Cost += getStoreImmCost(Src, OpInfo, CostKind);
1344 InstructionCost BaseCost =
1345 BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1346 CostKind, OpInfo, I);
1347 // Assume memory ops cost scale with the number of vector registers
1348 // possible accessed by the instruction. Note that BasicTTI already
1349 // handles the LT.first term for us.
1350 if (std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
1351 LT.second.isVector())
1352 BaseCost *= TLI->getLMULCost(LT.second);
1353 return Cost + BaseCost;
1354
1355}
1356
1358 Type *CondTy,
1359 CmpInst::Predicate VecPred,
1361 const Instruction *I) {
1363 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1364 I);
1365
1366 if (isa<FixedVectorType>(ValTy) && !ST->useRVVForFixedLengthVectors())
1367 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1368 I);
1369
1370 // Skip if scalar size of ValTy is bigger than ELEN.
1371 if (ValTy->isVectorTy() && ValTy->getScalarSizeInBits() > ST->getELen())
1372 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1373 I);
1374
1375 if (Opcode == Instruction::Select && ValTy->isVectorTy()) {
1376 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1377 if (CondTy->isVectorTy()) {
1378 if (ValTy->getScalarSizeInBits() == 1) {
1379 // vmandn.mm v8, v8, v9
1380 // vmand.mm v9, v0, v9
1381 // vmor.mm v0, v9, v8
1382 return LT.first * 3;
1383 }
1384 // vselect and max/min are supported natively.
1385 return LT.first * 1;
1386 }
1387
1388 if (ValTy->getScalarSizeInBits() == 1) {
1389 // vmv.v.x v9, a0
1390 // vmsne.vi v9, v9, 0
1391 // vmandn.mm v8, v8, v9
1392 // vmand.mm v9, v0, v9
1393 // vmor.mm v0, v9, v8
1394 return LT.first * 5;
1395 }
1396
1397 // vmv.v.x v10, a0
1398 // vmsne.vi v0, v10, 0
1399 // vmerge.vvm v8, v9, v8, v0
1400 return LT.first * 3;
1401 }
1402
1403 if ((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
1404 ValTy->isVectorTy()) {
1405 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1406
1407 // Support natively.
1408 if (CmpInst::isIntPredicate(VecPred))
1409 return LT.first * 1;
1410
1411 // If we do not support the input floating point vector type, use the base
1412 // one which will calculate as:
1413 // ScalarizeCost + Num * Cost for fixed vector,
1414 // InvalidCost for scalable vector.
1415 if ((ValTy->getScalarSizeInBits() == 16 && !ST->hasVInstructionsF16()) ||
1416 (ValTy->getScalarSizeInBits() == 32 && !ST->hasVInstructionsF32()) ||
1417 (ValTy->getScalarSizeInBits() == 64 && !ST->hasVInstructionsF64()))
1418 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1419 I);
1420 switch (VecPred) {
1421 // Support natively.
1422 case CmpInst::FCMP_OEQ:
1423 case CmpInst::FCMP_OGT:
1424 case CmpInst::FCMP_OGE:
1425 case CmpInst::FCMP_OLT:
1426 case CmpInst::FCMP_OLE:
1427 case CmpInst::FCMP_UNE:
1428 return LT.first * 1;
1429 // TODO: Other comparisons?
1430 default:
1431 break;
1432 }
1433 }
1434
1435 // TODO: Add cost for scalar type.
1436
1437 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
1438}
1439
1442 unsigned Index, Value *Op0,
1443 Value *Op1) {
1444 assert(Val->isVectorTy() && "This must be a vector type");
1445
1446 if (Opcode != Instruction::ExtractElement &&
1447 Opcode != Instruction::InsertElement)
1448 return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1);
1449
1450 // Legalize the type.
1451 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
1452
1453 // This type is legalized to a scalar type.
1454 if (!LT.second.isVector())
1455 return 0;
1456
1457 // For unsupported scalable vector.
1458 if (LT.second.isScalableVector() && !LT.first.isValid())
1459 return LT.first;
1460
1461 if (!isTypeLegal(Val))
1462 return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1);
1463
1464 // Mask vector extract/insert is expanded via e8.
1465 if (Val->getScalarSizeInBits() == 1) {
1466 VectorType *WideTy =
1468 cast<VectorType>(Val)->getElementCount());
1469 if (Opcode == Instruction::ExtractElement) {
1470 InstructionCost ExtendCost
1471 = getCastInstrCost(Instruction::ZExt, WideTy, Val,
1473 InstructionCost ExtractCost
1474 = getVectorInstrCost(Opcode, WideTy, CostKind, Index, nullptr, nullptr);
1475 return ExtendCost + ExtractCost;
1476 }
1477 InstructionCost ExtendCost
1478 = getCastInstrCost(Instruction::ZExt, WideTy, Val,
1480 InstructionCost InsertCost
1481 = getVectorInstrCost(Opcode, WideTy, CostKind, Index, nullptr, nullptr);
1482 InstructionCost TruncCost
1483 = getCastInstrCost(Instruction::Trunc, Val, WideTy,
1485 return ExtendCost + InsertCost + TruncCost;
1486 }
1487
1488
1489 // In RVV, we could use vslidedown + vmv.x.s to extract element from vector
1490 // and vslideup + vmv.s.x to insert element to vector.
1491 unsigned BaseCost = 1;
1492 // When insertelement we should add the index with 1 as the input of vslideup.
1493 unsigned SlideCost = Opcode == Instruction::InsertElement ? 2 : 1;
1494
1495 if (Index != -1U) {
1496 // The type may be split. For fixed-width vectors we can normalize the
1497 // index to the new type.
1498 if (LT.second.isFixedLengthVector()) {
1499 unsigned Width = LT.second.getVectorNumElements();
1500 Index = Index % Width;
1501 }
1502
1503 // We could extract/insert the first element without vslidedown/vslideup.
1504 if (Index == 0)
1505 SlideCost = 0;
1506 else if (Opcode == Instruction::InsertElement)
1507 SlideCost = 1; // With a constant index, we do not need to use addi.
1508 }
1509
1510 // Extract i64 in the target that has XLEN=32 need more instruction.
1511 if (Val->getScalarType()->isIntegerTy() &&
1512 ST->getXLen() < Val->getScalarSizeInBits()) {
1513 // For extractelement, we need the following instructions:
1514 // vsetivli zero, 1, e64, m1, ta, mu (not count)
1515 // vslidedown.vx v8, v8, a0
1516 // vmv.x.s a0, v8
1517 // li a1, 32
1518 // vsrl.vx v8, v8, a1
1519 // vmv.x.s a1, v8
1520
1521 // For insertelement, we need the following instructions:
1522 // vsetivli zero, 2, e32, m4, ta, mu (not count)
1523 // vmv.v.i v12, 0
1524 // vslide1up.vx v16, v12, a1
1525 // vslide1up.vx v12, v16, a0
1526 // addi a0, a2, 1
1527 // vsetvli zero, a0, e64, m4, tu, mu (not count)
1528 // vslideup.vx v8, v12, a2
1529
1530 // TODO: should we count these special vsetvlis?
1531 BaseCost = Opcode == Instruction::InsertElement ? 3 : 4;
1532 }
1533 return BaseCost + SlideCost;
1534}
1535
1537 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
1539 ArrayRef<const Value *> Args, const Instruction *CxtI) {
1540
1541 // TODO: Handle more cost kinds.
1543 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1544 Args, CxtI);
1545
1546 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
1547 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1548 Args, CxtI);
1549
1550 // Skip if scalar size of Ty is bigger than ELEN.
1551 if (isa<VectorType>(Ty) && Ty->getScalarSizeInBits() > ST->getELen())
1552 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1553 Args, CxtI);
1554
1555 // Legalize the type.
1556 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1557
1558 // TODO: Handle scalar type.
1559 if (!LT.second.isVector())
1560 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1561 Args, CxtI);
1562
1563
1564 auto getConstantMatCost =
1565 [&](unsigned Operand, TTI::OperandValueInfo OpInfo) -> InstructionCost {
1566 if (OpInfo.isUniform() && TLI->canSplatOperand(Opcode, Operand))
1567 // Two sub-cases:
1568 // * Has a 5 bit immediate operand which can be splatted.
1569 // * Has a larger immediate which must be materialized in scalar register
1570 // We return 0 for both as we currently ignore the cost of materializing
1571 // scalar constants in GPRs.
1572 return 0;
1573
1574 return getConstantPoolLoadCost(Ty, CostKind);
1575 };
1576
1577 // Add the cost of materializing any constant vectors required.
1578 InstructionCost ConstantMatCost = 0;
1579 if (Op1Info.isConstant())
1580 ConstantMatCost += getConstantMatCost(0, Op1Info);
1581 if (Op2Info.isConstant())
1582 ConstantMatCost += getConstantMatCost(1, Op2Info);
1583
1584 switch (TLI->InstructionOpcodeToISD(Opcode)) {
1585 case ISD::ADD:
1586 case ISD::SUB:
1587 case ISD::AND:
1588 case ISD::OR:
1589 case ISD::XOR:
1590 case ISD::SHL:
1591 case ISD::SRL:
1592 case ISD::SRA:
1593 case ISD::MUL:
1594 case ISD::MULHS:
1595 case ISD::MULHU:
1596 case ISD::FADD:
1597 case ISD::FSUB:
1598 case ISD::FMUL:
1599 case ISD::FNEG: {
1600 return ConstantMatCost + TLI->getLMULCost(LT.second) * LT.first * 1;
1601 }
1602 default:
1603 return ConstantMatCost +
1604 BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1605 Args, CxtI);
1606 }
1607}
1608
1609// TODO: Deduplicate from TargetTransformInfoImplCRTPBase.
1611 ArrayRef<const Value *> Ptrs, const Value *Base,
1612 const TTI::PointersChainInfo &Info, Type *AccessTy,
1615 // In the basic model we take into account GEP instructions only
1616 // (although here can come alloca instruction, a value, constants and/or
1617 // constant expressions, PHIs, bitcasts ... whatever allowed to be used as a
1618 // pointer). Typically, if Base is a not a GEP-instruction and all the
1619 // pointers are relative to the same base address, all the rest are
1620 // either GEP instructions, PHIs, bitcasts or constants. When we have same
1621 // base, we just calculate cost of each non-Base GEP as an ADD operation if
1622 // any their index is a non-const.
1623 // If no known dependecies between the pointers cost is calculated as a sum
1624 // of costs of GEP instructions.
1625 for (auto [I, V] : enumerate(Ptrs)) {
1626 const auto *GEP = dyn_cast<GetElementPtrInst>(V);
1627 if (!GEP)
1628 continue;
1629 if (Info.isSameBase() && V != Base) {
1630 if (GEP->hasAllConstantIndices())
1631 continue;
1632 // If the chain is unit-stride and BaseReg + stride*i is a legal
1633 // addressing mode, then presume the base GEP is sitting around in a
1634 // register somewhere and check if we can fold the offset relative to
1635 // it.
1636 unsigned Stride = DL.getTypeStoreSize(AccessTy);
1637 if (Info.isUnitStride() &&
1638 isLegalAddressingMode(AccessTy,
1639 /* BaseGV */ nullptr,
1640 /* BaseOffset */ Stride * I,
1641 /* HasBaseReg */ true,
1642 /* Scale */ 0,
1643 GEP->getType()->getPointerAddressSpace()))
1644 continue;
1645 Cost += getArithmeticInstrCost(Instruction::Add, GEP->getType(), CostKind,
1646 {TTI::OK_AnyValue, TTI::OP_None},
1647 {TTI::OK_AnyValue, TTI::OP_None},
1648 std::nullopt);
1649 } else {
1650 SmallVector<const Value *> Indices(GEP->indices());
1651 Cost += getGEPCost(GEP->getSourceElementType(), GEP->getPointerOperand(),
1652 Indices, AccessTy, CostKind);
1653 }
1654 }
1655 return Cost;
1656}
1657
1661 // TODO: More tuning on benchmarks and metrics with changes as needed
1662 // would apply to all settings below to enable performance.
1663
1664
1665 if (ST->enableDefaultUnroll())
1666 return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP, ORE);
1667
1668 // Enable Upper bound unrolling universally, not dependant upon the conditions
1669 // below.
1670 UP.UpperBound = true;
1671
1672 // Disable loop unrolling for Oz and Os.
1673 UP.OptSizeThreshold = 0;
1675 if (L->getHeader()->getParent()->hasOptSize())
1676 return;
1677
1678 SmallVector<BasicBlock *, 4> ExitingBlocks;
1679 L->getExitingBlocks(ExitingBlocks);
1680 LLVM_DEBUG(dbgs() << "Loop has:\n"
1681 << "Blocks: " << L->getNumBlocks() << "\n"
1682 << "Exit blocks: " << ExitingBlocks.size() << "\n");
1683
1684 // Only allow another exit other than the latch. This acts as an early exit
1685 // as it mirrors the profitability calculation of the runtime unroller.
1686 if (ExitingBlocks.size() > 2)
1687 return;
1688
1689 // Limit the CFG of the loop body for targets with a branch predictor.
1690 // Allowing 4 blocks permits if-then-else diamonds in the body.
1691 if (L->getNumBlocks() > 4)
1692 return;
1693
1694 // Don't unroll vectorized loops, including the remainder loop
1695 if (getBooleanLoopAttribute(L, "llvm.loop.isvectorized"))
1696 return;
1697
1698 // Scan the loop: don't unroll loops with calls as this could prevent
1699 // inlining.
1701 for (auto *BB : L->getBlocks()) {
1702 for (auto &I : *BB) {
1703 // Initial setting - Don't unroll loops containing vectorized
1704 // instructions.
1705 if (I.getType()->isVectorTy())
1706 return;
1707
1708 if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
1709 if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
1710 if (!isLoweredToCall(F))
1711 continue;
1712 }
1713 return;
1714 }
1715
1716 SmallVector<const Value *> Operands(I.operand_values());
1719 }
1720 }
1721
1722 LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n");
1723
1724 UP.Partial = true;
1725 UP.Runtime = true;
1726 UP.UnrollRemainder = true;
1727 UP.UnrollAndJam = true;
1729
1730 // Force unrolling small loops can be very useful because of the branch
1731 // taken cost of the backedge.
1732 if (Cost < 12)
1733 UP.Force = true;
1734}
1735
1739}
1740
1743 if (Ty->isVectorTy()) {
1744 if (Size.isScalable() && ST->hasVInstructions())
1745 return divideCeil(Size.getKnownMinValue(), RISCV::RVVBitsPerBlock);
1746
1748 return divideCeil(Size, ST->getRealMinVLen());
1749 }
1750
1751 return BaseT::getRegUsageForType(Ty);
1752}
1753
1754unsigned RISCVTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
1755 if (SLPMaxVF.getNumOccurrences())
1756 return SLPMaxVF;
1757
1758 // Return how many elements can fit in getRegisterBitwidth. This is the
1759 // same routine as used in LoopVectorizer. We should probably be
1760 // accounting for whether we actually have instructions with the right
1761 // lane type, but we don't have enough information to do that without
1762 // some additional plumbing which hasn't been justified yet.
1763 TypeSize RegWidth =
1765 // If no vector registers, or absurd element widths, disable
1766 // vectorization by returning 1.
1767 return std::max<unsigned>(1U, RegWidth.getFixedValue() / ElemWidth);
1768}
1769
1771 const TargetTransformInfo::LSRCost &C2) {
1772 // RISC-V specific here are "instruction number 1st priority".
1773 return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost,
1774 C1.NumIVMuls, C1.NumBaseAdds,
1775 C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
1776 std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost,
1777 C2.NumIVMuls, C2.NumBaseAdds,
1778 C2.ScaleCost, C2.ImmCost, C2.SetupCost);
1779}
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
Cost tables and simple lookup functions.
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
uint64_t Size
Hexagon Common GEP
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
mir Rename Register Operands
static const Function * getCalledFunction(const Value *V, bool &IsNoBuiltin)
static cl::opt< unsigned > SLPMaxVF("riscv-v-slp-max-vf", cl::desc("Overrides result used for getMaximumVF query which is used " "exclusively by SLP vectorizer."), cl::Hidden)
static cl::opt< unsigned > RVVRegisterWidthLMUL("riscv-v-register-bit-width-lmul", cl::desc("The LMUL to use for getRegisterBitWidth queries. Affects LMUL used " "by autovectorized code. Fractional LMULs are not supported."), cl::init(2), cl::Hidden)
static VectorType * getVRGatherIndexType(MVT DataVT, const RISCVSubtarget &ST, LLVMContext &C)
static const CostTblEntry VectorIntrinsicCostTable[]
static bool canUseShiftPair(Instruction *Inst, const APInt &Imm)
static unsigned getISDForVPIntrinsicID(Intrinsic::ID ID)
This file defines a TargetTransformInfo::Concept conforming object specific to the RISC-V target mach...
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file contains some templates that are useful if you are working with the STL at all.
This file describes how to lower LLVM code to machine code.
This pass exposes codegen information to IR-level passes.
Class for arbitrary precision integers.
Definition: APInt.h:76
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Get intrinsic cost based on arguments.
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
Definition: BasicTTIImpl.h:547
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *DataTy, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
std::optional< unsigned > getVScaleForTuning() const
Definition: BasicTTIImpl.h:721
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr)
Definition: BasicTTIImpl.h:856
std::optional< unsigned > getMaxVScale() const
Definition: BasicTTIImpl.h:720
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *Ty, int &Index, VectorType *&SubTy) const
Definition: BasicTTIImpl.h:934
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
Try to calculate op costs for min/max reduction operations.
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args=std::nullopt)
Definition: BasicTTIImpl.h:978
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef< const Value * > Operands, Type *AccessType, TTI::TargetCostKind CostKind)
Definition: BasicTTIImpl.h:422
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr)
Definition: BasicTTIImpl.h:334
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: BasicTTIImpl.h:619
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
Estimate the cost of type-legalization and the legalized type.
Definition: BasicTTIImpl.h:820
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:711
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition: InstrTypes.h:714
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition: InstrTypes.h:717
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition: InstrTypes.h:715
@ FCMP_OGE
0 0 1 1 True if ordered and greater than or equal
Definition: InstrTypes.h:716
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition: InstrTypes.h:718
@ FCMP_UNE
1 1 1 0 True if unordered or not equal
Definition: InstrTypes.h:727
bool isIntPredicate() const
Definition: InstrTypes.h:819
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
Definition: DataLayout.cpp:863
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition: DataLayout.h:672
TypeSize getTypeStoreSize(Type *Ty) const
Returns the maximum number of bytes that may be overwritten by storing the specified type.
Definition: DataLayout.h:472
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:20
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:536
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:693
static InstructionCost getInvalid(CostType Val=0)
bool isCommutative() const LLVM_READONLY
Return true if the instruction is commutative:
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:279
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:47
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
Definition: IntrinsicInst.h:54
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:47
Machine Value Type.
uint64_t getScalarSizeInBits() const
MVT changeVectorElementType(MVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
unsigned getVectorNumElements() const
MVT changeTypeToInteger()
Return the type converted to an equivalently sized integer or vector with integer element type.
bool bitsGT(MVT VT) const
Return true if this has more bits than VT.
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
unsigned getOpcode() const
Return the opcode for this Instruction or ConstantExpr.
Definition: Operator.h:41
The optimization diagnostic interface.
bool hasVInstructionsF64() const
unsigned getRealMinVLen() const
bool useRVVForFixedLengthVectors() const
unsigned getXLen() const
bool hasVInstructionsF16() const
bool hasVInstructions() const
unsigned getRealMaxVLen() const
bool hasVInstructionsF32() const
unsigned getELen() const
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I)
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args=std::nullopt)
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2)
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind)
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr)
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
bool isLegalMaskedLoadStore(Type *DataType, Align Alignment)
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr)
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
std::optional< unsigned > getVScaleForTuning() const
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy, FastMathFlags FMF, TTI::TargetCostKind CostKind)
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
std::optional< unsigned > getMaxVScale() const
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getPointersChainCost(ArrayRef< const Value * > Ptrs, const Value *Base, const TTI::PointersChainInfo &Info, Type *AccessTy, TTI::TargetCostKind CostKind)
TargetTransformInfo::PopcntSupportKind getPopcntSupport(unsigned TyWidth)
bool shouldExpandReduction(const IntrinsicInst *II) const
InstructionCost getStoreImmCost(Type *VecTy, TTI::OperandValueInfo OpInfo, TTI::TargetCostKind CostKind)
Return the cost of materializing an immediate for a value operand of a store instruction.
unsigned getRegUsageForType(Type *Ty)
bool isLegalMaskedGather(Type *DataType, Align Alignment)
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpdInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
bool isLegalMaskedScatter(Type *DataType, Align Alignment)
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const
InstructionCost getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind)
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
InstructionCost getVRGatherVVCost(MVT VT) const
Return the cost of a vrgather.vv instruction for the type VT.
bool canSplatOperand(Instruction *I, int Operand) const
Return true if the (vector) instruction I will be lowered to an instruction with a scalar splat opera...
InstructionCost getVRGatherVICost(MVT VT) const
Return the cost of a vrgather.vi (or vx) instruction for the type VT.
static unsigned computeVLMAX(unsigned VectorBits, unsigned EltSize, unsigned MinSize)
InstructionCost getLMULCost(MVT VT) const
Return the cost of LMUL for linear operations.
InstructionCost getVSlideCost(MVT VT) const
Return the cost of a vslidedown.vi/vx or vslideup.vi/vx instruction for the type VT.
bool isLegalInterleavedAccessType(VectorType *VTy, unsigned Factor, Align Alignment, unsigned AddrSpace, const DataLayout &) const
Returns whether or not generating a interleaved load/store intrinsic for this type will be legal.
The main scalar evolution driver.
static bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
size_t size() const
Definition: SmallVector.h:91
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1200
int InstructionOpcodeToISD(unsigned Opcode) const
Get the ISD node that corresponds to the Instruction class opcode.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
bool isOperationCustom(unsigned Op, EVT VT) const
Return true if the operation uses custom lowering, regardless of whether the type is legal or not.
const DataLayout & getDataLayout() const
bool isLoweredToCall(const Function *F) const
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TTI::TargetCostKind CostKind)
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
PopcntSupportKind
Flags indicating the kind of support for population count.
@ TCC_Free
Expected to fold away in lowering.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Transpose
Transpose two vectors.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
@ None
The cast is not used with a load/store of any kind.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:322
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition: TypeSize.h:325
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:265
static IntegerType * getInt1Ty(LLVMContext &C)
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:129
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:228
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:348
Value * getOperand(unsigned i) const
Definition: User.h:169
LLVM Value Representation.
Definition: Value.h:74
Base class of all SIMD vector types.
Definition: DerivedTypes.h:400
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
Definition: DerivedTypes.h:638
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Definition: Type.cpp:677
Type * getElementType() const
Definition: DerivedTypes.h:433
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:182
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition: TypeSize.h:163
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition: ISDOpcodes.h:44
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:239
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:787
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:390
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:774
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:925
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition: ISDOpcodes.h:651
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:705
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:777
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:885
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:833
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:680
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:866
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:783
int getIntMatCost(const APInt &Val, unsigned Size, const FeatureBitset &ActiveFeatures, bool CompressionCost)
static constexpr unsigned RVVBitsPerBlock
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:445
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
unsigned Log2_32_Ceil(uint32_t Value)
Return the ceil log base 2 of the specified value, 32 if the value is zero.
Definition: MathExtras.h:326
const CostTblEntryT< CostType > * CostTableLookup(ArrayRef< CostTblEntryT< CostType > > Tbl, int ISD, MVT Ty)
Find in cost table.
Definition: CostTable.h:35
bool getBooleanLoopAttribute(const Loop *TheLoop, StringRef Name)
Returns true if Name is applied to TheLoop and enabled.
Definition: LoopInfo.cpp:1084
uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:414
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are are tuples (A,...
Definition: STLExtras.h:2338
AddressSpace
Definition: NVPTXBaseInfo.h:21
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition: MathExtras.h:361
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: bit.h:179
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition: MathExtras.h:258
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
Definition: STLExtras.h:1933
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:313
llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:264
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
constexpr int PoisonMaskElem
llvm::SmallVector< int, 16 > createInterleaveMask(unsigned VF, unsigned NumVecs)
Create an interleave shuffle mask.
InstructionCost Cost
bool equal(L &&LRange, R &&RRange)
Wrapper function around std::equal to detect if pair-wise elements between two ranges are the same.
Definition: STLExtras.h:1995
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition: bit.h:291
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
Cost Table Entry.
Definition: CostTable.h:25
Extended Value Type.
Definition: ValueTypes.h:34
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
unsigned Insns
TODO: Some of these could be merged.
Describe known properties for a set of pointers.
Parameters that control the generic loop unrolling transformation.
bool UpperBound
Allow using trip count upper bound to unroll loops.
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
unsigned UnrollAndJamInnerLoopThreshold
Threshold for unroll and jam, for inner loop size.
bool UnrollAndJam
Allow unroll and jam. Used to enable unroll and jam for the target.
bool UnrollRemainder
Allow unrolling of all the iterations of the runtime loop remainder.
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
unsigned OptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size (set to UINT_MAX to disable).