LLVM 19.0.0git
RISCVTargetTransformInfo.cpp
Go to the documentation of this file.
1//===-- RISCVTargetTransformInfo.cpp - RISC-V specific TTI ----------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
11#include "llvm/ADT/STLExtras.h"
17#include <cmath>
18#include <optional>
19using namespace llvm;
20
21#define DEBUG_TYPE "riscvtti"
22
24 "riscv-v-register-bit-width-lmul",
26 "The LMUL to use for getRegisterBitWidth queries. Affects LMUL used "
27 "by autovectorized code. Fractional LMULs are not supported."),
29
31 "riscv-v-slp-max-vf",
33 "Overrides result used for getMaximumVF query which is used "
34 "exclusively by SLP vectorizer."),
36
38RISCVTTIImpl::getRISCVInstructionCost(ArrayRef<unsigned> OpCodes, MVT VT,
40 // Check if the type is valid for all CostKind
41 if (!VT.isVector())
43 size_t NumInstr = OpCodes.size();
45 return NumInstr;
46 InstructionCost LMULCost = TLI->getLMULCost(VT);
48 return LMULCost * NumInstr;
50 for (auto Op : OpCodes) {
51 switch (Op) {
52 case RISCV::VRGATHER_VI:
53 Cost += TLI->getVRGatherVICost(VT);
54 break;
55 case RISCV::VRGATHER_VV:
56 Cost += TLI->getVRGatherVVCost(VT);
57 break;
58 case RISCV::VSLIDEUP_VI:
59 case RISCV::VSLIDEDOWN_VI:
60 Cost += TLI->getVSlideVICost(VT);
61 break;
62 case RISCV::VSLIDEUP_VX:
63 case RISCV::VSLIDEDOWN_VX:
64 Cost += TLI->getVSlideVXCost(VT);
65 break;
66 case RISCV::VREDMAX_VS:
67 case RISCV::VREDMIN_VS:
68 case RISCV::VREDMAXU_VS:
69 case RISCV::VREDMINU_VS:
70 case RISCV::VREDSUM_VS:
71 case RISCV::VREDAND_VS:
72 case RISCV::VREDOR_VS:
73 case RISCV::VREDXOR_VS:
74 case RISCV::VFREDMAX_VS:
75 case RISCV::VFREDMIN_VS:
76 case RISCV::VFREDUSUM_VS: {
77 unsigned VL = VT.getVectorMinNumElements();
78 if (!VT.isFixedLengthVector())
79 VL *= *getVScaleForTuning();
80 Cost += Log2_32_Ceil(VL);
81 break;
82 }
83 case RISCV::VFREDOSUM_VS: {
84 unsigned VL = VT.getVectorMinNumElements();
85 if (!VT.isFixedLengthVector())
86 VL *= *getVScaleForTuning();
87 Cost += VL;
88 break;
89 }
90 case RISCV::VMV_X_S:
91 case RISCV::VMV_S_X:
92 case RISCV::VFMV_F_S:
93 case RISCV::VFMV_S_F:
94 case RISCV::VMOR_MM:
95 case RISCV::VMXOR_MM:
96 case RISCV::VMAND_MM:
97 case RISCV::VMANDN_MM:
98 case RISCV::VMNAND_MM:
99 case RISCV::VCPOP_M:
100 Cost += 1;
101 break;
102 default:
103 Cost += LMULCost;
104 }
105 }
106 return Cost;
107}
108
111 assert(Ty->isIntegerTy() &&
112 "getIntImmCost can only estimate cost of materialising integers");
113
114 // We have a Zero register, so 0 is always free.
115 if (Imm == 0)
116 return TTI::TCC_Free;
117
118 // Otherwise, we check how many instructions it will take to materialise.
119 const DataLayout &DL = getDataLayout();
120 return RISCVMatInt::getIntMatCost(Imm, DL.getTypeSizeInBits(Ty), *getST());
121}
122
123// Look for patterns of shift followed by AND that can be turned into a pair of
124// shifts. We won't need to materialize an immediate for the AND so these can
125// be considered free.
126static bool canUseShiftPair(Instruction *Inst, const APInt &Imm) {
127 uint64_t Mask = Imm.getZExtValue();
128 auto *BO = dyn_cast<BinaryOperator>(Inst->getOperand(0));
129 if (!BO || !BO->hasOneUse())
130 return false;
131
132 if (BO->getOpcode() != Instruction::Shl)
133 return false;
134
135 if (!isa<ConstantInt>(BO->getOperand(1)))
136 return false;
137
138 unsigned ShAmt = cast<ConstantInt>(BO->getOperand(1))->getZExtValue();
139 // (and (shl x, c2), c1) will be matched to (srli (slli x, c2+c3), c3) if c1
140 // is a mask shifted by c2 bits with c3 leading zeros.
141 if (isShiftedMask_64(Mask)) {
142 unsigned Trailing = llvm::countr_zero(Mask);
143 if (ShAmt == Trailing)
144 return true;
145 }
146
147 return false;
148}
149
151 const APInt &Imm, Type *Ty,
153 Instruction *Inst) {
154 assert(Ty->isIntegerTy() &&
155 "getIntImmCost can only estimate cost of materialising integers");
156
157 // We have a Zero register, so 0 is always free.
158 if (Imm == 0)
159 return TTI::TCC_Free;
160
161 // Some instructions in RISC-V can take a 12-bit immediate. Some of these are
162 // commutative, in others the immediate comes from a specific argument index.
163 bool Takes12BitImm = false;
164 unsigned ImmArgIdx = ~0U;
165
166 switch (Opcode) {
167 case Instruction::GetElementPtr:
168 // Never hoist any arguments to a GetElementPtr. CodeGenPrepare will
169 // split up large offsets in GEP into better parts than ConstantHoisting
170 // can.
171 return TTI::TCC_Free;
172 case Instruction::Store:
173 // If the address is a constant, use the materialization cost.
174 if (Idx == 1)
175 return getIntImmCost(Imm, Ty, CostKind);
176 return TTI::TCC_Free;
177 case Instruction::Load:
178 // If the address is a constant, use the materialization cost.
179 return getIntImmCost(Imm, Ty, CostKind);
180 case Instruction::And:
181 // zext.h
182 if (Imm == UINT64_C(0xffff) && ST->hasStdExtZbb())
183 return TTI::TCC_Free;
184 // zext.w
185 if (Imm == UINT64_C(0xffffffff) && ST->hasStdExtZba())
186 return TTI::TCC_Free;
187 // bclri
188 if (ST->hasStdExtZbs() && (~Imm).isPowerOf2())
189 return TTI::TCC_Free;
190 if (Inst && Idx == 1 && Imm.getBitWidth() <= ST->getXLen() &&
191 canUseShiftPair(Inst, Imm))
192 return TTI::TCC_Free;
193 Takes12BitImm = true;
194 break;
195 case Instruction::Add:
196 Takes12BitImm = true;
197 break;
198 case Instruction::Or:
199 case Instruction::Xor:
200 // bseti/binvi
201 if (ST->hasStdExtZbs() && Imm.isPowerOf2())
202 return TTI::TCC_Free;
203 Takes12BitImm = true;
204 break;
205 case Instruction::Mul:
206 // Power of 2 is a shift. Negated power of 2 is a shift and a negate.
207 if (Imm.isPowerOf2() || Imm.isNegatedPowerOf2())
208 return TTI::TCC_Free;
209 // One more or less than a power of 2 can use SLLI+ADD/SUB.
210 if ((Imm + 1).isPowerOf2() || (Imm - 1).isPowerOf2())
211 return TTI::TCC_Free;
212 // FIXME: There is no MULI instruction.
213 Takes12BitImm = true;
214 break;
215 case Instruction::Sub:
216 case Instruction::Shl:
217 case Instruction::LShr:
218 case Instruction::AShr:
219 Takes12BitImm = true;
220 ImmArgIdx = 1;
221 break;
222 default:
223 break;
224 }
225
226 if (Takes12BitImm) {
227 // Check immediate is the correct argument...
228 if (Instruction::isCommutative(Opcode) || Idx == ImmArgIdx) {
229 // ... and fits into the 12-bit immediate.
230 if (Imm.getSignificantBits() <= 64 &&
231 getTLI()->isLegalAddImmediate(Imm.getSExtValue())) {
232 return TTI::TCC_Free;
233 }
234 }
235
236 // Otherwise, use the full materialisation cost.
237 return getIntImmCost(Imm, Ty, CostKind);
238 }
239
240 // By default, prevent hoisting.
241 return TTI::TCC_Free;
242}
243
246 const APInt &Imm, Type *Ty,
248 // Prevent hoisting in unknown cases.
249 return TTI::TCC_Free;
250}
251
252bool RISCVTTIImpl::hasActiveVectorLength(unsigned, Type *DataTy, Align) const {
253 return ST->hasVInstructions();
254}
255
258 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
259 return ST->hasStdExtZbb() || ST->hasVendorXCVbitmanip()
262}
263
265 // Currently, the ExpandReductions pass can't expand scalable-vector
266 // reductions, but we still request expansion as RVV doesn't support certain
267 // reductions and the SelectionDAG can't legalize them either.
268 switch (II->getIntrinsicID()) {
269 default:
270 return false;
271 // These reductions have no equivalent in RVV
272 case Intrinsic::vector_reduce_mul:
273 case Intrinsic::vector_reduce_fmul:
274 return true;
275 }
276}
277
278std::optional<unsigned> RISCVTTIImpl::getMaxVScale() const {
279 if (ST->hasVInstructions())
281 return BaseT::getMaxVScale();
282}
283
284std::optional<unsigned> RISCVTTIImpl::getVScaleForTuning() const {
285 if (ST->hasVInstructions())
286 if (unsigned MinVLen = ST->getRealMinVLen();
287 MinVLen >= RISCV::RVVBitsPerBlock)
288 return MinVLen / RISCV::RVVBitsPerBlock;
290}
291
294 unsigned LMUL =
295 llvm::bit_floor(std::clamp<unsigned>(RVVRegisterWidthLMUL, 1, 8));
296 switch (K) {
298 return TypeSize::getFixed(ST->getXLen());
300 return TypeSize::getFixed(
301 ST->useRVVForFixedLengthVectors() ? LMUL * ST->getRealMinVLen() : 0);
304 (ST->hasVInstructions() &&
307 : 0);
308 }
309
310 llvm_unreachable("Unsupported register kind");
311}
312
314RISCVTTIImpl::getConstantPoolLoadCost(Type *Ty, TTI::TargetCostKind CostKind) {
315 // Add a cost of address generation + the cost of the load. The address
316 // is expected to be a PC relative offset to a constant pool entry
317 // using auipc/addi.
318 return 2 + getMemoryOpCost(Instruction::Load, Ty, DL.getABITypeAlign(Ty),
319 /*AddressSpace=*/0, CostKind);
320}
321
323 LLVMContext &C) {
324 assert((DataVT.getScalarSizeInBits() != 8 ||
325 DataVT.getVectorNumElements() <= 256) && "unhandled case in lowering");
326 MVT IndexVT = DataVT.changeTypeToInteger();
327 if (IndexVT.getScalarType().bitsGT(ST.getXLenVT()))
328 IndexVT = IndexVT.changeVectorElementType(MVT::i16);
329 return cast<VectorType>(EVT(IndexVT).getTypeForEVT(C));
330}
331
333 VectorType *Tp, ArrayRef<int> Mask,
335 int Index, VectorType *SubTp,
337 const Instruction *CxtI) {
338 Kind = improveShuffleKindFromMask(Kind, Mask, Tp, Index, SubTp);
339
340 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
341
342 // First, handle cases where having a fixed length vector enables us to
343 // give a more accurate cost than falling back to generic scalable codegen.
344 // TODO: Each of these cases hints at a modeling gap around scalable vectors.
345 if (isa<FixedVectorType>(Tp)) {
346 switch (Kind) {
347 default:
348 break;
350 if (Mask.size() >= 2 && LT.second.isFixedLengthVector()) {
351 MVT EltTp = LT.second.getVectorElementType();
352 // If the size of the element is < ELEN then shuffles of interleaves and
353 // deinterleaves of 2 vectors can be lowered into the following
354 // sequences
355 if (EltTp.getScalarSizeInBits() < ST->getELen()) {
356 // Example sequence:
357 // vsetivli zero, 4, e8, mf4, ta, ma (ignored)
358 // vwaddu.vv v10, v8, v9
359 // li a0, -1 (ignored)
360 // vwmaccu.vx v10, a0, v9
361 if (ShuffleVectorInst::isInterleaveMask(Mask, 2, Mask.size()))
362 return 2 * LT.first * TLI->getLMULCost(LT.second);
363
364 if (Mask[0] == 0 || Mask[0] == 1) {
365 auto DeinterleaveMask = createStrideMask(Mask[0], 2, Mask.size());
366 // Example sequence:
367 // vnsrl.wi v10, v8, 0
368 if (equal(DeinterleaveMask, Mask))
369 return LT.first * getRISCVInstructionCost(RISCV::VNSRL_WI,
370 LT.second, CostKind);
371 }
372 }
373 }
374 // vrgather + cost of generating the mask constant.
375 // We model this for an unknown mask with a single vrgather.
376 if (LT.second.isFixedLengthVector() && LT.first == 1 &&
377 (LT.second.getScalarSizeInBits() != 8 ||
378 LT.second.getVectorNumElements() <= 256)) {
379 VectorType *IdxTy = getVRGatherIndexType(LT.second, *ST, Tp->getContext());
380 InstructionCost IndexCost = getConstantPoolLoadCost(IdxTy, CostKind);
381 return IndexCost +
382 getRISCVInstructionCost(RISCV::VRGATHER_VV, LT.second, CostKind);
383 }
384 [[fallthrough]];
385 }
388 // 2 x (vrgather + cost of generating the mask constant) + cost of mask
389 // register for the second vrgather. We model this for an unknown
390 // (shuffle) mask.
391 if (LT.second.isFixedLengthVector() && LT.first == 1 &&
392 (LT.second.getScalarSizeInBits() != 8 ||
393 LT.second.getVectorNumElements() <= 256)) {
394 auto &C = Tp->getContext();
395 auto EC = Tp->getElementCount();
396 VectorType *IdxTy = getVRGatherIndexType(LT.second, *ST, C);
398 InstructionCost IndexCost = getConstantPoolLoadCost(IdxTy, CostKind);
399 InstructionCost MaskCost = getConstantPoolLoadCost(MaskTy, CostKind);
400 return 2 * IndexCost +
401 getRISCVInstructionCost({RISCV::VRGATHER_VV, RISCV::VRGATHER_VV},
402 LT.second, CostKind) +
403 MaskCost;
404 }
405 [[fallthrough]];
406 }
407 case TTI::SK_Select: {
408 // We are going to permute multiple sources and the result will be in
409 // multiple destinations. Providing an accurate cost only for splits where
410 // the element type remains the same.
411 if (!Mask.empty() && LT.first.isValid() && LT.first != 1 &&
412 LT.second.isFixedLengthVector() &&
413 LT.second.getVectorElementType().getSizeInBits() ==
415 LT.second.getVectorNumElements() <
416 cast<FixedVectorType>(Tp)->getNumElements() &&
417 divideCeil(Mask.size(),
418 cast<FixedVectorType>(Tp)->getNumElements()) ==
419 static_cast<unsigned>(*LT.first.getValue())) {
420 unsigned NumRegs = *LT.first.getValue();
421 unsigned VF = cast<FixedVectorType>(Tp)->getNumElements();
422 unsigned SubVF = PowerOf2Ceil(VF / NumRegs);
423 auto *SubVecTy = FixedVectorType::get(Tp->getElementType(), SubVF);
424
426 for (unsigned I = 0; I < NumRegs; ++I) {
427 bool IsSingleVector = true;
428 SmallVector<int> SubMask(SubVF, PoisonMaskElem);
429 transform(Mask.slice(I * SubVF,
430 I == NumRegs - 1 ? Mask.size() % SubVF : SubVF),
431 SubMask.begin(), [&](int I) {
432 bool SingleSubVector = I / VF == 0;
433 IsSingleVector &= SingleSubVector;
434 return (SingleSubVector ? 0 : 1) * SubVF + I % VF;
435 });
438 SubVecTy, SubMask, CostKind, 0, nullptr);
439 return Cost;
440 }
441 }
442 break;
443 }
444 }
445 };
446
447 // Handle scalable vectors (and fixed vectors legalized to scalable vectors).
448 switch (Kind) {
449 default:
450 // Fallthrough to generic handling.
451 // TODO: Most of these cases will return getInvalid in generic code, and
452 // must be implemented here.
453 break;
455 // Extract at zero is always a subregister extract
456 if (Index == 0)
457 return TTI::TCC_Free;
458
459 // If we're extracting a subvector of at most m1 size at a sub-register
460 // boundary - which unfortunately we need exact vlen to identify - this is
461 // a subregister extract at worst and thus won't require a vslidedown.
462 // TODO: Extend for aligned m2, m4 subvector extracts
463 // TODO: Extend for misalgined (but contained) extracts
464 // TODO: Extend for scalable subvector types
465 if (std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
466 SubLT.second.isValid() && SubLT.second.isFixedLengthVector()) {
467 const unsigned MinVLen = ST->getRealMinVLen();
468 const unsigned MaxVLen = ST->getRealMaxVLen();
469 if (MinVLen == MaxVLen &&
470 SubLT.second.getScalarSizeInBits() * Index % MinVLen == 0 &&
471 SubLT.second.getSizeInBits() <= MinVLen)
472 return TTI::TCC_Free;
473 }
474
475 // Example sequence:
476 // vsetivli zero, 4, e8, mf2, tu, ma (ignored)
477 // vslidedown.vi v8, v9, 2
478 return LT.first *
479 getRISCVInstructionCost(RISCV::VSLIDEDOWN_VI, LT.second, CostKind);
481 // Example sequence:
482 // vsetivli zero, 4, e8, mf2, tu, ma (ignored)
483 // vslideup.vi v8, v9, 2
484 return LT.first *
485 getRISCVInstructionCost(RISCV::VSLIDEUP_VI, LT.second, CostKind);
486 case TTI::SK_Select: {
487 // Example sequence:
488 // li a0, 90
489 // vsetivli zero, 8, e8, mf2, ta, ma (ignored)
490 // vmv.s.x v0, a0
491 // vmerge.vvm v8, v9, v8, v0
492 // We use 2 for the cost of the mask materialization as this is the true
493 // cost for small masks and most shuffles are small. At worst, this cost
494 // should be a very small constant for the constant pool load. As such,
495 // we may bias towards large selects slightly more than truely warranted.
496 return LT.first *
497 (1 + getRISCVInstructionCost({RISCV::VMV_S_X, RISCV::VMERGE_VVM},
498 LT.second, CostKind));
499 }
500 case TTI::SK_Broadcast: {
501 bool HasScalar = (Args.size() > 0) && (Operator::getOpcode(Args[0]) ==
502 Instruction::InsertElement);
503 if (LT.second.getScalarSizeInBits() == 1) {
504 if (HasScalar) {
505 // Example sequence:
506 // andi a0, a0, 1
507 // vsetivli zero, 2, e8, mf8, ta, ma (ignored)
508 // vmv.v.x v8, a0
509 // vmsne.vi v0, v8, 0
510 return LT.first *
511 (1 + getRISCVInstructionCost({RISCV::VMV_V_X, RISCV::VMSNE_VI},
512 LT.second, CostKind));
513 }
514 // Example sequence:
515 // vsetivli zero, 2, e8, mf8, ta, mu (ignored)
516 // vmv.v.i v8, 0
517 // vmerge.vim v8, v8, 1, v0
518 // vmv.x.s a0, v8
519 // andi a0, a0, 1
520 // vmv.v.x v8, a0
521 // vmsne.vi v0, v8, 0
522
523 return LT.first *
524 (1 + getRISCVInstructionCost({RISCV::VMV_V_I, RISCV::VMERGE_VIM,
525 RISCV::VMV_X_S, RISCV::VMV_V_X,
526 RISCV::VMSNE_VI},
527 LT.second, CostKind));
528 }
529
530 if (HasScalar) {
531 // Example sequence:
532 // vmv.v.x v8, a0
533 return LT.first *
534 getRISCVInstructionCost(RISCV::VMV_V_X, LT.second, CostKind);
535 }
536
537 // Example sequence:
538 // vrgather.vi v9, v8, 0
539 return LT.first *
540 getRISCVInstructionCost(RISCV::VRGATHER_VI, LT.second, CostKind);
541 }
542 case TTI::SK_Splice: {
543 // vslidedown+vslideup.
544 // TODO: Multiplying by LT.first implies this legalizes into multiple copies
545 // of similar code, but I think we expand through memory.
546 unsigned Opcodes[2] = {RISCV::VSLIDEDOWN_VX, RISCV::VSLIDEUP_VX};
547 if (Index >= 0 && Index < 32)
548 Opcodes[0] = RISCV::VSLIDEDOWN_VI;
549 else if (Index < 0 && Index > -32)
550 Opcodes[1] = RISCV::VSLIDEUP_VI;
551 return LT.first * getRISCVInstructionCost(Opcodes, LT.second, CostKind);
552 }
553 case TTI::SK_Reverse: {
554 // TODO: Cases to improve here:
555 // * Illegal vector types
556 // * i64 on RV32
557 // * i1 vector
558 // At low LMUL, most of the cost is producing the vrgather index register.
559 // At high LMUL, the cost of the vrgather itself will dominate.
560 // Example sequence:
561 // csrr a0, vlenb
562 // srli a0, a0, 3
563 // addi a0, a0, -1
564 // vsetvli a1, zero, e8, mf8, ta, mu (ignored)
565 // vid.v v9
566 // vrsub.vx v10, v9, a0
567 // vrgather.vv v9, v8, v10
568 InstructionCost LenCost = 3;
569 if (LT.second.isFixedLengthVector())
570 // vrsub.vi has a 5 bit immediate field, otherwise an li suffices
571 LenCost = isInt<5>(LT.second.getVectorNumElements() - 1) ? 0 : 1;
572 unsigned Opcodes[] = {RISCV::VID_V, RISCV::VRSUB_VX, RISCV::VRGATHER_VV};
573 if (LT.second.isFixedLengthVector() &&
574 isInt<5>(LT.second.getVectorNumElements() - 1))
575 Opcodes[1] = RISCV::VRSUB_VI;
576 InstructionCost GatherCost =
577 getRISCVInstructionCost(Opcodes, LT.second, CostKind);
578 // Mask operation additionally required extend and truncate
579 InstructionCost ExtendCost = Tp->getElementType()->isIntegerTy(1) ? 3 : 0;
580 return LT.first * (LenCost + GatherCost + ExtendCost);
581 }
582 }
583 return BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp);
584}
585
587RISCVTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
588 unsigned AddressSpace,
590 if (!isLegalMaskedLoadStore(Src, Alignment) ||
592 return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
593 CostKind);
594
595 return getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind);
596}
597
599 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
600 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
601 bool UseMaskForCond, bool UseMaskForGaps) {
602 if (isa<ScalableVectorType>(VecTy))
604 auto *FVTy = cast<FixedVectorType>(VecTy);
605 InstructionCost MemCost =
606 getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace, CostKind);
607 unsigned VF = FVTy->getNumElements() / Factor;
608
609 // The interleaved memory access pass will lower interleaved memory ops (i.e
610 // a load and store followed by a specific shuffle) to vlseg/vsseg
611 // intrinsics. In those cases then we can treat it as if it's just one (legal)
612 // memory op
613 if (!UseMaskForCond && !UseMaskForGaps &&
614 Factor <= TLI->getMaxSupportedInterleaveFactor()) {
615 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(FVTy);
616 // Need to make sure type has't been scalarized
617 if (LT.second.isFixedLengthVector()) {
618 auto *LegalFVTy = FixedVectorType::get(FVTy->getElementType(),
619 LT.second.getVectorNumElements());
620 // FIXME: We use the memory op cost of the *legalized* type here, becuase
621 // it's getMemoryOpCost returns a really expensive cost for types like
622 // <6 x i8>, which show up when doing interleaves of Factor=3 etc.
623 // Should the memory op cost of these be cheaper?
624 if (TLI->isLegalInterleavedAccessType(LegalFVTy, Factor, Alignment,
625 AddressSpace, DL)) {
626 InstructionCost LegalMemCost = getMemoryOpCost(
627 Opcode, LegalFVTy, Alignment, AddressSpace, CostKind);
628 return LT.first + LegalMemCost;
629 }
630 }
631 }
632
633 // An interleaved load will look like this for Factor=3:
634 // %wide.vec = load <12 x i32>, ptr %3, align 4
635 // %strided.vec = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
636 // %strided.vec1 = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
637 // %strided.vec2 = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
638 if (Opcode == Instruction::Load) {
639 InstructionCost Cost = MemCost;
640 for (unsigned Index : Indices) {
641 FixedVectorType *SubVecTy =
642 FixedVectorType::get(FVTy->getElementType(), VF * Factor);
643 auto Mask = createStrideMask(Index, Factor, VF);
644 InstructionCost ShuffleCost =
646 CostKind, 0, nullptr, {});
647 Cost += ShuffleCost;
648 }
649 return Cost;
650 }
651
652 // TODO: Model for NF > 2
653 // We'll need to enhance getShuffleCost to model shuffles that are just
654 // inserts and extracts into subvectors, since they won't have the full cost
655 // of a vrgather.
656 // An interleaved store for 3 vectors of 4 lanes will look like
657 // %11 = shufflevector <4 x i32> %4, <4 x i32> %6, <8 x i32> <0...7>
658 // %12 = shufflevector <4 x i32> %9, <4 x i32> poison, <8 x i32> <0...3>
659 // %13 = shufflevector <8 x i32> %11, <8 x i32> %12, <12 x i32> <0...11>
660 // %interleaved.vec = shufflevector %13, poison, <12 x i32> <interleave mask>
661 // store <12 x i32> %interleaved.vec, ptr %10, align 4
662 if (Factor != 2)
663 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
664 Alignment, AddressSpace, CostKind,
665 UseMaskForCond, UseMaskForGaps);
666
667 assert(Opcode == Instruction::Store && "Opcode must be a store");
668 // For an interleaving store of 2 vectors, we perform one large interleaving
669 // shuffle that goes into the wide store
670 auto Mask = createInterleaveMask(VF, Factor);
671 InstructionCost ShuffleCost =
673 CostKind, 0, nullptr, {});
674 return MemCost + ShuffleCost;
675}
676
678 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
679 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
681 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
682 Alignment, CostKind, I);
683
684 if ((Opcode == Instruction::Load &&
685 !isLegalMaskedGather(DataTy, Align(Alignment))) ||
686 (Opcode == Instruction::Store &&
687 !isLegalMaskedScatter(DataTy, Align(Alignment))))
688 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
689 Alignment, CostKind, I);
690
691 // Cost is proportional to the number of memory operations implied. For
692 // scalable vectors, we use an estimate on that number since we don't
693 // know exactly what VL will be.
694 auto &VTy = *cast<VectorType>(DataTy);
695 InstructionCost MemOpCost =
696 getMemoryOpCost(Opcode, VTy.getElementType(), Alignment, 0, CostKind,
697 {TTI::OK_AnyValue, TTI::OP_None}, I);
698 unsigned NumLoads = getEstimatedVLFor(&VTy);
699 return NumLoads * MemOpCost;
700}
701
703 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
704 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
705 if (((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
706 !isLegalStridedLoadStore(DataTy, Alignment)) ||
707 (Opcode != Instruction::Load && Opcode != Instruction::Store))
708 return BaseT::getStridedMemoryOpCost(Opcode, DataTy, Ptr, VariableMask,
709 Alignment, CostKind, I);
710
712 return TTI::TCC_Basic;
713
714 // Cost is proportional to the number of memory operations implied. For
715 // scalable vectors, we use an estimate on that number since we don't
716 // know exactly what VL will be.
717 auto &VTy = *cast<VectorType>(DataTy);
718 InstructionCost MemOpCost =
719 getMemoryOpCost(Opcode, VTy.getElementType(), Alignment, 0, CostKind,
720 {TTI::OK_AnyValue, TTI::OP_None}, I);
721 unsigned NumLoads = getEstimatedVLFor(&VTy);
722 return NumLoads * MemOpCost;
723}
724
725// Currently, these represent both throughput and codesize costs
726// for the respective intrinsics. The costs in this table are simply
727// instruction counts with the following adjustments made:
728// * One vsetvli is considered free.
730 {Intrinsic::floor, MVT::f32, 9},
731 {Intrinsic::floor, MVT::f64, 9},
732 {Intrinsic::ceil, MVT::f32, 9},
733 {Intrinsic::ceil, MVT::f64, 9},
734 {Intrinsic::trunc, MVT::f32, 7},
735 {Intrinsic::trunc, MVT::f64, 7},
736 {Intrinsic::round, MVT::f32, 9},
737 {Intrinsic::round, MVT::f64, 9},
738 {Intrinsic::roundeven, MVT::f32, 9},
739 {Intrinsic::roundeven, MVT::f64, 9},
740 {Intrinsic::rint, MVT::f32, 7},
741 {Intrinsic::rint, MVT::f64, 7},
742 {Intrinsic::lrint, MVT::i32, 1},
743 {Intrinsic::lrint, MVT::i64, 1},
744 {Intrinsic::llrint, MVT::i64, 1},
745 {Intrinsic::nearbyint, MVT::f32, 9},
746 {Intrinsic::nearbyint, MVT::f64, 9},
747 {Intrinsic::bswap, MVT::i16, 3},
748 {Intrinsic::bswap, MVT::i32, 12},
749 {Intrinsic::bswap, MVT::i64, 31},
750 {Intrinsic::vp_bswap, MVT::i16, 3},
751 {Intrinsic::vp_bswap, MVT::i32, 12},
752 {Intrinsic::vp_bswap, MVT::i64, 31},
753 {Intrinsic::vp_fshl, MVT::i8, 7},
754 {Intrinsic::vp_fshl, MVT::i16, 7},
755 {Intrinsic::vp_fshl, MVT::i32, 7},
756 {Intrinsic::vp_fshl, MVT::i64, 7},
757 {Intrinsic::vp_fshr, MVT::i8, 7},
758 {Intrinsic::vp_fshr, MVT::i16, 7},
759 {Intrinsic::vp_fshr, MVT::i32, 7},
760 {Intrinsic::vp_fshr, MVT::i64, 7},
761 {Intrinsic::bitreverse, MVT::i8, 17},
762 {Intrinsic::bitreverse, MVT::i16, 24},
763 {Intrinsic::bitreverse, MVT::i32, 33},
764 {Intrinsic::bitreverse, MVT::i64, 52},
765 {Intrinsic::vp_bitreverse, MVT::i8, 17},
766 {Intrinsic::vp_bitreverse, MVT::i16, 24},
767 {Intrinsic::vp_bitreverse, MVT::i32, 33},
768 {Intrinsic::vp_bitreverse, MVT::i64, 52},
769 {Intrinsic::ctpop, MVT::i8, 12},
770 {Intrinsic::ctpop, MVT::i16, 19},
771 {Intrinsic::ctpop, MVT::i32, 20},
772 {Intrinsic::ctpop, MVT::i64, 21},
773 {Intrinsic::vp_ctpop, MVT::i8, 12},
774 {Intrinsic::vp_ctpop, MVT::i16, 19},
775 {Intrinsic::vp_ctpop, MVT::i32, 20},
776 {Intrinsic::vp_ctpop, MVT::i64, 21},
777 {Intrinsic::vp_ctlz, MVT::i8, 19},
778 {Intrinsic::vp_ctlz, MVT::i16, 28},
779 {Intrinsic::vp_ctlz, MVT::i32, 31},
780 {Intrinsic::vp_ctlz, MVT::i64, 35},
781 {Intrinsic::vp_cttz, MVT::i8, 16},
782 {Intrinsic::vp_cttz, MVT::i16, 23},
783 {Intrinsic::vp_cttz, MVT::i32, 24},
784 {Intrinsic::vp_cttz, MVT::i64, 25},
785};
786
788 switch (ID) {
789#define HELPER_MAP_VPID_TO_VPSD(VPID, VPSD) \
790 case Intrinsic::VPID: \
791 return ISD::VPSD;
792#include "llvm/IR/VPIntrinsics.def"
793#undef HELPER_MAP_VPID_TO_VPSD
794 }
795 return ISD::DELETED_NODE;
796}
797
801 auto *RetTy = ICA.getReturnType();
802 switch (ICA.getID()) {
803 case Intrinsic::ceil:
804 case Intrinsic::floor:
805 case Intrinsic::trunc:
806 case Intrinsic::rint:
807 case Intrinsic::lrint:
808 case Intrinsic::llrint:
809 case Intrinsic::round:
810 case Intrinsic::roundeven: {
811 // These all use the same code.
813 if (!LT.second.isVector() && TLI->isOperationCustom(ISD::FCEIL, LT.second))
814 return LT.first * 8;
815 break;
816 }
817 case Intrinsic::umin:
818 case Intrinsic::umax:
819 case Intrinsic::smin:
820 case Intrinsic::smax: {
822 if (LT.second.isScalarInteger() && ST->hasStdExtZbb())
823 return LT.first;
824
825 if (ST->hasVInstructions() && LT.second.isVector()) {
826 unsigned Op;
827 switch (ICA.getID()) {
828 case Intrinsic::umin:
829 Op = RISCV::VMINU_VV;
830 break;
831 case Intrinsic::umax:
832 Op = RISCV::VMAXU_VV;
833 break;
834 case Intrinsic::smin:
835 Op = RISCV::VMIN_VV;
836 break;
837 case Intrinsic::smax:
838 Op = RISCV::VMAX_VV;
839 break;
840 }
841 return LT.first * getRISCVInstructionCost(Op, LT.second, CostKind);
842 }
843 break;
844 }
845 case Intrinsic::sadd_sat:
846 case Intrinsic::ssub_sat:
847 case Intrinsic::uadd_sat:
848 case Intrinsic::usub_sat:
849 case Intrinsic::fabs:
850 case Intrinsic::sqrt: {
852 if (ST->hasVInstructions() && LT.second.isVector())
853 return LT.first;
854 break;
855 }
856 case Intrinsic::ctpop: {
858 if (ST->hasVInstructions() && ST->hasStdExtZvbb() && LT.second.isVector())
859 return LT.first;
860 break;
861 }
862 case Intrinsic::abs: {
864 if (ST->hasVInstructions() && LT.second.isVector()) {
865 // vrsub.vi v10, v8, 0
866 // vmax.vv v8, v8, v10
867 return LT.first * 2;
868 }
869 break;
870 }
871 case Intrinsic::get_active_lane_mask: {
872 if (ST->hasVInstructions()) {
873 Type *ExpRetTy = VectorType::get(
874 ICA.getArgTypes()[0], cast<VectorType>(RetTy)->getElementCount());
875 auto LT = getTypeLegalizationCost(ExpRetTy);
876
877 // vid.v v8 // considered hoisted
878 // vsaddu.vx v8, v8, a0
879 // vmsltu.vx v0, v8, a1
880 return LT.first *
881 getRISCVInstructionCost({RISCV::VSADDU_VX, RISCV::VMSLTU_VX},
882 LT.second, CostKind);
883 }
884 break;
885 }
886 // TODO: add more intrinsic
887 case Intrinsic::experimental_stepvector: {
889 // Legalisation of illegal types involves an `index' instruction plus
890 // (LT.first - 1) vector adds.
891 if (ST->hasVInstructions())
892 return getRISCVInstructionCost(RISCV::VID_V, LT.second, CostKind) +
893 (LT.first - 1) *
894 getRISCVInstructionCost(RISCV::VADD_VX, LT.second, CostKind);
895 return 1 + (LT.first - 1);
896 }
897 case Intrinsic::vp_rint: {
898 // RISC-V target uses at least 5 instructions to lower rounding intrinsics.
899 unsigned Cost = 5;
901 if (TLI->isOperationCustom(ISD::VP_FRINT, LT.second))
902 return Cost * LT.first;
903 break;
904 }
905 case Intrinsic::vp_nearbyint: {
906 // More one read and one write for fflags than vp_rint.
907 unsigned Cost = 7;
909 if (TLI->isOperationCustom(ISD::VP_FRINT, LT.second))
910 return Cost * LT.first;
911 break;
912 }
913 case Intrinsic::vp_ceil:
914 case Intrinsic::vp_floor:
915 case Intrinsic::vp_round:
916 case Intrinsic::vp_roundeven:
917 case Intrinsic::vp_roundtozero: {
918 // Rounding with static rounding mode needs two more instructions to
919 // swap/write FRM than vp_rint.
920 unsigned Cost = 7;
922 unsigned VPISD = getISDForVPIntrinsicID(ICA.getID());
923 if (TLI->isOperationCustom(VPISD, LT.second))
924 return Cost * LT.first;
925 break;
926 }
927 }
928
929 if (ST->hasVInstructions() && RetTy->isVectorTy()) {
930 if (auto LT = getTypeLegalizationCost(RetTy);
931 LT.second.isVector()) {
932 MVT EltTy = LT.second.getVectorElementType();
933 if (const auto *Entry = CostTableLookup(VectorIntrinsicCostTable,
934 ICA.getID(), EltTy))
935 return LT.first * Entry->Cost;
936 }
937 }
938
940}
941
943 Type *Src,
946 const Instruction *I) {
947 bool IsVectorType = isa<VectorType>(Dst) && isa<VectorType>(Src);
948 if (!IsVectorType)
949 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
950
951 bool IsTypeLegal = isTypeLegal(Src) && isTypeLegal(Dst) &&
952 (Src->getScalarSizeInBits() <= ST->getELen()) &&
953 (Dst->getScalarSizeInBits() <= ST->getELen());
954
955 // FIXME: Need to compute legalizing cost for illegal types.
956 if (!IsTypeLegal)
957 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
958
959 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Src);
960 std::pair<InstructionCost, MVT> DstLT = getTypeLegalizationCost(Dst);
961
962 int ISD = TLI->InstructionOpcodeToISD(Opcode);
963 assert(ISD && "Invalid opcode");
964
965 int PowDiff = (int)Log2_32(Dst->getScalarSizeInBits()) -
966 (int)Log2_32(Src->getScalarSizeInBits());
967 switch (ISD) {
968 case ISD::SIGN_EXTEND:
969 case ISD::ZERO_EXTEND: {
970 const unsigned SrcEltSize = Src->getScalarSizeInBits();
971 if (SrcEltSize == 1) {
972 // We do not use vsext/vzext to extend from mask vector.
973 // Instead we use the following instructions to extend from mask vector:
974 // vmv.v.i v8, 0
975 // vmerge.vim v8, v8, -1, v0
976 return getRISCVInstructionCost({RISCV::VMV_V_I, RISCV::VMERGE_VIM},
977 DstLT.second, CostKind);
978 }
979 if ((PowDiff < 1) || (PowDiff > 3))
980 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
981 unsigned SExtOp[] = {RISCV::VSEXT_VF2, RISCV::VSEXT_VF4, RISCV::VSEXT_VF8};
982 unsigned ZExtOp[] = {RISCV::VZEXT_VF2, RISCV::VZEXT_VF4, RISCV::VZEXT_VF8};
983 unsigned Op =
984 (ISD == ISD::SIGN_EXTEND) ? SExtOp[PowDiff - 1] : ZExtOp[PowDiff - 1];
985 return getRISCVInstructionCost(Op, DstLT.second, CostKind);
986 }
987 case ISD::TRUNCATE:
988 if (Dst->getScalarSizeInBits() == 1) {
989 // We do not use several vncvt to truncate to mask vector. So we could
990 // not use PowDiff to calculate it.
991 // Instead we use the following instructions to truncate to mask vector:
992 // vand.vi v8, v8, 1
993 // vmsne.vi v0, v8, 0
994 return getRISCVInstructionCost({RISCV::VAND_VI, RISCV::VMSNE_VI},
995 SrcLT.second, CostKind);
996 }
997 [[fallthrough]];
998 case ISD::FP_EXTEND:
999 case ISD::FP_ROUND: {
1000 // Counts of narrow/widen instructions.
1001 unsigned SrcEltSize = Src->getScalarSizeInBits();
1002 unsigned DstEltSize = Dst->getScalarSizeInBits();
1003
1004 unsigned Op = (ISD == ISD::TRUNCATE) ? RISCV::VNSRL_WI
1005 : (ISD == ISD::FP_EXTEND) ? RISCV::VFWCVT_F_F_V
1006 : RISCV::VFNCVT_F_F_W;
1008 for (; SrcEltSize != DstEltSize;) {
1009 MVT ElementMVT = (ISD == ISD::TRUNCATE)
1010 ? MVT::getIntegerVT(DstEltSize)
1011 : MVT::getFloatingPointVT(DstEltSize);
1012 MVT DstMVT = DstLT.second.changeVectorElementType(ElementMVT);
1013 DstEltSize =
1014 (DstEltSize > SrcEltSize) ? DstEltSize >> 1 : DstEltSize << 1;
1015 Cost += getRISCVInstructionCost(Op, DstMVT, CostKind);
1016 }
1017 return Cost;
1018 }
1019 case ISD::FP_TO_SINT:
1020 case ISD::FP_TO_UINT:
1021 case ISD::SINT_TO_FP:
1022 case ISD::UINT_TO_FP:
1023 if (Src->getScalarSizeInBits() == 1 || Dst->getScalarSizeInBits() == 1) {
1024 // The cost of convert from or to mask vector is different from other
1025 // cases. We could not use PowDiff to calculate it.
1026 // For mask vector to fp, we should use the following instructions:
1027 // vmv.v.i v8, 0
1028 // vmerge.vim v8, v8, -1, v0
1029 // vfcvt.f.x.v v8, v8
1030
1031 // And for fp vector to mask, we use:
1032 // vfncvt.rtz.x.f.w v9, v8
1033 // vand.vi v8, v9, 1
1034 // vmsne.vi v0, v8, 0
1035 return 3;
1036 }
1037 if (std::abs(PowDiff) <= 1)
1038 return 1;
1039 // Backend could lower (v[sz]ext i8 to double) to vfcvt(v[sz]ext.f8 i8),
1040 // so it only need two conversion.
1041 if (Src->isIntOrIntVectorTy())
1042 return 2;
1043 // Counts of narrow/widen instructions.
1044 return std::abs(PowDiff);
1045 }
1046 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1047}
1048
1049unsigned RISCVTTIImpl::getEstimatedVLFor(VectorType *Ty) {
1050 if (isa<ScalableVectorType>(Ty)) {
1051 const unsigned EltSize = DL.getTypeSizeInBits(Ty->getElementType());
1052 const unsigned MinSize = DL.getTypeSizeInBits(Ty).getKnownMinValue();
1053 const unsigned VectorBits = *getVScaleForTuning() * RISCV::RVVBitsPerBlock;
1054 return RISCVTargetLowering::computeVLMAX(VectorBits, EltSize, MinSize);
1055 }
1056 return cast<FixedVectorType>(Ty)->getNumElements();
1057}
1058
1061 FastMathFlags FMF,
1063 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
1064 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
1065
1066 // Skip if scalar size of Ty is bigger than ELEN.
1067 if (Ty->getScalarSizeInBits() > ST->getELen())
1068 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
1069
1070 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1071 if (Ty->getElementType()->isIntegerTy(1)) {
1072 // SelectionDAGBuilder does following transforms:
1073 // vector_reduce_{smin,umax}(<n x i1>) --> vector_reduce_or(<n x i1>)
1074 // vector_reduce_{smax,umin}(<n x i1>) --> vector_reduce_and(<n x i1>)
1075 if (IID == Intrinsic::umax || IID == Intrinsic::smin)
1076 return getArithmeticReductionCost(Instruction::Or, Ty, FMF, CostKind);
1077 else
1078 return getArithmeticReductionCost(Instruction::And, Ty, FMF, CostKind);
1079 }
1080
1081 if (IID == Intrinsic::maximum || IID == Intrinsic::minimum) {
1083 InstructionCost ExtraCost = 0;
1084 switch (IID) {
1085 case Intrinsic::maximum:
1086 if (FMF.noNaNs()) {
1087 Opcodes = {RISCV::VFREDMAX_VS, RISCV::VFMV_F_S};
1088 } else {
1089 Opcodes = {RISCV::VMFNE_VV, RISCV::VCPOP_M, RISCV::VFREDMAX_VS,
1090 RISCV::VFMV_F_S};
1091 // Cost of Canonical Nan + branch
1092 // lui a0, 523264
1093 // fmv.w.x fa0, a0
1094 Type *DstTy = Ty->getScalarType();
1095 const unsigned EltTyBits = DstTy->getScalarSizeInBits();
1096 Type *SrcTy = IntegerType::getIntNTy(DstTy->getContext(), EltTyBits);
1097 ExtraCost = 1 +
1098 getCastInstrCost(Instruction::UIToFP, DstTy, SrcTy,
1100 getCFInstrCost(Instruction::Br, CostKind);
1101 }
1102 break;
1103
1104 case Intrinsic::minimum:
1105 if (FMF.noNaNs()) {
1106 Opcodes = {RISCV::VFREDMIN_VS, RISCV::VFMV_F_S};
1107 } else {
1108 Opcodes = {RISCV::VMFNE_VV, RISCV::VCPOP_M, RISCV::VFREDMIN_VS,
1109 RISCV::VFMV_F_S};
1110 // Cost of Canonical Nan + branch
1111 // lui a0, 523264
1112 // fmv.w.x fa0, a0
1113 Type *DstTy = Ty->getScalarType();
1114 const unsigned EltTyBits = DL.getTypeSizeInBits(DstTy);
1115 Type *SrcTy = IntegerType::getIntNTy(DstTy->getContext(), EltTyBits);
1116 ExtraCost = 1 +
1117 getCastInstrCost(Instruction::UIToFP, DstTy, SrcTy,
1119 getCFInstrCost(Instruction::Br, CostKind);
1120 }
1121 break;
1122 }
1123 return ExtraCost + getRISCVInstructionCost(Opcodes, LT.second, CostKind);
1124 }
1125
1126 // IR Reduction is composed by two vmv and one rvv reduction instruction.
1127 unsigned SplitOp;
1129 switch (IID) {
1130 default:
1131 llvm_unreachable("Unsupported intrinsic");
1132 case Intrinsic::smax:
1133 SplitOp = RISCV::VMAX_VV;
1134 Opcodes = {RISCV::VMV_S_X, RISCV::VREDMAX_VS, RISCV::VMV_X_S};
1135 break;
1136 case Intrinsic::smin:
1137 SplitOp = RISCV::VMIN_VV;
1138 Opcodes = {RISCV::VMV_S_X, RISCV::VREDMIN_VS, RISCV::VMV_X_S};
1139 break;
1140 case Intrinsic::umax:
1141 SplitOp = RISCV::VMAXU_VV;
1142 Opcodes = {RISCV::VMV_S_X, RISCV::VREDMAXU_VS, RISCV::VMV_X_S};
1143 break;
1144 case Intrinsic::umin:
1145 SplitOp = RISCV::VMINU_VV;
1146 Opcodes = {RISCV::VMV_S_X, RISCV::VREDMINU_VS, RISCV::VMV_X_S};
1147 break;
1148 case Intrinsic::maxnum:
1149 SplitOp = RISCV::VFMAX_VV;
1150 Opcodes = {RISCV::VFMV_S_F, RISCV::VFREDMAX_VS, RISCV::VFMV_F_S};
1151 break;
1152 case Intrinsic::minnum:
1153 SplitOp = RISCV::VFMIN_VV;
1154 Opcodes = {RISCV::VFMV_S_F, RISCV::VFREDMIN_VS, RISCV::VFMV_F_S};
1155 break;
1156 }
1157 // Add a cost for data larger than LMUL8
1158 InstructionCost SplitCost =
1159 (LT.first > 1) ? (LT.first - 1) *
1160 getRISCVInstructionCost(SplitOp, LT.second, CostKind)
1161 : 0;
1162 return SplitCost + getRISCVInstructionCost(Opcodes, LT.second, CostKind);
1163}
1164
1167 std::optional<FastMathFlags> FMF,
1169 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
1170 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
1171
1172 // Skip if scalar size of Ty is bigger than ELEN.
1173 if (Ty->getScalarSizeInBits() > ST->getELen())
1174 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
1175
1176 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1177 assert(ISD && "Invalid opcode");
1178
1179 if (ISD != ISD::ADD && ISD != ISD::OR && ISD != ISD::XOR && ISD != ISD::AND &&
1180 ISD != ISD::FADD)
1181 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
1182
1183 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1185 Type *ElementTy = Ty->getElementType();
1186 if (ElementTy->isIntegerTy(1)) {
1187 if (ISD == ISD::AND) {
1188 // Example sequences:
1189 // vsetvli a0, zero, e8, mf8, ta, ma
1190 // vmnot.m v8, v0
1191 // vcpop.m a0, v8
1192 // seqz a0, a0
1193 Opcodes = {RISCV::VMNAND_MM, RISCV::VCPOP_M};
1194 return (LT.first - 1) +
1195 getRISCVInstructionCost(Opcodes, LT.second, CostKind) +
1196 getCmpSelInstrCost(Instruction::ICmp, ElementTy, ElementTy,
1198 } else {
1199 // Example sequences:
1200 // vsetvli a0, zero, e8, mf8, ta, ma
1201 // vcpop.m a0, v0
1202 // snez a0, a0
1203 Opcodes = {RISCV::VCPOP_M};
1204 return (LT.first - 1) +
1205 getRISCVInstructionCost(Opcodes, LT.second, CostKind) +
1206 getCmpSelInstrCost(Instruction::ICmp, ElementTy, ElementTy,
1208 }
1209 }
1210
1211 // IR Reduction is composed by two vmv and one rvv reduction instruction.
1213 Opcodes.push_back(RISCV::VFMV_S_F);
1214 for (unsigned i = 0; i < LT.first.getValue(); i++)
1215 Opcodes.push_back(RISCV::VFREDOSUM_VS);
1216 Opcodes.push_back(RISCV::VFMV_F_S);
1217 return getRISCVInstructionCost(Opcodes, LT.second, CostKind);
1218 }
1219 unsigned SplitOp;
1220 switch (ISD) {
1221 case ISD::ADD:
1222 SplitOp = RISCV::VADD_VV;
1223 Opcodes = {RISCV::VMV_S_X, RISCV::VREDSUM_VS, RISCV::VMV_X_S};
1224 break;
1225 case ISD::OR:
1226 SplitOp = RISCV::VOR_VV;
1227 Opcodes = {RISCV::VMV_S_X, RISCV::VREDOR_VS, RISCV::VMV_X_S};
1228 break;
1229 case ISD::XOR:
1230 SplitOp = RISCV::VXOR_VV;
1231 Opcodes = {RISCV::VMV_S_X, RISCV::VREDXOR_VS, RISCV::VMV_X_S};
1232 break;
1233 case ISD::AND:
1234 SplitOp = RISCV::VAND_VV;
1235 Opcodes = {RISCV::VMV_S_X, RISCV::VREDAND_VS, RISCV::VMV_X_S};
1236 break;
1237 case ISD::FADD:
1238 SplitOp = RISCV::VFADD_VV;
1239 Opcodes = {RISCV::VFMV_S_F, RISCV::VFREDUSUM_VS, RISCV::VFMV_F_S};
1240 break;
1241 }
1242 // Add a cost for data larger than LMUL8
1243 InstructionCost SplitCost =
1244 (LT.first > 1) ? (LT.first - 1) *
1245 getRISCVInstructionCost(SplitOp, LT.second, CostKind)
1246 : 0;
1247 return SplitCost + getRISCVInstructionCost(Opcodes, LT.second, CostKind);
1248}
1249
1251 unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy,
1253 if (isa<FixedVectorType>(ValTy) && !ST->useRVVForFixedLengthVectors())
1254 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
1255 FMF, CostKind);
1256
1257 // Skip if scalar size of ResTy is bigger than ELEN.
1258 if (ResTy->getScalarSizeInBits() > ST->getELen())
1259 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
1260 FMF, CostKind);
1261
1262 if (Opcode != Instruction::Add && Opcode != Instruction::FAdd)
1263 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
1264 FMF, CostKind);
1265
1266 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1267
1268 if (ResTy->getScalarSizeInBits() != 2 * LT.second.getScalarSizeInBits())
1269 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
1270 FMF, CostKind);
1271
1272 return (LT.first - 1) +
1273 getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
1274}
1275
1277 TTI::OperandValueInfo OpInfo,
1279 assert(OpInfo.isConstant() && "non constant operand?");
1280 if (!isa<VectorType>(Ty))
1281 // FIXME: We need to account for immediate materialization here, but doing
1282 // a decent job requires more knowledge about the immediate than we
1283 // currently have here.
1284 return 0;
1285
1286 if (OpInfo.isUniform())
1287 // vmv.x.i, vmv.v.x, or vfmv.v.f
1288 // We ignore the cost of the scalar constant materialization to be consistent
1289 // with how we treat scalar constants themselves just above.
1290 return 1;
1291
1292 return getConstantPoolLoadCost(Ty, CostKind);
1293}
1294
1295
1297 MaybeAlign Alignment,
1298 unsigned AddressSpace,
1300 TTI::OperandValueInfo OpInfo,
1301 const Instruction *I) {
1302 EVT VT = TLI->getValueType(DL, Src, true);
1303 // Type legalization can't handle structs
1304 if (VT == MVT::Other)
1305 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1306 CostKind, OpInfo, I);
1307
1309 if (Opcode == Instruction::Store && OpInfo.isConstant())
1310 Cost += getStoreImmCost(Src, OpInfo, CostKind);
1311 InstructionCost BaseCost =
1312 BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1313 CostKind, OpInfo, I);
1314 // Assume memory ops cost scale with the number of vector registers
1315 // possible accessed by the instruction. Note that BasicTTI already
1316 // handles the LT.first term for us.
1317 if (std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
1318 LT.second.isVector() && CostKind != TTI::TCK_CodeSize)
1319 BaseCost *= TLI->getLMULCost(LT.second);
1320 return Cost + BaseCost;
1321
1322}
1323
1325 Type *CondTy,
1326 CmpInst::Predicate VecPred,
1328 const Instruction *I) {
1330 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1331 I);
1332
1333 if (isa<FixedVectorType>(ValTy) && !ST->useRVVForFixedLengthVectors())
1334 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1335 I);
1336
1337 // Skip if scalar size of ValTy is bigger than ELEN.
1338 if (ValTy->isVectorTy() && ValTy->getScalarSizeInBits() > ST->getELen())
1339 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1340 I);
1341
1342 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1343 if (Opcode == Instruction::Select && ValTy->isVectorTy()) {
1344 if (CondTy->isVectorTy()) {
1345 if (ValTy->getScalarSizeInBits() == 1) {
1346 // vmandn.mm v8, v8, v9
1347 // vmand.mm v9, v0, v9
1348 // vmor.mm v0, v9, v8
1349 return LT.first *
1350 getRISCVInstructionCost(
1351 {RISCV::VMANDN_MM, RISCV::VMAND_MM, RISCV::VMOR_MM},
1352 LT.second, CostKind);
1353 }
1354 // vselect and max/min are supported natively.
1355 return LT.first *
1356 getRISCVInstructionCost(RISCV::VMERGE_VVM, LT.second, CostKind);
1357 }
1358
1359 if (ValTy->getScalarSizeInBits() == 1) {
1360 // vmv.v.x v9, a0
1361 // vmsne.vi v9, v9, 0
1362 // vmandn.mm v8, v8, v9
1363 // vmand.mm v9, v0, v9
1364 // vmor.mm v0, v9, v8
1365 MVT InterimVT = LT.second.changeVectorElementType(MVT::i8);
1366 return LT.first *
1367 getRISCVInstructionCost({RISCV::VMV_V_X, RISCV::VMSNE_VI},
1368 InterimVT, CostKind) +
1369 LT.first * getRISCVInstructionCost(
1370 {RISCV::VMANDN_MM, RISCV::VMAND_MM, RISCV::VMOR_MM},
1371 LT.second, CostKind);
1372 }
1373
1374 // vmv.v.x v10, a0
1375 // vmsne.vi v0, v10, 0
1376 // vmerge.vvm v8, v9, v8, v0
1377 return LT.first * getRISCVInstructionCost(
1378 {RISCV::VMV_V_X, RISCV::VMSNE_VI, RISCV::VMERGE_VVM},
1379 LT.second, CostKind);
1380 }
1381
1382 if ((Opcode == Instruction::ICmp) && ValTy->isVectorTy() &&
1383 CmpInst::isIntPredicate(VecPred)) {
1384 // Use VMSLT_VV to represent VMSEQ, VMSNE, VMSLTU, VMSLEU, VMSLT, VMSLE
1385 // provided they incur the same cost across all implementations
1386 return LT.first *
1387 getRISCVInstructionCost(RISCV::VMSLT_VV, LT.second, CostKind);
1388 }
1389
1390 if ((Opcode == Instruction::FCmp) && ValTy->isVectorTy() &&
1391 CmpInst::isFPPredicate(VecPred)) {
1392
1393 // Use VMXOR_MM and VMXNOR_MM to generate all true/false mask
1394 if ((VecPred == CmpInst::FCMP_FALSE) || (VecPred == CmpInst::FCMP_TRUE))
1395 return getRISCVInstructionCost(RISCV::VMXOR_MM, LT.second, CostKind);
1396
1397 // If we do not support the input floating point vector type, use the base
1398 // one which will calculate as:
1399 // ScalarizeCost + Num * Cost for fixed vector,
1400 // InvalidCost for scalable vector.
1401 if ((ValTy->getScalarSizeInBits() == 16 && !ST->hasVInstructionsF16()) ||
1402 (ValTy->getScalarSizeInBits() == 32 && !ST->hasVInstructionsF32()) ||
1403 (ValTy->getScalarSizeInBits() == 64 && !ST->hasVInstructionsF64()))
1404 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1405 I);
1406
1407 // Assuming vector fp compare and mask instructions are all the same cost
1408 // until a need arises to differentiate them.
1409 switch (VecPred) {
1410 case CmpInst::FCMP_ONE: // vmflt.vv + vmflt.vv + vmor.mm
1411 case CmpInst::FCMP_ORD: // vmfeq.vv + vmfeq.vv + vmand.mm
1412 case CmpInst::FCMP_UNO: // vmfne.vv + vmfne.vv + vmor.mm
1413 case CmpInst::FCMP_UEQ: // vmflt.vv + vmflt.vv + vmnor.mm
1414 return LT.first * getRISCVInstructionCost(
1415 {RISCV::VMFLT_VV, RISCV::VMFLT_VV, RISCV::VMOR_MM},
1416 LT.second, CostKind);
1417
1418 case CmpInst::FCMP_UGT: // vmfle.vv + vmnot.m
1419 case CmpInst::FCMP_UGE: // vmflt.vv + vmnot.m
1420 case CmpInst::FCMP_ULT: // vmfle.vv + vmnot.m
1421 case CmpInst::FCMP_ULE: // vmflt.vv + vmnot.m
1422 return LT.first *
1423 getRISCVInstructionCost({RISCV::VMFLT_VV, RISCV::VMNAND_MM},
1424 LT.second, CostKind);
1425
1426 case CmpInst::FCMP_OEQ: // vmfeq.vv
1427 case CmpInst::FCMP_OGT: // vmflt.vv
1428 case CmpInst::FCMP_OGE: // vmfle.vv
1429 case CmpInst::FCMP_OLT: // vmflt.vv
1430 case CmpInst::FCMP_OLE: // vmfle.vv
1431 case CmpInst::FCMP_UNE: // vmfne.vv
1432 return LT.first *
1433 getRISCVInstructionCost(RISCV::VMFLT_VV, LT.second, CostKind);
1434 default:
1435 break;
1436 }
1437 }
1438
1439 // TODO: Add cost for scalar type.
1440
1441 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
1442}
1443
1446 const Instruction *I) {
1448 return Opcode == Instruction::PHI ? 0 : 1;
1449 // Branches are assumed to be predicted.
1450 return 0;
1451}
1452
1455 unsigned Index, Value *Op0,
1456 Value *Op1) {
1457 assert(Val->isVectorTy() && "This must be a vector type");
1458
1459 if (Opcode != Instruction::ExtractElement &&
1460 Opcode != Instruction::InsertElement)
1461 return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1);
1462
1463 // Legalize the type.
1464 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
1465
1466 // This type is legalized to a scalar type.
1467 if (!LT.second.isVector()) {
1468 auto *FixedVecTy = cast<FixedVectorType>(Val);
1469 // If Index is a known constant, cost is zero.
1470 if (Index != -1U)
1471 return 0;
1472 // Extract/InsertElement with non-constant index is very costly when
1473 // scalarized; estimate cost of loads/stores sequence via the stack:
1474 // ExtractElement cost: store vector to stack, load scalar;
1475 // InsertElement cost: store vector to stack, store scalar, load vector.
1476 Type *ElemTy = FixedVecTy->getElementType();
1477 auto NumElems = FixedVecTy->getNumElements();
1478 auto Align = DL.getPrefTypeAlign(ElemTy);
1479 InstructionCost LoadCost =
1480 getMemoryOpCost(Instruction::Load, ElemTy, Align, 0, CostKind);
1481 InstructionCost StoreCost =
1482 getMemoryOpCost(Instruction::Store, ElemTy, Align, 0, CostKind);
1483 return Opcode == Instruction::ExtractElement
1484 ? StoreCost * NumElems + LoadCost
1485 : (StoreCost + LoadCost) * NumElems + StoreCost;
1486 }
1487
1488 // For unsupported scalable vector.
1489 if (LT.second.isScalableVector() && !LT.first.isValid())
1490 return LT.first;
1491
1492 if (!isTypeLegal(Val))
1493 return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1);
1494
1495 // Mask vector extract/insert is expanded via e8.
1496 if (Val->getScalarSizeInBits() == 1) {
1497 VectorType *WideTy =
1499 cast<VectorType>(Val)->getElementCount());
1500 if (Opcode == Instruction::ExtractElement) {
1501 InstructionCost ExtendCost
1502 = getCastInstrCost(Instruction::ZExt, WideTy, Val,
1504 InstructionCost ExtractCost
1505 = getVectorInstrCost(Opcode, WideTy, CostKind, Index, nullptr, nullptr);
1506 return ExtendCost + ExtractCost;
1507 }
1508 InstructionCost ExtendCost
1509 = getCastInstrCost(Instruction::ZExt, WideTy, Val,
1511 InstructionCost InsertCost
1512 = getVectorInstrCost(Opcode, WideTy, CostKind, Index, nullptr, nullptr);
1513 InstructionCost TruncCost
1514 = getCastInstrCost(Instruction::Trunc, Val, WideTy,
1516 return ExtendCost + InsertCost + TruncCost;
1517 }
1518
1519
1520 // In RVV, we could use vslidedown + vmv.x.s to extract element from vector
1521 // and vslideup + vmv.s.x to insert element to vector.
1522 unsigned BaseCost = 1;
1523 // When insertelement we should add the index with 1 as the input of vslideup.
1524 unsigned SlideCost = Opcode == Instruction::InsertElement ? 2 : 1;
1525
1526 if (Index != -1U) {
1527 // The type may be split. For fixed-width vectors we can normalize the
1528 // index to the new type.
1529 if (LT.second.isFixedLengthVector()) {
1530 unsigned Width = LT.second.getVectorNumElements();
1531 Index = Index % Width;
1532 }
1533
1534 // We could extract/insert the first element without vslidedown/vslideup.
1535 if (Index == 0)
1536 SlideCost = 0;
1537 else if (Opcode == Instruction::InsertElement)
1538 SlideCost = 1; // With a constant index, we do not need to use addi.
1539 }
1540
1541 // Extract i64 in the target that has XLEN=32 need more instruction.
1542 if (Val->getScalarType()->isIntegerTy() &&
1543 ST->getXLen() < Val->getScalarSizeInBits()) {
1544 // For extractelement, we need the following instructions:
1545 // vsetivli zero, 1, e64, m1, ta, mu (not count)
1546 // vslidedown.vx v8, v8, a0
1547 // vmv.x.s a0, v8
1548 // li a1, 32
1549 // vsrl.vx v8, v8, a1
1550 // vmv.x.s a1, v8
1551
1552 // For insertelement, we need the following instructions:
1553 // vsetivli zero, 2, e32, m4, ta, mu (not count)
1554 // vmv.v.i v12, 0
1555 // vslide1up.vx v16, v12, a1
1556 // vslide1up.vx v12, v16, a0
1557 // addi a0, a2, 1
1558 // vsetvli zero, a0, e64, m4, tu, mu (not count)
1559 // vslideup.vx v8, v12, a2
1560
1561 // TODO: should we count these special vsetvlis?
1562 BaseCost = Opcode == Instruction::InsertElement ? 3 : 4;
1563 }
1564 return BaseCost + SlideCost;
1565}
1566
1568 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
1570 ArrayRef<const Value *> Args, const Instruction *CxtI) {
1571
1572 // TODO: Handle more cost kinds.
1574 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1575 Args, CxtI);
1576
1577 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
1578 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1579 Args, CxtI);
1580
1581 // Skip if scalar size of Ty is bigger than ELEN.
1582 if (isa<VectorType>(Ty) && Ty->getScalarSizeInBits() > ST->getELen())
1583 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1584 Args, CxtI);
1585
1586 // Legalize the type.
1587 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1588
1589 // TODO: Handle scalar type.
1590 if (!LT.second.isVector())
1591 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1592 Args, CxtI);
1593
1594
1595 auto getConstantMatCost =
1596 [&](unsigned Operand, TTI::OperandValueInfo OpInfo) -> InstructionCost {
1597 if (OpInfo.isUniform() && TLI->canSplatOperand(Opcode, Operand))
1598 // Two sub-cases:
1599 // * Has a 5 bit immediate operand which can be splatted.
1600 // * Has a larger immediate which must be materialized in scalar register
1601 // We return 0 for both as we currently ignore the cost of materializing
1602 // scalar constants in GPRs.
1603 return 0;
1604
1605 return getConstantPoolLoadCost(Ty, CostKind);
1606 };
1607
1608 // Add the cost of materializing any constant vectors required.
1609 InstructionCost ConstantMatCost = 0;
1610 if (Op1Info.isConstant())
1611 ConstantMatCost += getConstantMatCost(0, Op1Info);
1612 if (Op2Info.isConstant())
1613 ConstantMatCost += getConstantMatCost(1, Op2Info);
1614
1615 switch (TLI->InstructionOpcodeToISD(Opcode)) {
1616 case ISD::ADD:
1617 case ISD::SUB:
1618 case ISD::AND:
1619 case ISD::OR:
1620 case ISD::XOR:
1621 case ISD::SHL:
1622 case ISD::SRL:
1623 case ISD::SRA:
1624 case ISD::MUL:
1625 case ISD::MULHS:
1626 case ISD::MULHU:
1627 case ISD::FADD:
1628 case ISD::FSUB:
1629 case ISD::FMUL:
1630 case ISD::FNEG: {
1631 return ConstantMatCost + TLI->getLMULCost(LT.second) * LT.first * 1;
1632 }
1633 default:
1634 return ConstantMatCost +
1635 BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1636 Args, CxtI);
1637 }
1638}
1639
1640// TODO: Deduplicate from TargetTransformInfoImplCRTPBase.
1642 ArrayRef<const Value *> Ptrs, const Value *Base,
1643 const TTI::PointersChainInfo &Info, Type *AccessTy,
1646 // In the basic model we take into account GEP instructions only
1647 // (although here can come alloca instruction, a value, constants and/or
1648 // constant expressions, PHIs, bitcasts ... whatever allowed to be used as a
1649 // pointer). Typically, if Base is a not a GEP-instruction and all the
1650 // pointers are relative to the same base address, all the rest are
1651 // either GEP instructions, PHIs, bitcasts or constants. When we have same
1652 // base, we just calculate cost of each non-Base GEP as an ADD operation if
1653 // any their index is a non-const.
1654 // If no known dependecies between the pointers cost is calculated as a sum
1655 // of costs of GEP instructions.
1656 for (auto [I, V] : enumerate(Ptrs)) {
1657 const auto *GEP = dyn_cast<GetElementPtrInst>(V);
1658 if (!GEP)
1659 continue;
1660 if (Info.isSameBase() && V != Base) {
1661 if (GEP->hasAllConstantIndices())
1662 continue;
1663 // If the chain is unit-stride and BaseReg + stride*i is a legal
1664 // addressing mode, then presume the base GEP is sitting around in a
1665 // register somewhere and check if we can fold the offset relative to
1666 // it.
1667 unsigned Stride = DL.getTypeStoreSize(AccessTy);
1668 if (Info.isUnitStride() &&
1669 isLegalAddressingMode(AccessTy,
1670 /* BaseGV */ nullptr,
1671 /* BaseOffset */ Stride * I,
1672 /* HasBaseReg */ true,
1673 /* Scale */ 0,
1674 GEP->getType()->getPointerAddressSpace()))
1675 continue;
1676 Cost += getArithmeticInstrCost(Instruction::Add, GEP->getType(), CostKind,
1677 {TTI::OK_AnyValue, TTI::OP_None},
1678 {TTI::OK_AnyValue, TTI::OP_None},
1679 std::nullopt);
1680 } else {
1681 SmallVector<const Value *> Indices(GEP->indices());
1682 Cost += getGEPCost(GEP->getSourceElementType(), GEP->getPointerOperand(),
1683 Indices, AccessTy, CostKind);
1684 }
1685 }
1686 return Cost;
1687}
1688
1692 // TODO: More tuning on benchmarks and metrics with changes as needed
1693 // would apply to all settings below to enable performance.
1694
1695
1696 if (ST->enableDefaultUnroll())
1697 return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP, ORE);
1698
1699 // Enable Upper bound unrolling universally, not dependant upon the conditions
1700 // below.
1701 UP.UpperBound = true;
1702
1703 // Disable loop unrolling for Oz and Os.
1704 UP.OptSizeThreshold = 0;
1706 if (L->getHeader()->getParent()->hasOptSize())
1707 return;
1708
1709 SmallVector<BasicBlock *, 4> ExitingBlocks;
1710 L->getExitingBlocks(ExitingBlocks);
1711 LLVM_DEBUG(dbgs() << "Loop has:\n"
1712 << "Blocks: " << L->getNumBlocks() << "\n"
1713 << "Exit blocks: " << ExitingBlocks.size() << "\n");
1714
1715 // Only allow another exit other than the latch. This acts as an early exit
1716 // as it mirrors the profitability calculation of the runtime unroller.
1717 if (ExitingBlocks.size() > 2)
1718 return;
1719
1720 // Limit the CFG of the loop body for targets with a branch predictor.
1721 // Allowing 4 blocks permits if-then-else diamonds in the body.
1722 if (L->getNumBlocks() > 4)
1723 return;
1724
1725 // Don't unroll vectorized loops, including the remainder loop
1726 if (getBooleanLoopAttribute(L, "llvm.loop.isvectorized"))
1727 return;
1728
1729 // Scan the loop: don't unroll loops with calls as this could prevent
1730 // inlining.
1732 for (auto *BB : L->getBlocks()) {
1733 for (auto &I : *BB) {
1734 // Initial setting - Don't unroll loops containing vectorized
1735 // instructions.
1736 if (I.getType()->isVectorTy())
1737 return;
1738
1739 if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
1740 if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
1741 if (!isLoweredToCall(F))
1742 continue;
1743 }
1744 return;
1745 }
1746
1747 SmallVector<const Value *> Operands(I.operand_values());
1750 }
1751 }
1752
1753 LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n");
1754
1755 UP.Partial = true;
1756 UP.Runtime = true;
1757 UP.UnrollRemainder = true;
1758 UP.UnrollAndJam = true;
1760
1761 // Force unrolling small loops can be very useful because of the branch
1762 // taken cost of the backedge.
1763 if (Cost < 12)
1764 UP.Force = true;
1765}
1766
1770}
1771
1774 if (Ty->isVectorTy()) {
1775 if (Size.isScalable() && ST->hasVInstructions())
1776 return divideCeil(Size.getKnownMinValue(), RISCV::RVVBitsPerBlock);
1777
1779 return divideCeil(Size, ST->getRealMinVLen());
1780 }
1781
1782 return BaseT::getRegUsageForType(Ty);
1783}
1784
1785unsigned RISCVTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
1786 if (SLPMaxVF.getNumOccurrences())
1787 return SLPMaxVF;
1788
1789 // Return how many elements can fit in getRegisterBitwidth. This is the
1790 // same routine as used in LoopVectorizer. We should probably be
1791 // accounting for whether we actually have instructions with the right
1792 // lane type, but we don't have enough information to do that without
1793 // some additional plumbing which hasn't been justified yet.
1794 TypeSize RegWidth =
1796 // If no vector registers, or absurd element widths, disable
1797 // vectorization by returning 1.
1798 return std::max<unsigned>(1U, RegWidth.getFixedValue() / ElemWidth);
1799}
1800
1802 const TargetTransformInfo::LSRCost &C2) {
1803 // RISC-V specific here are "instruction number 1st priority".
1804 return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost,
1805 C1.NumIVMuls, C1.NumBaseAdds,
1806 C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
1807 std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost,
1808 C2.NumIVMuls, C2.NumBaseAdds,
1809 C2.ScaleCost, C2.ImmCost, C2.SetupCost);
1810}
1811
1813 auto *VTy = dyn_cast<VectorType>(DataTy);
1814 if (!VTy || VTy->isScalableTy())
1815 return false;
1816
1817 if (!isLegalMaskedLoadStore(DataTy, Alignment))
1818 return false;
1819 return true;
1820}
1821
1823 const Function *Callee) const {
1824 const TargetMachine &TM = getTLI()->getTargetMachine();
1825
1826 const FeatureBitset &CallerBits =
1827 TM.getSubtargetImpl(*Caller)->getFeatureBits();
1828 const FeatureBitset &CalleeBits =
1829 TM.getSubtargetImpl(*Callee)->getFeatureBits();
1830
1831 // Inline a callee if its target-features are a subset of the callers
1832 // target-features.
1833 return (CallerBits & CalleeBits) == CalleeBits;
1834}
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
Cost tables and simple lookup functions.
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
uint64_t Size
Hexagon Common GEP
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
mir Rename Register Operands
static const Function * getCalledFunction(const Value *V, bool &IsNoBuiltin)
if(VerifyEach)
const char LLVMTargetMachineRef TM
static cl::opt< unsigned > SLPMaxVF("riscv-v-slp-max-vf", cl::desc("Overrides result used for getMaximumVF query which is used " "exclusively by SLP vectorizer."), cl::Hidden)
static cl::opt< unsigned > RVVRegisterWidthLMUL("riscv-v-register-bit-width-lmul", cl::desc("The LMUL to use for getRegisterBitWidth queries. Affects LMUL used " "by autovectorized code. Fractional LMULs are not supported."), cl::init(2), cl::Hidden)
static VectorType * getVRGatherIndexType(MVT DataVT, const RISCVSubtarget &ST, LLVMContext &C)
static const CostTblEntry VectorIntrinsicCostTable[]
static bool canUseShiftPair(Instruction *Inst, const APInt &Imm)
static unsigned getISDForVPIntrinsicID(Intrinsic::ID ID)
This file defines a TargetTransformInfo::Concept conforming object specific to the RISC-V target mach...
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file contains some templates that are useful if you are working with the STL at all.
This file describes how to lower LLVM code to machine code.
This pass exposes codegen information to IR-level passes.
Class for arbitrary precision integers.
Definition: APInt.h:76
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Get intrinsic cost based on arguments.
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
Definition: BasicTTIImpl.h:582
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *DataTy, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
std::optional< unsigned > getVScaleForTuning() const
Definition: BasicTTIImpl.h:756
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
InstructionCost getStridedMemoryOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I)
std::optional< unsigned > getMaxVScale() const
Definition: BasicTTIImpl.h:755
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *Ty, int &Index, VectorType *&SubTy) const
Definition: BasicTTIImpl.h:969
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
Try to calculate op costs for min/max reduction operations.
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr)
InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef< const Value * > Operands, Type *AccessType, TTI::TargetCostKind CostKind)
Definition: BasicTTIImpl.h:438
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: BasicTTIImpl.h:654
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr)
Definition: BasicTTIImpl.h:891
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
Estimate the cost of type-legalization and the legalized type.
Definition: BasicTTIImpl.h:855
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr, int64_t ScalableOffset=0)
Definition: BasicTTIImpl.h:339
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:993
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition: InstrTypes.h:996
@ FCMP_TRUE
1 1 1 1 Always true (always folded)
Definition: InstrTypes.h:1010
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition: InstrTypes.h:999
@ FCMP_ULE
1 1 0 1 True if unordered, less than, or equal
Definition: InstrTypes.h:1008
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition: InstrTypes.h:997
@ FCMP_OGE
0 0 1 1 True if ordered and greater than or equal
Definition: InstrTypes.h:998
@ FCMP_ULT
1 1 0 0 True if unordered or less than
Definition: InstrTypes.h:1007
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition: InstrTypes.h:1001
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition: InstrTypes.h:1004
@ FCMP_UGT
1 0 1 0 True if unordered or greater than
Definition: InstrTypes.h:1005
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition: InstrTypes.h:1000
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition: InstrTypes.h:1002
@ ICMP_EQ
equal
Definition: InstrTypes.h:1014
@ ICMP_NE
not equal
Definition: InstrTypes.h:1015
@ FCMP_UNE
1 1 1 0 True if unordered or not equal
Definition: InstrTypes.h:1009
@ FCMP_UGE
1 0 1 1 True if unordered, greater than, or equal
Definition: InstrTypes.h:1006
@ FCMP_FALSE
0 0 0 0 Always false (always folded)
Definition: InstrTypes.h:995
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition: InstrTypes.h:1003
bool isFPPredicate() const
Definition: InstrTypes.h:1122
bool isIntPredicate() const
Definition: InstrTypes.h:1123
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
Definition: DataLayout.cpp:865
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition: DataLayout.h:672
TypeSize getTypeStoreSize(Type *Ty) const
Returns the maximum number of bytes that may be overwritten by storing the specified type.
Definition: DataLayout.h:472
Align getPrefTypeAlign(Type *Ty) const
Returns the preferred stack/global alignment for the specified type.
Definition: DataLayout.cpp:874
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:20
bool noNaNs() const
Definition: FMF.h:66
Container class for subtarget features.
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:539
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:692
static InstructionCost getInvalid(CostType Val=0)
bool isCommutative() const LLVM_READONLY
Return true if the instruction is commutative:
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:278
const SmallVectorImpl< Type * > & getArgTypes() const
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:47
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
Definition: IntrinsicInst.h:54
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:44
Machine Value Type.
static MVT getFloatingPointVT(unsigned BitWidth)
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
uint64_t getScalarSizeInBits() const
MVT changeVectorElementType(MVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
MVT changeTypeToInteger()
Return the type converted to an equivalently sized integer or vector with integer element type.
bool bitsGT(MVT VT) const
Return true if this has more bits than VT.
bool isFixedLengthVector() const
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
unsigned getOpcode() const
Return the opcode for this Instruction or ConstantExpr.
Definition: Operator.h:41
The optimization diagnostic interface.
bool hasVInstructionsF64() const
unsigned getRealMinVLen() const
bool useRVVForFixedLengthVectors() const
unsigned getXLen() const
bool hasVInstructionsF16() const
bool hasVInstructions() const
unsigned getRealMaxVLen() const
bool hasVInstructionsF32() const
unsigned getELen() const
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I)
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr)
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2)
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind)
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
bool isLegalMaskedLoadStore(Type *DataType, Align Alignment)
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr)
bool areInlineCompatible(const Function *Caller, const Function *Callee) const
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr)
InstructionCost getStridedMemoryOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I)
std::optional< unsigned > getVScaleForTuning() const
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy, FastMathFlags FMF, TTI::TargetCostKind CostKind)
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
std::optional< unsigned > getMaxVScale() const
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getPointersChainCost(ArrayRef< const Value * > Ptrs, const Value *Base, const TTI::PointersChainInfo &Info, Type *AccessTy, TTI::TargetCostKind CostKind)
TargetTransformInfo::PopcntSupportKind getPopcntSupport(unsigned TyWidth)
bool shouldExpandReduction(const IntrinsicInst *II) const
InstructionCost getStoreImmCost(Type *VecTy, TTI::OperandValueInfo OpInfo, TTI::TargetCostKind CostKind)
Return the cost of materializing an immediate for a value operand of a store instruction.
bool isLegalMaskedCompressStore(Type *DataTy, Align Alignment)
bool isLegalStridedLoadStore(Type *DataType, Align Alignment)
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
unsigned getRegUsageForType(Type *Ty)
bool isLegalMaskedGather(Type *DataType, Align Alignment)
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpdInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
bool isLegalMaskedScatter(Type *DataType, Align Alignment)
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const
InstructionCost getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind)
bool hasActiveVectorLength(unsigned Opcode, Type *DataType, Align Alignment) const
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
InstructionCost getVRGatherVVCost(MVT VT) const
Return the cost of a vrgather.vv instruction for the type VT.
bool canSplatOperand(Instruction *I, int Operand) const
Return true if the (vector) instruction I will be lowered to an instruction with a scalar splat opera...
InstructionCost getVRGatherVICost(MVT VT) const
Return the cost of a vrgather.vi (or vx) instruction for the type VT.
static unsigned computeVLMAX(unsigned VectorBits, unsigned EltSize, unsigned MinSize)
InstructionCost getLMULCost(MVT VT) const
Return the cost of LMUL for linear operations.
InstructionCost getVSlideVICost(MVT VT) const
Return the cost of a vslidedown.vi or vslideup.vi instruction for the type VT.
InstructionCost getVSlideVXCost(MVT VT) const
Return the cost of a vslidedown.vx or vslideup.vx instruction for the type VT.
bool isLegalInterleavedAccessType(VectorType *VTy, unsigned Factor, Align Alignment, unsigned AddrSpace, const DataLayout &) const
Returns whether or not generating a interleaved load/store intrinsic for this type will be legal.
The main scalar evolution driver.
static bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
size_t size() const
Definition: SmallVector.h:91
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
int InstructionOpcodeToISD(unsigned Opcode) const
Get the ISD node that corresponds to the Instruction class opcode.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
const TargetMachine & getTargetMachine() const
bool isOperationCustom(unsigned Op, EVT VT) const
Return true if the operation uses custom lowering, regardless of whether the type is legal or not.
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:76
const DataLayout & getDataLayout() const
bool isLoweredToCall(const Function *F) const
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TTI::TargetCostKind CostKind)
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
@ TCK_Latency
The latency of instruction.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
PopcntSupportKind
Flags indicating the kind of support for population count.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Transpose
Transpose two vectors.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
@ None
The cast is not used with a load/store of any kind.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:330
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition: TypeSize.h:333
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:265
static IntegerType * getInt1Ty(LLVMContext &C)
static IntegerType * getIntNTy(LLVMContext &C, unsigned N)
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:129
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:228
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:348
Value * getOperand(unsigned i) const
Definition: User.h:169
LLVM Value Representation.
Definition: Value.h:74
Base class of all SIMD vector types.
Definition: DerivedTypes.h:403
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
Definition: DerivedTypes.h:641
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Definition: Type.cpp:676
Type * getElementType() const
Definition: DerivedTypes.h:436
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:187
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition: TypeSize.h:168
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition: ISDOpcodes.h:44
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:239
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:790
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:390
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:774
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:930
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition: ISDOpcodes.h:651
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:705
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:780
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:888
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:836
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:680
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:869
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:786
int getIntMatCost(const APInt &Val, unsigned Size, const MCSubtargetInfo &STI, bool CompressionCost)
static constexpr unsigned RVVBitsPerBlock
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:450
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
unsigned Log2_32_Ceil(uint32_t Value)
Return the ceil log base 2 of the specified value, 32 if the value is zero.
Definition: MathExtras.h:337
const CostTblEntryT< CostType > * CostTableLookup(ArrayRef< CostTblEntryT< CostType > > Tbl, int ISD, MVT Ty)
Find in cost table.
Definition: CostTable.h:35
bool getBooleanLoopAttribute(const Loop *TheLoop, StringRef Name)
Returns true if Name is applied to TheLoop and enabled.
Definition: LoopInfo.cpp:1085
uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:428
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are are tuples (A,...
Definition: STLExtras.h:2406
AddressSpace
Definition: NVPTXBaseInfo.h:21
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition: MathExtras.h:372
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: bit.h:215
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition: MathExtras.h:269
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
Definition: STLExtras.h:1928
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:324
llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:275
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
constexpr int PoisonMaskElem
llvm::SmallVector< int, 16 > createInterleaveMask(unsigned VF, unsigned NumVecs)
Create an interleave shuffle mask.
DWARFExpression::Operation Op
InstructionCost Cost
bool equal(L &&LRange, R &&RRange)
Wrapper function around std::equal to detect if pair-wise elements between two ranges are the same.
Definition: STLExtras.h:2025
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition: bit.h:327
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
Cost Table Entry.
Definition: CostTable.h:25
Extended Value Type.
Definition: ValueTypes.h:34
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
unsigned Insns
TODO: Some of these could be merged.
Describe known properties for a set of pointers.
Parameters that control the generic loop unrolling transformation.
bool UpperBound
Allow using trip count upper bound to unroll loops.
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
unsigned UnrollAndJamInnerLoopThreshold
Threshold for unroll and jam, for inner loop size.
bool UnrollAndJam
Allow unroll and jam. Used to enable unroll and jam for the target.
bool UnrollRemainder
Allow unrolling of all the iterations of the runtime loop remainder.
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
unsigned OptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size (set to UINT_MAX to disable).