LLVM 19.0.0git
RISCVTargetTransformInfo.cpp
Go to the documentation of this file.
1//===-- RISCVTargetTransformInfo.cpp - RISC-V specific TTI ----------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
11#include "llvm/ADT/STLExtras.h"
17#include <cmath>
18#include <optional>
19using namespace llvm;
20
21#define DEBUG_TYPE "riscvtti"
22
24 "riscv-v-register-bit-width-lmul",
26 "The LMUL to use for getRegisterBitWidth queries. Affects LMUL used "
27 "by autovectorized code. Fractional LMULs are not supported."),
29
31 "riscv-v-slp-max-vf",
33 "Overrides result used for getMaximumVF query which is used "
34 "exclusively by SLP vectorizer."),
36
38RISCVTTIImpl::getRISCVInstructionCost(ArrayRef<unsigned> OpCodes, MVT VT,
40 size_t NumInstr = OpCodes.size();
42 return NumInstr;
43 InstructionCost LMULCost = TLI->getLMULCost(VT);
45 return LMULCost * NumInstr;
47 for (auto Op : OpCodes) {
48 switch (Op) {
49 case RISCV::VRGATHER_VI:
50 Cost += TLI->getVRGatherVICost(VT);
51 break;
52 case RISCV::VRGATHER_VV:
53 Cost += TLI->getVRGatherVVCost(VT);
54 break;
55 case RISCV::VSLIDEUP_VI:
56 case RISCV::VSLIDEDOWN_VI:
57 Cost += TLI->getVSlideVICost(VT);
58 break;
59 case RISCV::VSLIDEUP_VX:
60 case RISCV::VSLIDEDOWN_VX:
61 Cost += TLI->getVSlideVXCost(VT);
62 break;
63 case RISCV::VREDMAX_VS:
64 case RISCV::VREDMIN_VS:
65 case RISCV::VREDMAXU_VS:
66 case RISCV::VREDMINU_VS:
67 case RISCV::VREDSUM_VS:
68 case RISCV::VREDAND_VS:
69 case RISCV::VREDOR_VS:
70 case RISCV::VREDXOR_VS:
71 case RISCV::VFREDMAX_VS:
72 case RISCV::VFREDMIN_VS:
73 case RISCV::VFREDUSUM_VS: {
74 unsigned VL = VT.getVectorMinNumElements();
75 if (!VT.isFixedLengthVector())
76 VL *= *getVScaleForTuning();
77 Cost += Log2_32_Ceil(VL);
78 break;
79 }
80 case RISCV::VFREDOSUM_VS: {
81 unsigned VL = VT.getVectorMinNumElements();
82 if (!VT.isFixedLengthVector())
83 VL *= *getVScaleForTuning();
84 Cost += VL;
85 break;
86 }
87 case RISCV::VMV_X_S:
88 case RISCV::VMV_S_X:
89 case RISCV::VFMV_F_S:
90 case RISCV::VFMV_S_F:
91 case RISCV::VMNAND_MM:
92 case RISCV::VCPOP_M:
93 Cost += 1;
94 break;
95 default:
96 Cost += LMULCost;
97 }
98 }
99 return Cost;
100}
101
104 assert(Ty->isIntegerTy() &&
105 "getIntImmCost can only estimate cost of materialising integers");
106
107 // We have a Zero register, so 0 is always free.
108 if (Imm == 0)
109 return TTI::TCC_Free;
110
111 // Otherwise, we check how many instructions it will take to materialise.
112 const DataLayout &DL = getDataLayout();
113 return RISCVMatInt::getIntMatCost(Imm, DL.getTypeSizeInBits(Ty), *getST());
114}
115
116// Look for patterns of shift followed by AND that can be turned into a pair of
117// shifts. We won't need to materialize an immediate for the AND so these can
118// be considered free.
119static bool canUseShiftPair(Instruction *Inst, const APInt &Imm) {
120 uint64_t Mask = Imm.getZExtValue();
121 auto *BO = dyn_cast<BinaryOperator>(Inst->getOperand(0));
122 if (!BO || !BO->hasOneUse())
123 return false;
124
125 if (BO->getOpcode() != Instruction::Shl)
126 return false;
127
128 if (!isa<ConstantInt>(BO->getOperand(1)))
129 return false;
130
131 unsigned ShAmt = cast<ConstantInt>(BO->getOperand(1))->getZExtValue();
132 // (and (shl x, c2), c1) will be matched to (srli (slli x, c2+c3), c3) if c1
133 // is a mask shifted by c2 bits with c3 leading zeros.
134 if (isShiftedMask_64(Mask)) {
135 unsigned Trailing = llvm::countr_zero(Mask);
136 if (ShAmt == Trailing)
137 return true;
138 }
139
140 return false;
141}
142
144 const APInt &Imm, Type *Ty,
146 Instruction *Inst) {
147 assert(Ty->isIntegerTy() &&
148 "getIntImmCost can only estimate cost of materialising integers");
149
150 // We have a Zero register, so 0 is always free.
151 if (Imm == 0)
152 return TTI::TCC_Free;
153
154 // Some instructions in RISC-V can take a 12-bit immediate. Some of these are
155 // commutative, in others the immediate comes from a specific argument index.
156 bool Takes12BitImm = false;
157 unsigned ImmArgIdx = ~0U;
158
159 switch (Opcode) {
160 case Instruction::GetElementPtr:
161 // Never hoist any arguments to a GetElementPtr. CodeGenPrepare will
162 // split up large offsets in GEP into better parts than ConstantHoisting
163 // can.
164 return TTI::TCC_Free;
165 case Instruction::And:
166 // zext.h
167 if (Imm == UINT64_C(0xffff) && ST->hasStdExtZbb())
168 return TTI::TCC_Free;
169 // zext.w
170 if (Imm == UINT64_C(0xffffffff) && ST->hasStdExtZba())
171 return TTI::TCC_Free;
172 // bclri
173 if (ST->hasStdExtZbs() && (~Imm).isPowerOf2())
174 return TTI::TCC_Free;
175 if (Inst && Idx == 1 && Imm.getBitWidth() <= ST->getXLen() &&
176 canUseShiftPair(Inst, Imm))
177 return TTI::TCC_Free;
178 Takes12BitImm = true;
179 break;
180 case Instruction::Add:
181 Takes12BitImm = true;
182 break;
183 case Instruction::Or:
184 case Instruction::Xor:
185 // bseti/binvi
186 if (ST->hasStdExtZbs() && Imm.isPowerOf2())
187 return TTI::TCC_Free;
188 Takes12BitImm = true;
189 break;
190 case Instruction::Mul:
191 // Power of 2 is a shift. Negated power of 2 is a shift and a negate.
192 if (Imm.isPowerOf2() || Imm.isNegatedPowerOf2())
193 return TTI::TCC_Free;
194 // One more or less than a power of 2 can use SLLI+ADD/SUB.
195 if ((Imm + 1).isPowerOf2() || (Imm - 1).isPowerOf2())
196 return TTI::TCC_Free;
197 // FIXME: There is no MULI instruction.
198 Takes12BitImm = true;
199 break;
200 case Instruction::Sub:
201 case Instruction::Shl:
202 case Instruction::LShr:
203 case Instruction::AShr:
204 Takes12BitImm = true;
205 ImmArgIdx = 1;
206 break;
207 default:
208 break;
209 }
210
211 if (Takes12BitImm) {
212 // Check immediate is the correct argument...
213 if (Instruction::isCommutative(Opcode) || Idx == ImmArgIdx) {
214 // ... and fits into the 12-bit immediate.
215 if (Imm.getSignificantBits() <= 64 &&
216 getTLI()->isLegalAddImmediate(Imm.getSExtValue())) {
217 return TTI::TCC_Free;
218 }
219 }
220
221 // Otherwise, use the full materialisation cost.
222 return getIntImmCost(Imm, Ty, CostKind);
223 }
224
225 // By default, prevent hoisting.
226 return TTI::TCC_Free;
227}
228
231 const APInt &Imm, Type *Ty,
233 // Prevent hoisting in unknown cases.
234 return TTI::TCC_Free;
235}
236
239 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
240 return ST->hasStdExtZbb() || ST->hasVendorXCVbitmanip()
243}
244
246 // Currently, the ExpandReductions pass can't expand scalable-vector
247 // reductions, but we still request expansion as RVV doesn't support certain
248 // reductions and the SelectionDAG can't legalize them either.
249 switch (II->getIntrinsicID()) {
250 default:
251 return false;
252 // These reductions have no equivalent in RVV
253 case Intrinsic::vector_reduce_mul:
254 case Intrinsic::vector_reduce_fmul:
255 return true;
256 }
257}
258
259std::optional<unsigned> RISCVTTIImpl::getMaxVScale() const {
260 if (ST->hasVInstructions())
262 return BaseT::getMaxVScale();
263}
264
265std::optional<unsigned> RISCVTTIImpl::getVScaleForTuning() const {
266 if (ST->hasVInstructions())
267 if (unsigned MinVLen = ST->getRealMinVLen();
268 MinVLen >= RISCV::RVVBitsPerBlock)
269 return MinVLen / RISCV::RVVBitsPerBlock;
271}
272
275 unsigned LMUL =
276 llvm::bit_floor(std::clamp<unsigned>(RVVRegisterWidthLMUL, 1, 8));
277 switch (K) {
279 return TypeSize::getFixed(ST->getXLen());
281 return TypeSize::getFixed(
282 ST->useRVVForFixedLengthVectors() ? LMUL * ST->getRealMinVLen() : 0);
285 (ST->hasVInstructions() &&
288 : 0);
289 }
290
291 llvm_unreachable("Unsupported register kind");
292}
293
295RISCVTTIImpl::getConstantPoolLoadCost(Type *Ty, TTI::TargetCostKind CostKind) {
296 // Add a cost of address generation + the cost of the load. The address
297 // is expected to be a PC relative offset to a constant pool entry
298 // using auipc/addi.
299 return 2 + getMemoryOpCost(Instruction::Load, Ty, DL.getABITypeAlign(Ty),
300 /*AddressSpace=*/0, CostKind);
301}
302
304 LLVMContext &C) {
305 assert((DataVT.getScalarSizeInBits() != 8 ||
306 DataVT.getVectorNumElements() <= 256) && "unhandled case in lowering");
307 MVT IndexVT = DataVT.changeTypeToInteger();
308 if (IndexVT.getScalarType().bitsGT(ST.getXLenVT()))
309 IndexVT = IndexVT.changeVectorElementType(MVT::i16);
310 return cast<VectorType>(EVT(IndexVT).getTypeForEVT(C));
311}
312
314 VectorType *Tp, ArrayRef<int> Mask,
316 int Index, VectorType *SubTp,
318 Kind = improveShuffleKindFromMask(Kind, Mask, Tp, Index, SubTp);
319
320 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
321
322 // First, handle cases where having a fixed length vector enables us to
323 // give a more accurate cost than falling back to generic scalable codegen.
324 // TODO: Each of these cases hints at a modeling gap around scalable vectors.
325 if (isa<FixedVectorType>(Tp)) {
326 switch (Kind) {
327 default:
328 break;
330 if (Mask.size() >= 2 && LT.second.isFixedLengthVector()) {
331 MVT EltTp = LT.second.getVectorElementType();
332 // If the size of the element is < ELEN then shuffles of interleaves and
333 // deinterleaves of 2 vectors can be lowered into the following
334 // sequences
335 if (EltTp.getScalarSizeInBits() < ST->getELen()) {
336 // Example sequence:
337 // vsetivli zero, 4, e8, mf4, ta, ma (ignored)
338 // vwaddu.vv v10, v8, v9
339 // li a0, -1 (ignored)
340 // vwmaccu.vx v10, a0, v9
341 if (ShuffleVectorInst::isInterleaveMask(Mask, 2, Mask.size()))
342 return 2 * LT.first * TLI->getLMULCost(LT.second);
343
344 if (Mask[0] == 0 || Mask[0] == 1) {
345 auto DeinterleaveMask = createStrideMask(Mask[0], 2, Mask.size());
346 // Example sequence:
347 // vnsrl.wi v10, v8, 0
348 if (equal(DeinterleaveMask, Mask))
349 return LT.first * getRISCVInstructionCost(RISCV::VNSRL_WI,
350 LT.second, CostKind);
351 }
352 }
353 }
354 // vrgather + cost of generating the mask constant.
355 // We model this for an unknown mask with a single vrgather.
356 if (LT.second.isFixedLengthVector() && LT.first == 1 &&
357 (LT.second.getScalarSizeInBits() != 8 ||
358 LT.second.getVectorNumElements() <= 256)) {
359 VectorType *IdxTy = getVRGatherIndexType(LT.second, *ST, Tp->getContext());
360 InstructionCost IndexCost = getConstantPoolLoadCost(IdxTy, CostKind);
361 return IndexCost +
362 getRISCVInstructionCost(RISCV::VRGATHER_VV, LT.second, CostKind);
363 }
364 [[fallthrough]];
365 }
368 // 2 x (vrgather + cost of generating the mask constant) + cost of mask
369 // register for the second vrgather. We model this for an unknown
370 // (shuffle) mask.
371 if (LT.second.isFixedLengthVector() && LT.first == 1 &&
372 (LT.second.getScalarSizeInBits() != 8 ||
373 LT.second.getVectorNumElements() <= 256)) {
374 auto &C = Tp->getContext();
375 auto EC = Tp->getElementCount();
376 VectorType *IdxTy = getVRGatherIndexType(LT.second, *ST, C);
378 InstructionCost IndexCost = getConstantPoolLoadCost(IdxTy, CostKind);
379 InstructionCost MaskCost = getConstantPoolLoadCost(MaskTy, CostKind);
380 return 2 * IndexCost +
381 getRISCVInstructionCost({RISCV::VRGATHER_VV, RISCV::VRGATHER_VV},
382 LT.second, CostKind) +
383 MaskCost;
384 }
385 [[fallthrough]];
386 }
387 case TTI::SK_Select: {
388 // We are going to permute multiple sources and the result will be in
389 // multiple destinations. Providing an accurate cost only for splits where
390 // the element type remains the same.
391 if (!Mask.empty() && LT.first.isValid() && LT.first != 1 &&
392 LT.second.isFixedLengthVector() &&
393 LT.second.getVectorElementType().getSizeInBits() ==
395 LT.second.getVectorNumElements() <
396 cast<FixedVectorType>(Tp)->getNumElements() &&
397 divideCeil(Mask.size(),
398 cast<FixedVectorType>(Tp)->getNumElements()) ==
399 static_cast<unsigned>(*LT.first.getValue())) {
400 unsigned NumRegs = *LT.first.getValue();
401 unsigned VF = cast<FixedVectorType>(Tp)->getNumElements();
402 unsigned SubVF = PowerOf2Ceil(VF / NumRegs);
403 auto *SubVecTy = FixedVectorType::get(Tp->getElementType(), SubVF);
404
406 for (unsigned I = 0; I < NumRegs; ++I) {
407 bool IsSingleVector = true;
408 SmallVector<int> SubMask(SubVF, PoisonMaskElem);
409 transform(Mask.slice(I * SubVF,
410 I == NumRegs - 1 ? Mask.size() % SubVF : SubVF),
411 SubMask.begin(), [&](int I) {
412 bool SingleSubVector = I / VF == 0;
413 IsSingleVector &= SingleSubVector;
414 return (SingleSubVector ? 0 : 1) * SubVF + I % VF;
415 });
418 SubVecTy, SubMask, CostKind, 0, nullptr);
419 return Cost;
420 }
421 }
422 break;
423 }
424 }
425 };
426
427 // Handle scalable vectors (and fixed vectors legalized to scalable vectors).
428 switch (Kind) {
429 default:
430 // Fallthrough to generic handling.
431 // TODO: Most of these cases will return getInvalid in generic code, and
432 // must be implemented here.
433 break;
435 // Extract at zero is always a subregister extract
436 if (Index == 0)
437 return TTI::TCC_Free;
438
439 // If we're extracting a subvector of at most m1 size at a sub-register
440 // boundary - which unfortunately we need exact vlen to identify - this is
441 // a subregister extract at worst and thus won't require a vslidedown.
442 // TODO: Extend for aligned m2, m4 subvector extracts
443 // TODO: Extend for misalgined (but contained) extracts
444 // TODO: Extend for scalable subvector types
445 if (std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
446 SubLT.second.isValid() && SubLT.second.isFixedLengthVector()) {
447 const unsigned MinVLen = ST->getRealMinVLen();
448 const unsigned MaxVLen = ST->getRealMaxVLen();
449 if (MinVLen == MaxVLen &&
450 SubLT.second.getScalarSizeInBits() * Index % MinVLen == 0 &&
451 SubLT.second.getSizeInBits() <= MinVLen)
452 return TTI::TCC_Free;
453 }
454
455 // Example sequence:
456 // vsetivli zero, 4, e8, mf2, tu, ma (ignored)
457 // vslidedown.vi v8, v9, 2
458 return LT.first *
459 getRISCVInstructionCost(RISCV::VSLIDEDOWN_VI, LT.second, CostKind);
461 // Example sequence:
462 // vsetivli zero, 4, e8, mf2, tu, ma (ignored)
463 // vslideup.vi v8, v9, 2
464 return LT.first *
465 getRISCVInstructionCost(RISCV::VSLIDEUP_VI, LT.second, CostKind);
466 case TTI::SK_Select: {
467 // Example sequence:
468 // li a0, 90
469 // vsetivli zero, 8, e8, mf2, ta, ma (ignored)
470 // vmv.s.x v0, a0
471 // vmerge.vvm v8, v9, v8, v0
472 // We use 2 for the cost of the mask materialization as this is the true
473 // cost for small masks and most shuffles are small. At worst, this cost
474 // should be a very small constant for the constant pool load. As such,
475 // we may bias towards large selects slightly more than truely warranted.
476 return LT.first *
477 (1 + getRISCVInstructionCost({RISCV::VMV_S_X, RISCV::VMERGE_VVM},
478 LT.second, CostKind));
479 }
480 case TTI::SK_Broadcast: {
481 bool HasScalar = (Args.size() > 0) && (Operator::getOpcode(Args[0]) ==
482 Instruction::InsertElement);
483 if (LT.second.getScalarSizeInBits() == 1) {
484 if (HasScalar) {
485 // Example sequence:
486 // andi a0, a0, 1
487 // vsetivli zero, 2, e8, mf8, ta, ma (ignored)
488 // vmv.v.x v8, a0
489 // vmsne.vi v0, v8, 0
490 return LT.first *
491 (1 + getRISCVInstructionCost({RISCV::VMV_V_X, RISCV::VMSNE_VI},
492 LT.second, CostKind));
493 }
494 // Example sequence:
495 // vsetivli zero, 2, e8, mf8, ta, mu (ignored)
496 // vmv.v.i v8, 0
497 // vmerge.vim v8, v8, 1, v0
498 // vmv.x.s a0, v8
499 // andi a0, a0, 1
500 // vmv.v.x v8, a0
501 // vmsne.vi v0, v8, 0
502
503 return LT.first *
504 (1 + getRISCVInstructionCost({RISCV::VMV_V_I, RISCV::VMERGE_VIM,
505 RISCV::VMV_X_S, RISCV::VMV_V_X,
506 RISCV::VMSNE_VI},
507 LT.second, CostKind));
508 }
509
510 if (HasScalar) {
511 // Example sequence:
512 // vmv.v.x v8, a0
513 return LT.first *
514 getRISCVInstructionCost(RISCV::VMV_V_X, LT.second, CostKind);
515 }
516
517 // Example sequence:
518 // vrgather.vi v9, v8, 0
519 return LT.first *
520 getRISCVInstructionCost(RISCV::VRGATHER_VI, LT.second, CostKind);
521 }
522 case TTI::SK_Splice: {
523 // vslidedown+vslideup.
524 // TODO: Multiplying by LT.first implies this legalizes into multiple copies
525 // of similar code, but I think we expand through memory.
526 unsigned Opcodes[2] = {RISCV::VSLIDEDOWN_VX, RISCV::VSLIDEUP_VX};
527 if (Index >= 0 && Index < 32)
528 Opcodes[0] = RISCV::VSLIDEDOWN_VI;
529 else if (Index < 0 && Index > -32)
530 Opcodes[1] = RISCV::VSLIDEUP_VI;
531 return LT.first * getRISCVInstructionCost(Opcodes, LT.second, CostKind);
532 }
533 case TTI::SK_Reverse: {
534 // TODO: Cases to improve here:
535 // * Illegal vector types
536 // * i64 on RV32
537 // * i1 vector
538 // At low LMUL, most of the cost is producing the vrgather index register.
539 // At high LMUL, the cost of the vrgather itself will dominate.
540 // Example sequence:
541 // csrr a0, vlenb
542 // srli a0, a0, 3
543 // addi a0, a0, -1
544 // vsetvli a1, zero, e8, mf8, ta, mu (ignored)
545 // vid.v v9
546 // vrsub.vx v10, v9, a0
547 // vrgather.vv v9, v8, v10
548 InstructionCost LenCost = 3;
549 if (LT.second.isFixedLengthVector())
550 // vrsub.vi has a 5 bit immediate field, otherwise an li suffices
551 LenCost = isInt<5>(LT.second.getVectorNumElements() - 1) ? 0 : 1;
552 unsigned Opcodes[] = {RISCV::VID_V, RISCV::VRSUB_VX, RISCV::VRGATHER_VV};
553 if (LT.second.isFixedLengthVector() &&
554 isInt<5>(LT.second.getVectorNumElements() - 1))
555 Opcodes[1] = RISCV::VRSUB_VI;
556 InstructionCost GatherCost =
557 getRISCVInstructionCost(Opcodes, LT.second, CostKind);
558 // Mask operation additionally required extend and truncate
559 InstructionCost ExtendCost = Tp->getElementType()->isIntegerTy(1) ? 3 : 0;
560 return LT.first * (LenCost + GatherCost + ExtendCost);
561 }
562 }
563 return BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp);
564}
565
567RISCVTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
568 unsigned AddressSpace,
570 if (!isLegalMaskedLoadStore(Src, Alignment) ||
572 return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
573 CostKind);
574
575 return getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind);
576}
577
579 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
580 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
581 bool UseMaskForCond, bool UseMaskForGaps) {
582 if (isa<ScalableVectorType>(VecTy))
584 auto *FVTy = cast<FixedVectorType>(VecTy);
585 InstructionCost MemCost =
586 getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace, CostKind);
587 unsigned VF = FVTy->getNumElements() / Factor;
588
589 // The interleaved memory access pass will lower interleaved memory ops (i.e
590 // a load and store followed by a specific shuffle) to vlseg/vsseg
591 // intrinsics. In those cases then we can treat it as if it's just one (legal)
592 // memory op
593 if (!UseMaskForCond && !UseMaskForGaps &&
594 Factor <= TLI->getMaxSupportedInterleaveFactor()) {
595 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(FVTy);
596 // Need to make sure type has't been scalarized
597 if (LT.second.isFixedLengthVector()) {
598 auto *LegalFVTy = FixedVectorType::get(FVTy->getElementType(),
599 LT.second.getVectorNumElements());
600 // FIXME: We use the memory op cost of the *legalized* type here, becuase
601 // it's getMemoryOpCost returns a really expensive cost for types like
602 // <6 x i8>, which show up when doing interleaves of Factor=3 etc.
603 // Should the memory op cost of these be cheaper?
604 if (TLI->isLegalInterleavedAccessType(LegalFVTy, Factor, Alignment,
605 AddressSpace, DL)) {
606 InstructionCost LegalMemCost = getMemoryOpCost(
607 Opcode, LegalFVTy, Alignment, AddressSpace, CostKind);
608 return LT.first + LegalMemCost;
609 }
610 }
611 }
612
613 // An interleaved load will look like this for Factor=3:
614 // %wide.vec = load <12 x i32>, ptr %3, align 4
615 // %strided.vec = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
616 // %strided.vec1 = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
617 // %strided.vec2 = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
618 if (Opcode == Instruction::Load) {
619 InstructionCost Cost = MemCost;
620 for (unsigned Index : Indices) {
621 FixedVectorType *SubVecTy =
622 FixedVectorType::get(FVTy->getElementType(), VF * Factor);
623 auto Mask = createStrideMask(Index, Factor, VF);
624 InstructionCost ShuffleCost =
626 CostKind, 0, nullptr, {});
627 Cost += ShuffleCost;
628 }
629 return Cost;
630 }
631
632 // TODO: Model for NF > 2
633 // We'll need to enhance getShuffleCost to model shuffles that are just
634 // inserts and extracts into subvectors, since they won't have the full cost
635 // of a vrgather.
636 // An interleaved store for 3 vectors of 4 lanes will look like
637 // %11 = shufflevector <4 x i32> %4, <4 x i32> %6, <8 x i32> <0...7>
638 // %12 = shufflevector <4 x i32> %9, <4 x i32> poison, <8 x i32> <0...3>
639 // %13 = shufflevector <8 x i32> %11, <8 x i32> %12, <12 x i32> <0...11>
640 // %interleaved.vec = shufflevector %13, poison, <12 x i32> <interleave mask>
641 // store <12 x i32> %interleaved.vec, ptr %10, align 4
642 if (Factor != 2)
643 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
644 Alignment, AddressSpace, CostKind,
645 UseMaskForCond, UseMaskForGaps);
646
647 assert(Opcode == Instruction::Store && "Opcode must be a store");
648 // For an interleaving store of 2 vectors, we perform one large interleaving
649 // shuffle that goes into the wide store
650 auto Mask = createInterleaveMask(VF, Factor);
651 InstructionCost ShuffleCost =
653 CostKind, 0, nullptr, {});
654 return MemCost + ShuffleCost;
655}
656
658 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
659 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
661 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
662 Alignment, CostKind, I);
663
664 if ((Opcode == Instruction::Load &&
665 !isLegalMaskedGather(DataTy, Align(Alignment))) ||
666 (Opcode == Instruction::Store &&
667 !isLegalMaskedScatter(DataTy, Align(Alignment))))
668 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
669 Alignment, CostKind, I);
670
671 // Cost is proportional to the number of memory operations implied. For
672 // scalable vectors, we use an estimate on that number since we don't
673 // know exactly what VL will be.
674 auto &VTy = *cast<VectorType>(DataTy);
675 InstructionCost MemOpCost =
676 getMemoryOpCost(Opcode, VTy.getElementType(), Alignment, 0, CostKind,
677 {TTI::OK_AnyValue, TTI::OP_None}, I);
678 unsigned NumLoads = getEstimatedVLFor(&VTy);
679 return NumLoads * MemOpCost;
680}
681
683 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
684 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
685 if (((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
686 !isLegalStridedLoadStore(DataTy, Alignment)) ||
687 (Opcode != Instruction::Load && Opcode != Instruction::Store))
688 return BaseT::getStridedMemoryOpCost(Opcode, DataTy, Ptr, VariableMask,
689 Alignment, CostKind, I);
690
692 return TTI::TCC_Basic;
693
694 // Cost is proportional to the number of memory operations implied. For
695 // scalable vectors, we use an estimate on that number since we don't
696 // know exactly what VL will be.
697 auto &VTy = *cast<VectorType>(DataTy);
698 InstructionCost MemOpCost =
699 getMemoryOpCost(Opcode, VTy.getElementType(), Alignment, 0, CostKind,
700 {TTI::OK_AnyValue, TTI::OP_None}, I);
701 unsigned NumLoads = getEstimatedVLFor(&VTy);
702 return NumLoads * MemOpCost;
703}
704
705// Currently, these represent both throughput and codesize costs
706// for the respective intrinsics. The costs in this table are simply
707// instruction counts with the following adjustments made:
708// * One vsetvli is considered free.
710 {Intrinsic::floor, MVT::f32, 9},
711 {Intrinsic::floor, MVT::f64, 9},
712 {Intrinsic::ceil, MVT::f32, 9},
713 {Intrinsic::ceil, MVT::f64, 9},
714 {Intrinsic::trunc, MVT::f32, 7},
715 {Intrinsic::trunc, MVT::f64, 7},
716 {Intrinsic::round, MVT::f32, 9},
717 {Intrinsic::round, MVT::f64, 9},
718 {Intrinsic::roundeven, MVT::f32, 9},
719 {Intrinsic::roundeven, MVT::f64, 9},
720 {Intrinsic::rint, MVT::f32, 7},
721 {Intrinsic::rint, MVT::f64, 7},
722 {Intrinsic::lrint, MVT::i32, 1},
723 {Intrinsic::lrint, MVT::i64, 1},
724 {Intrinsic::llrint, MVT::i64, 1},
725 {Intrinsic::nearbyint, MVT::f32, 9},
726 {Intrinsic::nearbyint, MVT::f64, 9},
727 {Intrinsic::bswap, MVT::i16, 3},
728 {Intrinsic::bswap, MVT::i32, 12},
729 {Intrinsic::bswap, MVT::i64, 31},
730 {Intrinsic::vp_bswap, MVT::i16, 3},
731 {Intrinsic::vp_bswap, MVT::i32, 12},
732 {Intrinsic::vp_bswap, MVT::i64, 31},
733 {Intrinsic::vp_fshl, MVT::i8, 7},
734 {Intrinsic::vp_fshl, MVT::i16, 7},
735 {Intrinsic::vp_fshl, MVT::i32, 7},
736 {Intrinsic::vp_fshl, MVT::i64, 7},
737 {Intrinsic::vp_fshr, MVT::i8, 7},
738 {Intrinsic::vp_fshr, MVT::i16, 7},
739 {Intrinsic::vp_fshr, MVT::i32, 7},
740 {Intrinsic::vp_fshr, MVT::i64, 7},
741 {Intrinsic::bitreverse, MVT::i8, 17},
742 {Intrinsic::bitreverse, MVT::i16, 24},
743 {Intrinsic::bitreverse, MVT::i32, 33},
744 {Intrinsic::bitreverse, MVT::i64, 52},
745 {Intrinsic::vp_bitreverse, MVT::i8, 17},
746 {Intrinsic::vp_bitreverse, MVT::i16, 24},
747 {Intrinsic::vp_bitreverse, MVT::i32, 33},
748 {Intrinsic::vp_bitreverse, MVT::i64, 52},
749 {Intrinsic::ctpop, MVT::i8, 12},
750 {Intrinsic::ctpop, MVT::i16, 19},
751 {Intrinsic::ctpop, MVT::i32, 20},
752 {Intrinsic::ctpop, MVT::i64, 21},
753 {Intrinsic::vp_ctpop, MVT::i8, 12},
754 {Intrinsic::vp_ctpop, MVT::i16, 19},
755 {Intrinsic::vp_ctpop, MVT::i32, 20},
756 {Intrinsic::vp_ctpop, MVT::i64, 21},
757 {Intrinsic::vp_ctlz, MVT::i8, 19},
758 {Intrinsic::vp_ctlz, MVT::i16, 28},
759 {Intrinsic::vp_ctlz, MVT::i32, 31},
760 {Intrinsic::vp_ctlz, MVT::i64, 35},
761 {Intrinsic::vp_cttz, MVT::i8, 16},
762 {Intrinsic::vp_cttz, MVT::i16, 23},
763 {Intrinsic::vp_cttz, MVT::i32, 24},
764 {Intrinsic::vp_cttz, MVT::i64, 25},
765};
766
768 switch (ID) {
769#define HELPER_MAP_VPID_TO_VPSD(VPID, VPSD) \
770 case Intrinsic::VPID: \
771 return ISD::VPSD;
772#include "llvm/IR/VPIntrinsics.def"
773#undef HELPER_MAP_VPID_TO_VPSD
774 }
775 return ISD::DELETED_NODE;
776}
777
781 auto *RetTy = ICA.getReturnType();
782 switch (ICA.getID()) {
783 case Intrinsic::ceil:
784 case Intrinsic::floor:
785 case Intrinsic::trunc:
786 case Intrinsic::rint:
787 case Intrinsic::lrint:
788 case Intrinsic::llrint:
789 case Intrinsic::round:
790 case Intrinsic::roundeven: {
791 // These all use the same code.
793 if (!LT.second.isVector() && TLI->isOperationCustom(ISD::FCEIL, LT.second))
794 return LT.first * 8;
795 break;
796 }
797 case Intrinsic::umin:
798 case Intrinsic::umax:
799 case Intrinsic::smin:
800 case Intrinsic::smax: {
802 if ((ST->hasVInstructions() && LT.second.isVector()) ||
803 (LT.second.isScalarInteger() && ST->hasStdExtZbb()))
804 return LT.first;
805 break;
806 }
807 case Intrinsic::sadd_sat:
808 case Intrinsic::ssub_sat:
809 case Intrinsic::uadd_sat:
810 case Intrinsic::usub_sat:
811 case Intrinsic::fabs:
812 case Intrinsic::sqrt: {
814 if (ST->hasVInstructions() && LT.second.isVector())
815 return LT.first;
816 break;
817 }
818 case Intrinsic::ctpop: {
820 if (ST->hasVInstructions() && ST->hasStdExtZvbb() && LT.second.isVector())
821 return LT.first;
822 break;
823 }
824 case Intrinsic::abs: {
826 if (ST->hasVInstructions() && LT.second.isVector()) {
827 // vrsub.vi v10, v8, 0
828 // vmax.vv v8, v8, v10
829 return LT.first * 2;
830 }
831 break;
832 }
833 // TODO: add more intrinsic
834 case Intrinsic::experimental_stepvector: {
835 unsigned Cost = 1; // vid
837 return Cost + (LT.first - 1);
838 }
839 case Intrinsic::vp_rint: {
840 // RISC-V target uses at least 5 instructions to lower rounding intrinsics.
841 unsigned Cost = 5;
843 if (TLI->isOperationCustom(ISD::VP_FRINT, LT.second))
844 return Cost * LT.first;
845 break;
846 }
847 case Intrinsic::vp_nearbyint: {
848 // More one read and one write for fflags than vp_rint.
849 unsigned Cost = 7;
851 if (TLI->isOperationCustom(ISD::VP_FRINT, LT.second))
852 return Cost * LT.first;
853 break;
854 }
855 case Intrinsic::vp_ceil:
856 case Intrinsic::vp_floor:
857 case Intrinsic::vp_round:
858 case Intrinsic::vp_roundeven:
859 case Intrinsic::vp_roundtozero: {
860 // Rounding with static rounding mode needs two more instructions to
861 // swap/write FRM than vp_rint.
862 unsigned Cost = 7;
864 unsigned VPISD = getISDForVPIntrinsicID(ICA.getID());
865 if (TLI->isOperationCustom(VPISD, LT.second))
866 return Cost * LT.first;
867 break;
868 }
869 }
870
871 if (ST->hasVInstructions() && RetTy->isVectorTy()) {
872 if (auto LT = getTypeLegalizationCost(RetTy);
873 LT.second.isVector()) {
874 MVT EltTy = LT.second.getVectorElementType();
875 if (const auto *Entry = CostTableLookup(VectorIntrinsicCostTable,
876 ICA.getID(), EltTy))
877 return LT.first * Entry->Cost;
878 }
879 }
880
882}
883
885 Type *Src,
888 const Instruction *I) {
889 if (isa<VectorType>(Dst) && isa<VectorType>(Src)) {
890 // FIXME: Need to compute legalizing cost for illegal types.
891 if (!isTypeLegal(Src) || !isTypeLegal(Dst))
892 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
893
894 // Skip if element size of Dst or Src is bigger than ELEN.
895 if (Src->getScalarSizeInBits() > ST->getELen() ||
896 Dst->getScalarSizeInBits() > ST->getELen())
897 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
898
899 int ISD = TLI->InstructionOpcodeToISD(Opcode);
900 assert(ISD && "Invalid opcode");
901
902 // FIXME: Need to consider vsetvli and lmul.
903 int PowDiff = (int)Log2_32(Dst->getScalarSizeInBits()) -
904 (int)Log2_32(Src->getScalarSizeInBits());
905 switch (ISD) {
906 case ISD::SIGN_EXTEND:
907 case ISD::ZERO_EXTEND:
908 if (Src->getScalarSizeInBits() == 1) {
909 // We do not use vsext/vzext to extend from mask vector.
910 // Instead we use the following instructions to extend from mask vector:
911 // vmv.v.i v8, 0
912 // vmerge.vim v8, v8, -1, v0
913 return 2;
914 }
915 return 1;
916 case ISD::TRUNCATE:
917 if (Dst->getScalarSizeInBits() == 1) {
918 // We do not use several vncvt to truncate to mask vector. So we could
919 // not use PowDiff to calculate it.
920 // Instead we use the following instructions to truncate to mask vector:
921 // vand.vi v8, v8, 1
922 // vmsne.vi v0, v8, 0
923 return 2;
924 }
925 [[fallthrough]];
926 case ISD::FP_EXTEND:
927 case ISD::FP_ROUND:
928 // Counts of narrow/widen instructions.
929 return std::abs(PowDiff);
930 case ISD::FP_TO_SINT:
931 case ISD::FP_TO_UINT:
932 case ISD::SINT_TO_FP:
933 case ISD::UINT_TO_FP:
934 if (Src->getScalarSizeInBits() == 1 || Dst->getScalarSizeInBits() == 1) {
935 // The cost of convert from or to mask vector is different from other
936 // cases. We could not use PowDiff to calculate it.
937 // For mask vector to fp, we should use the following instructions:
938 // vmv.v.i v8, 0
939 // vmerge.vim v8, v8, -1, v0
940 // vfcvt.f.x.v v8, v8
941
942 // And for fp vector to mask, we use:
943 // vfncvt.rtz.x.f.w v9, v8
944 // vand.vi v8, v9, 1
945 // vmsne.vi v0, v8, 0
946 return 3;
947 }
948 if (std::abs(PowDiff) <= 1)
949 return 1;
950 // Backend could lower (v[sz]ext i8 to double) to vfcvt(v[sz]ext.f8 i8),
951 // so it only need two conversion.
952 if (Src->isIntOrIntVectorTy())
953 return 2;
954 // Counts of narrow/widen instructions.
955 return std::abs(PowDiff);
956 }
957 }
958 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
959}
960
961unsigned RISCVTTIImpl::getEstimatedVLFor(VectorType *Ty) {
962 if (isa<ScalableVectorType>(Ty)) {
963 const unsigned EltSize = DL.getTypeSizeInBits(Ty->getElementType());
964 const unsigned MinSize = DL.getTypeSizeInBits(Ty).getKnownMinValue();
965 const unsigned VectorBits = *getVScaleForTuning() * RISCV::RVVBitsPerBlock;
966 return RISCVTargetLowering::computeVLMAX(VectorBits, EltSize, MinSize);
967 }
968 return cast<FixedVectorType>(Ty)->getNumElements();
969}
970
973 FastMathFlags FMF,
975 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
976 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
977
978 // Skip if scalar size of Ty is bigger than ELEN.
979 if (Ty->getScalarSizeInBits() > ST->getELen())
980 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
981
982 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
983 if (Ty->getElementType()->isIntegerTy(1)) {
984 // SelectionDAGBuilder does following transforms:
985 // vector_reduce_{smin,umax}(<n x i1>) --> vector_reduce_or(<n x i1>)
986 // vector_reduce_{smax,umin}(<n x i1>) --> vector_reduce_and(<n x i1>)
987 if (IID == Intrinsic::umax || IID == Intrinsic::smin)
988 return getArithmeticReductionCost(Instruction::Or, Ty, FMF, CostKind);
989 else
990 return getArithmeticReductionCost(Instruction::And, Ty, FMF, CostKind);
991 }
992
993 // IR Reduction is composed by two vmv and one rvv reduction instruction.
994 InstructionCost BaseCost = 2;
995
997 return (LT.first - 1) + BaseCost;
998
999 unsigned VL = getEstimatedVLFor(Ty);
1000 return (LT.first - 1) + BaseCost + Log2_32_Ceil(VL);
1001}
1002
1005 std::optional<FastMathFlags> FMF,
1007 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
1008 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
1009
1010 // Skip if scalar size of Ty is bigger than ELEN.
1011 if (Ty->getScalarSizeInBits() > ST->getELen())
1012 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
1013
1014 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1015 assert(ISD && "Invalid opcode");
1016
1017 if (ISD != ISD::ADD && ISD != ISD::OR && ISD != ISD::XOR && ISD != ISD::AND &&
1018 ISD != ISD::FADD)
1019 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
1020
1021 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1023 Type *ElementTy = Ty->getElementType();
1024 if (ElementTy->isIntegerTy(1)) {
1025 if (ISD == ISD::AND) {
1026 // Example sequences:
1027 // vsetvli a0, zero, e8, mf8, ta, ma
1028 // vmnot.m v8, v0
1029 // vcpop.m a0, v8
1030 // seqz a0, a0
1031 Opcodes = {RISCV::VMNAND_MM, RISCV::VCPOP_M};
1032 return (LT.first - 1) +
1033 getRISCVInstructionCost(Opcodes, LT.second, CostKind) +
1034 getCmpSelInstrCost(Instruction::ICmp, ElementTy, ElementTy,
1036 } else {
1037 // Example sequences:
1038 // vsetvli a0, zero, e8, mf8, ta, ma
1039 // vcpop.m a0, v0
1040 // snez a0, a0
1041 Opcodes = {RISCV::VCPOP_M};
1042 return (LT.first - 1) +
1043 getRISCVInstructionCost(Opcodes, LT.second, CostKind) +
1044 getCmpSelInstrCost(Instruction::ICmp, ElementTy, ElementTy,
1046 }
1047 }
1048
1049 // IR Reduction is composed by two vmv and one rvv reduction instruction.
1051 Opcodes.push_back(RISCV::VFMV_S_F);
1052 for (unsigned i = 0; i < LT.first.getValue(); i++)
1053 Opcodes.push_back(RISCV::VFREDOSUM_VS);
1054 Opcodes.push_back(RISCV::VFMV_F_S);
1055 return getRISCVInstructionCost(Opcodes, LT.second, CostKind);
1056 }
1057 unsigned SplitOp;
1058 switch (ISD) {
1059 case ISD::ADD:
1060 SplitOp = RISCV::VADD_VV;
1061 Opcodes = {RISCV::VMV_S_X, RISCV::VREDSUM_VS, RISCV::VMV_X_S};
1062 break;
1063 case ISD::OR:
1064 SplitOp = RISCV::VOR_VV;
1065 Opcodes = {RISCV::VMV_S_X, RISCV::VREDOR_VS, RISCV::VMV_X_S};
1066 break;
1067 case ISD::XOR:
1068 SplitOp = RISCV::VXOR_VV;
1069 Opcodes = {RISCV::VMV_S_X, RISCV::VREDXOR_VS, RISCV::VMV_X_S};
1070 break;
1071 case ISD::AND:
1072 SplitOp = RISCV::VAND_VV;
1073 Opcodes = {RISCV::VMV_S_X, RISCV::VREDAND_VS, RISCV::VMV_X_S};
1074 break;
1075 case ISD::FADD:
1076 SplitOp = RISCV::VFADD_VV;
1077 Opcodes = {RISCV::VFMV_S_F, RISCV::VFREDUSUM_VS, RISCV::VFMV_F_S};
1078 break;
1079 }
1080 // Add a cost for data larger than LMUL8
1081 InstructionCost SplitCost =
1082 (LT.first > 1) ? (LT.first - 1) *
1083 getRISCVInstructionCost(SplitOp, LT.second, CostKind)
1084 : 0;
1085 return SplitCost + getRISCVInstructionCost(Opcodes, LT.second, CostKind);
1086}
1087
1089 unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy,
1091 if (isa<FixedVectorType>(ValTy) && !ST->useRVVForFixedLengthVectors())
1092 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
1093 FMF, CostKind);
1094
1095 // Skip if scalar size of ResTy is bigger than ELEN.
1096 if (ResTy->getScalarSizeInBits() > ST->getELen())
1097 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
1098 FMF, CostKind);
1099
1100 if (Opcode != Instruction::Add && Opcode != Instruction::FAdd)
1101 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
1102 FMF, CostKind);
1103
1104 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1105
1106 if (ResTy->getScalarSizeInBits() != 2 * LT.second.getScalarSizeInBits())
1107 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
1108 FMF, CostKind);
1109
1110 return (LT.first - 1) +
1111 getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
1112}
1113
1115 TTI::OperandValueInfo OpInfo,
1117 assert(OpInfo.isConstant() && "non constant operand?");
1118 if (!isa<VectorType>(Ty))
1119 // FIXME: We need to account for immediate materialization here, but doing
1120 // a decent job requires more knowledge about the immediate than we
1121 // currently have here.
1122 return 0;
1123
1124 if (OpInfo.isUniform())
1125 // vmv.x.i, vmv.v.x, or vfmv.v.f
1126 // We ignore the cost of the scalar constant materialization to be consistent
1127 // with how we treat scalar constants themselves just above.
1128 return 1;
1129
1130 return getConstantPoolLoadCost(Ty, CostKind);
1131}
1132
1133
1135 MaybeAlign Alignment,
1136 unsigned AddressSpace,
1138 TTI::OperandValueInfo OpInfo,
1139 const Instruction *I) {
1140 EVT VT = TLI->getValueType(DL, Src, true);
1141 // Type legalization can't handle structs
1142 if (VT == MVT::Other)
1143 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1144 CostKind, OpInfo, I);
1145
1147 if (Opcode == Instruction::Store && OpInfo.isConstant())
1148 Cost += getStoreImmCost(Src, OpInfo, CostKind);
1149 InstructionCost BaseCost =
1150 BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1151 CostKind, OpInfo, I);
1152 // Assume memory ops cost scale with the number of vector registers
1153 // possible accessed by the instruction. Note that BasicTTI already
1154 // handles the LT.first term for us.
1155 if (std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
1156 LT.second.isVector() && CostKind != TTI::TCK_CodeSize)
1157 BaseCost *= TLI->getLMULCost(LT.second);
1158 return Cost + BaseCost;
1159
1160}
1161
1163 Type *CondTy,
1164 CmpInst::Predicate VecPred,
1166 const Instruction *I) {
1168 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1169 I);
1170
1171 if (isa<FixedVectorType>(ValTy) && !ST->useRVVForFixedLengthVectors())
1172 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1173 I);
1174
1175 // Skip if scalar size of ValTy is bigger than ELEN.
1176 if (ValTy->isVectorTy() && ValTy->getScalarSizeInBits() > ST->getELen())
1177 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1178 I);
1179
1180 if (Opcode == Instruction::Select && ValTy->isVectorTy()) {
1181 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1182 if (CondTy->isVectorTy()) {
1183 if (ValTy->getScalarSizeInBits() == 1) {
1184 // vmandn.mm v8, v8, v9
1185 // vmand.mm v9, v0, v9
1186 // vmor.mm v0, v9, v8
1187 return LT.first * 3;
1188 }
1189 // vselect and max/min are supported natively.
1190 return LT.first * 1;
1191 }
1192
1193 if (ValTy->getScalarSizeInBits() == 1) {
1194 // vmv.v.x v9, a0
1195 // vmsne.vi v9, v9, 0
1196 // vmandn.mm v8, v8, v9
1197 // vmand.mm v9, v0, v9
1198 // vmor.mm v0, v9, v8
1199 return LT.first * 5;
1200 }
1201
1202 // vmv.v.x v10, a0
1203 // vmsne.vi v0, v10, 0
1204 // vmerge.vvm v8, v9, v8, v0
1205 return LT.first * 3;
1206 }
1207
1208 if ((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
1209 ValTy->isVectorTy()) {
1210 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1211
1212 // Support natively.
1213 if (CmpInst::isIntPredicate(VecPred))
1214 return LT.first * 1;
1215
1216 // If we do not support the input floating point vector type, use the base
1217 // one which will calculate as:
1218 // ScalarizeCost + Num * Cost for fixed vector,
1219 // InvalidCost for scalable vector.
1220 if ((ValTy->getScalarSizeInBits() == 16 && !ST->hasVInstructionsF16()) ||
1221 (ValTy->getScalarSizeInBits() == 32 && !ST->hasVInstructionsF32()) ||
1222 (ValTy->getScalarSizeInBits() == 64 && !ST->hasVInstructionsF64()))
1223 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1224 I);
1225 switch (VecPred) {
1226 // Support natively.
1227 case CmpInst::FCMP_OEQ:
1228 case CmpInst::FCMP_OGT:
1229 case CmpInst::FCMP_OGE:
1230 case CmpInst::FCMP_OLT:
1231 case CmpInst::FCMP_OLE:
1232 case CmpInst::FCMP_UNE:
1233 return LT.first * 1;
1234 // TODO: Other comparisons?
1235 default:
1236 break;
1237 }
1238 }
1239
1240 // TODO: Add cost for scalar type.
1241
1242 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
1243}
1244
1247 const Instruction *I) {
1249 return Opcode == Instruction::PHI ? 0 : 1;
1250 // Branches are assumed to be predicted.
1251 return 0;
1252}
1253
1256 unsigned Index, Value *Op0,
1257 Value *Op1) {
1258 assert(Val->isVectorTy() && "This must be a vector type");
1259
1260 if (Opcode != Instruction::ExtractElement &&
1261 Opcode != Instruction::InsertElement)
1262 return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1);
1263
1264 // Legalize the type.
1265 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
1266
1267 // This type is legalized to a scalar type.
1268 if (!LT.second.isVector()) {
1269 auto *FixedVecTy = cast<FixedVectorType>(Val);
1270 // If Index is a known constant, cost is zero.
1271 if (Index != -1U)
1272 return 0;
1273 // Extract/InsertElement with non-constant index is very costly when
1274 // scalarized; estimate cost of loads/stores sequence via the stack:
1275 // ExtractElement cost: store vector to stack, load scalar;
1276 // InsertElement cost: store vector to stack, store scalar, load vector.
1277 Type *ElemTy = FixedVecTy->getElementType();
1278 auto NumElems = FixedVecTy->getNumElements();
1279 auto Align = DL.getPrefTypeAlign(ElemTy);
1280 InstructionCost LoadCost =
1281 getMemoryOpCost(Instruction::Load, ElemTy, Align, 0, CostKind);
1282 InstructionCost StoreCost =
1283 getMemoryOpCost(Instruction::Store, ElemTy, Align, 0, CostKind);
1284 return Opcode == Instruction::ExtractElement
1285 ? StoreCost * NumElems + LoadCost
1286 : (StoreCost + LoadCost) * NumElems + StoreCost;
1287 }
1288
1289 // For unsupported scalable vector.
1290 if (LT.second.isScalableVector() && !LT.first.isValid())
1291 return LT.first;
1292
1293 if (!isTypeLegal(Val))
1294 return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1);
1295
1296 // Mask vector extract/insert is expanded via e8.
1297 if (Val->getScalarSizeInBits() == 1) {
1298 VectorType *WideTy =
1300 cast<VectorType>(Val)->getElementCount());
1301 if (Opcode == Instruction::ExtractElement) {
1302 InstructionCost ExtendCost
1303 = getCastInstrCost(Instruction::ZExt, WideTy, Val,
1305 InstructionCost ExtractCost
1306 = getVectorInstrCost(Opcode, WideTy, CostKind, Index, nullptr, nullptr);
1307 return ExtendCost + ExtractCost;
1308 }
1309 InstructionCost ExtendCost
1310 = getCastInstrCost(Instruction::ZExt, WideTy, Val,
1312 InstructionCost InsertCost
1313 = getVectorInstrCost(Opcode, WideTy, CostKind, Index, nullptr, nullptr);
1314 InstructionCost TruncCost
1315 = getCastInstrCost(Instruction::Trunc, Val, WideTy,
1317 return ExtendCost + InsertCost + TruncCost;
1318 }
1319
1320
1321 // In RVV, we could use vslidedown + vmv.x.s to extract element from vector
1322 // and vslideup + vmv.s.x to insert element to vector.
1323 unsigned BaseCost = 1;
1324 // When insertelement we should add the index with 1 as the input of vslideup.
1325 unsigned SlideCost = Opcode == Instruction::InsertElement ? 2 : 1;
1326
1327 if (Index != -1U) {
1328 // The type may be split. For fixed-width vectors we can normalize the
1329 // index to the new type.
1330 if (LT.second.isFixedLengthVector()) {
1331 unsigned Width = LT.second.getVectorNumElements();
1332 Index = Index % Width;
1333 }
1334
1335 // We could extract/insert the first element without vslidedown/vslideup.
1336 if (Index == 0)
1337 SlideCost = 0;
1338 else if (Opcode == Instruction::InsertElement)
1339 SlideCost = 1; // With a constant index, we do not need to use addi.
1340 }
1341
1342 // Extract i64 in the target that has XLEN=32 need more instruction.
1343 if (Val->getScalarType()->isIntegerTy() &&
1344 ST->getXLen() < Val->getScalarSizeInBits()) {
1345 // For extractelement, we need the following instructions:
1346 // vsetivli zero, 1, e64, m1, ta, mu (not count)
1347 // vslidedown.vx v8, v8, a0
1348 // vmv.x.s a0, v8
1349 // li a1, 32
1350 // vsrl.vx v8, v8, a1
1351 // vmv.x.s a1, v8
1352
1353 // For insertelement, we need the following instructions:
1354 // vsetivli zero, 2, e32, m4, ta, mu (not count)
1355 // vmv.v.i v12, 0
1356 // vslide1up.vx v16, v12, a1
1357 // vslide1up.vx v12, v16, a0
1358 // addi a0, a2, 1
1359 // vsetvli zero, a0, e64, m4, tu, mu (not count)
1360 // vslideup.vx v8, v12, a2
1361
1362 // TODO: should we count these special vsetvlis?
1363 BaseCost = Opcode == Instruction::InsertElement ? 3 : 4;
1364 }
1365 return BaseCost + SlideCost;
1366}
1367
1369 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
1371 ArrayRef<const Value *> Args, const Instruction *CxtI) {
1372
1373 // TODO: Handle more cost kinds.
1375 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1376 Args, CxtI);
1377
1378 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
1379 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1380 Args, CxtI);
1381
1382 // Skip if scalar size of Ty is bigger than ELEN.
1383 if (isa<VectorType>(Ty) && Ty->getScalarSizeInBits() > ST->getELen())
1384 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1385 Args, CxtI);
1386
1387 // Legalize the type.
1388 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1389
1390 // TODO: Handle scalar type.
1391 if (!LT.second.isVector())
1392 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1393 Args, CxtI);
1394
1395
1396 auto getConstantMatCost =
1397 [&](unsigned Operand, TTI::OperandValueInfo OpInfo) -> InstructionCost {
1398 if (OpInfo.isUniform() && TLI->canSplatOperand(Opcode, Operand))
1399 // Two sub-cases:
1400 // * Has a 5 bit immediate operand which can be splatted.
1401 // * Has a larger immediate which must be materialized in scalar register
1402 // We return 0 for both as we currently ignore the cost of materializing
1403 // scalar constants in GPRs.
1404 return 0;
1405
1406 return getConstantPoolLoadCost(Ty, CostKind);
1407 };
1408
1409 // Add the cost of materializing any constant vectors required.
1410 InstructionCost ConstantMatCost = 0;
1411 if (Op1Info.isConstant())
1412 ConstantMatCost += getConstantMatCost(0, Op1Info);
1413 if (Op2Info.isConstant())
1414 ConstantMatCost += getConstantMatCost(1, Op2Info);
1415
1416 switch (TLI->InstructionOpcodeToISD(Opcode)) {
1417 case ISD::ADD:
1418 case ISD::SUB:
1419 case ISD::AND:
1420 case ISD::OR:
1421 case ISD::XOR:
1422 case ISD::SHL:
1423 case ISD::SRL:
1424 case ISD::SRA:
1425 case ISD::MUL:
1426 case ISD::MULHS:
1427 case ISD::MULHU:
1428 case ISD::FADD:
1429 case ISD::FSUB:
1430 case ISD::FMUL:
1431 case ISD::FNEG: {
1432 return ConstantMatCost + TLI->getLMULCost(LT.second) * LT.first * 1;
1433 }
1434 default:
1435 return ConstantMatCost +
1436 BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1437 Args, CxtI);
1438 }
1439}
1440
1441// TODO: Deduplicate from TargetTransformInfoImplCRTPBase.
1443 ArrayRef<const Value *> Ptrs, const Value *Base,
1444 const TTI::PointersChainInfo &Info, Type *AccessTy,
1447 // In the basic model we take into account GEP instructions only
1448 // (although here can come alloca instruction, a value, constants and/or
1449 // constant expressions, PHIs, bitcasts ... whatever allowed to be used as a
1450 // pointer). Typically, if Base is a not a GEP-instruction and all the
1451 // pointers are relative to the same base address, all the rest are
1452 // either GEP instructions, PHIs, bitcasts or constants. When we have same
1453 // base, we just calculate cost of each non-Base GEP as an ADD operation if
1454 // any their index is a non-const.
1455 // If no known dependecies between the pointers cost is calculated as a sum
1456 // of costs of GEP instructions.
1457 for (auto [I, V] : enumerate(Ptrs)) {
1458 const auto *GEP = dyn_cast<GetElementPtrInst>(V);
1459 if (!GEP)
1460 continue;
1461 if (Info.isSameBase() && V != Base) {
1462 if (GEP->hasAllConstantIndices())
1463 continue;
1464 // If the chain is unit-stride and BaseReg + stride*i is a legal
1465 // addressing mode, then presume the base GEP is sitting around in a
1466 // register somewhere and check if we can fold the offset relative to
1467 // it.
1468 unsigned Stride = DL.getTypeStoreSize(AccessTy);
1469 if (Info.isUnitStride() &&
1470 isLegalAddressingMode(AccessTy,
1471 /* BaseGV */ nullptr,
1472 /* BaseOffset */ Stride * I,
1473 /* HasBaseReg */ true,
1474 /* Scale */ 0,
1475 GEP->getType()->getPointerAddressSpace()))
1476 continue;
1477 Cost += getArithmeticInstrCost(Instruction::Add, GEP->getType(), CostKind,
1478 {TTI::OK_AnyValue, TTI::OP_None},
1479 {TTI::OK_AnyValue, TTI::OP_None},
1480 std::nullopt);
1481 } else {
1482 SmallVector<const Value *> Indices(GEP->indices());
1483 Cost += getGEPCost(GEP->getSourceElementType(), GEP->getPointerOperand(),
1484 Indices, AccessTy, CostKind);
1485 }
1486 }
1487 return Cost;
1488}
1489
1493 // TODO: More tuning on benchmarks and metrics with changes as needed
1494 // would apply to all settings below to enable performance.
1495
1496
1497 if (ST->enableDefaultUnroll())
1498 return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP, ORE);
1499
1500 // Enable Upper bound unrolling universally, not dependant upon the conditions
1501 // below.
1502 UP.UpperBound = true;
1503
1504 // Disable loop unrolling for Oz and Os.
1505 UP.OptSizeThreshold = 0;
1507 if (L->getHeader()->getParent()->hasOptSize())
1508 return;
1509
1510 SmallVector<BasicBlock *, 4> ExitingBlocks;
1511 L->getExitingBlocks(ExitingBlocks);
1512 LLVM_DEBUG(dbgs() << "Loop has:\n"
1513 << "Blocks: " << L->getNumBlocks() << "\n"
1514 << "Exit blocks: " << ExitingBlocks.size() << "\n");
1515
1516 // Only allow another exit other than the latch. This acts as an early exit
1517 // as it mirrors the profitability calculation of the runtime unroller.
1518 if (ExitingBlocks.size() > 2)
1519 return;
1520
1521 // Limit the CFG of the loop body for targets with a branch predictor.
1522 // Allowing 4 blocks permits if-then-else diamonds in the body.
1523 if (L->getNumBlocks() > 4)
1524 return;
1525
1526 // Don't unroll vectorized loops, including the remainder loop
1527 if (getBooleanLoopAttribute(L, "llvm.loop.isvectorized"))
1528 return;
1529
1530 // Scan the loop: don't unroll loops with calls as this could prevent
1531 // inlining.
1533 for (auto *BB : L->getBlocks()) {
1534 for (auto &I : *BB) {
1535 // Initial setting - Don't unroll loops containing vectorized
1536 // instructions.
1537 if (I.getType()->isVectorTy())
1538 return;
1539
1540 if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
1541 if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
1542 if (!isLoweredToCall(F))
1543 continue;
1544 }
1545 return;
1546 }
1547
1548 SmallVector<const Value *> Operands(I.operand_values());
1551 }
1552 }
1553
1554 LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n");
1555
1556 UP.Partial = true;
1557 UP.Runtime = true;
1558 UP.UnrollRemainder = true;
1559 UP.UnrollAndJam = true;
1561
1562 // Force unrolling small loops can be very useful because of the branch
1563 // taken cost of the backedge.
1564 if (Cost < 12)
1565 UP.Force = true;
1566}
1567
1571}
1572
1575 if (Ty->isVectorTy()) {
1576 if (Size.isScalable() && ST->hasVInstructions())
1577 return divideCeil(Size.getKnownMinValue(), RISCV::RVVBitsPerBlock);
1578
1580 return divideCeil(Size, ST->getRealMinVLen());
1581 }
1582
1583 return BaseT::getRegUsageForType(Ty);
1584}
1585
1586unsigned RISCVTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
1587 if (SLPMaxVF.getNumOccurrences())
1588 return SLPMaxVF;
1589
1590 // Return how many elements can fit in getRegisterBitwidth. This is the
1591 // same routine as used in LoopVectorizer. We should probably be
1592 // accounting for whether we actually have instructions with the right
1593 // lane type, but we don't have enough information to do that without
1594 // some additional plumbing which hasn't been justified yet.
1595 TypeSize RegWidth =
1597 // If no vector registers, or absurd element widths, disable
1598 // vectorization by returning 1.
1599 return std::max<unsigned>(1U, RegWidth.getFixedValue() / ElemWidth);
1600}
1601
1603 const TargetTransformInfo::LSRCost &C2) {
1604 // RISC-V specific here are "instruction number 1st priority".
1605 return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost,
1606 C1.NumIVMuls, C1.NumBaseAdds,
1607 C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
1608 std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost,
1609 C2.NumIVMuls, C2.NumBaseAdds,
1610 C2.ScaleCost, C2.ImmCost, C2.SetupCost);
1611}
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
Cost tables and simple lookup functions.
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
uint64_t Size
Hexagon Common GEP
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
mir Rename Register Operands
static const Function * getCalledFunction(const Value *V, bool &IsNoBuiltin)
if(VerifyEach)
static cl::opt< unsigned > SLPMaxVF("riscv-v-slp-max-vf", cl::desc("Overrides result used for getMaximumVF query which is used " "exclusively by SLP vectorizer."), cl::Hidden)
static cl::opt< unsigned > RVVRegisterWidthLMUL("riscv-v-register-bit-width-lmul", cl::desc("The LMUL to use for getRegisterBitWidth queries. Affects LMUL used " "by autovectorized code. Fractional LMULs are not supported."), cl::init(2), cl::Hidden)
static VectorType * getVRGatherIndexType(MVT DataVT, const RISCVSubtarget &ST, LLVMContext &C)
static const CostTblEntry VectorIntrinsicCostTable[]
static bool canUseShiftPair(Instruction *Inst, const APInt &Imm)
static unsigned getISDForVPIntrinsicID(Intrinsic::ID ID)
This file defines a TargetTransformInfo::Concept conforming object specific to the RISC-V target mach...
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file contains some templates that are useful if you are working with the STL at all.
This file describes how to lower LLVM code to machine code.
This pass exposes codegen information to IR-level passes.
Class for arbitrary precision integers.
Definition: APInt.h:76
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Get intrinsic cost based on arguments.
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
Definition: BasicTTIImpl.h:576
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *DataTy, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
std::optional< unsigned > getVScaleForTuning() const
Definition: BasicTTIImpl.h:750
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr)
Definition: BasicTTIImpl.h:885
InstructionCost getStridedMemoryOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I)
std::optional< unsigned > getMaxVScale() const
Definition: BasicTTIImpl.h:749
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *Ty, int &Index, VectorType *&SubTy) const
Definition: BasicTTIImpl.h:963
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
Try to calculate op costs for min/max reduction operations.
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args=std::nullopt)
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef< const Value * > Operands, Type *AccessType, TTI::TargetCostKind CostKind)
Definition: BasicTTIImpl.h:432
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr)
Definition: BasicTTIImpl.h:335
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: BasicTTIImpl.h:648
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
Estimate the cost of type-legalization and the legalized type.
Definition: BasicTTIImpl.h:849
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:965
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition: InstrTypes.h:968
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition: InstrTypes.h:971
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition: InstrTypes.h:969
@ FCMP_OGE
0 0 1 1 True if ordered and greater than or equal
Definition: InstrTypes.h:970
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition: InstrTypes.h:972
@ ICMP_EQ
equal
Definition: InstrTypes.h:986
@ ICMP_NE
not equal
Definition: InstrTypes.h:987
@ FCMP_UNE
1 1 1 0 True if unordered or not equal
Definition: InstrTypes.h:981
bool isIntPredicate() const
Definition: InstrTypes.h:1084
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
Definition: DataLayout.cpp:865
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition: DataLayout.h:672
TypeSize getTypeStoreSize(Type *Ty) const
Returns the maximum number of bytes that may be overwritten by storing the specified type.
Definition: DataLayout.h:472
Align getPrefTypeAlign(Type *Ty) const
Returns the preferred stack/global alignment for the specified type.
Definition: DataLayout.cpp:874
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:20
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:539
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:692
static InstructionCost getInvalid(CostType Val=0)
bool isCommutative() const LLVM_READONLY
Return true if the instruction is commutative:
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:278
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:47
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
Definition: IntrinsicInst.h:54
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:44
Machine Value Type.
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
uint64_t getScalarSizeInBits() const
MVT changeVectorElementType(MVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
unsigned getVectorNumElements() const
MVT changeTypeToInteger()
Return the type converted to an equivalently sized integer or vector with integer element type.
bool bitsGT(MVT VT) const
Return true if this has more bits than VT.
bool isFixedLengthVector() const
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
unsigned getOpcode() const
Return the opcode for this Instruction or ConstantExpr.
Definition: Operator.h:41
The optimization diagnostic interface.
bool hasVInstructionsF64() const
unsigned getRealMinVLen() const
bool useRVVForFixedLengthVectors() const
unsigned getXLen() const
bool hasVInstructionsF16() const
bool hasVInstructions() const
unsigned getRealMaxVLen() const
bool hasVInstructionsF32() const
unsigned getELen() const
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I)
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args=std::nullopt)
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2)
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind)
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr)
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
bool isLegalMaskedLoadStore(Type *DataType, Align Alignment)
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr)
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
InstructionCost getStridedMemoryOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I)
std::optional< unsigned > getVScaleForTuning() const
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy, FastMathFlags FMF, TTI::TargetCostKind CostKind)
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
std::optional< unsigned > getMaxVScale() const
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getPointersChainCost(ArrayRef< const Value * > Ptrs, const Value *Base, const TTI::PointersChainInfo &Info, Type *AccessTy, TTI::TargetCostKind CostKind)
TargetTransformInfo::PopcntSupportKind getPopcntSupport(unsigned TyWidth)
bool shouldExpandReduction(const IntrinsicInst *II) const
InstructionCost getStoreImmCost(Type *VecTy, TTI::OperandValueInfo OpInfo, TTI::TargetCostKind CostKind)
Return the cost of materializing an immediate for a value operand of a store instruction.
bool isLegalStridedLoadStore(Type *DataType, Align Alignment)
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
unsigned getRegUsageForType(Type *Ty)
bool isLegalMaskedGather(Type *DataType, Align Alignment)
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpdInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
bool isLegalMaskedScatter(Type *DataType, Align Alignment)
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const
InstructionCost getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind)
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
InstructionCost getVRGatherVVCost(MVT VT) const
Return the cost of a vrgather.vv instruction for the type VT.
bool canSplatOperand(Instruction *I, int Operand) const
Return true if the (vector) instruction I will be lowered to an instruction with a scalar splat opera...
InstructionCost getVRGatherVICost(MVT VT) const
Return the cost of a vrgather.vi (or vx) instruction for the type VT.
static unsigned computeVLMAX(unsigned VectorBits, unsigned EltSize, unsigned MinSize)
InstructionCost getLMULCost(MVT VT) const
Return the cost of LMUL for linear operations.
InstructionCost getVSlideVICost(MVT VT) const
Return the cost of a vslidedown.vi or vslideup.vi instruction for the type VT.
InstructionCost getVSlideVXCost(MVT VT) const
Return the cost of a vslidedown.vx or vslideup.vx instruction for the type VT.
bool isLegalInterleavedAccessType(VectorType *VTy, unsigned Factor, Align Alignment, unsigned AddrSpace, const DataLayout &) const
Returns whether or not generating a interleaved load/store intrinsic for this type will be legal.
The main scalar evolution driver.
static bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
size_t size() const
Definition: SmallVector.h:91
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
int InstructionOpcodeToISD(unsigned Opcode) const
Get the ISD node that corresponds to the Instruction class opcode.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
bool isOperationCustom(unsigned Op, EVT VT) const
Return true if the operation uses custom lowering, regardless of whether the type is legal or not.
const DataLayout & getDataLayout() const
bool isLoweredToCall(const Function *F) const
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TTI::TargetCostKind CostKind)
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
@ TCK_Latency
The latency of instruction.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
PopcntSupportKind
Flags indicating the kind of support for population count.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Transpose
Transpose two vectors.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
@ None
The cast is not used with a load/store of any kind.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:330
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition: TypeSize.h:333
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:265
static IntegerType * getInt1Ty(LLVMContext &C)
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:129
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:228
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:348
Value * getOperand(unsigned i) const
Definition: User.h:169
LLVM Value Representation.
Definition: Value.h:74
Base class of all SIMD vector types.
Definition: DerivedTypes.h:403
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
Definition: DerivedTypes.h:641
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Definition: Type.cpp:676
Type * getElementType() const
Definition: DerivedTypes.h:436
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:187
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition: TypeSize.h:168
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition: ISDOpcodes.h:44
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:239
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:790
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:390
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:774
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:928
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition: ISDOpcodes.h:651
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:705
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:780
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:888
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:836
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:680
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:869
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:786
int getIntMatCost(const APInt &Val, unsigned Size, const MCSubtargetInfo &STI, bool CompressionCost)
static constexpr unsigned RVVBitsPerBlock
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:450
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
unsigned Log2_32_Ceil(uint32_t Value)
Return the ceil log base 2 of the specified value, 32 if the value is zero.
Definition: MathExtras.h:326
const CostTblEntryT< CostType > * CostTableLookup(ArrayRef< CostTblEntryT< CostType > > Tbl, int ISD, MVT Ty)
Find in cost table.
Definition: CostTable.h:35
bool getBooleanLoopAttribute(const Loop *TheLoop, StringRef Name)
Returns true if Name is applied to TheLoop and enabled.
Definition: LoopInfo.cpp:1085
uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:417
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are are tuples (A,...
Definition: STLExtras.h:2386
AddressSpace
Definition: NVPTXBaseInfo.h:21
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition: MathExtras.h:361
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: bit.h:215
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition: MathExtras.h:258
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
Definition: STLExtras.h:1937
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:313
llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:264
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
constexpr int PoisonMaskElem
llvm::SmallVector< int, 16 > createInterleaveMask(unsigned VF, unsigned NumVecs)
Create an interleave shuffle mask.
InstructionCost Cost
bool equal(L &&LRange, R &&RRange)
Wrapper function around std::equal to detect if pair-wise elements between two ranges are the same.
Definition: STLExtras.h:2005
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition: bit.h:327
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
Cost Table Entry.
Definition: CostTable.h:25
Extended Value Type.
Definition: ValueTypes.h:34
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
unsigned Insns
TODO: Some of these could be merged.
Describe known properties for a set of pointers.
Parameters that control the generic loop unrolling transformation.
bool UpperBound
Allow using trip count upper bound to unroll loops.
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
unsigned UnrollAndJamInnerLoopThreshold
Threshold for unroll and jam, for inner loop size.
bool UnrollAndJam
Allow unroll and jam. Used to enable unroll and jam for the target.
bool UnrollRemainder
Allow unrolling of all the iterations of the runtime loop remainder.
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
unsigned OptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size (set to UINT_MAX to disable).