LLVM 19.0.0git
RISCVTargetTransformInfo.cpp
Go to the documentation of this file.
1//===-- RISCVTargetTransformInfo.cpp - RISC-V specific TTI ----------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
11#include "llvm/ADT/STLExtras.h"
17#include <cmath>
18#include <optional>
19using namespace llvm;
20
21#define DEBUG_TYPE "riscvtti"
22
24 "riscv-v-register-bit-width-lmul",
26 "The LMUL to use for getRegisterBitWidth queries. Affects LMUL used "
27 "by autovectorized code. Fractional LMULs are not supported."),
29
31 "riscv-v-slp-max-vf",
33 "Overrides result used for getMaximumVF query which is used "
34 "exclusively by SLP vectorizer."),
36
38RISCVTTIImpl::getRISCVInstructionCost(ArrayRef<unsigned> OpCodes, MVT VT,
40 // Check if the type is valid for all CostKind
41 if (!VT.isVector())
43 size_t NumInstr = OpCodes.size();
45 return NumInstr;
46 InstructionCost LMULCost = TLI->getLMULCost(VT);
48 return LMULCost * NumInstr;
50 for (auto Op : OpCodes) {
51 switch (Op) {
52 case RISCV::VRGATHER_VI:
53 Cost += TLI->getVRGatherVICost(VT);
54 break;
55 case RISCV::VRGATHER_VV:
56 Cost += TLI->getVRGatherVVCost(VT);
57 break;
58 case RISCV::VSLIDEUP_VI:
59 case RISCV::VSLIDEDOWN_VI:
60 Cost += TLI->getVSlideVICost(VT);
61 break;
62 case RISCV::VSLIDEUP_VX:
63 case RISCV::VSLIDEDOWN_VX:
64 Cost += TLI->getVSlideVXCost(VT);
65 break;
66 case RISCV::VREDMAX_VS:
67 case RISCV::VREDMIN_VS:
68 case RISCV::VREDMAXU_VS:
69 case RISCV::VREDMINU_VS:
70 case RISCV::VREDSUM_VS:
71 case RISCV::VREDAND_VS:
72 case RISCV::VREDOR_VS:
73 case RISCV::VREDXOR_VS:
74 case RISCV::VFREDMAX_VS:
75 case RISCV::VFREDMIN_VS:
76 case RISCV::VFREDUSUM_VS: {
77 unsigned VL = VT.getVectorMinNumElements();
78 if (!VT.isFixedLengthVector())
79 VL *= *getVScaleForTuning();
80 Cost += Log2_32_Ceil(VL);
81 break;
82 }
83 case RISCV::VFREDOSUM_VS: {
84 unsigned VL = VT.getVectorMinNumElements();
85 if (!VT.isFixedLengthVector())
86 VL *= *getVScaleForTuning();
87 Cost += VL;
88 break;
89 }
90 case RISCV::VMV_X_S:
91 case RISCV::VMV_S_X:
92 case RISCV::VFMV_F_S:
93 case RISCV::VFMV_S_F:
94 case RISCV::VMOR_MM:
95 case RISCV::VMXOR_MM:
96 case RISCV::VMAND_MM:
97 case RISCV::VMANDN_MM:
98 case RISCV::VMNAND_MM:
99 case RISCV::VCPOP_M:
100 case RISCV::VFIRST_M:
101 Cost += 1;
102 break;
103 default:
104 Cost += LMULCost;
105 }
106 }
107 return Cost;
108}
109
112 assert(Ty->isIntegerTy() &&
113 "getIntImmCost can only estimate cost of materialising integers");
114
115 // We have a Zero register, so 0 is always free.
116 if (Imm == 0)
117 return TTI::TCC_Free;
118
119 // Otherwise, we check how many instructions it will take to materialise.
120 const DataLayout &DL = getDataLayout();
121 return RISCVMatInt::getIntMatCost(Imm, DL.getTypeSizeInBits(Ty), *getST());
122}
123
124// Look for patterns of shift followed by AND that can be turned into a pair of
125// shifts. We won't need to materialize an immediate for the AND so these can
126// be considered free.
127static bool canUseShiftPair(Instruction *Inst, const APInt &Imm) {
128 uint64_t Mask = Imm.getZExtValue();
129 auto *BO = dyn_cast<BinaryOperator>(Inst->getOperand(0));
130 if (!BO || !BO->hasOneUse())
131 return false;
132
133 if (BO->getOpcode() != Instruction::Shl)
134 return false;
135
136 if (!isa<ConstantInt>(BO->getOperand(1)))
137 return false;
138
139 unsigned ShAmt = cast<ConstantInt>(BO->getOperand(1))->getZExtValue();
140 // (and (shl x, c2), c1) will be matched to (srli (slli x, c2+c3), c3) if c1
141 // is a mask shifted by c2 bits with c3 leading zeros.
142 if (isShiftedMask_64(Mask)) {
143 unsigned Trailing = llvm::countr_zero(Mask);
144 if (ShAmt == Trailing)
145 return true;
146 }
147
148 return false;
149}
150
152 const APInt &Imm, Type *Ty,
154 Instruction *Inst) {
155 assert(Ty->isIntegerTy() &&
156 "getIntImmCost can only estimate cost of materialising integers");
157
158 // We have a Zero register, so 0 is always free.
159 if (Imm == 0)
160 return TTI::TCC_Free;
161
162 // Some instructions in RISC-V can take a 12-bit immediate. Some of these are
163 // commutative, in others the immediate comes from a specific argument index.
164 bool Takes12BitImm = false;
165 unsigned ImmArgIdx = ~0U;
166
167 switch (Opcode) {
168 case Instruction::GetElementPtr:
169 // Never hoist any arguments to a GetElementPtr. CodeGenPrepare will
170 // split up large offsets in GEP into better parts than ConstantHoisting
171 // can.
172 return TTI::TCC_Free;
173 case Instruction::Store:
174 // If the address is a constant, use the materialization cost.
175 if (Idx == 1)
176 return getIntImmCost(Imm, Ty, CostKind);
177 return TTI::TCC_Free;
178 case Instruction::Load:
179 // If the address is a constant, use the materialization cost.
180 return getIntImmCost(Imm, Ty, CostKind);
181 case Instruction::And:
182 // zext.h
183 if (Imm == UINT64_C(0xffff) && ST->hasStdExtZbb())
184 return TTI::TCC_Free;
185 // zext.w
186 if (Imm == UINT64_C(0xffffffff) && ST->hasStdExtZba())
187 return TTI::TCC_Free;
188 // bclri
189 if (ST->hasStdExtZbs() && (~Imm).isPowerOf2())
190 return TTI::TCC_Free;
191 if (Inst && Idx == 1 && Imm.getBitWidth() <= ST->getXLen() &&
192 canUseShiftPair(Inst, Imm))
193 return TTI::TCC_Free;
194 Takes12BitImm = true;
195 break;
196 case Instruction::Add:
197 Takes12BitImm = true;
198 break;
199 case Instruction::Or:
200 case Instruction::Xor:
201 // bseti/binvi
202 if (ST->hasStdExtZbs() && Imm.isPowerOf2())
203 return TTI::TCC_Free;
204 Takes12BitImm = true;
205 break;
206 case Instruction::Mul:
207 // Power of 2 is a shift. Negated power of 2 is a shift and a negate.
208 if (Imm.isPowerOf2() || Imm.isNegatedPowerOf2())
209 return TTI::TCC_Free;
210 // One more or less than a power of 2 can use SLLI+ADD/SUB.
211 if ((Imm + 1).isPowerOf2() || (Imm - 1).isPowerOf2())
212 return TTI::TCC_Free;
213 // FIXME: There is no MULI instruction.
214 Takes12BitImm = true;
215 break;
216 case Instruction::Sub:
217 case Instruction::Shl:
218 case Instruction::LShr:
219 case Instruction::AShr:
220 Takes12BitImm = true;
221 ImmArgIdx = 1;
222 break;
223 default:
224 break;
225 }
226
227 if (Takes12BitImm) {
228 // Check immediate is the correct argument...
229 if (Instruction::isCommutative(Opcode) || Idx == ImmArgIdx) {
230 // ... and fits into the 12-bit immediate.
231 if (Imm.getSignificantBits() <= 64 &&
232 getTLI()->isLegalAddImmediate(Imm.getSExtValue())) {
233 return TTI::TCC_Free;
234 }
235 }
236
237 // Otherwise, use the full materialisation cost.
238 return getIntImmCost(Imm, Ty, CostKind);
239 }
240
241 // By default, prevent hoisting.
242 return TTI::TCC_Free;
243}
244
247 const APInt &Imm, Type *Ty,
249 // Prevent hoisting in unknown cases.
250 return TTI::TCC_Free;
251}
252
253bool RISCVTTIImpl::hasActiveVectorLength(unsigned, Type *DataTy, Align) const {
254 return ST->hasVInstructions();
255}
256
259 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
260 return ST->hasStdExtZbb() || ST->hasVendorXCVbitmanip()
263}
264
266 // Currently, the ExpandReductions pass can't expand scalable-vector
267 // reductions, but we still request expansion as RVV doesn't support certain
268 // reductions and the SelectionDAG can't legalize them either.
269 switch (II->getIntrinsicID()) {
270 default:
271 return false;
272 // These reductions have no equivalent in RVV
273 case Intrinsic::vector_reduce_mul:
274 case Intrinsic::vector_reduce_fmul:
275 return true;
276 }
277}
278
279std::optional<unsigned> RISCVTTIImpl::getMaxVScale() const {
280 if (ST->hasVInstructions())
282 return BaseT::getMaxVScale();
283}
284
285std::optional<unsigned> RISCVTTIImpl::getVScaleForTuning() const {
286 if (ST->hasVInstructions())
287 if (unsigned MinVLen = ST->getRealMinVLen();
288 MinVLen >= RISCV::RVVBitsPerBlock)
289 return MinVLen / RISCV::RVVBitsPerBlock;
291}
292
295 unsigned LMUL =
296 llvm::bit_floor(std::clamp<unsigned>(RVVRegisterWidthLMUL, 1, 8));
297 switch (K) {
299 return TypeSize::getFixed(ST->getXLen());
301 return TypeSize::getFixed(
302 ST->useRVVForFixedLengthVectors() ? LMUL * ST->getRealMinVLen() : 0);
305 (ST->hasVInstructions() &&
308 : 0);
309 }
310
311 llvm_unreachable("Unsupported register kind");
312}
313
315RISCVTTIImpl::getConstantPoolLoadCost(Type *Ty, TTI::TargetCostKind CostKind) {
316 // Add a cost of address generation + the cost of the load. The address
317 // is expected to be a PC relative offset to a constant pool entry
318 // using auipc/addi.
319 return 2 + getMemoryOpCost(Instruction::Load, Ty, DL.getABITypeAlign(Ty),
320 /*AddressSpace=*/0, CostKind);
321}
322
324 LLVMContext &C) {
325 assert((DataVT.getScalarSizeInBits() != 8 ||
326 DataVT.getVectorNumElements() <= 256) && "unhandled case in lowering");
327 MVT IndexVT = DataVT.changeTypeToInteger();
328 if (IndexVT.getScalarType().bitsGT(ST.getXLenVT()))
329 IndexVT = IndexVT.changeVectorElementType(MVT::i16);
330 return cast<VectorType>(EVT(IndexVT).getTypeForEVT(C));
331}
332
334 VectorType *Tp, ArrayRef<int> Mask,
336 int Index, VectorType *SubTp,
338 const Instruction *CxtI) {
339 Kind = improveShuffleKindFromMask(Kind, Mask, Tp, Index, SubTp);
340
341 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
342
343 // First, handle cases where having a fixed length vector enables us to
344 // give a more accurate cost than falling back to generic scalable codegen.
345 // TODO: Each of these cases hints at a modeling gap around scalable vectors.
346 if (isa<FixedVectorType>(Tp)) {
347 switch (Kind) {
348 default:
349 break;
351 if (Mask.size() >= 2 && LT.second.isFixedLengthVector()) {
352 MVT EltTp = LT.second.getVectorElementType();
353 // If the size of the element is < ELEN then shuffles of interleaves and
354 // deinterleaves of 2 vectors can be lowered into the following
355 // sequences
356 if (EltTp.getScalarSizeInBits() < ST->getELen()) {
357 // Example sequence:
358 // vsetivli zero, 4, e8, mf4, ta, ma (ignored)
359 // vwaddu.vv v10, v8, v9
360 // li a0, -1 (ignored)
361 // vwmaccu.vx v10, a0, v9
362 if (ShuffleVectorInst::isInterleaveMask(Mask, 2, Mask.size()))
363 return 2 * LT.first * TLI->getLMULCost(LT.second);
364
365 if (Mask[0] == 0 || Mask[0] == 1) {
366 auto DeinterleaveMask = createStrideMask(Mask[0], 2, Mask.size());
367 // Example sequence:
368 // vnsrl.wi v10, v8, 0
369 if (equal(DeinterleaveMask, Mask))
370 return LT.first * getRISCVInstructionCost(RISCV::VNSRL_WI,
371 LT.second, CostKind);
372 }
373 }
374 }
375 // vrgather + cost of generating the mask constant.
376 // We model this for an unknown mask with a single vrgather.
377 if (LT.second.isFixedLengthVector() && LT.first == 1 &&
378 (LT.second.getScalarSizeInBits() != 8 ||
379 LT.second.getVectorNumElements() <= 256)) {
380 VectorType *IdxTy = getVRGatherIndexType(LT.second, *ST, Tp->getContext());
381 InstructionCost IndexCost = getConstantPoolLoadCost(IdxTy, CostKind);
382 return IndexCost +
383 getRISCVInstructionCost(RISCV::VRGATHER_VV, LT.second, CostKind);
384 }
385 [[fallthrough]];
386 }
389 // 2 x (vrgather + cost of generating the mask constant) + cost of mask
390 // register for the second vrgather. We model this for an unknown
391 // (shuffle) mask.
392 if (LT.second.isFixedLengthVector() && LT.first == 1 &&
393 (LT.second.getScalarSizeInBits() != 8 ||
394 LT.second.getVectorNumElements() <= 256)) {
395 auto &C = Tp->getContext();
396 auto EC = Tp->getElementCount();
397 VectorType *IdxTy = getVRGatherIndexType(LT.second, *ST, C);
399 InstructionCost IndexCost = getConstantPoolLoadCost(IdxTy, CostKind);
400 InstructionCost MaskCost = getConstantPoolLoadCost(MaskTy, CostKind);
401 return 2 * IndexCost +
402 getRISCVInstructionCost({RISCV::VRGATHER_VV, RISCV::VRGATHER_VV},
403 LT.second, CostKind) +
404 MaskCost;
405 }
406 [[fallthrough]];
407 }
408 case TTI::SK_Select: {
409 // We are going to permute multiple sources and the result will be in
410 // multiple destinations. Providing an accurate cost only for splits where
411 // the element type remains the same.
412 if (!Mask.empty() && LT.first.isValid() && LT.first != 1 &&
413 LT.second.isFixedLengthVector() &&
414 LT.second.getVectorElementType().getSizeInBits() ==
416 LT.second.getVectorNumElements() <
417 cast<FixedVectorType>(Tp)->getNumElements() &&
418 divideCeil(Mask.size(),
419 cast<FixedVectorType>(Tp)->getNumElements()) ==
420 static_cast<unsigned>(*LT.first.getValue())) {
421 unsigned NumRegs = *LT.first.getValue();
422 unsigned VF = cast<FixedVectorType>(Tp)->getNumElements();
423 unsigned SubVF = PowerOf2Ceil(VF / NumRegs);
424 auto *SubVecTy = FixedVectorType::get(Tp->getElementType(), SubVF);
425
427 for (unsigned I = 0; I < NumRegs; ++I) {
428 bool IsSingleVector = true;
429 SmallVector<int> SubMask(SubVF, PoisonMaskElem);
430 transform(Mask.slice(I * SubVF,
431 I == NumRegs - 1 ? Mask.size() % SubVF : SubVF),
432 SubMask.begin(), [&](int I) {
433 bool SingleSubVector = I / VF == 0;
434 IsSingleVector &= SingleSubVector;
435 return (SingleSubVector ? 0 : 1) * SubVF + I % VF;
436 });
439 SubVecTy, SubMask, CostKind, 0, nullptr);
440 return Cost;
441 }
442 }
443 break;
444 }
445 }
446 };
447
448 // Handle scalable vectors (and fixed vectors legalized to scalable vectors).
449 switch (Kind) {
450 default:
451 // Fallthrough to generic handling.
452 // TODO: Most of these cases will return getInvalid in generic code, and
453 // must be implemented here.
454 break;
456 // Extract at zero is always a subregister extract
457 if (Index == 0)
458 return TTI::TCC_Free;
459
460 // If we're extracting a subvector of at most m1 size at a sub-register
461 // boundary - which unfortunately we need exact vlen to identify - this is
462 // a subregister extract at worst and thus won't require a vslidedown.
463 // TODO: Extend for aligned m2, m4 subvector extracts
464 // TODO: Extend for misalgined (but contained) extracts
465 // TODO: Extend for scalable subvector types
466 if (std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
467 SubLT.second.isValid() && SubLT.second.isFixedLengthVector()) {
468 const unsigned MinVLen = ST->getRealMinVLen();
469 const unsigned MaxVLen = ST->getRealMaxVLen();
470 if (MinVLen == MaxVLen &&
471 SubLT.second.getScalarSizeInBits() * Index % MinVLen == 0 &&
472 SubLT.second.getSizeInBits() <= MinVLen)
473 return TTI::TCC_Free;
474 }
475
476 // Example sequence:
477 // vsetivli zero, 4, e8, mf2, tu, ma (ignored)
478 // vslidedown.vi v8, v9, 2
479 return LT.first *
480 getRISCVInstructionCost(RISCV::VSLIDEDOWN_VI, LT.second, CostKind);
482 // Example sequence:
483 // vsetivli zero, 4, e8, mf2, tu, ma (ignored)
484 // vslideup.vi v8, v9, 2
485 return LT.first *
486 getRISCVInstructionCost(RISCV::VSLIDEUP_VI, LT.second, CostKind);
487 case TTI::SK_Select: {
488 // Example sequence:
489 // li a0, 90
490 // vsetivli zero, 8, e8, mf2, ta, ma (ignored)
491 // vmv.s.x v0, a0
492 // vmerge.vvm v8, v9, v8, v0
493 // We use 2 for the cost of the mask materialization as this is the true
494 // cost for small masks and most shuffles are small. At worst, this cost
495 // should be a very small constant for the constant pool load. As such,
496 // we may bias towards large selects slightly more than truely warranted.
497 return LT.first *
498 (1 + getRISCVInstructionCost({RISCV::VMV_S_X, RISCV::VMERGE_VVM},
499 LT.second, CostKind));
500 }
501 case TTI::SK_Broadcast: {
502 bool HasScalar = (Args.size() > 0) && (Operator::getOpcode(Args[0]) ==
503 Instruction::InsertElement);
504 if (LT.second.getScalarSizeInBits() == 1) {
505 if (HasScalar) {
506 // Example sequence:
507 // andi a0, a0, 1
508 // vsetivli zero, 2, e8, mf8, ta, ma (ignored)
509 // vmv.v.x v8, a0
510 // vmsne.vi v0, v8, 0
511 return LT.first *
512 (1 + getRISCVInstructionCost({RISCV::VMV_V_X, RISCV::VMSNE_VI},
513 LT.second, CostKind));
514 }
515 // Example sequence:
516 // vsetivli zero, 2, e8, mf8, ta, mu (ignored)
517 // vmv.v.i v8, 0
518 // vmerge.vim v8, v8, 1, v0
519 // vmv.x.s a0, v8
520 // andi a0, a0, 1
521 // vmv.v.x v8, a0
522 // vmsne.vi v0, v8, 0
523
524 return LT.first *
525 (1 + getRISCVInstructionCost({RISCV::VMV_V_I, RISCV::VMERGE_VIM,
526 RISCV::VMV_X_S, RISCV::VMV_V_X,
527 RISCV::VMSNE_VI},
528 LT.second, CostKind));
529 }
530
531 if (HasScalar) {
532 // Example sequence:
533 // vmv.v.x v8, a0
534 return LT.first *
535 getRISCVInstructionCost(RISCV::VMV_V_X, LT.second, CostKind);
536 }
537
538 // Example sequence:
539 // vrgather.vi v9, v8, 0
540 return LT.first *
541 getRISCVInstructionCost(RISCV::VRGATHER_VI, LT.second, CostKind);
542 }
543 case TTI::SK_Splice: {
544 // vslidedown+vslideup.
545 // TODO: Multiplying by LT.first implies this legalizes into multiple copies
546 // of similar code, but I think we expand through memory.
547 unsigned Opcodes[2] = {RISCV::VSLIDEDOWN_VX, RISCV::VSLIDEUP_VX};
548 if (Index >= 0 && Index < 32)
549 Opcodes[0] = RISCV::VSLIDEDOWN_VI;
550 else if (Index < 0 && Index > -32)
551 Opcodes[1] = RISCV::VSLIDEUP_VI;
552 return LT.first * getRISCVInstructionCost(Opcodes, LT.second, CostKind);
553 }
554 case TTI::SK_Reverse: {
555 // TODO: Cases to improve here:
556 // * Illegal vector types
557 // * i64 on RV32
558 // * i1 vector
559 // At low LMUL, most of the cost is producing the vrgather index register.
560 // At high LMUL, the cost of the vrgather itself will dominate.
561 // Example sequence:
562 // csrr a0, vlenb
563 // srli a0, a0, 3
564 // addi a0, a0, -1
565 // vsetvli a1, zero, e8, mf8, ta, mu (ignored)
566 // vid.v v9
567 // vrsub.vx v10, v9, a0
568 // vrgather.vv v9, v8, v10
569 InstructionCost LenCost = 3;
570 if (LT.second.isFixedLengthVector())
571 // vrsub.vi has a 5 bit immediate field, otherwise an li suffices
572 LenCost = isInt<5>(LT.second.getVectorNumElements() - 1) ? 0 : 1;
573 unsigned Opcodes[] = {RISCV::VID_V, RISCV::VRSUB_VX, RISCV::VRGATHER_VV};
574 if (LT.second.isFixedLengthVector() &&
575 isInt<5>(LT.second.getVectorNumElements() - 1))
576 Opcodes[1] = RISCV::VRSUB_VI;
577 InstructionCost GatherCost =
578 getRISCVInstructionCost(Opcodes, LT.second, CostKind);
579 // Mask operation additionally required extend and truncate
580 InstructionCost ExtendCost = Tp->getElementType()->isIntegerTy(1) ? 3 : 0;
581 return LT.first * (LenCost + GatherCost + ExtendCost);
582 }
583 }
584 return BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp);
585}
586
588RISCVTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
589 unsigned AddressSpace,
591 if (!isLegalMaskedLoadStore(Src, Alignment) ||
593 return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
594 CostKind);
595
596 return getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind);
597}
598
600 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
601 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
602 bool UseMaskForCond, bool UseMaskForGaps) {
603 if (isa<ScalableVectorType>(VecTy) && Factor != 2)
605
606 // The interleaved memory access pass will lower interleaved memory ops (i.e
607 // a load and store followed by a specific shuffle) to vlseg/vsseg
608 // intrinsics. In those cases then we can treat it as if it's just one (legal)
609 // memory op
610 if (!UseMaskForCond && !UseMaskForGaps &&
611 Factor <= TLI->getMaxSupportedInterleaveFactor()) {
612 auto *VTy = cast<VectorType>(VecTy);
613 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VTy);
614 // Need to make sure type has't been scalarized
615 if (LT.second.isVector()) {
616 auto *SubVecTy =
617 VectorType::get(VTy->getElementType(),
618 VTy->getElementCount().divideCoefficientBy(Factor));
619
620 if (VTy->getElementCount().isKnownMultipleOf(Factor) &&
621 TLI->isLegalInterleavedAccessType(SubVecTy, Factor, Alignment,
622 AddressSpace, DL)) {
623 // FIXME: We use the memory op cost of the *legalized* type here,
624 // because it's getMemoryOpCost returns a really expensive cost for
625 // types like <6 x i8>, which show up when doing interleaves of
626 // Factor=3 etc. Should the memory op cost of these be cheaper?
627 auto *LegalVTy = VectorType::get(VTy->getElementType(),
628 LT.second.getVectorElementCount());
629 InstructionCost LegalMemCost = getMemoryOpCost(
630 Opcode, LegalVTy, Alignment, AddressSpace, CostKind);
631 return LT.first + LegalMemCost;
632 }
633 }
634 }
635
636 // TODO: Return the cost of interleaved accesses for scalable vector when
637 // unable to convert to segment accesses instructions.
638 if (isa<ScalableVectorType>(VecTy))
640
641 auto *FVTy = cast<FixedVectorType>(VecTy);
642 InstructionCost MemCost =
643 getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace, CostKind);
644 unsigned VF = FVTy->getNumElements() / Factor;
645
646 // An interleaved load will look like this for Factor=3:
647 // %wide.vec = load <12 x i32>, ptr %3, align 4
648 // %strided.vec = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
649 // %strided.vec1 = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
650 // %strided.vec2 = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
651 if (Opcode == Instruction::Load) {
652 InstructionCost Cost = MemCost;
653 for (unsigned Index : Indices) {
654 FixedVectorType *SubVecTy =
655 FixedVectorType::get(FVTy->getElementType(), VF * Factor);
656 auto Mask = createStrideMask(Index, Factor, VF);
657 InstructionCost ShuffleCost =
659 CostKind, 0, nullptr, {});
660 Cost += ShuffleCost;
661 }
662 return Cost;
663 }
664
665 // TODO: Model for NF > 2
666 // We'll need to enhance getShuffleCost to model shuffles that are just
667 // inserts and extracts into subvectors, since they won't have the full cost
668 // of a vrgather.
669 // An interleaved store for 3 vectors of 4 lanes will look like
670 // %11 = shufflevector <4 x i32> %4, <4 x i32> %6, <8 x i32> <0...7>
671 // %12 = shufflevector <4 x i32> %9, <4 x i32> poison, <8 x i32> <0...3>
672 // %13 = shufflevector <8 x i32> %11, <8 x i32> %12, <12 x i32> <0...11>
673 // %interleaved.vec = shufflevector %13, poison, <12 x i32> <interleave mask>
674 // store <12 x i32> %interleaved.vec, ptr %10, align 4
675 if (Factor != 2)
676 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
677 Alignment, AddressSpace, CostKind,
678 UseMaskForCond, UseMaskForGaps);
679
680 assert(Opcode == Instruction::Store && "Opcode must be a store");
681 // For an interleaving store of 2 vectors, we perform one large interleaving
682 // shuffle that goes into the wide store
683 auto Mask = createInterleaveMask(VF, Factor);
684 InstructionCost ShuffleCost =
686 CostKind, 0, nullptr, {});
687 return MemCost + ShuffleCost;
688}
689
691 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
692 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
694 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
695 Alignment, CostKind, I);
696
697 if ((Opcode == Instruction::Load &&
698 !isLegalMaskedGather(DataTy, Align(Alignment))) ||
699 (Opcode == Instruction::Store &&
700 !isLegalMaskedScatter(DataTy, Align(Alignment))))
701 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
702 Alignment, CostKind, I);
703
704 // Cost is proportional to the number of memory operations implied. For
705 // scalable vectors, we use an estimate on that number since we don't
706 // know exactly what VL will be.
707 auto &VTy = *cast<VectorType>(DataTy);
708 InstructionCost MemOpCost =
709 getMemoryOpCost(Opcode, VTy.getElementType(), Alignment, 0, CostKind,
710 {TTI::OK_AnyValue, TTI::OP_None}, I);
711 unsigned NumLoads = getEstimatedVLFor(&VTy);
712 return NumLoads * MemOpCost;
713}
714
716 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
717 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
718 if (((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
719 !isLegalStridedLoadStore(DataTy, Alignment)) ||
720 (Opcode != Instruction::Load && Opcode != Instruction::Store))
721 return BaseT::getStridedMemoryOpCost(Opcode, DataTy, Ptr, VariableMask,
722 Alignment, CostKind, I);
723
725 return TTI::TCC_Basic;
726
727 // Cost is proportional to the number of memory operations implied. For
728 // scalable vectors, we use an estimate on that number since we don't
729 // know exactly what VL will be.
730 auto &VTy = *cast<VectorType>(DataTy);
731 InstructionCost MemOpCost =
732 getMemoryOpCost(Opcode, VTy.getElementType(), Alignment, 0, CostKind,
733 {TTI::OK_AnyValue, TTI::OP_None}, I);
734 unsigned NumLoads = getEstimatedVLFor(&VTy);
735 return NumLoads * MemOpCost;
736}
737
738// Currently, these represent both throughput and codesize costs
739// for the respective intrinsics. The costs in this table are simply
740// instruction counts with the following adjustments made:
741// * One vsetvli is considered free.
743 {Intrinsic::floor, MVT::f32, 9},
744 {Intrinsic::floor, MVT::f64, 9},
745 {Intrinsic::ceil, MVT::f32, 9},
746 {Intrinsic::ceil, MVT::f64, 9},
747 {Intrinsic::trunc, MVT::f32, 7},
748 {Intrinsic::trunc, MVT::f64, 7},
749 {Intrinsic::round, MVT::f32, 9},
750 {Intrinsic::round, MVT::f64, 9},
751 {Intrinsic::roundeven, MVT::f32, 9},
752 {Intrinsic::roundeven, MVT::f64, 9},
753 {Intrinsic::rint, MVT::f32, 7},
754 {Intrinsic::rint, MVT::f64, 7},
755 {Intrinsic::lrint, MVT::i32, 1},
756 {Intrinsic::lrint, MVT::i64, 1},
757 {Intrinsic::llrint, MVT::i64, 1},
758 {Intrinsic::nearbyint, MVT::f32, 9},
759 {Intrinsic::nearbyint, MVT::f64, 9},
760 {Intrinsic::bswap, MVT::i16, 3},
761 {Intrinsic::bswap, MVT::i32, 12},
762 {Intrinsic::bswap, MVT::i64, 31},
763 {Intrinsic::vp_bswap, MVT::i16, 3},
764 {Intrinsic::vp_bswap, MVT::i32, 12},
765 {Intrinsic::vp_bswap, MVT::i64, 31},
766 {Intrinsic::vp_fshl, MVT::i8, 7},
767 {Intrinsic::vp_fshl, MVT::i16, 7},
768 {Intrinsic::vp_fshl, MVT::i32, 7},
769 {Intrinsic::vp_fshl, MVT::i64, 7},
770 {Intrinsic::vp_fshr, MVT::i8, 7},
771 {Intrinsic::vp_fshr, MVT::i16, 7},
772 {Intrinsic::vp_fshr, MVT::i32, 7},
773 {Intrinsic::vp_fshr, MVT::i64, 7},
774 {Intrinsic::bitreverse, MVT::i8, 17},
775 {Intrinsic::bitreverse, MVT::i16, 24},
776 {Intrinsic::bitreverse, MVT::i32, 33},
777 {Intrinsic::bitreverse, MVT::i64, 52},
778 {Intrinsic::vp_bitreverse, MVT::i8, 17},
779 {Intrinsic::vp_bitreverse, MVT::i16, 24},
780 {Intrinsic::vp_bitreverse, MVT::i32, 33},
781 {Intrinsic::vp_bitreverse, MVT::i64, 52},
782 {Intrinsic::ctpop, MVT::i8, 12},
783 {Intrinsic::ctpop, MVT::i16, 19},
784 {Intrinsic::ctpop, MVT::i32, 20},
785 {Intrinsic::ctpop, MVT::i64, 21},
786 {Intrinsic::vp_ctpop, MVT::i8, 12},
787 {Intrinsic::vp_ctpop, MVT::i16, 19},
788 {Intrinsic::vp_ctpop, MVT::i32, 20},
789 {Intrinsic::vp_ctpop, MVT::i64, 21},
790 {Intrinsic::vp_ctlz, MVT::i8, 19},
791 {Intrinsic::vp_ctlz, MVT::i16, 28},
792 {Intrinsic::vp_ctlz, MVT::i32, 31},
793 {Intrinsic::vp_ctlz, MVT::i64, 35},
794 {Intrinsic::vp_cttz, MVT::i8, 16},
795 {Intrinsic::vp_cttz, MVT::i16, 23},
796 {Intrinsic::vp_cttz, MVT::i32, 24},
797 {Intrinsic::vp_cttz, MVT::i64, 25},
798};
799
801 switch (ID) {
802#define HELPER_MAP_VPID_TO_VPSD(VPID, VPSD) \
803 case Intrinsic::VPID: \
804 return ISD::VPSD;
805#include "llvm/IR/VPIntrinsics.def"
806#undef HELPER_MAP_VPID_TO_VPSD
807 }
808 return ISD::DELETED_NODE;
809}
810
814 auto *RetTy = ICA.getReturnType();
815 switch (ICA.getID()) {
816 case Intrinsic::ceil:
817 case Intrinsic::floor:
818 case Intrinsic::trunc:
819 case Intrinsic::rint:
820 case Intrinsic::lrint:
821 case Intrinsic::llrint:
822 case Intrinsic::round:
823 case Intrinsic::roundeven: {
824 // These all use the same code.
826 if (!LT.second.isVector() && TLI->isOperationCustom(ISD::FCEIL, LT.second))
827 return LT.first * 8;
828 break;
829 }
830 case Intrinsic::umin:
831 case Intrinsic::umax:
832 case Intrinsic::smin:
833 case Intrinsic::smax: {
835 if (LT.second.isScalarInteger() && ST->hasStdExtZbb())
836 return LT.first;
837
838 if (ST->hasVInstructions() && LT.second.isVector()) {
839 unsigned Op;
840 switch (ICA.getID()) {
841 case Intrinsic::umin:
842 Op = RISCV::VMINU_VV;
843 break;
844 case Intrinsic::umax:
845 Op = RISCV::VMAXU_VV;
846 break;
847 case Intrinsic::smin:
848 Op = RISCV::VMIN_VV;
849 break;
850 case Intrinsic::smax:
851 Op = RISCV::VMAX_VV;
852 break;
853 }
854 return LT.first * getRISCVInstructionCost(Op, LT.second, CostKind);
855 }
856 break;
857 }
858 case Intrinsic::sadd_sat:
859 case Intrinsic::ssub_sat:
860 case Intrinsic::uadd_sat:
861 case Intrinsic::usub_sat:
862 case Intrinsic::fabs:
863 case Intrinsic::sqrt: {
865 if (ST->hasVInstructions() && LT.second.isVector())
866 return LT.first;
867 break;
868 }
869 case Intrinsic::ctpop: {
871 if (ST->hasVInstructions() && ST->hasStdExtZvbb() && LT.second.isVector())
872 return LT.first;
873 break;
874 }
875 case Intrinsic::abs: {
877 if (ST->hasVInstructions() && LT.second.isVector()) {
878 // vrsub.vi v10, v8, 0
879 // vmax.vv v8, v8, v10
880 return LT.first * 2;
881 }
882 break;
883 }
884 case Intrinsic::get_active_lane_mask: {
885 if (ST->hasVInstructions()) {
886 Type *ExpRetTy = VectorType::get(
887 ICA.getArgTypes()[0], cast<VectorType>(RetTy)->getElementCount());
888 auto LT = getTypeLegalizationCost(ExpRetTy);
889
890 // vid.v v8 // considered hoisted
891 // vsaddu.vx v8, v8, a0
892 // vmsltu.vx v0, v8, a1
893 return LT.first *
894 getRISCVInstructionCost({RISCV::VSADDU_VX, RISCV::VMSLTU_VX},
895 LT.second, CostKind);
896 }
897 break;
898 }
899 // TODO: add more intrinsic
900 case Intrinsic::experimental_stepvector: {
902 // Legalisation of illegal types involves an `index' instruction plus
903 // (LT.first - 1) vector adds.
904 if (ST->hasVInstructions())
905 return getRISCVInstructionCost(RISCV::VID_V, LT.second, CostKind) +
906 (LT.first - 1) *
907 getRISCVInstructionCost(RISCV::VADD_VX, LT.second, CostKind);
908 return 1 + (LT.first - 1);
909 }
910 case Intrinsic::experimental_cttz_elts: {
911 Type *ArgTy = ICA.getArgTypes()[0];
912 EVT ArgType = TLI->getValueType(DL, ArgTy, true);
913 if (getTLI()->shouldExpandCttzElements(ArgType))
914 break;
915 InstructionCost Cost = getRISCVInstructionCost(
916 RISCV::VFIRST_M, getTypeLegalizationCost(ArgTy).second, CostKind);
917
918 // If zero_is_poison is false, then we will generate additional
919 // cmp + select instructions to convert -1 to EVL.
920 Type *BoolTy = Type::getInt1Ty(RetTy->getContext());
921 if (ICA.getArgs().size() > 1 &&
922 cast<ConstantInt>(ICA.getArgs()[1])->isZero())
923 Cost += getCmpSelInstrCost(Instruction::ICmp, BoolTy, RetTy,
925 getCmpSelInstrCost(Instruction::Select, RetTy, BoolTy,
927
928 return Cost;
929 }
930 case Intrinsic::vp_rint: {
931 // RISC-V target uses at least 5 instructions to lower rounding intrinsics.
932 unsigned Cost = 5;
934 if (TLI->isOperationCustom(ISD::VP_FRINT, LT.second))
935 return Cost * LT.first;
936 break;
937 }
938 case Intrinsic::vp_nearbyint: {
939 // More one read and one write for fflags than vp_rint.
940 unsigned Cost = 7;
942 if (TLI->isOperationCustom(ISD::VP_FRINT, LT.second))
943 return Cost * LT.first;
944 break;
945 }
946 case Intrinsic::vp_ceil:
947 case Intrinsic::vp_floor:
948 case Intrinsic::vp_round:
949 case Intrinsic::vp_roundeven:
950 case Intrinsic::vp_roundtozero: {
951 // Rounding with static rounding mode needs two more instructions to
952 // swap/write FRM than vp_rint.
953 unsigned Cost = 7;
955 unsigned VPISD = getISDForVPIntrinsicID(ICA.getID());
956 if (TLI->isOperationCustom(VPISD, LT.second))
957 return Cost * LT.first;
958 break;
959 }
960 }
961
962 if (ST->hasVInstructions() && RetTy->isVectorTy()) {
963 if (auto LT = getTypeLegalizationCost(RetTy);
964 LT.second.isVector()) {
965 MVT EltTy = LT.second.getVectorElementType();
966 if (const auto *Entry = CostTableLookup(VectorIntrinsicCostTable,
967 ICA.getID(), EltTy))
968 return LT.first * Entry->Cost;
969 }
970 }
971
973}
974
976 Type *Src,
979 const Instruction *I) {
980 bool IsVectorType = isa<VectorType>(Dst) && isa<VectorType>(Src);
981 if (!IsVectorType)
982 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
983
984 bool IsTypeLegal = isTypeLegal(Src) && isTypeLegal(Dst) &&
985 (Src->getScalarSizeInBits() <= ST->getELen()) &&
986 (Dst->getScalarSizeInBits() <= ST->getELen());
987
988 // FIXME: Need to compute legalizing cost for illegal types.
989 if (!IsTypeLegal)
990 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
991
992 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Src);
993 std::pair<InstructionCost, MVT> DstLT = getTypeLegalizationCost(Dst);
994
995 int ISD = TLI->InstructionOpcodeToISD(Opcode);
996 assert(ISD && "Invalid opcode");
997
998 int PowDiff = (int)Log2_32(Dst->getScalarSizeInBits()) -
999 (int)Log2_32(Src->getScalarSizeInBits());
1000 switch (ISD) {
1001 case ISD::SIGN_EXTEND:
1002 case ISD::ZERO_EXTEND: {
1003 const unsigned SrcEltSize = Src->getScalarSizeInBits();
1004 if (SrcEltSize == 1) {
1005 // We do not use vsext/vzext to extend from mask vector.
1006 // Instead we use the following instructions to extend from mask vector:
1007 // vmv.v.i v8, 0
1008 // vmerge.vim v8, v8, -1, v0
1009 return getRISCVInstructionCost({RISCV::VMV_V_I, RISCV::VMERGE_VIM},
1010 DstLT.second, CostKind);
1011 }
1012 if ((PowDiff < 1) || (PowDiff > 3))
1013 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1014 unsigned SExtOp[] = {RISCV::VSEXT_VF2, RISCV::VSEXT_VF4, RISCV::VSEXT_VF8};
1015 unsigned ZExtOp[] = {RISCV::VZEXT_VF2, RISCV::VZEXT_VF4, RISCV::VZEXT_VF8};
1016 unsigned Op =
1017 (ISD == ISD::SIGN_EXTEND) ? SExtOp[PowDiff - 1] : ZExtOp[PowDiff - 1];
1018 return getRISCVInstructionCost(Op, DstLT.second, CostKind);
1019 }
1020 case ISD::TRUNCATE:
1021 if (Dst->getScalarSizeInBits() == 1) {
1022 // We do not use several vncvt to truncate to mask vector. So we could
1023 // not use PowDiff to calculate it.
1024 // Instead we use the following instructions to truncate to mask vector:
1025 // vand.vi v8, v8, 1
1026 // vmsne.vi v0, v8, 0
1027 return getRISCVInstructionCost({RISCV::VAND_VI, RISCV::VMSNE_VI},
1028 SrcLT.second, CostKind);
1029 }
1030 [[fallthrough]];
1031 case ISD::FP_EXTEND:
1032 case ISD::FP_ROUND: {
1033 // Counts of narrow/widen instructions.
1034 unsigned SrcEltSize = Src->getScalarSizeInBits();
1035 unsigned DstEltSize = Dst->getScalarSizeInBits();
1036
1037 unsigned Op = (ISD == ISD::TRUNCATE) ? RISCV::VNSRL_WI
1038 : (ISD == ISD::FP_EXTEND) ? RISCV::VFWCVT_F_F_V
1039 : RISCV::VFNCVT_F_F_W;
1041 for (; SrcEltSize != DstEltSize;) {
1042 MVT ElementMVT = (ISD == ISD::TRUNCATE)
1043 ? MVT::getIntegerVT(DstEltSize)
1044 : MVT::getFloatingPointVT(DstEltSize);
1045 MVT DstMVT = DstLT.second.changeVectorElementType(ElementMVT);
1046 DstEltSize =
1047 (DstEltSize > SrcEltSize) ? DstEltSize >> 1 : DstEltSize << 1;
1048 Cost += getRISCVInstructionCost(Op, DstMVT, CostKind);
1049 }
1050 return Cost;
1051 }
1052 case ISD::FP_TO_SINT:
1053 case ISD::FP_TO_UINT:
1054 case ISD::SINT_TO_FP:
1055 case ISD::UINT_TO_FP:
1056 if (Src->getScalarSizeInBits() == 1 || Dst->getScalarSizeInBits() == 1) {
1057 // The cost of convert from or to mask vector is different from other
1058 // cases. We could not use PowDiff to calculate it.
1059 // For mask vector to fp, we should use the following instructions:
1060 // vmv.v.i v8, 0
1061 // vmerge.vim v8, v8, -1, v0
1062 // vfcvt.f.x.v v8, v8
1063
1064 // And for fp vector to mask, we use:
1065 // vfncvt.rtz.x.f.w v9, v8
1066 // vand.vi v8, v9, 1
1067 // vmsne.vi v0, v8, 0
1068 return 3;
1069 }
1070 if (std::abs(PowDiff) <= 1)
1071 return 1;
1072 // Backend could lower (v[sz]ext i8 to double) to vfcvt(v[sz]ext.f8 i8),
1073 // so it only need two conversion.
1074 if (Src->isIntOrIntVectorTy())
1075 return 2;
1076 // Counts of narrow/widen instructions.
1077 return std::abs(PowDiff);
1078 }
1079 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1080}
1081
1082unsigned RISCVTTIImpl::getEstimatedVLFor(VectorType *Ty) {
1083 if (isa<ScalableVectorType>(Ty)) {
1084 const unsigned EltSize = DL.getTypeSizeInBits(Ty->getElementType());
1085 const unsigned MinSize = DL.getTypeSizeInBits(Ty).getKnownMinValue();
1086 const unsigned VectorBits = *getVScaleForTuning() * RISCV::RVVBitsPerBlock;
1087 return RISCVTargetLowering::computeVLMAX(VectorBits, EltSize, MinSize);
1088 }
1089 return cast<FixedVectorType>(Ty)->getNumElements();
1090}
1091
1094 FastMathFlags FMF,
1096 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
1097 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
1098
1099 // Skip if scalar size of Ty is bigger than ELEN.
1100 if (Ty->getScalarSizeInBits() > ST->getELen())
1101 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
1102
1103 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1104 if (Ty->getElementType()->isIntegerTy(1)) {
1105 // SelectionDAGBuilder does following transforms:
1106 // vector_reduce_{smin,umax}(<n x i1>) --> vector_reduce_or(<n x i1>)
1107 // vector_reduce_{smax,umin}(<n x i1>) --> vector_reduce_and(<n x i1>)
1108 if (IID == Intrinsic::umax || IID == Intrinsic::smin)
1109 return getArithmeticReductionCost(Instruction::Or, Ty, FMF, CostKind);
1110 else
1111 return getArithmeticReductionCost(Instruction::And, Ty, FMF, CostKind);
1112 }
1113
1114 if (IID == Intrinsic::maximum || IID == Intrinsic::minimum) {
1116 InstructionCost ExtraCost = 0;
1117 switch (IID) {
1118 case Intrinsic::maximum:
1119 if (FMF.noNaNs()) {
1120 Opcodes = {RISCV::VFREDMAX_VS, RISCV::VFMV_F_S};
1121 } else {
1122 Opcodes = {RISCV::VMFNE_VV, RISCV::VCPOP_M, RISCV::VFREDMAX_VS,
1123 RISCV::VFMV_F_S};
1124 // Cost of Canonical Nan + branch
1125 // lui a0, 523264
1126 // fmv.w.x fa0, a0
1127 Type *DstTy = Ty->getScalarType();
1128 const unsigned EltTyBits = DstTy->getScalarSizeInBits();
1129 Type *SrcTy = IntegerType::getIntNTy(DstTy->getContext(), EltTyBits);
1130 ExtraCost = 1 +
1131 getCastInstrCost(Instruction::UIToFP, DstTy, SrcTy,
1133 getCFInstrCost(Instruction::Br, CostKind);
1134 }
1135 break;
1136
1137 case Intrinsic::minimum:
1138 if (FMF.noNaNs()) {
1139 Opcodes = {RISCV::VFREDMIN_VS, RISCV::VFMV_F_S};
1140 } else {
1141 Opcodes = {RISCV::VMFNE_VV, RISCV::VCPOP_M, RISCV::VFREDMIN_VS,
1142 RISCV::VFMV_F_S};
1143 // Cost of Canonical Nan + branch
1144 // lui a0, 523264
1145 // fmv.w.x fa0, a0
1146 Type *DstTy = Ty->getScalarType();
1147 const unsigned EltTyBits = DL.getTypeSizeInBits(DstTy);
1148 Type *SrcTy = IntegerType::getIntNTy(DstTy->getContext(), EltTyBits);
1149 ExtraCost = 1 +
1150 getCastInstrCost(Instruction::UIToFP, DstTy, SrcTy,
1152 getCFInstrCost(Instruction::Br, CostKind);
1153 }
1154 break;
1155 }
1156 return ExtraCost + getRISCVInstructionCost(Opcodes, LT.second, CostKind);
1157 }
1158
1159 // IR Reduction is composed by two vmv and one rvv reduction instruction.
1160 unsigned SplitOp;
1162 switch (IID) {
1163 default:
1164 llvm_unreachable("Unsupported intrinsic");
1165 case Intrinsic::smax:
1166 SplitOp = RISCV::VMAX_VV;
1167 Opcodes = {RISCV::VMV_S_X, RISCV::VREDMAX_VS, RISCV::VMV_X_S};
1168 break;
1169 case Intrinsic::smin:
1170 SplitOp = RISCV::VMIN_VV;
1171 Opcodes = {RISCV::VMV_S_X, RISCV::VREDMIN_VS, RISCV::VMV_X_S};
1172 break;
1173 case Intrinsic::umax:
1174 SplitOp = RISCV::VMAXU_VV;
1175 Opcodes = {RISCV::VMV_S_X, RISCV::VREDMAXU_VS, RISCV::VMV_X_S};
1176 break;
1177 case Intrinsic::umin:
1178 SplitOp = RISCV::VMINU_VV;
1179 Opcodes = {RISCV::VMV_S_X, RISCV::VREDMINU_VS, RISCV::VMV_X_S};
1180 break;
1181 case Intrinsic::maxnum:
1182 SplitOp = RISCV::VFMAX_VV;
1183 Opcodes = {RISCV::VFMV_S_F, RISCV::VFREDMAX_VS, RISCV::VFMV_F_S};
1184 break;
1185 case Intrinsic::minnum:
1186 SplitOp = RISCV::VFMIN_VV;
1187 Opcodes = {RISCV::VFMV_S_F, RISCV::VFREDMIN_VS, RISCV::VFMV_F_S};
1188 break;
1189 }
1190 // Add a cost for data larger than LMUL8
1191 InstructionCost SplitCost =
1192 (LT.first > 1) ? (LT.first - 1) *
1193 getRISCVInstructionCost(SplitOp, LT.second, CostKind)
1194 : 0;
1195 return SplitCost + getRISCVInstructionCost(Opcodes, LT.second, CostKind);
1196}
1197
1200 std::optional<FastMathFlags> FMF,
1202 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
1203 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
1204
1205 // Skip if scalar size of Ty is bigger than ELEN.
1206 if (Ty->getScalarSizeInBits() > ST->getELen())
1207 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
1208
1209 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1210 assert(ISD && "Invalid opcode");
1211
1212 if (ISD != ISD::ADD && ISD != ISD::OR && ISD != ISD::XOR && ISD != ISD::AND &&
1213 ISD != ISD::FADD)
1214 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
1215
1216 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1218 Type *ElementTy = Ty->getElementType();
1219 if (ElementTy->isIntegerTy(1)) {
1220 if (ISD == ISD::AND) {
1221 // Example sequences:
1222 // vsetvli a0, zero, e8, mf8, ta, ma
1223 // vmnot.m v8, v0
1224 // vcpop.m a0, v8
1225 // seqz a0, a0
1226 Opcodes = {RISCV::VMNAND_MM, RISCV::VCPOP_M};
1227 return (LT.first - 1) +
1228 getRISCVInstructionCost(Opcodes, LT.second, CostKind) +
1229 getCmpSelInstrCost(Instruction::ICmp, ElementTy, ElementTy,
1231 } else {
1232 // Example sequences:
1233 // vsetvli a0, zero, e8, mf8, ta, ma
1234 // vcpop.m a0, v0
1235 // snez a0, a0
1236 Opcodes = {RISCV::VCPOP_M};
1237 return (LT.first - 1) +
1238 getRISCVInstructionCost(Opcodes, LT.second, CostKind) +
1239 getCmpSelInstrCost(Instruction::ICmp, ElementTy, ElementTy,
1241 }
1242 }
1243
1244 // IR Reduction is composed by two vmv and one rvv reduction instruction.
1246 Opcodes.push_back(RISCV::VFMV_S_F);
1247 for (unsigned i = 0; i < LT.first.getValue(); i++)
1248 Opcodes.push_back(RISCV::VFREDOSUM_VS);
1249 Opcodes.push_back(RISCV::VFMV_F_S);
1250 return getRISCVInstructionCost(Opcodes, LT.second, CostKind);
1251 }
1252 unsigned SplitOp;
1253 switch (ISD) {
1254 case ISD::ADD:
1255 SplitOp = RISCV::VADD_VV;
1256 Opcodes = {RISCV::VMV_S_X, RISCV::VREDSUM_VS, RISCV::VMV_X_S};
1257 break;
1258 case ISD::OR:
1259 SplitOp = RISCV::VOR_VV;
1260 Opcodes = {RISCV::VMV_S_X, RISCV::VREDOR_VS, RISCV::VMV_X_S};
1261 break;
1262 case ISD::XOR:
1263 SplitOp = RISCV::VXOR_VV;
1264 Opcodes = {RISCV::VMV_S_X, RISCV::VREDXOR_VS, RISCV::VMV_X_S};
1265 break;
1266 case ISD::AND:
1267 SplitOp = RISCV::VAND_VV;
1268 Opcodes = {RISCV::VMV_S_X, RISCV::VREDAND_VS, RISCV::VMV_X_S};
1269 break;
1270 case ISD::FADD:
1271 SplitOp = RISCV::VFADD_VV;
1272 Opcodes = {RISCV::VFMV_S_F, RISCV::VFREDUSUM_VS, RISCV::VFMV_F_S};
1273 break;
1274 }
1275 // Add a cost for data larger than LMUL8
1276 InstructionCost SplitCost =
1277 (LT.first > 1) ? (LT.first - 1) *
1278 getRISCVInstructionCost(SplitOp, LT.second, CostKind)
1279 : 0;
1280 return SplitCost + getRISCVInstructionCost(Opcodes, LT.second, CostKind);
1281}
1282
1284 unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy,
1286 if (isa<FixedVectorType>(ValTy) && !ST->useRVVForFixedLengthVectors())
1287 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
1288 FMF, CostKind);
1289
1290 // Skip if scalar size of ResTy is bigger than ELEN.
1291 if (ResTy->getScalarSizeInBits() > ST->getELen())
1292 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
1293 FMF, CostKind);
1294
1295 if (Opcode != Instruction::Add && Opcode != Instruction::FAdd)
1296 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
1297 FMF, CostKind);
1298
1299 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1300
1301 if (ResTy->getScalarSizeInBits() != 2 * LT.second.getScalarSizeInBits())
1302 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
1303 FMF, CostKind);
1304
1305 return (LT.first - 1) +
1306 getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
1307}
1308
1310 TTI::OperandValueInfo OpInfo,
1312 assert(OpInfo.isConstant() && "non constant operand?");
1313 if (!isa<VectorType>(Ty))
1314 // FIXME: We need to account for immediate materialization here, but doing
1315 // a decent job requires more knowledge about the immediate than we
1316 // currently have here.
1317 return 0;
1318
1319 if (OpInfo.isUniform())
1320 // vmv.x.i, vmv.v.x, or vfmv.v.f
1321 // We ignore the cost of the scalar constant materialization to be consistent
1322 // with how we treat scalar constants themselves just above.
1323 return 1;
1324
1325 return getConstantPoolLoadCost(Ty, CostKind);
1326}
1327
1328
1330 MaybeAlign Alignment,
1331 unsigned AddressSpace,
1333 TTI::OperandValueInfo OpInfo,
1334 const Instruction *I) {
1335 EVT VT = TLI->getValueType(DL, Src, true);
1336 // Type legalization can't handle structs
1337 if (VT == MVT::Other)
1338 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1339 CostKind, OpInfo, I);
1340
1342 if (Opcode == Instruction::Store && OpInfo.isConstant())
1343 Cost += getStoreImmCost(Src, OpInfo, CostKind);
1344 InstructionCost BaseCost =
1345 BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1346 CostKind, OpInfo, I);
1347 // Assume memory ops cost scale with the number of vector registers
1348 // possible accessed by the instruction. Note that BasicTTI already
1349 // handles the LT.first term for us.
1350 if (std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
1351 LT.second.isVector() && CostKind != TTI::TCK_CodeSize)
1352 BaseCost *= TLI->getLMULCost(LT.second);
1353 return Cost + BaseCost;
1354
1355}
1356
1358 Type *CondTy,
1359 CmpInst::Predicate VecPred,
1361 const Instruction *I) {
1363 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1364 I);
1365
1366 if (isa<FixedVectorType>(ValTy) && !ST->useRVVForFixedLengthVectors())
1367 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1368 I);
1369
1370 // Skip if scalar size of ValTy is bigger than ELEN.
1371 if (ValTy->isVectorTy() && ValTy->getScalarSizeInBits() > ST->getELen())
1372 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1373 I);
1374
1375 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1376 if (Opcode == Instruction::Select && ValTy->isVectorTy()) {
1377 if (CondTy->isVectorTy()) {
1378 if (ValTy->getScalarSizeInBits() == 1) {
1379 // vmandn.mm v8, v8, v9
1380 // vmand.mm v9, v0, v9
1381 // vmor.mm v0, v9, v8
1382 return LT.first *
1383 getRISCVInstructionCost(
1384 {RISCV::VMANDN_MM, RISCV::VMAND_MM, RISCV::VMOR_MM},
1385 LT.second, CostKind);
1386 }
1387 // vselect and max/min are supported natively.
1388 return LT.first *
1389 getRISCVInstructionCost(RISCV::VMERGE_VVM, LT.second, CostKind);
1390 }
1391
1392 if (ValTy->getScalarSizeInBits() == 1) {
1393 // vmv.v.x v9, a0
1394 // vmsne.vi v9, v9, 0
1395 // vmandn.mm v8, v8, v9
1396 // vmand.mm v9, v0, v9
1397 // vmor.mm v0, v9, v8
1398 MVT InterimVT = LT.second.changeVectorElementType(MVT::i8);
1399 return LT.first *
1400 getRISCVInstructionCost({RISCV::VMV_V_X, RISCV::VMSNE_VI},
1401 InterimVT, CostKind) +
1402 LT.first * getRISCVInstructionCost(
1403 {RISCV::VMANDN_MM, RISCV::VMAND_MM, RISCV::VMOR_MM},
1404 LT.second, CostKind);
1405 }
1406
1407 // vmv.v.x v10, a0
1408 // vmsne.vi v0, v10, 0
1409 // vmerge.vvm v8, v9, v8, v0
1410 return LT.first * getRISCVInstructionCost(
1411 {RISCV::VMV_V_X, RISCV::VMSNE_VI, RISCV::VMERGE_VVM},
1412 LT.second, CostKind);
1413 }
1414
1415 if ((Opcode == Instruction::ICmp) && ValTy->isVectorTy() &&
1416 CmpInst::isIntPredicate(VecPred)) {
1417 // Use VMSLT_VV to represent VMSEQ, VMSNE, VMSLTU, VMSLEU, VMSLT, VMSLE
1418 // provided they incur the same cost across all implementations
1419 return LT.first *
1420 getRISCVInstructionCost(RISCV::VMSLT_VV, LT.second, CostKind);
1421 }
1422
1423 if ((Opcode == Instruction::FCmp) && ValTy->isVectorTy() &&
1424 CmpInst::isFPPredicate(VecPred)) {
1425
1426 // Use VMXOR_MM and VMXNOR_MM to generate all true/false mask
1427 if ((VecPred == CmpInst::FCMP_FALSE) || (VecPred == CmpInst::FCMP_TRUE))
1428 return getRISCVInstructionCost(RISCV::VMXOR_MM, LT.second, CostKind);
1429
1430 // If we do not support the input floating point vector type, use the base
1431 // one which will calculate as:
1432 // ScalarizeCost + Num * Cost for fixed vector,
1433 // InvalidCost for scalable vector.
1434 if ((ValTy->getScalarSizeInBits() == 16 && !ST->hasVInstructionsF16()) ||
1435 (ValTy->getScalarSizeInBits() == 32 && !ST->hasVInstructionsF32()) ||
1436 (ValTy->getScalarSizeInBits() == 64 && !ST->hasVInstructionsF64()))
1437 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1438 I);
1439
1440 // Assuming vector fp compare and mask instructions are all the same cost
1441 // until a need arises to differentiate them.
1442 switch (VecPred) {
1443 case CmpInst::FCMP_ONE: // vmflt.vv + vmflt.vv + vmor.mm
1444 case CmpInst::FCMP_ORD: // vmfeq.vv + vmfeq.vv + vmand.mm
1445 case CmpInst::FCMP_UNO: // vmfne.vv + vmfne.vv + vmor.mm
1446 case CmpInst::FCMP_UEQ: // vmflt.vv + vmflt.vv + vmnor.mm
1447 return LT.first * getRISCVInstructionCost(
1448 {RISCV::VMFLT_VV, RISCV::VMFLT_VV, RISCV::VMOR_MM},
1449 LT.second, CostKind);
1450
1451 case CmpInst::FCMP_UGT: // vmfle.vv + vmnot.m
1452 case CmpInst::FCMP_UGE: // vmflt.vv + vmnot.m
1453 case CmpInst::FCMP_ULT: // vmfle.vv + vmnot.m
1454 case CmpInst::FCMP_ULE: // vmflt.vv + vmnot.m
1455 return LT.first *
1456 getRISCVInstructionCost({RISCV::VMFLT_VV, RISCV::VMNAND_MM},
1457 LT.second, CostKind);
1458
1459 case CmpInst::FCMP_OEQ: // vmfeq.vv
1460 case CmpInst::FCMP_OGT: // vmflt.vv
1461 case CmpInst::FCMP_OGE: // vmfle.vv
1462 case CmpInst::FCMP_OLT: // vmflt.vv
1463 case CmpInst::FCMP_OLE: // vmfle.vv
1464 case CmpInst::FCMP_UNE: // vmfne.vv
1465 return LT.first *
1466 getRISCVInstructionCost(RISCV::VMFLT_VV, LT.second, CostKind);
1467 default:
1468 break;
1469 }
1470 }
1471
1472 // TODO: Add cost for scalar type.
1473
1474 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
1475}
1476
1479 const Instruction *I) {
1481 return Opcode == Instruction::PHI ? 0 : 1;
1482 // Branches are assumed to be predicted.
1483 return 0;
1484}
1485
1488 unsigned Index, Value *Op0,
1489 Value *Op1) {
1490 assert(Val->isVectorTy() && "This must be a vector type");
1491
1492 if (Opcode != Instruction::ExtractElement &&
1493 Opcode != Instruction::InsertElement)
1494 return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1);
1495
1496 // Legalize the type.
1497 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
1498
1499 // This type is legalized to a scalar type.
1500 if (!LT.second.isVector()) {
1501 auto *FixedVecTy = cast<FixedVectorType>(Val);
1502 // If Index is a known constant, cost is zero.
1503 if (Index != -1U)
1504 return 0;
1505 // Extract/InsertElement with non-constant index is very costly when
1506 // scalarized; estimate cost of loads/stores sequence via the stack:
1507 // ExtractElement cost: store vector to stack, load scalar;
1508 // InsertElement cost: store vector to stack, store scalar, load vector.
1509 Type *ElemTy = FixedVecTy->getElementType();
1510 auto NumElems = FixedVecTy->getNumElements();
1511 auto Align = DL.getPrefTypeAlign(ElemTy);
1512 InstructionCost LoadCost =
1513 getMemoryOpCost(Instruction::Load, ElemTy, Align, 0, CostKind);
1514 InstructionCost StoreCost =
1515 getMemoryOpCost(Instruction::Store, ElemTy, Align, 0, CostKind);
1516 return Opcode == Instruction::ExtractElement
1517 ? StoreCost * NumElems + LoadCost
1518 : (StoreCost + LoadCost) * NumElems + StoreCost;
1519 }
1520
1521 // For unsupported scalable vector.
1522 if (LT.second.isScalableVector() && !LT.first.isValid())
1523 return LT.first;
1524
1525 if (!isTypeLegal(Val))
1526 return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1);
1527
1528 // Mask vector extract/insert is expanded via e8.
1529 if (Val->getScalarSizeInBits() == 1) {
1530 VectorType *WideTy =
1532 cast<VectorType>(Val)->getElementCount());
1533 if (Opcode == Instruction::ExtractElement) {
1534 InstructionCost ExtendCost
1535 = getCastInstrCost(Instruction::ZExt, WideTy, Val,
1537 InstructionCost ExtractCost
1538 = getVectorInstrCost(Opcode, WideTy, CostKind, Index, nullptr, nullptr);
1539 return ExtendCost + ExtractCost;
1540 }
1541 InstructionCost ExtendCost
1542 = getCastInstrCost(Instruction::ZExt, WideTy, Val,
1544 InstructionCost InsertCost
1545 = getVectorInstrCost(Opcode, WideTy, CostKind, Index, nullptr, nullptr);
1546 InstructionCost TruncCost
1547 = getCastInstrCost(Instruction::Trunc, Val, WideTy,
1549 return ExtendCost + InsertCost + TruncCost;
1550 }
1551
1552
1553 // In RVV, we could use vslidedown + vmv.x.s to extract element from vector
1554 // and vslideup + vmv.s.x to insert element to vector.
1555 unsigned BaseCost = 1;
1556 // When insertelement we should add the index with 1 as the input of vslideup.
1557 unsigned SlideCost = Opcode == Instruction::InsertElement ? 2 : 1;
1558
1559 if (Index != -1U) {
1560 // The type may be split. For fixed-width vectors we can normalize the
1561 // index to the new type.
1562 if (LT.second.isFixedLengthVector()) {
1563 unsigned Width = LT.second.getVectorNumElements();
1564 Index = Index % Width;
1565 }
1566
1567 // We could extract/insert the first element without vslidedown/vslideup.
1568 if (Index == 0)
1569 SlideCost = 0;
1570 else if (Opcode == Instruction::InsertElement)
1571 SlideCost = 1; // With a constant index, we do not need to use addi.
1572 }
1573
1574 // Extract i64 in the target that has XLEN=32 need more instruction.
1575 if (Val->getScalarType()->isIntegerTy() &&
1576 ST->getXLen() < Val->getScalarSizeInBits()) {
1577 // For extractelement, we need the following instructions:
1578 // vsetivli zero, 1, e64, m1, ta, mu (not count)
1579 // vslidedown.vx v8, v8, a0
1580 // vmv.x.s a0, v8
1581 // li a1, 32
1582 // vsrl.vx v8, v8, a1
1583 // vmv.x.s a1, v8
1584
1585 // For insertelement, we need the following instructions:
1586 // vsetivli zero, 2, e32, m4, ta, mu (not count)
1587 // vmv.v.i v12, 0
1588 // vslide1up.vx v16, v12, a1
1589 // vslide1up.vx v12, v16, a0
1590 // addi a0, a2, 1
1591 // vsetvli zero, a0, e64, m4, tu, mu (not count)
1592 // vslideup.vx v8, v12, a2
1593
1594 // TODO: should we count these special vsetvlis?
1595 BaseCost = Opcode == Instruction::InsertElement ? 3 : 4;
1596 }
1597 return BaseCost + SlideCost;
1598}
1599
1601 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
1603 ArrayRef<const Value *> Args, const Instruction *CxtI) {
1604
1605 // TODO: Handle more cost kinds.
1607 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1608 Args, CxtI);
1609
1610 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
1611 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1612 Args, CxtI);
1613
1614 // Skip if scalar size of Ty is bigger than ELEN.
1615 if (isa<VectorType>(Ty) && Ty->getScalarSizeInBits() > ST->getELen())
1616 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1617 Args, CxtI);
1618
1619 // Legalize the type.
1620 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1621
1622 // TODO: Handle scalar type.
1623 if (!LT.second.isVector())
1624 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1625 Args, CxtI);
1626
1627
1628 auto getConstantMatCost =
1629 [&](unsigned Operand, TTI::OperandValueInfo OpInfo) -> InstructionCost {
1630 if (OpInfo.isUniform() && TLI->canSplatOperand(Opcode, Operand))
1631 // Two sub-cases:
1632 // * Has a 5 bit immediate operand which can be splatted.
1633 // * Has a larger immediate which must be materialized in scalar register
1634 // We return 0 for both as we currently ignore the cost of materializing
1635 // scalar constants in GPRs.
1636 return 0;
1637
1638 return getConstantPoolLoadCost(Ty, CostKind);
1639 };
1640
1641 // Add the cost of materializing any constant vectors required.
1642 InstructionCost ConstantMatCost = 0;
1643 if (Op1Info.isConstant())
1644 ConstantMatCost += getConstantMatCost(0, Op1Info);
1645 if (Op2Info.isConstant())
1646 ConstantMatCost += getConstantMatCost(1, Op2Info);
1647
1648 unsigned Op;
1649 switch (TLI->InstructionOpcodeToISD(Opcode)) {
1650 case ISD::ADD:
1651 case ISD::SUB:
1652 Op = RISCV::VADD_VV;
1653 break;
1654 case ISD::SHL:
1655 case ISD::SRL:
1656 case ISD::SRA:
1657 Op = RISCV::VSLL_VV;
1658 break;
1659 case ISD::AND:
1660 case ISD::OR:
1661 case ISD::XOR:
1662 Op = (Ty->getScalarSizeInBits() == 1) ? RISCV::VMAND_MM : RISCV::VAND_VV;
1663 break;
1664 case ISD::MUL:
1665 case ISD::MULHS:
1666 case ISD::MULHU:
1667 Op = RISCV::VMUL_VV;
1668 break;
1669 case ISD::SDIV:
1670 case ISD::UDIV:
1671 Op = RISCV::VDIV_VV;
1672 break;
1673 case ISD::SREM:
1674 case ISD::UREM:
1675 Op = RISCV::VREM_VV;
1676 break;
1677 case ISD::FADD:
1678 case ISD::FSUB:
1679 // TODO: Address FP16 with VFHMIN
1680 Op = RISCV::VFADD_VV;
1681 break;
1682 case ISD::FMUL:
1683 // TODO: Address FP16 with VFHMIN
1684 Op = RISCV::VFMUL_VV;
1685 break;
1686 case ISD::FDIV:
1687 Op = RISCV::VFDIV_VV;
1688 break;
1689 case ISD::FNEG:
1690 Op = RISCV::VFSGNJN_VV;
1691 break;
1692 default:
1693 // Assuming all other instructions have the same cost until a need arises to
1694 // differentiate them.
1695 return ConstantMatCost + BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind,
1696 Op1Info, Op2Info,
1697 Args, CxtI);
1698 }
1699 return ConstantMatCost +
1700 LT.first * getRISCVInstructionCost(Op, LT.second, CostKind);
1701}
1702
1703// TODO: Deduplicate from TargetTransformInfoImplCRTPBase.
1705 ArrayRef<const Value *> Ptrs, const Value *Base,
1706 const TTI::PointersChainInfo &Info, Type *AccessTy,
1709 // In the basic model we take into account GEP instructions only
1710 // (although here can come alloca instruction, a value, constants and/or
1711 // constant expressions, PHIs, bitcasts ... whatever allowed to be used as a
1712 // pointer). Typically, if Base is a not a GEP-instruction and all the
1713 // pointers are relative to the same base address, all the rest are
1714 // either GEP instructions, PHIs, bitcasts or constants. When we have same
1715 // base, we just calculate cost of each non-Base GEP as an ADD operation if
1716 // any their index is a non-const.
1717 // If no known dependecies between the pointers cost is calculated as a sum
1718 // of costs of GEP instructions.
1719 for (auto [I, V] : enumerate(Ptrs)) {
1720 const auto *GEP = dyn_cast<GetElementPtrInst>(V);
1721 if (!GEP)
1722 continue;
1723 if (Info.isSameBase() && V != Base) {
1724 if (GEP->hasAllConstantIndices())
1725 continue;
1726 // If the chain is unit-stride and BaseReg + stride*i is a legal
1727 // addressing mode, then presume the base GEP is sitting around in a
1728 // register somewhere and check if we can fold the offset relative to
1729 // it.
1730 unsigned Stride = DL.getTypeStoreSize(AccessTy);
1731 if (Info.isUnitStride() &&
1732 isLegalAddressingMode(AccessTy,
1733 /* BaseGV */ nullptr,
1734 /* BaseOffset */ Stride * I,
1735 /* HasBaseReg */ true,
1736 /* Scale */ 0,
1737 GEP->getType()->getPointerAddressSpace()))
1738 continue;
1739 Cost += getArithmeticInstrCost(Instruction::Add, GEP->getType(), CostKind,
1740 {TTI::OK_AnyValue, TTI::OP_None},
1741 {TTI::OK_AnyValue, TTI::OP_None},
1742 std::nullopt);
1743 } else {
1744 SmallVector<const Value *> Indices(GEP->indices());
1745 Cost += getGEPCost(GEP->getSourceElementType(), GEP->getPointerOperand(),
1746 Indices, AccessTy, CostKind);
1747 }
1748 }
1749 return Cost;
1750}
1751
1755 // TODO: More tuning on benchmarks and metrics with changes as needed
1756 // would apply to all settings below to enable performance.
1757
1758
1759 if (ST->enableDefaultUnroll())
1760 return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP, ORE);
1761
1762 // Enable Upper bound unrolling universally, not dependant upon the conditions
1763 // below.
1764 UP.UpperBound = true;
1765
1766 // Disable loop unrolling for Oz and Os.
1767 UP.OptSizeThreshold = 0;
1769 if (L->getHeader()->getParent()->hasOptSize())
1770 return;
1771
1772 SmallVector<BasicBlock *, 4> ExitingBlocks;
1773 L->getExitingBlocks(ExitingBlocks);
1774 LLVM_DEBUG(dbgs() << "Loop has:\n"
1775 << "Blocks: " << L->getNumBlocks() << "\n"
1776 << "Exit blocks: " << ExitingBlocks.size() << "\n");
1777
1778 // Only allow another exit other than the latch. This acts as an early exit
1779 // as it mirrors the profitability calculation of the runtime unroller.
1780 if (ExitingBlocks.size() > 2)
1781 return;
1782
1783 // Limit the CFG of the loop body for targets with a branch predictor.
1784 // Allowing 4 blocks permits if-then-else diamonds in the body.
1785 if (L->getNumBlocks() > 4)
1786 return;
1787
1788 // Don't unroll vectorized loops, including the remainder loop
1789 if (getBooleanLoopAttribute(L, "llvm.loop.isvectorized"))
1790 return;
1791
1792 // Scan the loop: don't unroll loops with calls as this could prevent
1793 // inlining.
1795 for (auto *BB : L->getBlocks()) {
1796 for (auto &I : *BB) {
1797 // Initial setting - Don't unroll loops containing vectorized
1798 // instructions.
1799 if (I.getType()->isVectorTy())
1800 return;
1801
1802 if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
1803 if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
1804 if (!isLoweredToCall(F))
1805 continue;
1806 }
1807 return;
1808 }
1809
1810 SmallVector<const Value *> Operands(I.operand_values());
1813 }
1814 }
1815
1816 LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n");
1817
1818 UP.Partial = true;
1819 UP.Runtime = true;
1820 UP.UnrollRemainder = true;
1821 UP.UnrollAndJam = true;
1823
1824 // Force unrolling small loops can be very useful because of the branch
1825 // taken cost of the backedge.
1826 if (Cost < 12)
1827 UP.Force = true;
1828}
1829
1833}
1834
1837 if (Ty->isVectorTy()) {
1838 if (Size.isScalable() && ST->hasVInstructions())
1839 return divideCeil(Size.getKnownMinValue(), RISCV::RVVBitsPerBlock);
1840
1842 return divideCeil(Size, ST->getRealMinVLen());
1843 }
1844
1845 return BaseT::getRegUsageForType(Ty);
1846}
1847
1848unsigned RISCVTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
1849 if (SLPMaxVF.getNumOccurrences())
1850 return SLPMaxVF;
1851
1852 // Return how many elements can fit in getRegisterBitwidth. This is the
1853 // same routine as used in LoopVectorizer. We should probably be
1854 // accounting for whether we actually have instructions with the right
1855 // lane type, but we don't have enough information to do that without
1856 // some additional plumbing which hasn't been justified yet.
1857 TypeSize RegWidth =
1859 // If no vector registers, or absurd element widths, disable
1860 // vectorization by returning 1.
1861 return std::max<unsigned>(1U, RegWidth.getFixedValue() / ElemWidth);
1862}
1863
1865 const TargetTransformInfo::LSRCost &C2) {
1866 // RISC-V specific here are "instruction number 1st priority".
1867 return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost,
1868 C1.NumIVMuls, C1.NumBaseAdds,
1869 C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
1870 std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost,
1871 C2.NumIVMuls, C2.NumBaseAdds,
1872 C2.ScaleCost, C2.ImmCost, C2.SetupCost);
1873}
1874
1876 auto *VTy = dyn_cast<VectorType>(DataTy);
1877 if (!VTy || VTy->isScalableTy())
1878 return false;
1879
1880 if (!isLegalMaskedLoadStore(DataTy, Alignment))
1881 return false;
1882 return true;
1883}
1884
1886 const Function *Callee) const {
1887 const TargetMachine &TM = getTLI()->getTargetMachine();
1888
1889 const FeatureBitset &CallerBits =
1890 TM.getSubtargetImpl(*Caller)->getFeatureBits();
1891 const FeatureBitset &CalleeBits =
1892 TM.getSubtargetImpl(*Callee)->getFeatureBits();
1893
1894 // Inline a callee if its target-features are a subset of the callers
1895 // target-features.
1896 return (CallerBits & CalleeBits) == CalleeBits;
1897}
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
Cost tables and simple lookup functions.
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
uint64_t Size
Hexagon Common GEP
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
mir Rename Register Operands
static const Function * getCalledFunction(const Value *V, bool &IsNoBuiltin)
if(VerifyEach)
const char LLVMTargetMachineRef TM
static cl::opt< unsigned > SLPMaxVF("riscv-v-slp-max-vf", cl::desc("Overrides result used for getMaximumVF query which is used " "exclusively by SLP vectorizer."), cl::Hidden)
static cl::opt< unsigned > RVVRegisterWidthLMUL("riscv-v-register-bit-width-lmul", cl::desc("The LMUL to use for getRegisterBitWidth queries. Affects LMUL used " "by autovectorized code. Fractional LMULs are not supported."), cl::init(2), cl::Hidden)
static VectorType * getVRGatherIndexType(MVT DataVT, const RISCVSubtarget &ST, LLVMContext &C)
static const CostTblEntry VectorIntrinsicCostTable[]
static bool canUseShiftPair(Instruction *Inst, const APInt &Imm)
static unsigned getISDForVPIntrinsicID(Intrinsic::ID ID)
This file defines a TargetTransformInfo::Concept conforming object specific to the RISC-V target mach...
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file contains some templates that are useful if you are working with the STL at all.
This file describes how to lower LLVM code to machine code.
This pass exposes codegen information to IR-level passes.
Class for arbitrary precision integers.
Definition: APInt.h:76
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Get intrinsic cost based on arguments.
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
Definition: BasicTTIImpl.h:584
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *DataTy, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
std::optional< unsigned > getVScaleForTuning() const
Definition: BasicTTIImpl.h:758
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
InstructionCost getStridedMemoryOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I)
std::optional< unsigned > getMaxVScale() const
Definition: BasicTTIImpl.h:757
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *Ty, int &Index, VectorType *&SubTy) const
Definition: BasicTTIImpl.h:971
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
Try to calculate op costs for min/max reduction operations.
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr)
InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef< const Value * > Operands, Type *AccessType, TTI::TargetCostKind CostKind)
Definition: BasicTTIImpl.h:440
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: BasicTTIImpl.h:656
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr)
Definition: BasicTTIImpl.h:893
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
Estimate the cost of type-legalization and the legalized type.
Definition: BasicTTIImpl.h:857
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr, int64_t ScalableOffset=0)
Definition: BasicTTIImpl.h:340
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:993
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition: InstrTypes.h:996
@ FCMP_TRUE
1 1 1 1 Always true (always folded)
Definition: InstrTypes.h:1010
@ ICMP_SLT
signed less than
Definition: InstrTypes.h:1022
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition: InstrTypes.h:999
@ FCMP_ULE
1 1 0 1 True if unordered, less than, or equal
Definition: InstrTypes.h:1008
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition: InstrTypes.h:997
@ FCMP_OGE
0 0 1 1 True if ordered and greater than or equal
Definition: InstrTypes.h:998
@ FCMP_ULT
1 1 0 0 True if unordered or less than
Definition: InstrTypes.h:1007
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition: InstrTypes.h:1001
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition: InstrTypes.h:1004
@ FCMP_UGT
1 0 1 0 True if unordered or greater than
Definition: InstrTypes.h:1005
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition: InstrTypes.h:1000
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition: InstrTypes.h:1002
@ ICMP_EQ
equal
Definition: InstrTypes.h:1014
@ ICMP_NE
not equal
Definition: InstrTypes.h:1015
@ FCMP_UNE
1 1 1 0 True if unordered or not equal
Definition: InstrTypes.h:1009
@ FCMP_UGE
1 0 1 1 True if unordered, greater than, or equal
Definition: InstrTypes.h:1006
@ FCMP_FALSE
0 0 0 0 Always false (always folded)
Definition: InstrTypes.h:995
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition: InstrTypes.h:1003
bool isFPPredicate() const
Definition: InstrTypes.h:1122
bool isIntPredicate() const
Definition: InstrTypes.h:1123
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
Definition: DataLayout.cpp:865
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition: DataLayout.h:672
TypeSize getTypeStoreSize(Type *Ty) const
Returns the maximum number of bytes that may be overwritten by storing the specified type.
Definition: DataLayout.h:472
Align getPrefTypeAlign(Type *Ty) const
Returns the preferred stack/global alignment for the specified type.
Definition: DataLayout.cpp:874
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:20
bool noNaNs() const
Definition: FMF.h:66
Container class for subtarget features.
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:539
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:692
static InstructionCost getInvalid(CostType Val=0)
bool isCommutative() const LLVM_READONLY
Return true if the instruction is commutative:
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:278
const SmallVectorImpl< Type * > & getArgTypes() const
const SmallVectorImpl< const Value * > & getArgs() const
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:47
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
Definition: IntrinsicInst.h:54
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:44
Machine Value Type.
static MVT getFloatingPointVT(unsigned BitWidth)
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
uint64_t getScalarSizeInBits() const
MVT changeVectorElementType(MVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
MVT changeTypeToInteger()
Return the type converted to an equivalently sized integer or vector with integer element type.
bool bitsGT(MVT VT) const
Return true if this has more bits than VT.
bool isFixedLengthVector() const
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
unsigned getOpcode() const
Return the opcode for this Instruction or ConstantExpr.
Definition: Operator.h:41
The optimization diagnostic interface.
bool hasVInstructionsF64() const
unsigned getRealMinVLen() const
bool useRVVForFixedLengthVectors() const
unsigned getXLen() const
bool hasVInstructionsF16() const
bool hasVInstructions() const
unsigned getRealMaxVLen() const
bool hasVInstructionsF32() const
unsigned getELen() const
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I)
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr)
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2)
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind)
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
bool isLegalMaskedLoadStore(Type *DataType, Align Alignment)
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr)
bool areInlineCompatible(const Function *Caller, const Function *Callee) const
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr)
InstructionCost getStridedMemoryOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I)
std::optional< unsigned > getVScaleForTuning() const
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy, FastMathFlags FMF, TTI::TargetCostKind CostKind)
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
std::optional< unsigned > getMaxVScale() const
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getPointersChainCost(ArrayRef< const Value * > Ptrs, const Value *Base, const TTI::PointersChainInfo &Info, Type *AccessTy, TTI::TargetCostKind CostKind)
TargetTransformInfo::PopcntSupportKind getPopcntSupport(unsigned TyWidth)
bool shouldExpandReduction(const IntrinsicInst *II) const
InstructionCost getStoreImmCost(Type *VecTy, TTI::OperandValueInfo OpInfo, TTI::TargetCostKind CostKind)
Return the cost of materializing an immediate for a value operand of a store instruction.
bool isLegalMaskedCompressStore(Type *DataTy, Align Alignment)
bool isLegalStridedLoadStore(Type *DataType, Align Alignment)
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
unsigned getRegUsageForType(Type *Ty)
bool isLegalMaskedGather(Type *DataType, Align Alignment)
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpdInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
bool isLegalMaskedScatter(Type *DataType, Align Alignment)
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const
InstructionCost getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind)
bool hasActiveVectorLength(unsigned Opcode, Type *DataType, Align Alignment) const
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
InstructionCost getVRGatherVVCost(MVT VT) const
Return the cost of a vrgather.vv instruction for the type VT.
bool canSplatOperand(Instruction *I, int Operand) const
Return true if the (vector) instruction I will be lowered to an instruction with a scalar splat opera...
InstructionCost getVRGatherVICost(MVT VT) const
Return the cost of a vrgather.vi (or vx) instruction for the type VT.
static unsigned computeVLMAX(unsigned VectorBits, unsigned EltSize, unsigned MinSize)
bool shouldExpandCttzElements(EVT VT) const override
Return true if the @llvm.experimental.cttz.elts intrinsic should be expanded using generic code in Se...
InstructionCost getLMULCost(MVT VT) const
Return the cost of LMUL for linear operations.
InstructionCost getVSlideVICost(MVT VT) const
Return the cost of a vslidedown.vi or vslideup.vi instruction for the type VT.
InstructionCost getVSlideVXCost(MVT VT) const
Return the cost of a vslidedown.vx or vslideup.vx instruction for the type VT.
bool isLegalInterleavedAccessType(VectorType *VTy, unsigned Factor, Align Alignment, unsigned AddrSpace, const DataLayout &) const
Returns whether or not generating a interleaved load/store intrinsic for this type will be legal.
The main scalar evolution driver.
static bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
size_t size() const
Definition: SmallVector.h:91
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
int InstructionOpcodeToISD(unsigned Opcode) const
Get the ISD node that corresponds to the Instruction class opcode.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
const TargetMachine & getTargetMachine() const
bool isOperationCustom(unsigned Op, EVT VT) const
Return true if the operation uses custom lowering, regardless of whether the type is legal or not.
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:76
const DataLayout & getDataLayout() const
bool isLoweredToCall(const Function *F) const
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TTI::TargetCostKind CostKind)
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
@ TCK_Latency
The latency of instruction.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
PopcntSupportKind
Flags indicating the kind of support for population count.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Transpose
Transpose two vectors.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
@ None
The cast is not used with a load/store of any kind.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:342
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition: TypeSize.h:345
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:265
static IntegerType * getInt1Ty(LLVMContext &C)
static IntegerType * getIntNTy(LLVMContext &C, unsigned N)
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:129
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:228
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:348
Value * getOperand(unsigned i) const
Definition: User.h:169
LLVM Value Representation.
Definition: Value.h:74
Base class of all SIMD vector types.
Definition: DerivedTypes.h:403
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
Definition: DerivedTypes.h:641
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Definition: Type.cpp:676
Type * getElementType() const
Definition: DerivedTypes.h:436
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:199
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition: TypeSize.h:168
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition: ISDOpcodes.h:44
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:240
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:791
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:391
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:775
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:931
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition: ISDOpcodes.h:652
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:706
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:781
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:889
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:837
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:681
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:870
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:787
int getIntMatCost(const APInt &Val, unsigned Size, const MCSubtargetInfo &STI, bool CompressionCost)
static constexpr unsigned RVVBitsPerBlock
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:450
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
unsigned Log2_32_Ceil(uint32_t Value)
Return the ceil log base 2 of the specified value, 32 if the value is zero.
Definition: MathExtras.h:337
const CostTblEntryT< CostType > * CostTableLookup(ArrayRef< CostTblEntryT< CostType > > Tbl, int ISD, MVT Ty)
Find in cost table.
Definition: CostTable.h:35
bool getBooleanLoopAttribute(const Loop *TheLoop, StringRef Name)
Returns true if Name is applied to TheLoop and enabled.
Definition: LoopInfo.cpp:1085
uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:428
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are are tuples (A,...
Definition: STLExtras.h:2406
AddressSpace
Definition: NVPTXBaseInfo.h:21
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition: MathExtras.h:372
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: bit.h:215
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition: MathExtras.h:269
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
Definition: STLExtras.h:1928
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:324
llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:275
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
constexpr int PoisonMaskElem
llvm::SmallVector< int, 16 > createInterleaveMask(unsigned VF, unsigned NumVecs)
Create an interleave shuffle mask.
DWARFExpression::Operation Op
InstructionCost Cost
bool equal(L &&LRange, R &&RRange)
Wrapper function around std::equal to detect if pair-wise elements between two ranges are the same.
Definition: STLExtras.h:2025
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition: bit.h:327
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
Cost Table Entry.
Definition: CostTable.h:25
Extended Value Type.
Definition: ValueTypes.h:34
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
unsigned Insns
TODO: Some of these could be merged.
Describe known properties for a set of pointers.
Parameters that control the generic loop unrolling transformation.
bool UpperBound
Allow using trip count upper bound to unroll loops.
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
unsigned UnrollAndJamInnerLoopThreshold
Threshold for unroll and jam, for inner loop size.
bool UnrollAndJam
Allow unroll and jam. Used to enable unroll and jam for the target.
bool UnrollRemainder
Allow unrolling of all the iterations of the runtime loop remainder.
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
unsigned OptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size (set to UINT_MAX to disable).